Zingg Entity Resolution Python Package
Zingg Python APIs for entity resolution, identity resolution, record linkage, data mastering and deduplication using ML (https://www.zingg.ai)
Note
Requires python 3.6+; spark 3.5.0
Otherwise, zingg.client.Zingg()
cannot be executed
- Zingg Entity Resolution Package
- zingg.client
Arguments
Arguments.copyArgs()
Arguments.createArgumentsFromJSON()
Arguments.createArgumentsFromJSONString()
Arguments.getArgs()
Arguments.getModelId()
Arguments.getZinggBaseModelDir()
Arguments.getZinggBaseTrainingDataDir()
Arguments.getZinggModelDir()
Arguments.getZinggTrainingDataMarkedDir()
Arguments.getZinggTrainingDataUnmarkedDir()
Arguments.setArgs()
Arguments.setColumn()
Arguments.setData()
Arguments.setFieldDefinition()
Arguments.setLabelDataSampleSize()
Arguments.setModelId()
Arguments.setNumPartitions()
Arguments.setOutput()
Arguments.setStopWordsCutoff()
Arguments.setTrainingSamples()
Arguments.setZinggDir()
Arguments.writeArgumentsToJSON()
Arguments.writeArgumentsToJSONString()
ClientOptions
ClientOptions.COLUMN
ClientOptions.CONF
ClientOptions.EMAIL
ClientOptions.LICENSE
ClientOptions.LOCATION
ClientOptions.MODEL_ID
ClientOptions.PHASE
ClientOptions.REMOTE
ClientOptions.ZINGG_DIR
ClientOptions.getClientOptions()
ClientOptions.getConf()
ClientOptions.getLocation()
ClientOptions.getOptionValue()
ClientOptions.getPhase()
ClientOptions.hasLocation()
ClientOptions.setOptionValue()
ClientOptions.setPhase()
FieldDefinition
Zingg
Zingg.execute()
Zingg.executeLabel()
Zingg.executeLabelUpdate()
Zingg.getArguments()
Zingg.getMarkedRecords()
Zingg.getMarkedRecordsStat()
Zingg.getMatchedMarkedRecordsStat()
Zingg.getOptions()
Zingg.getUnmarkedRecords()
Zingg.getUnmatchedMarkedRecordsStat()
Zingg.getUnsureMarkedRecordsStat()
Zingg.init()
Zingg.initAndExecute()
Zingg.processRecordsCli()
Zingg.processRecordsCliLabelUpdate()
Zingg.setArguments()
Zingg.setOptions()
Zingg.writeLabelledOutput()
Zingg.writeLabelledOutputFromPandas()
ZinggWithSpark
getDfFromDs()
getGateway()
getJVM()
getPandasDfFromDs()
getSparkContext()
getSparkSession()
getSqlContext()
initClient()
initDataBricksConectClient()
initSparkClient()
parseArguments()
- zingg.pipes
BigQueryPipe
CsvPipe
InMemoryPipe
Pipe
SnowflakePipe
SnowflakePipe.DATABASE
SnowflakePipe.DBTABLE
SnowflakePipe.PASSWORD
SnowflakePipe.SCHEMA
SnowflakePipe.URL
SnowflakePipe.USER
SnowflakePipe.WAREHOUSE
SnowflakePipe.setDatabase()
SnowflakePipe.setDbTable()
SnowflakePipe.setPassword()
SnowflakePipe.setSFSchema()
SnowflakePipe.setURL()
SnowflakePipe.setUser()
SnowflakePipe.setWarehouse()
API Reference
Example API Usage
1from zingg.client import *
2from zingg.pipes import *
3
4#build the arguments for zingg
5args = Arguments()
6#set field definitions
7fname = FieldDefinition("fname", "string", MatchType.FUZZY)
8lname = FieldDefinition("lname", "string", MatchType.FUZZY)
9stNo = FieldDefinition("stNo", "string", MatchType.FUZZY)
10add1 = FieldDefinition("add1","string", MatchType.FUZZY)
11add2 = FieldDefinition("add2", "string", MatchType.FUZZY)
12city = FieldDefinition("city", "string", MatchType.FUZZY)
13areacode = FieldDefinition("areacode", "string", MatchType.FUZZY)
14state = FieldDefinition("state", "string", MatchType.FUZZY)
15dob = FieldDefinition("dob", "string", MatchType.FUZZY)
16ssn = FieldDefinition("ssn", "string", MatchType.FUZZY)
17
18fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn]
19
20args.setFieldDefinition(fieldDefs)
21#set the modelid and the zingg dir
22args.setModelId("100")
23args.setZinggDir("models")
24args.setNumPartitions(4)
25args.setLabelDataSampleSize(0.5)
26
27#reading dataset into inputPipe and settint it up in 'args'
28#below line should not be required if you are reading from in memory dataset
29#in that case, replace df with input df
30schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string"
31inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema)
32args.setData(inputPipe)
33outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput")
34
35args.setOutput(outputPipe)
36
37options = ClientOptions([ClientOptions.PHASE,"match"])
38
39#Zingg execution for the given phase
40zingg = Zingg(args, options)
41zingg.initAndExecute()