Zingg Entity Resolution Python Package
Zingg Python APIs for entity resolution, identity resolution, record linkage, data mastering and deduplication using ML (https://www.zingg.ai)
Note
Requires python 3.6+; spark 3.5.0
Otherwise, zingg.client.Zingg() cannot be executed
API Reference
Example API Usage
1from zingg.client import *
2from zingg.pipes import *
3
4#build the arguments for zingg
5args = Arguments()
6#set field definitions
7fname = FieldDefinition("fname", "string", MatchType.FUZZY)
8lname = FieldDefinition("lname", "string", MatchType.FUZZY)
9stNo = FieldDefinition("stNo", "string", MatchType.FUZZY)
10add1 = FieldDefinition("add1","string", MatchType.FUZZY)
11add2 = FieldDefinition("add2", "string", MatchType.FUZZY)
12city = FieldDefinition("city", "string", MatchType.FUZZY)
13areacode = FieldDefinition("areacode", "string", MatchType.FUZZY)
14state = FieldDefinition("state", "string", MatchType.FUZZY)
15dob = FieldDefinition("dob", "string", MatchType.FUZZY)
16ssn = FieldDefinition("ssn", "string", MatchType.FUZZY)
17
18fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn]
19
20args.setFieldDefinition(fieldDefs)
21#set the modelid and the zingg dir
22args.setModelId("100")
23args.setZinggDir("models")
24args.setNumPartitions(4)
25args.setLabelDataSampleSize(0.5)
26
27#reading dataset into inputPipe and settint it up in 'args'
28#below line should not be required if you are reading from in memory dataset
29#in that case, replace df with input df
30schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string"
31inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema)
32args.setData(inputPipe)
33outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput")
34
35args.setOutput(outputPipe)
36
37options = ClientOptions([ClientOptions.PHASE,"match"])
38
39#Zingg execution for the given phase
40zingg = Zingg(args, options)
41zingg.initAndExecute()