Zingg Entity Resolution Python Package

Zingg Python APIs for entity resolution, identity resolution, record linkage, data mastering and deduplication using ML (https://www.zingg.ai)

Note

Requires python 3.6+; spark 3.5.0 Otherwise, zingg.client.Zingg() cannot be executed

API Reference

Example API Usage

 1from zingg.client import *
 2from zingg.pipes import *
 3
 4#build the arguments for zingg
 5args = Arguments()
 6#set field definitions
 7fname = FieldDefinition("fname", "string", MatchType.FUZZY)
 8lname = FieldDefinition("lname", "string", MatchType.FUZZY)
 9stNo = FieldDefinition("stNo", "string", MatchType.FUZZY)
10add1 = FieldDefinition("add1","string", MatchType.FUZZY)
11add2 = FieldDefinition("add2", "string", MatchType.FUZZY)
12city = FieldDefinition("city", "string", MatchType.FUZZY)
13areacode = FieldDefinition("areacode", "string", MatchType.FUZZY)
14state = FieldDefinition("state", "string", MatchType.FUZZY)
15dob = FieldDefinition("dob", "string", MatchType.FUZZY)
16ssn = FieldDefinition("ssn", "string", MatchType.FUZZY)
17
18fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn]
19
20args.setFieldDefinition(fieldDefs)
21#set the modelid and the zingg dir
22args.setModelId("100")
23args.setZinggDir("models")
24args.setNumPartitions(4)
25args.setLabelDataSampleSize(0.5)
26
27#reading dataset into inputPipe and settint it up in 'args'
28#below line should not be required if you are reading from in memory dataset
29#in that case, replace df with input df
30schema = "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn  string"
31inputPipe = CsvPipe("testFebrl", "examples/febrl/test.csv", schema)
32args.setData(inputPipe)
33outputPipe = CsvPipe("resultFebrl", "/tmp/febrlOutput")
34
35args.setOutput(outputPipe)
36
37options = ClientOptions([ClientOptions.PHASE,"match"])
38
39#Zingg execution for the given phase
40zingg = Zingg(args, options)
41zingg.initAndExecute()