diff --git a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java new file mode 100644 index 000000000..c664e095d --- /dev/null +++ b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java @@ -0,0 +1,90 @@ +package zingg.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class TestJaccSimFunction { + + @Test + public void testFirstStringNull() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call(null, "text 2")); + } + + @Test + public void testFirstStringEmpty() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("", "text 2")); + } + + @Test + public void testSecondStringNull() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("text 1", null)); + } + + @Test + public void testSecondStringEmpty() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("text 1", "")); + } + + @Test + public void testBothEmpty() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("", "")); + } + + @Test + public void testBothNull() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call(null, null)); + } + + @Test + public void testBothSame() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("sample text", "sample text")); + } + + @Test + public void testBothSameButCaseDifferent() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("sample text", "sAmPle TeXt")); + } + + @Test + public void testBothNotEmptyDifferent() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + Double score = strDistanceFn.call("sample text first", "sample text second"); + assertEquals(0.5d, score); + } + + @Test + public void testSpecificInputsDifferent() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + String first = "sonicwall client/server "; + String second = "sonicwall businesses "; + Double score = strDistanceFn.call(first, second); + assertEquals(0.25d, score); + } + + @Test + public void testInputsSameWithSlashes() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + String first = "sample/string"; + String second = "sample/string"; + Double score = strDistanceFn.call(first, second); + assertEquals(1d, score); + } + + @Test + public void testInputsDifferentWithSlashesAndColons() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + String first = "slashes/and:colons.,"; + String second = "slashes and colons"; + Double score = strDistanceFn.call(first, second); + assertEquals(1d, score); + } +} diff --git a/examples/amazon-google/AmazonGoogle.py b/examples/amazon-google/AmazonGoogle.py index e3f2573e3..af0c9a893 100644 --- a/examples/amazon-google/AmazonGoogle.py +++ b/examples/amazon-google/AmazonGoogle.py @@ -23,21 +23,17 @@ #in that case, replace df with input df dfAmazon = spark.read.format("csv").schema("id string, title string, description string, manufacturer string, price double ").load("examples/amazon-google/Amazon.csv") dfSchemaAmazon = str(dfAmazon.schema.json()) -inputPipeAmazon = CsvPipe("testAmazon") -inputPipeAmazon.setLocation("examples/amazon-google/Amazon.csv") -inputPipeAmazon.setSchema(dfSchemaAmazon) +inputPipeAmazon = CsvPipe("testAmazon", dfSchemaAmazon, "examples/amazon-google/Amazon.csv") dfGoogle = spark.read.format("csv").schema("id string, title string, description string, manufacturer string, price double ").load("examples/amazon-google/GoogleProducts.csv") dfSchemaGoogle = str(dfGoogle.schema.json()) -inputPipeGoogle = CsvPipe("testGoogle") -inputPipeGoogle.setLocation("examples/amazon-google/GoogleProducts.csv") -inputPipeGoogle.setSchema(dfSchemaGoogle) +inputPipeGoogle = CsvPipe("testGoogle", dfSchemaGoogle, "examples/amazon-google/GoogleProducts.csv") args.setData(inputPipeAmazon,inputPipeGoogle) #setting outputpipe in 'args' -outputPipe = CsvPipe("resultAmazonGoogle") -outputPipe.setLocation("/tmp/AwsGoogleOutput") +outputPipe = CsvPipe("resultAmazonGoogle", None, "/tmp/AwsGoogleOutput") + args.setOutput(outputPipe) options = ClientOptions() diff --git a/examples/febrl/FebrlExample.py b/examples/febrl/FebrlExample.py index 6d1963824..97592a8fb 100644 --- a/examples/febrl/FebrlExample.py +++ b/examples/febrl/FebrlExample.py @@ -28,18 +28,13 @@ #below line should not be required if you are reading from in memory dataset #in that case, replace df with input df df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string").load("examples/febrl/test.csv") - -inputPipe = CsvPipe("test") -inputPipe.setLocation("examples/febrl/test.csv") - dfSchema = str(df.schema.json()) -inputPipe.setSchema(dfSchema) +inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv") args.setData(inputPipe) #setting outputpipe in 'args' -outputPipe = CsvPipe("csv") -outputPipe.setLocation("/tmp") +outputPipe = CsvPipe("csv", None, "/tmp") args.setOutput(outputPipe) diff --git a/examples/iTunes-amazon/iTunesAmazon.py b/examples/iTunes-amazon/iTunesAmazon.py index 6b1f6f706..9e1aecf4c 100644 --- a/examples/iTunes-amazon/iTunesAmazon.py +++ b/examples/iTunes-amazon/iTunesAmazon.py @@ -28,22 +28,17 @@ #below line should not be required if you are reading from in memory dataset #in that case, replace df with input df dfiTunes = spark.read.format("csv").schema("id string, Song_Name string, Artist_Name string, Album_Name string, Genre string, Price double, CopyRight string, Time string, Released string").load("examples/iTunes-amazon/iTunesMusic.csv") -dfSchema = str(dfiTunes.schema.json()) -inputPipeiTunes = CsvPipe("testiTunes") -inputPipeiTunes.setLocation("examples/iTunes-amazon/iTunesMusic.csv") -inputPipeiTunes.setSchema(dfSchema) +dfSchemaiTunes = str(dfiTunes.schema.json()) +inputPipeiTunes = CsvPipe("testiTunes", dfSchemaiTunes, "examples/iTunes-amazon/iTunesMusic.csv") dfAmazon = spark.read.format("csv").schema("id string, Song_Name string, Artist_Name string, Album_Name string, Genre string, Price double, CopyRight string, Time string, Released string").load("examples/iTunes-amazon/AmazonMusic.csv") dfSchemaAmazon = str(dfAmazon.schema.json()) -inputPipeAmazon = CsvPipe("testAmazon") -inputPipeAmazon.setLocation("examples/iTunes-amazon/AmazonMusic.csv") -inputPipeAmazon.setSchema(dfSchemaAmazon) +inputPipeAmazon = CsvPipe("testAmazon", dfSchemaAmazon, "examples/iTunes-amazon/AmazonMusic.csv") args.setData(inputPipeiTunes,inputPipeAmazon) #setting outputpipe in 'args' -outputPipe = CsvPipe("iTunesAmazonresult") -outputPipe.setLocation("/tmp/iTunesAmazonOutput") +outputPipe = CsvPipe("iTunesAmazonresult", None, "/tmp/iTunesAmazonOutput") args.setOutput(outputPipe) diff --git a/examples/ncVoters5M/ncVoters.py b/examples/ncVoters5M/ncVoters.py index 5af7e6d1e..35525e4e6 100644 --- a/examples/ncVoters5M/ncVoters.py +++ b/examples/ncVoters5M/ncVoters.py @@ -22,16 +22,14 @@ #below line should not be required if you are reading from in memory dataset #in that case, replace df with input df df = spark.read.format("csv").schema("recid string, givenname string, surname string, suburb string, postcode double ").load("examples/ncVoters5M/5Party-ocp20/") -dfSchemaA = str(df.schema.json()) +dfSchema = str(df.schema.json()) +inputPipe = CsvPipe("test", dfSchema, "examples/ncVoters5M/5Party-ocp20/") -inputPipe = CsvPipe("test") -inputPipe.setLocation("examples/ncVoters5M/5Party-ocp20/") -inputPipe.setSchema(dfSchemaA) args.setData(inputPipe) #setting outputpipe in 'args' -outputPipe = CsvPipe("ncVotersResult") -outputPipe.setLocation("/tmp/ncVotersOutput") +outputPipe = CsvPipe("ncVotersResult", None, "/tmp/ncVotersOutput") + args.setOutput(outputPipe) options = ClientOptions() diff --git a/python/zingg/pipes/pipes.py b/python/zingg/pipes/pipes.py index 88953e20e..8d0b70ba9 100644 --- a/python/zingg/pipes/pipes.py +++ b/python/zingg/pipes/pipes.py @@ -19,9 +19,17 @@ class CsvPipe(Pipe): :param name: name of the pipe. :type name: String + :param schema: (optional) json schema for the pipe + :type schema: Schema or None + :param location: (optional) location from where we read data + :type location: String or None """ - def __init__(self, name): + def __init__(self, name, schema = None, location = None): Pipe.__init__(self, name, Format.CSV.type()) + if(schema != None): + Pipe.setSchema(schema) + if(location != None): + Pipe.addProperty(self, FilePipe.LOCATION, location) def setDelimiter(self, delimiter): """ This method is used to define delimiter of CsvPipe @@ -42,10 +50,6 @@ def setLocation(self, location): def setHeader(self, header): Pipe.addProperty(self, FilePipe.HEADER, header) -class BigQueryPipe(Pipe): - -======= - class BigQueryPipe(Pipe): """ Pipe Class for working with BigQuery pipeline diff --git a/test/testFebrl/testFebrl.py b/test/testFebrl/testFebrl.py new file mode 100644 index 000000000..ebf4995b2 --- /dev/null +++ b/test/testFebrl/testFebrl.py @@ -0,0 +1,65 @@ +import unittest +from unittest.case import TestCase +import unittest +from io import StringIO + + +from zingg import * +from zingg.pipes import * + +args = Arguments() +fname = FieldDefinition("fname", "string", MatchType.FUZZY) +lname = FieldDefinition("lname", "string", MatchType.FUZZY) +stNo = FieldDefinition("stNo", "string", MatchType.FUZZY) +add1 = FieldDefinition("add1","string", MatchType.FUZZY) +add2 = FieldDefinition("add2", "string", MatchType.FUZZY) +city = FieldDefinition("city", "string", MatchType.FUZZY) +areacode = FieldDefinition("areacode", "string", MatchType.FUZZY) +state = FieldDefinition("state", "string", MatchType.FUZZY) +dob = FieldDefinition("dob", "string", MatchType.FUZZY) +ssn = FieldDefinition("ssn", "string", MatchType.FUZZY) + +fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] + +args.setFieldDefinition(fieldDefs) +args.setModelId("100") +args.setZinggDir("models") +args.setNumPartitions(4) +args.setLabelDataSampleSize(0.5) + +df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string").load("examples/febrl/test.csv") +inputPipe = CsvPipe("test") +inputPipe.setLocation("examples/febrl/test.csv") +dfSchema = str(df.schema.json()) +inputPipe.setSchema(dfSchema) + +outputPipe = CsvPipe("result") +outputPipe.setLocation("/tmp/pythonTest") + +args.setData(inputPipe) +args.setOutput(outputPipe) + +options = ClientOptions() +# options.setPhase("trainMatch") +options.setPhase("trainMatch") + +#testing + +class Accuracy_recordCount(TestCase): + def test_recordCount(self): + client = Zingg(args, options) + client.initAndExecute() + pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords()) + labelledData = spark.createDataFrame(pMarkedDF) + + total_marked = pMarkedDF.shape[0] + + # marked record count test + self.assertEqual(total_marked, 76) + + pMarkedDF.drop(pMarkedDF[pMarkedDF[ColName.PREDICTION_COL] == -1].index, inplace=True) + acc = (pMarkedDF[ColName.MATCH_FLAG_COL]== pMarkedDF[ColName.PREDICTION_COL]).mean() + + # accuracy test + self.assertGreater(acc, 0.9) +