From 35c6be7889e180e58b302d0ce97d449d21227d17 Mon Sep 17 00:00:00 2001 From: Navin Singh Date: Thu, 30 Jun 2022 11:18:20 +0530 Subject: [PATCH 1/4] TCs for String Similarity Distance function --- .../function/TestJaccSimFunction.java | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java diff --git a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java new file mode 100644 index 000000000..23d4669e5 --- /dev/null +++ b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java @@ -0,0 +1,91 @@ +package zingg.similarity.function; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import org.junit.jupiter.api.Test; + +public class TestJaccSimFunction { + + @Test + public void testFirstStringNull() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call(null, "text 2")); + } + + @Test + public void testFirstStringEmpty() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("", "text 2")); + } + + @Test + public void testSecondStringNull() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("text 1", null)); + } + + @Test + public void testSecondStringEmpty() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("text 1", "")); + } + + @Test + public void testBothEmpty() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("", "")); + } + + @Test + public void testBothNull() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call(null, null)); + } + + @Test + public void testBothSame() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("sample text", "sample text")); + } + + @Test + public void testBothSameButCaseDifferent() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + assertEquals(1d, strDistanceFn.call("sample text", "sAmPle TeXt")); + } + + @Test + public void testBothNotEmptyDifferent() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + Double score = strDistanceFn.call("sample text first", "sample text second"); + assertFalse(score == 0d || score == 1d); + } + + @Test + public void testSpecificInputsDifferent() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + String first = "sonicwall 01-ssc-6997 : usually ships in 24 hours : : sonicwall client/server anti-virus suite leverages the award-winning mcafee netshield and groupshield applications for networks with windows -based file print and exchange servers.,"; + String second = "sonicwall 01-ssc-5670 : usually ships in 24 hours : : more and more businesses schools government agencies and libraries are connecting to the internet to meet their organizational and educational goals."; + Double score = strDistanceFn.call(first, second); + assertFalse(score == 0d || score == 1d); + } + + @Test + public void testInputsSameWithSlashes() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + String first = "sample/string/with/slashes"; + String second = "sample/string/with/slashes"; + Double score = strDistanceFn.call(first, second); + assertEquals(1d, score); + } + + @Test + public void testInputsDifferentWithSlashesAndColons() { + StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); + String first = "sample/string/with/slashes:and:colons.,"; + String second = "sample string/with slash:and,."; + Double score = strDistanceFn.call(first, second); + assertFalse(score == 0d || score == 1d); + } +} From b2a4251f0f7aca4594d3b4a19fe19a7a031757e1 Mon Sep 17 00:00:00 2001 From: Navin Singh Date: Tue, 5 Jul 2022 15:14:22 +0530 Subject: [PATCH 2/4] Added actual similarty score to assert --- .../function/TestJaccSimFunction.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java index 23d4669e5..c664e095d 100644 --- a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java +++ b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java @@ -1,7 +1,6 @@ package zingg.similarity.function; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import org.junit.jupiter.api.Test; @@ -59,23 +58,23 @@ public void testBothSameButCaseDifferent() { public void testBothNotEmptyDifferent() { StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); Double score = strDistanceFn.call("sample text first", "sample text second"); - assertFalse(score == 0d || score == 1d); + assertEquals(0.5d, score); } @Test public void testSpecificInputsDifferent() { StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); - String first = "sonicwall 01-ssc-6997 : usually ships in 24 hours : : sonicwall client/server anti-virus suite leverages the award-winning mcafee netshield and groupshield applications for networks with windows -based file print and exchange servers.,"; - String second = "sonicwall 01-ssc-5670 : usually ships in 24 hours : : more and more businesses schools government agencies and libraries are connecting to the internet to meet their organizational and educational goals."; + String first = "sonicwall client/server "; + String second = "sonicwall businesses "; Double score = strDistanceFn.call(first, second); - assertFalse(score == 0d || score == 1d); + assertEquals(0.25d, score); } @Test public void testInputsSameWithSlashes() { StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); - String first = "sample/string/with/slashes"; - String second = "sample/string/with/slashes"; + String first = "sample/string"; + String second = "sample/string"; Double score = strDistanceFn.call(first, second); assertEquals(1d, score); } @@ -83,9 +82,9 @@ public void testInputsSameWithSlashes() { @Test public void testInputsDifferentWithSlashesAndColons() { StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test"); - String first = "sample/string/with/slashes:and:colons.,"; - String second = "sample string/with slash:and,."; + String first = "slashes/and:colons.,"; + String second = "slashes and colons"; Double score = strDistanceFn.call(first, second); - assertFalse(score == 0d || score == 1d); + assertEquals(1d, score); } } From 0bffcbc8652ac2ad96cbe751683275e9b953623f Mon Sep 17 00:00:00 2001 From: Raviraj Baraiya Date: Sun, 17 Jul 2022 21:08:48 +0530 Subject: [PATCH 3/4] CsvPipe python api issue #401 --- examples/amazon-google/AmazonGoogle.py | 12 ++---- examples/febrl/FebrlExample.py | 9 +---- examples/iTunes-amazon/iTunesAmazon.py | 13 ++---- examples/ncVoters5M/ncVoters.py | 10 ++--- python/zingg/pipes/pipes.py | 19 ++++++--- test/testFebrl/testFebrl.py | 56 ++++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 35 deletions(-) create mode 100644 test/testFebrl/testFebrl.py diff --git a/examples/amazon-google/AmazonGoogle.py b/examples/amazon-google/AmazonGoogle.py index e3f2573e3..af0c9a893 100644 --- a/examples/amazon-google/AmazonGoogle.py +++ b/examples/amazon-google/AmazonGoogle.py @@ -23,21 +23,17 @@ #in that case, replace df with input df dfAmazon = spark.read.format("csv").schema("id string, title string, description string, manufacturer string, price double ").load("examples/amazon-google/Amazon.csv") dfSchemaAmazon = str(dfAmazon.schema.json()) -inputPipeAmazon = CsvPipe("testAmazon") -inputPipeAmazon.setLocation("examples/amazon-google/Amazon.csv") -inputPipeAmazon.setSchema(dfSchemaAmazon) +inputPipeAmazon = CsvPipe("testAmazon", dfSchemaAmazon, "examples/amazon-google/Amazon.csv") dfGoogle = spark.read.format("csv").schema("id string, title string, description string, manufacturer string, price double ").load("examples/amazon-google/GoogleProducts.csv") dfSchemaGoogle = str(dfGoogle.schema.json()) -inputPipeGoogle = CsvPipe("testGoogle") -inputPipeGoogle.setLocation("examples/amazon-google/GoogleProducts.csv") -inputPipeGoogle.setSchema(dfSchemaGoogle) +inputPipeGoogle = CsvPipe("testGoogle", dfSchemaGoogle, "examples/amazon-google/GoogleProducts.csv") args.setData(inputPipeAmazon,inputPipeGoogle) #setting outputpipe in 'args' -outputPipe = CsvPipe("resultAmazonGoogle") -outputPipe.setLocation("/tmp/AwsGoogleOutput") +outputPipe = CsvPipe("resultAmazonGoogle", None, "/tmp/AwsGoogleOutput") + args.setOutput(outputPipe) options = ClientOptions() diff --git a/examples/febrl/FebrlExample.py b/examples/febrl/FebrlExample.py index 6d1963824..97592a8fb 100644 --- a/examples/febrl/FebrlExample.py +++ b/examples/febrl/FebrlExample.py @@ -28,18 +28,13 @@ #below line should not be required if you are reading from in memory dataset #in that case, replace df with input df df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string").load("examples/febrl/test.csv") - -inputPipe = CsvPipe("test") -inputPipe.setLocation("examples/febrl/test.csv") - dfSchema = str(df.schema.json()) -inputPipe.setSchema(dfSchema) +inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv") args.setData(inputPipe) #setting outputpipe in 'args' -outputPipe = CsvPipe("csv") -outputPipe.setLocation("/tmp") +outputPipe = CsvPipe("csv", None, "/tmp") args.setOutput(outputPipe) diff --git a/examples/iTunes-amazon/iTunesAmazon.py b/examples/iTunes-amazon/iTunesAmazon.py index 6b1f6f706..9e1aecf4c 100644 --- a/examples/iTunes-amazon/iTunesAmazon.py +++ b/examples/iTunes-amazon/iTunesAmazon.py @@ -28,22 +28,17 @@ #below line should not be required if you are reading from in memory dataset #in that case, replace df with input df dfiTunes = spark.read.format("csv").schema("id string, Song_Name string, Artist_Name string, Album_Name string, Genre string, Price double, CopyRight string, Time string, Released string").load("examples/iTunes-amazon/iTunesMusic.csv") -dfSchema = str(dfiTunes.schema.json()) -inputPipeiTunes = CsvPipe("testiTunes") -inputPipeiTunes.setLocation("examples/iTunes-amazon/iTunesMusic.csv") -inputPipeiTunes.setSchema(dfSchema) +dfSchemaiTunes = str(dfiTunes.schema.json()) +inputPipeiTunes = CsvPipe("testiTunes", dfSchemaiTunes, "examples/iTunes-amazon/iTunesMusic.csv") dfAmazon = spark.read.format("csv").schema("id string, Song_Name string, Artist_Name string, Album_Name string, Genre string, Price double, CopyRight string, Time string, Released string").load("examples/iTunes-amazon/AmazonMusic.csv") dfSchemaAmazon = str(dfAmazon.schema.json()) -inputPipeAmazon = CsvPipe("testAmazon") -inputPipeAmazon.setLocation("examples/iTunes-amazon/AmazonMusic.csv") -inputPipeAmazon.setSchema(dfSchemaAmazon) +inputPipeAmazon = CsvPipe("testAmazon", dfSchemaAmazon, "examples/iTunes-amazon/AmazonMusic.csv") args.setData(inputPipeiTunes,inputPipeAmazon) #setting outputpipe in 'args' -outputPipe = CsvPipe("iTunesAmazonresult") -outputPipe.setLocation("/tmp/iTunesAmazonOutput") +outputPipe = CsvPipe("iTunesAmazonresult", None, "/tmp/iTunesAmazonOutput") args.setOutput(outputPipe) diff --git a/examples/ncVoters5M/ncVoters.py b/examples/ncVoters5M/ncVoters.py index 5af7e6d1e..35525e4e6 100644 --- a/examples/ncVoters5M/ncVoters.py +++ b/examples/ncVoters5M/ncVoters.py @@ -22,16 +22,14 @@ #below line should not be required if you are reading from in memory dataset #in that case, replace df with input df df = spark.read.format("csv").schema("recid string, givenname string, surname string, suburb string, postcode double ").load("examples/ncVoters5M/5Party-ocp20/") -dfSchemaA = str(df.schema.json()) +dfSchema = str(df.schema.json()) +inputPipe = CsvPipe("test", dfSchema, "examples/ncVoters5M/5Party-ocp20/") -inputPipe = CsvPipe("test") -inputPipe.setLocation("examples/ncVoters5M/5Party-ocp20/") -inputPipe.setSchema(dfSchemaA) args.setData(inputPipe) #setting outputpipe in 'args' -outputPipe = CsvPipe("ncVotersResult") -outputPipe.setLocation("/tmp/ncVotersOutput") +outputPipe = CsvPipe("ncVotersResult", None, "/tmp/ncVotersOutput") + args.setOutput(outputPipe) options = ClientOptions() diff --git a/python/zingg/pipes/pipes.py b/python/zingg/pipes/pipes.py index 88953e20e..3d7ab1bc9 100644 --- a/python/zingg/pipes/pipes.py +++ b/python/zingg/pipes/pipes.py @@ -19,9 +19,17 @@ class CsvPipe(Pipe): :param name: name of the pipe. :type name: String + :param schema: (optional) json schema for the pipe + :type schema: Schema or None + :param location: (optional) location from where we read data + :type location: String or None """ - def __init__(self, name): + def __init__(self, name, schema = None, location = None): Pipe.__init__(self, name, Format.CSV.type()) + if(schema != None): + Pipe.setSchema(schema) + if(location != None): + Pipe.addProperty(self, FilePipe.LOCATION, location) def setDelimiter(self, delimiter): """ This method is used to define delimiter of CsvPipe @@ -40,11 +48,12 @@ def setLocation(self, location): Pipe.addProperty(self, FilePipe.LOCATION, location) def setHeader(self, header): - Pipe.addProperty(self, FilePipe.HEADER, header) - -class BigQueryPipe(Pipe): + """ Method to set header property of pipe -======= + :param header: true if data contains header otherwise false + :type header: Bool + """ + Pipe.addProperty(self, FilePipe.HEADER, header) class BigQueryPipe(Pipe): """ Pipe Class for working with BigQuery pipeline diff --git a/test/testFebrl/testFebrl.py b/test/testFebrl/testFebrl.py new file mode 100644 index 000000000..318ae1eaf --- /dev/null +++ b/test/testFebrl/testFebrl.py @@ -0,0 +1,56 @@ +import unittest +from unittest.case import TestCase +import unittest +from io import StringIO + + +from zingg import * +from zingg.pipes import * + +args = Arguments() +fname = FieldDefinition("fname", "string", MatchType.FUZZY) +lname = FieldDefinition("lname", "string", MatchType.FUZZY) +stNo = FieldDefinition("stNo", "string", MatchType.FUZZY) +add1 = FieldDefinition("add1","string", MatchType.FUZZY) +add2 = FieldDefinition("add2", "string", MatchType.FUZZY) +city = FieldDefinition("city", "string", MatchType.FUZZY) +areacode = FieldDefinition("areacode", "string", MatchType.FUZZY) +state = FieldDefinition("state", "string", MatchType.FUZZY) +dob = FieldDefinition("dob", "string", MatchType.FUZZY) +ssn = FieldDefinition("ssn", "string", MatchType.FUZZY) + +fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn] + +args.setFieldDefinition(fieldDefs) +args.setModelId("100") +args.setZinggDir("models") +args.setNumPartitions(4) +args.setLabelDataSampleSize(0.5) + +df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string").load("examples/febrl/test.csv") +dfSchema = str(df.schema.json()) +inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv") +outputPipe = CsvPipe("result", None, "/tmp/febrlTest") +args.setData(inputPipe) +args.setOutput(outputPipe) +options = ClientOptions() +options.setPhase("trainMatch") + +#testing +class Accuracy_recordCount(TestCase): + def test_recordCount(self): + client = Zingg(args, options) + client.initAndExecute() + pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords()) + labelledData = spark.createDataFrame(pMarkedDF) + + total_marked = pMarkedDF.shape[0] + + # marked record count test + self.assertEqual(total_marked, 76) + + pMarkedDF.drop(pMarkedDF[pMarkedDF[ColName.PREDICTION_COL] == -1].index, inplace=True) + acc = (pMarkedDF[ColName.MATCH_FLAG_COL]== pMarkedDF[ColName.PREDICTION_COL]).mean() + + # accuracy test + self.assertGreater(acc, 0.9) \ No newline at end of file From f353eca5370774877f6f588d7a094d0d070d37f5 Mon Sep 17 00:00:00 2001 From: Raviraj Baraiya Date: Sun, 17 Jul 2022 21:15:16 +0530 Subject: [PATCH 4/4] conflict resolved --- python/zingg/pipes/pipes.py | 5 ----- test/testFebrl/testFebrl.py | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/zingg/pipes/pipes.py b/python/zingg/pipes/pipes.py index 3d7ab1bc9..8d0b70ba9 100644 --- a/python/zingg/pipes/pipes.py +++ b/python/zingg/pipes/pipes.py @@ -48,11 +48,6 @@ def setLocation(self, location): Pipe.addProperty(self, FilePipe.LOCATION, location) def setHeader(self, header): - """ Method to set header property of pipe - - :param header: true if data contains header otherwise false - :type header: Bool - """ Pipe.addProperty(self, FilePipe.HEADER, header) class BigQueryPipe(Pipe): diff --git a/test/testFebrl/testFebrl.py b/test/testFebrl/testFebrl.py index 318ae1eaf..ebf4995b2 100644 --- a/test/testFebrl/testFebrl.py +++ b/test/testFebrl/testFebrl.py @@ -28,15 +28,23 @@ args.setLabelDataSampleSize(0.5) df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string").load("examples/febrl/test.csv") +inputPipe = CsvPipe("test") +inputPipe.setLocation("examples/febrl/test.csv") dfSchema = str(df.schema.json()) -inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv") -outputPipe = CsvPipe("result", None, "/tmp/febrlTest") +inputPipe.setSchema(dfSchema) + +outputPipe = CsvPipe("result") +outputPipe.setLocation("/tmp/pythonTest") + args.setData(inputPipe) args.setOutput(outputPipe) + options = ClientOptions() +# options.setPhase("trainMatch") options.setPhase("trainMatch") #testing + class Accuracy_recordCount(TestCase): def test_recordCount(self): client = Zingg(args, options) @@ -53,4 +61,5 @@ def test_recordCount(self): acc = (pMarkedDF[ColName.MATCH_FLAG_COL]== pMarkedDF[ColName.PREDICTION_COL]).mean() # accuracy test - self.assertGreater(acc, 0.9) \ No newline at end of file + self.assertGreater(acc, 0.9) +