From 35c6be7889e180e58b302d0ce97d449d21227d17 Mon Sep 17 00:00:00 2001
From: Navin Singh <navin.rathore@gmail.com>
Date: Thu, 30 Jun 2022 11:18:20 +0530
Subject: [PATCH 1/4] TCs for String Similarity Distance function

---
 .../function/TestJaccSimFunction.java         | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java

diff --git a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java
new file mode 100644
index 000000000..23d4669e5
--- /dev/null
+++ b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java
@@ -0,0 +1,91 @@
+package zingg.similarity.function;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+import org.junit.jupiter.api.Test;
+
+public class TestJaccSimFunction {
+
+	@Test
+	public void testFirstStringNull() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call(null, "text 2"));
+	}
+
+	@Test
+	public void testFirstStringEmpty() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call("", "text 2"));
+	}
+
+	@Test
+	public void testSecondStringNull() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call("text 1", null));
+	}
+
+	@Test
+	public void testSecondStringEmpty() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call("text 1", ""));
+	}
+
+	@Test
+	public void testBothEmpty() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call("", ""));
+	}
+
+	@Test
+	public void testBothNull() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call(null, null));
+	}
+
+	@Test
+	public void testBothSame() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call("sample text", "sample text"));
+	}
+
+	@Test
+	public void testBothSameButCaseDifferent() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		assertEquals(1d, strDistanceFn.call("sample text", "sAmPle TeXt"));
+	}
+
+	@Test
+	public void testBothNotEmptyDifferent() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		Double score = strDistanceFn.call("sample text first", "sample text second");
+		assertFalse(score == 0d || score == 1d);
+	}
+
+	@Test
+	public void testSpecificInputsDifferent() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		String first = "sonicwall 01-ssc-6997 : usually ships in 24 hours : : sonicwall client/server anti-virus suite leverages the award-winning mcafee netshield and groupshield applications for networks with windows -based file print and exchange servers.,";
+		String second = "sonicwall 01-ssc-5670 : usually ships in 24 hours : : more and more businesses schools government agencies and libraries are connecting to the internet to meet their organizational and educational goals.";
+		Double score = strDistanceFn.call(first, second);
+		assertFalse(score == 0d || score == 1d);
+	}
+
+	@Test
+	public void testInputsSameWithSlashes() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		String first = "sample/string/with/slashes";
+		String second = "sample/string/with/slashes";
+		Double score = strDistanceFn.call(first, second);
+		assertEquals(1d, score);
+	}
+
+	@Test
+	public void testInputsDifferentWithSlashesAndColons() {
+		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
+		String first = "sample/string/with/slashes:and:colons.,";
+		String second = "sample string/with slash:and,.";
+		Double score = strDistanceFn.call(first, second);
+		assertFalse(score == 0d || score == 1d);
+	}
+}

From b2a4251f0f7aca4594d3b4a19fe19a7a031757e1 Mon Sep 17 00:00:00 2001
From: Navin Singh <navin.rathore@gmail.com>
Date: Tue, 5 Jul 2022 15:14:22 +0530
Subject: [PATCH 2/4] Added actual similarty score to assert

---
 .../function/TestJaccSimFunction.java         | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java
index 23d4669e5..c664e095d 100644
--- a/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java
+++ b/core/src/test/java/zingg/similarity/function/TestJaccSimFunction.java
@@ -1,7 +1,6 @@
 package zingg.similarity.function;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
 
 import org.junit.jupiter.api.Test;
 
@@ -59,23 +58,23 @@ public void testBothSameButCaseDifferent() {
 	public void testBothNotEmptyDifferent() {
 		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
 		Double score = strDistanceFn.call("sample text first", "sample text second");
-		assertFalse(score == 0d || score == 1d);
+		assertEquals(0.5d, score);
 	}
 
 	@Test
 	public void testSpecificInputsDifferent() {
 		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
-		String first = "sonicwall 01-ssc-6997 : usually ships in 24 hours : : sonicwall client/server anti-virus suite leverages the award-winning mcafee netshield and groupshield applications for networks with windows -based file print and exchange servers.,";
-		String second = "sonicwall 01-ssc-5670 : usually ships in 24 hours : : more and more businesses schools government agencies and libraries are connecting to the internet to meet their organizational and educational goals.";
+		String first = "sonicwall client/server ";
+		String second = "sonicwall businesses ";
 		Double score = strDistanceFn.call(first, second);
-		assertFalse(score == 0d || score == 1d);
+		assertEquals(0.25d, score);
 	}
 
 	@Test
 	public void testInputsSameWithSlashes() {
 		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
-		String first = "sample/string/with/slashes";
-		String second = "sample/string/with/slashes";
+		String first = "sample/string";
+		String second = "sample/string";
 		Double score = strDistanceFn.call(first, second);
 		assertEquals(1d, score);
 	}
@@ -83,9 +82,9 @@ public void testInputsSameWithSlashes() {
 	@Test
 	public void testInputsDifferentWithSlashesAndColons() {
 		StringSimilarityDistanceFunction strDistanceFn = new JaccSimFunction("test");
-		String first = "sample/string/with/slashes:and:colons.,";
-		String second = "sample string/with slash:and,.";
+		String first = "slashes/and:colons.,";
+		String second = "slashes and colons";
 		Double score = strDistanceFn.call(first, second);
-		assertFalse(score == 0d || score == 1d);
+		assertEquals(1d, score);
 	}
 }

From 0bffcbc8652ac2ad96cbe751683275e9b953623f Mon Sep 17 00:00:00 2001
From: Raviraj Baraiya <ravibaraiya2240@gmail.com>
Date: Sun, 17 Jul 2022 21:08:48 +0530
Subject: [PATCH 3/4] CsvPipe python api issue #401

---
 examples/amazon-google/AmazonGoogle.py | 12 ++----
 examples/febrl/FebrlExample.py         |  9 +----
 examples/iTunes-amazon/iTunesAmazon.py | 13 ++----
 examples/ncVoters5M/ncVoters.py        | 10 ++---
 python/zingg/pipes/pipes.py            | 19 ++++++---
 test/testFebrl/testFebrl.py            | 56 ++++++++++++++++++++++++++
 6 files changed, 84 insertions(+), 35 deletions(-)
 create mode 100644 test/testFebrl/testFebrl.py

diff --git a/examples/amazon-google/AmazonGoogle.py b/examples/amazon-google/AmazonGoogle.py
index e3f2573e3..af0c9a893 100644
--- a/examples/amazon-google/AmazonGoogle.py
+++ b/examples/amazon-google/AmazonGoogle.py
@@ -23,21 +23,17 @@
 #in that case, replace df with input df
 dfAmazon = spark.read.format("csv").schema("id string, title string, description string, manufacturer string, price double ").load("examples/amazon-google/Amazon.csv")
 dfSchemaAmazon = str(dfAmazon.schema.json())
-inputPipeAmazon = CsvPipe("testAmazon")
-inputPipeAmazon.setLocation("examples/amazon-google/Amazon.csv")
-inputPipeAmazon.setSchema(dfSchemaAmazon)
+inputPipeAmazon = CsvPipe("testAmazon", dfSchemaAmazon, "examples/amazon-google/Amazon.csv")
 
 dfGoogle = spark.read.format("csv").schema("id string, title string, description string, manufacturer string, price double ").load("examples/amazon-google/GoogleProducts.csv")
 dfSchemaGoogle = str(dfGoogle.schema.json())
-inputPipeGoogle = CsvPipe("testGoogle")
-inputPipeGoogle.setLocation("examples/amazon-google/GoogleProducts.csv")
-inputPipeGoogle.setSchema(dfSchemaGoogle)
+inputPipeGoogle = CsvPipe("testGoogle", dfSchemaGoogle, "examples/amazon-google/GoogleProducts.csv")
 
 args.setData(inputPipeAmazon,inputPipeGoogle)
 
 #setting outputpipe in 'args'
-outputPipe = CsvPipe("resultAmazonGoogle")
-outputPipe.setLocation("/tmp/AwsGoogleOutput")
+outputPipe = CsvPipe("resultAmazonGoogle", None, "/tmp/AwsGoogleOutput")
+
 args.setOutput(outputPipe)
 
 options = ClientOptions()
diff --git a/examples/febrl/FebrlExample.py b/examples/febrl/FebrlExample.py
index 6d1963824..97592a8fb 100644
--- a/examples/febrl/FebrlExample.py
+++ b/examples/febrl/FebrlExample.py
@@ -28,18 +28,13 @@
 #below line should not be required if you are reading from in memory dataset
 #in that case, replace df with input df
 df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn  string").load("examples/febrl/test.csv")
-
-inputPipe = CsvPipe("test")
-inputPipe.setLocation("examples/febrl/test.csv")
-
 dfSchema = str(df.schema.json())
-inputPipe.setSchema(dfSchema)
+inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv")
 
 args.setData(inputPipe)
 
 #setting outputpipe in 'args'
-outputPipe = CsvPipe("csv")
-outputPipe.setLocation("/tmp")
+outputPipe = CsvPipe("csv", None, "/tmp")
 
 args.setOutput(outputPipe)
 
diff --git a/examples/iTunes-amazon/iTunesAmazon.py b/examples/iTunes-amazon/iTunesAmazon.py
index 6b1f6f706..9e1aecf4c 100644
--- a/examples/iTunes-amazon/iTunesAmazon.py
+++ b/examples/iTunes-amazon/iTunesAmazon.py
@@ -28,22 +28,17 @@
 #below line should not be required if you are reading from in memory dataset
 #in that case, replace df with input df
 dfiTunes = spark.read.format("csv").schema("id string, Song_Name string, Artist_Name string, Album_Name string, Genre string, Price double, CopyRight string, Time string, Released string").load("examples/iTunes-amazon/iTunesMusic.csv")
-dfSchema = str(dfiTunes.schema.json())
-inputPipeiTunes = CsvPipe("testiTunes")
-inputPipeiTunes.setLocation("examples/iTunes-amazon/iTunesMusic.csv")
-inputPipeiTunes.setSchema(dfSchema)
+dfSchemaiTunes = str(dfiTunes.schema.json())
+inputPipeiTunes = CsvPipe("testiTunes", dfSchemaiTunes, "examples/iTunes-amazon/iTunesMusic.csv")
 
 dfAmazon = spark.read.format("csv").schema("id string, Song_Name string, Artist_Name string, Album_Name string, Genre string, Price double, CopyRight string, Time string, Released string").load("examples/iTunes-amazon/AmazonMusic.csv")
 dfSchemaAmazon = str(dfAmazon.schema.json())
-inputPipeAmazon = CsvPipe("testAmazon")
-inputPipeAmazon.setLocation("examples/iTunes-amazon/AmazonMusic.csv")
-inputPipeAmazon.setSchema(dfSchemaAmazon)
+inputPipeAmazon = CsvPipe("testAmazon", dfSchemaAmazon, "examples/iTunes-amazon/AmazonMusic.csv")
 
 args.setData(inputPipeiTunes,inputPipeAmazon)
 
 #setting outputpipe in 'args'
-outputPipe = CsvPipe("iTunesAmazonresult")
-outputPipe.setLocation("/tmp/iTunesAmazonOutput")
+outputPipe = CsvPipe("iTunesAmazonresult", None, "/tmp/iTunesAmazonOutput")
 
 args.setOutput(outputPipe)
 
diff --git a/examples/ncVoters5M/ncVoters.py b/examples/ncVoters5M/ncVoters.py
index 5af7e6d1e..35525e4e6 100644
--- a/examples/ncVoters5M/ncVoters.py
+++ b/examples/ncVoters5M/ncVoters.py
@@ -22,16 +22,14 @@
 #below line should not be required if you are reading from in memory dataset
 #in that case, replace df with input df
 df = spark.read.format("csv").schema("recid string, givenname string, surname string, suburb string, postcode double ").load("examples/ncVoters5M/5Party-ocp20/")
-dfSchemaA = str(df.schema.json())
+dfSchema = str(df.schema.json())
+inputPipe = CsvPipe("test", dfSchema, "examples/ncVoters5M/5Party-ocp20/")
 
-inputPipe = CsvPipe("test")
-inputPipe.setLocation("examples/ncVoters5M/5Party-ocp20/")
-inputPipe.setSchema(dfSchemaA)
 args.setData(inputPipe)
 
 #setting outputpipe in 'args'
-outputPipe = CsvPipe("ncVotersResult")
-outputPipe.setLocation("/tmp/ncVotersOutput")
+outputPipe = CsvPipe("ncVotersResult", None, "/tmp/ncVotersOutput")
+
 args.setOutput(outputPipe)
 
 options = ClientOptions()
diff --git a/python/zingg/pipes/pipes.py b/python/zingg/pipes/pipes.py
index 88953e20e..3d7ab1bc9 100644
--- a/python/zingg/pipes/pipes.py
+++ b/python/zingg/pipes/pipes.py
@@ -19,9 +19,17 @@ class CsvPipe(Pipe):
 
     :param name: name of the pipe.
     :type name: String
+    :param schema: (optional) json schema for the pipe
+    :type schema: Schema or None
+    :param location: (optional) location from where we read data
+    :type location: String or None
     """
-    def __init__(self, name):
+    def __init__(self, name, schema = None, location = None):
         Pipe.__init__(self, name, Format.CSV.type())
+        if(schema != None):
+            Pipe.setSchema(schema)
+        if(location != None):
+            Pipe.addProperty(self, FilePipe.LOCATION, location)
 
     def setDelimiter(self, delimiter):
         """ This method is used to define delimiter of CsvPipe
@@ -40,11 +48,12 @@ def setLocation(self, location):
         Pipe.addProperty(self, FilePipe.LOCATION, location)
 
     def setHeader(self, header):
-        Pipe.addProperty(self, FilePipe.HEADER, header)
-
-class BigQueryPipe(Pipe):
+        """ Method to set header property of pipe
 
-=======
+        :param header: true if data contains header otherwise false
+        :type header: Bool
+        """
+        Pipe.addProperty(self, FilePipe.HEADER, header)
 
 class BigQueryPipe(Pipe):
     """ Pipe Class for working with BigQuery pipeline
diff --git a/test/testFebrl/testFebrl.py b/test/testFebrl/testFebrl.py
new file mode 100644
index 000000000..318ae1eaf
--- /dev/null
+++ b/test/testFebrl/testFebrl.py
@@ -0,0 +1,56 @@
+import unittest
+from unittest.case import TestCase
+import unittest
+from io import StringIO
+
+
+from zingg import *
+from zingg.pipes import *
+
+args = Arguments()
+fname = FieldDefinition("fname", "string", MatchType.FUZZY)
+lname = FieldDefinition("lname", "string", MatchType.FUZZY)
+stNo = FieldDefinition("stNo", "string", MatchType.FUZZY)
+add1 = FieldDefinition("add1","string", MatchType.FUZZY)
+add2 = FieldDefinition("add2", "string", MatchType.FUZZY)
+city = FieldDefinition("city", "string", MatchType.FUZZY)
+areacode = FieldDefinition("areacode", "string", MatchType.FUZZY)
+state = FieldDefinition("state", "string", MatchType.FUZZY)
+dob = FieldDefinition("dob", "string", MatchType.FUZZY)
+ssn = FieldDefinition("ssn", "string", MatchType.FUZZY)
+
+fieldDefs = [fname, lname, stNo, add1, add2, city, areacode, state, dob, ssn]
+
+args.setFieldDefinition(fieldDefs)
+args.setModelId("100")
+args.setZinggDir("models")
+args.setNumPartitions(4)
+args.setLabelDataSampleSize(0.5)
+
+df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn  string").load("examples/febrl/test.csv")
+dfSchema = str(df.schema.json())
+inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv")
+outputPipe = CsvPipe("result", None, "/tmp/febrlTest")
+args.setData(inputPipe)
+args.setOutput(outputPipe)
+options = ClientOptions()
+options.setPhase("trainMatch")
+
+#testing
+class Accuracy_recordCount(TestCase):
+	def test_recordCount(self):
+		client = Zingg(args, options)
+		client.initAndExecute()
+		pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords())
+		labelledData = spark.createDataFrame(pMarkedDF)
+
+		total_marked = pMarkedDF.shape[0]
+
+		# marked record count test
+		self.assertEqual(total_marked, 76)
+
+		pMarkedDF.drop(pMarkedDF[pMarkedDF[ColName.PREDICTION_COL] == -1].index, inplace=True)
+		acc = (pMarkedDF[ColName.MATCH_FLAG_COL]== pMarkedDF[ColName.PREDICTION_COL]).mean()
+
+		# accuracy test
+		self.assertGreater(acc, 0.9)
\ No newline at end of file

From f353eca5370774877f6f588d7a094d0d070d37f5 Mon Sep 17 00:00:00 2001
From: Raviraj Baraiya <ravibaraiya2240@gmail.com>
Date: Sun, 17 Jul 2022 21:15:16 +0530
Subject: [PATCH 4/4] conflict resolved

---
 python/zingg/pipes/pipes.py |  5 -----
 test/testFebrl/testFebrl.py | 15 ++++++++++++---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/zingg/pipes/pipes.py b/python/zingg/pipes/pipes.py
index 3d7ab1bc9..8d0b70ba9 100644
--- a/python/zingg/pipes/pipes.py
+++ b/python/zingg/pipes/pipes.py
@@ -48,11 +48,6 @@ def setLocation(self, location):
         Pipe.addProperty(self, FilePipe.LOCATION, location)
 
     def setHeader(self, header):
-        """ Method to set header property of pipe
-
-        :param header: true if data contains header otherwise false
-        :type header: Bool
-        """
         Pipe.addProperty(self, FilePipe.HEADER, header)
 
 class BigQueryPipe(Pipe):
diff --git a/test/testFebrl/testFebrl.py b/test/testFebrl/testFebrl.py
index 318ae1eaf..ebf4995b2 100644
--- a/test/testFebrl/testFebrl.py
+++ b/test/testFebrl/testFebrl.py
@@ -28,15 +28,23 @@
 args.setLabelDataSampleSize(0.5)
 
 df = spark.read.format("csv").schema("id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn  string").load("examples/febrl/test.csv")
+inputPipe = CsvPipe("test")
+inputPipe.setLocation("examples/febrl/test.csv")
 dfSchema = str(df.schema.json())
-inputPipe = CsvPipe("test", dfSchema, "examples/febrl/test.csv")
-outputPipe = CsvPipe("result", None, "/tmp/febrlTest")
+inputPipe.setSchema(dfSchema)
+
+outputPipe = CsvPipe("result")
+outputPipe.setLocation("/tmp/pythonTest")
+
 args.setData(inputPipe)
 args.setOutput(outputPipe)
+
 options = ClientOptions()
+# options.setPhase("trainMatch")
 options.setPhase("trainMatch")
 
 #testing
+
 class Accuracy_recordCount(TestCase):
 	def test_recordCount(self):
 		client = Zingg(args, options)
@@ -53,4 +61,5 @@ def test_recordCount(self):
 		acc = (pMarkedDF[ColName.MATCH_FLAG_COL]== pMarkedDF[ColName.PREDICTION_COL]).mean()
 
 		# accuracy test
-		self.assertGreater(acc, 0.9)
\ No newline at end of file
+		self.assertGreater(acc, 0.9)
+