Skip to content

Commit

Permalink
fixing Float and long not supported data types issue #490
Browse files Browse the repository at this point in the history
  • Loading branch information
vikasgupta78 committed Mar 22, 2023
1 parent 6218fc0 commit d7ab150
Show file tree
Hide file tree
Showing 13 changed files with 310 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package zingg.common.core.feature;

import zingg.common.client.FieldDefinition;
import zingg.common.client.MatchType;
import zingg.common.core.similarity.function.FloatSimilarityFunction;


public class FloatFeature extends BaseFeature<Float> {

private static final long serialVersionUID = 1L;

public FloatFeature() {

}

public void init(FieldDefinition newParam) {
setFieldDefinition(newParam);
if (newParam.getMatchType().contains(MatchType.FUZZY)) {
addSimFunction(new FloatSimilarityFunction());
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package zingg.common.core.hash;

public class LessThanZeroFloat extends BaseHash<Float,Boolean>{

private static final long serialVersionUID = 1L;


public LessThanZeroFloat() {
setName("lessThanZeroFloat");
}


public Boolean call(Float field) {
Boolean r = false;
if (field != null) {
r = field < 0 ? true : false;
}
return r;
}

}
33 changes: 33 additions & 0 deletions common/core/src/main/java/zingg/common/core/hash/RangeFloat.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package zingg.common.core.hash;

public class RangeFloat extends BaseHash<Float,Integer>{
private static final long serialVersionUID = 1L;
private int lowerLimit;
private int upperLimit;

public RangeFloat(int lower, int upper) {
setName("rangeBetween" + lower + "And" + upper + "Float");
this.lowerLimit = lower;
this.upperLimit = upper;
}


public Integer call(Float field) {
int withinRange = 0;
if (field != null && field >= lowerLimit && field < upperLimit) {
withinRange = 1;
}
return withinRange;
}


public int getLowerLimit() {
return lowerLimit;
}


public int getUpperLimit() {
return upperLimit;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package zingg.common.core.hash;

/**
* Base class for hash functions related to trimming of floats
*
*/
public class TrimLastDigitsFloat extends BaseHash<Float,Float>{
private static final long serialVersionUID = 1L;
private int numDigits;
static final int[] POWERS_OF_10 = {1, 10, 100, 1000, 10000, 100000};
public TrimLastDigitsFloat(int count) {
setName("trimLast" + count + "DigitsFloat");
this.numDigits = count;
}


public Float call(Float field) {
Float r = null;
if (field == null) {
r = field;
} else {
r = (float)(Math.floor(field / POWERS_OF_10[numDigits]));
}
return r;
}


public int getNumDigits() {
return numDigits;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package zingg.common.core.hash;

/**
* Base class for hash functions related to truncating of floats
*
*
*/
public class TruncateFloat extends BaseHash<Float,Float>{
private static final long serialVersionUID = 1L;
private int numDecimalPlaces;
static final int[] POWERS_OF_10 = {1, 10, 100, 1000, 10000, 100000};
public TruncateFloat(int numDecimalPlaces) {
setName("truncateFloatTo" + numDecimalPlaces + "Places");
this.numDecimalPlaces = numDecimalPlaces;
}


public Float call(Float field) {
Float r = null;
if (field == null) {
r = field;
} else {
r = (float)(Math.floor(field * POWERS_OF_10[numDecimalPlaces]) / POWERS_OF_10[numDecimalPlaces]);
}
return r;
}


public int getNumDecimalPlaces() {
return numDecimalPlaces;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package zingg.common.core.similarity.function;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class FloatSimilarityFunction extends SimFunction<Float> {
private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory
.getLog(FloatSimilarityFunction.class);

public FloatSimilarityFunction() {
super("FloatSimilarityFunction");
}

@Override
public Double call(Float first, Float second) {
if (first == null || first.isNaN()) return 1d;
if (second == null || second.isNaN()) return 1d;
//we want similarity, hence we subtract from 1 so that closer values have higher score
double score = 1 - (Math.abs(first-second))/(1.0+first + second);
LOG.debug(" DoubleSim bw " + first + " and second " + second + " is "
+ score);
return score;
}

}
33 changes: 33 additions & 0 deletions common/core/src/main/resources/hashFunctions.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,21 @@
{
"name":"truncateDoubleTo3Places"
},
{
"name":"truncateFloatTo1Places"
},
{
"name":"truncateFloatTo2Places"
},
{
"name":"truncateFloatTo3Places"
},
{
"name":"lessThanZeroDbl"
},
{
"name":"lessThanZeroFloat"
},
{
"name":"lessThanZeroInt"
},
Expand All @@ -69,6 +81,15 @@
{
"name":"trimLast3DigitsDbl"
},
{
"name":"trimLast1DigitsFloat"
},
{
"name":"trimLast2DigitsFloat"
},
{
"name":"trimLast3DigitsFloat"
},
{
"name":"trimLast1DigitsInt"
},
Expand Down Expand Up @@ -99,6 +120,18 @@
{
"name":"rangeBetween1000And10000Dbl"
},
{
"name":"rangeBetween0And10Float"
},
{
"name":"rangeBetween10And100Float"
},
{
"name":"rangeBetween100And1000Float"
},
{
"name":"rangeBetween1000And10000Float"
},
{
"name":"rangeBetween0And10Int"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import zingg.common.core.feature.DateFeature;
import zingg.common.core.feature.DoubleFeature;
import zingg.common.core.feature.FeatureFactory;
import zingg.common.core.feature.FloatFeature;
import zingg.common.core.feature.IntFeature;
import zingg.common.core.feature.LongFeature;
import zingg.common.core.feature.StringFeature;
Expand All @@ -24,6 +25,7 @@ public void init() {
map.put(DataTypes.IntegerType, IntFeature.class);
map.put(DataTypes.DateType, DateFeature.class);
map.put(DataTypes.DoubleType, DoubleFeature.class);
map.put(DataTypes.FloatType, FloatFeature.class);
map.put(DataTypes.LongType, LongFeature.class);

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,16 @@ public SparkHashFunctionRegistry() {

init(new SparkTruncateDouble(1));
init(new SparkTruncateDouble(2));
init(new SparkTruncateDouble(3));
init(new SparkTruncateDouble(3));

init(new SparkTruncateFloat(1));
init(new SparkTruncateFloat(2));
init(new SparkTruncateFloat(3));

init(new SparkLessThanZeroDbl());

init(new SparkLessThanZeroFloat());

init(new SparkLessThanZeroInt());

init(new SparkLessThanZeroLong());
Expand All @@ -57,6 +63,10 @@ public SparkHashFunctionRegistry() {
init(new SparkTrimLastDigitsDbl(2));
init(new SparkTrimLastDigitsDbl(3));

init(new SparkTrimLastDigitsFloat(1));
init(new SparkTrimLastDigitsFloat(2));
init(new SparkTrimLastDigitsFloat(3));

init(new SparkTrimLastDigitsInt(1));
init(new SparkTrimLastDigitsInt(2));
init(new SparkTrimLastDigitsInt(3));
Expand All @@ -69,6 +79,11 @@ public SparkHashFunctionRegistry() {
init(new SparkRangeDbl(10,100));
init(new SparkRangeDbl(100,1000));
init(new SparkRangeDbl(1000,10000));

init(new SparkRangeFloat(0,10));
init(new SparkRangeFloat(10,100));
init(new SparkRangeFloat(100,1000));
init(new SparkRangeFloat(1000,10000));

init(new SparkRangeInt(0,10));
init(new SparkRangeInt(10,100));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package zingg.spark.core.hash;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.types.DataTypes;

import zingg.common.core.hash.LessThanZeroFloat;

public class SparkLessThanZeroFloat extends SparkHashFunction<Float, Boolean>{

private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory.getLog(SparkLessThanZeroFloat.class);

public SparkLessThanZeroFloat() {
setBaseHash(new LessThanZeroFloat());
setDataType(DataTypes.FloatType);
setReturnType(DataTypes.BooleanType);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package zingg.spark.core.hash;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.types.DataTypes;

import zingg.common.core.hash.RangeFloat;

public class SparkRangeFloat extends SparkHashFunction<Float, Integer>{

private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory.getLog(SparkRangeFloat.class);

public SparkRangeFloat(int lower, int upper) {
setBaseHash(new RangeFloat(lower ,upper));
setDataType(DataTypes.FloatType);
setReturnType(DataTypes.IntegerType);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package zingg.spark.core.hash;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.types.DataTypes;

import zingg.common.core.hash.TrimLastDigitsFloat;

/**
* Spark specific trim function for Float
*
*
*/
public class SparkTrimLastDigitsFloat extends SparkHashFunction<Float, Float>{

private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory.getLog(SparkTrimLastDigitsFloat.class);

public SparkTrimLastDigitsFloat(int count){
setBaseHash(new TrimLastDigitsFloat(count));
setDataType(DataTypes.FloatType);
setReturnType(DataTypes.FloatType);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package zingg.spark.core.hash;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.types.DataTypes;

import zingg.common.core.hash.TruncateFloat;

/**
* Spark specific trunc function for Float
*
*
*
*/
public class SparkTruncateFloat extends SparkHashFunction<Float, Float>{

private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory.getLog(SparkTruncateFloat.class);

public SparkTruncateFloat(int count){
setBaseHash(new TruncateFloat(count));
setDataType(DataTypes.FloatType);
setReturnType(DataTypes.FloatType);
}

}

0 comments on commit d7ab150

Please sign in to comment.