Skip to content

Commit

Permalink
blocking tree changes (#924)
Browse files Browse the repository at this point in the history
* blocking tree changes

* moved the set logic to block

* reverted back tests

* reverted back test data

* removed unused code

* added back commented code

* ftd changes

* refactor

* simplified if-else

* removed unused imports

* ftd changes

* changed absolute path

* refactored class naming

* config changes

* changed tot this()
  • Loading branch information
Nitish1814 authored Jan 3, 2025
1 parent d75daca commit c69adbe
Show file tree
Hide file tree
Showing 20 changed files with 687 additions and 75 deletions.
32 changes: 26 additions & 6 deletions assembly/dependency-reduced-pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,32 @@
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<version>5.2.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>5.2.0</version>
<scope>test</scope>
<exclusions>
<exclusion>
<artifactId>byte-buddy</artifactId>
<groupId>net.bytebuddy</groupId>
</exclusion>
<exclusion>
<artifactId>byte-buddy-agent</artifactId>
<groupId>net.bytebuddy</groupId>
</exclusion>
<exclusion>
<artifactId>objenesis</artifactId>
<groupId>org.objenesis</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
Expand Down Expand Up @@ -113,12 +139,6 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.8.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
Expand Down
54 changes: 11 additions & 43 deletions common/core/src/main/java/zingg/common/core/block/Block.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public abstract class Block<D,R,C,T> implements Serializable {
private static final long serialVersionUID = 1L;

public static final Log LOG = LogFactory.getLog(Block.class);
private final IHashFunctionUtility<D, R, C, T> hashFunctionUtility;

protected ZFrame<D,R,C> dupes;
// Class[] types;
Expand All @@ -30,10 +31,11 @@ public abstract class Block<D,R,C,T> implements Serializable {
protected ListMap<HashFunction<D,R,C,T>, String> childless;

public Block() {

this.hashFunctionUtility = HashFunctionUtilityFactory.getHashFunctionUtility(HashUtility.CACHED);
}

public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes) {
this();
this.training = training;
this.dupes = dupes;
childless = new ListMap<HashFunction<D,R,C,T>, String>();
Expand Down Expand Up @@ -145,7 +147,7 @@ public void estimateElimCount(Canopy<R> c, long elimCount) {
for (HashFunction function : functions) {
// /if (!used.contains(field.getIndex(), function) &&
if (least ==0) break;//how much better can it get?
if (!isFunctionUsed(tree, node, field.fieldName, function) //&&
if (!hashFunctionUtility.isHashFunctionUsed(field, function, tree, node) //&&
//!childless.contains(function, field.fieldName)
)
{
Expand Down Expand Up @@ -231,6 +233,8 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
LOG.debug("Size is bigger ");
Canopy<R>best = getBestNode(tree, parent, node, fieldsOfInterest);
if (best != null) {
//add function, context info for this best node in set
hashFunctionUtility.addHashFunctionIfRequired(best);
if (LOG.isDebugEnabled()) {
LOG.debug(" HashFunction is " + best + " and node is " + node);
}
Expand Down Expand Up @@ -258,6 +262,8 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,

getBlockingTree(tree, node, n, fieldsOfInterest);
}
//remove function, context info for this best node as we are returning from best node
hashFunctionUtility.removeHashFunctionIfRequired(best);
}
else {
node.clearBeforeSaving();
Expand All @@ -279,48 +285,10 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
return tree;
}

public boolean checkFunctionInNode(Canopy<R>node, String name,
HashFunction function) {
if (node.getFunction() != null && node.getFunction().equals(function)
&& node.context.fieldName.equals(name)) {
return true;
}
return false;
}
// public boolean isFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> function) {
// return hashFunctionsInCurrentNodePath.contains(getKey(fieldDefinition, function));
// }

public boolean isFunctionUsed(Tree<Canopy<R>> tree, Canopy<R>node, String fieldName,
HashFunction function) {
// //LOG.debug("Tree " + tree);
// //LOG.debug("Node " + node);
// //LOG.debug("Index " + index);
// //LOG.debug("Function " + function);
boolean isUsed = false;
if (node == null || tree == null)
return false;
if (checkFunctionInNode(node, fieldName, function))
return true;
Tree<Canopy<R>> nodeTree = tree.getTree(node);
if (nodeTree == null)
return false;

Tree<Canopy<R>> parent = nodeTree.getParent();
if (parent != null) {
Canopy<R>head = parent.getHead();
while (head != null) {
// check siblings of node
/*for (Tree<Canopy<R>> siblings : parent.getSubTrees()) {
Canopy<R>sibling = siblings.getHead();
if (checkFunctionInNode(sibling, index, function))
return true;
}*/
// check parent of node
return isFunctionUsed(tree, head, fieldName, function);
}
}
return isUsed;
}


public List<Canopy<R>> getHashSuccessors(Collection<Canopy<R>> successors, Object hash) {
List<Canopy<R>> retCanopy = new ArrayList<Canopy<R>>();
for (Canopy<R>c: successors) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package zingg.common.core.block;

import zingg.common.client.FieldDefinition;
import zingg.common.core.hash.HashFunction;

import java.util.HashSet;
import java.util.Set;

public class CacheBasedHashFunctionUtility<D, R, C, T> implements IHashFunctionUtility<D, R, C, T> {

private final Set<String> hashFunctionsInCurrentNodePath;
private static final String DELIMITER = ":";

public CacheBasedHashFunctionUtility() {
this.hashFunctionsInCurrentNodePath = new HashSet<String>();
}

@Override
public boolean isHashFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction, Tree<Canopy<R>> tree, Canopy<R> node) {
return hashFunctionsInCurrentNodePath.contains(getKey(fieldDefinition, hashFunction));
}

@Override
public void addHashFunctionIfRequired(Canopy<R> node) {
addHashFunctionInCurrentNodePath(node);
}

@Override
public void removeHashFunctionIfRequired(Canopy<R> node) {
removeHashFunctionInCurrentNodePath(node);
}

private void addHashFunctionInCurrentNodePath(Canopy<R> node) {
this.hashFunctionsInCurrentNodePath.add(getKey(node.getContext(), node.getFunction()));
}

private void removeHashFunctionInCurrentNodePath(Canopy<R> node) {
this.hashFunctionsInCurrentNodePath.remove(getKey(node.getContext(), node.getFunction()));
}

private String getKey(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction) {
return fieldDefinition.getName() + DELIMITER + hashFunction.getName();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package zingg.common.core.block;

import zingg.common.client.FieldDefinition;
import zingg.common.core.hash.HashFunction;

public class DefaultHashFunctionUtility<D, R, C, T> implements IHashFunctionUtility<D, R, C, T>{
@Override
public boolean isHashFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction, Tree<Canopy<R>> tree, Canopy<R> node) {
boolean isUsed = false;
if (node == null || tree == null) {
return false;
}
if (checkFunctionInNode(node, fieldDefinition.fieldName, hashFunction)) {
return true;
}
Tree<Canopy<R>> nodeTree = tree.getTree(node);
if (nodeTree == null) {
return false;
}

Tree<Canopy<R>> parent = nodeTree.getParent();
if (parent != null) {
Canopy<R>head = parent.getHead();
while (head != null) {
// check siblings of node
/*for (Tree<Canopy<R>> siblings : parent.getSubTrees()) {
Canopy<R>sibling = siblings.getHead();
if (checkFunctionInNode(sibling, index, function))
return true;
}*/
// check parent of node
return isHashFunctionUsed(fieldDefinition, hashFunction, tree, head);
}
}
return isUsed;
}

@Override
public void addHashFunctionIfRequired(Canopy<R> node) {
//don't add hashFunction to cache
//as we are in default mode
}

@Override
public void removeHashFunctionIfRequired(Canopy<R> node) {
//don't remove hashFunction from cache
//as we are in default mode
}

private boolean checkFunctionInNode(Canopy<R>node, String name,
HashFunction<D, R, C, T> function) {
return node.getFunction() != null && node.getFunction().equals(function)
&& node.context.fieldName.equals(name);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package zingg.common.core.block;

public class HashFunctionUtilityFactory {

public static IHashFunctionUtility getHashFunctionUtility(HashUtility hashUtility) {

if (HashUtility.DEFAULT.equals(hashUtility)) {
return new DefaultHashFunctionUtility();
}
return new CacheBasedHashFunctionUtility();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package zingg.common.core.block;

public enum HashUtility {
DEFAULT,
CACHED
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package zingg.common.core.block;

import zingg.common.client.FieldDefinition;
import zingg.common.core.hash.HashFunction;

public interface IHashFunctionUtility<D, R, C, T> {
boolean isHashFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction, Tree<Canopy<R>> tree, Canopy<R>node);

void addHashFunctionIfRequired(Canopy<R> node);

void removeHashFunctionIfRequired(Canopy<R> node);
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package zingg.common.core.executor;

import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

Expand Down Expand Up @@ -87,7 +85,7 @@ public void execute() throws ZinggClientException {

ZFrame<D,R,C> sample = getStopWords().preprocessForStopWords(sampleOrginal);

Tree<Canopy<R>> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList());
Tree<Canopy<R>> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList());
//tree.print(2);
ZFrame<D,R,C> blocked = getBlockingTreeUtil().getBlockHashes(sample, tree);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
public abstract class BlockingTreeUtil<S, D,R,C,T> {

public final Log LOG = LogFactory.getLog(BlockingTreeUtil.class);


private PipeUtilBase<S, D, R, C> pipeUtil;

Expand All @@ -32,8 +31,6 @@ public PipeUtilBase<S, D, R, C> getPipeUtil() {
}




public void setPipeUtil(PipeUtilBase<S, D, R, C> pipeUtil) {
this.pipeUtil = pipeUtil;
}
Expand All @@ -43,10 +40,10 @@ public abstract Block<D,R,C,T> getBlock(ZFrame<D,R,C> sample, ZFrame<D,R,C> posi
ListMap<T, HashFunction<D,R,C,T>>hashFunctions, long blockSize);


public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
ZFrame<D,R,C> positives, double sampleFraction, long blockSize,
IArguments args,
ListMap<T, HashFunction<D,R,C,T>> hashFunctions) throws Exception, ZinggClientException {
public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
ZFrame<D,R,C> positives, double sampleFraction, long blockSize,
IArguments args,
ListMap<T, HashFunction<D,R,C,T>> hashFunctions) throws Exception, ZinggClientException {
ZFrame<D,R,C> sample = testData.sample(false, sampleFraction);
sample = sample.cache();
long totalCount = sample.count();
Expand All @@ -68,9 +65,7 @@ public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
fd.add(def);
}
}

Tree<Canopy<R>> blockingTree = cblock.getBlockingTree(null, null, root,
fd);
Tree<Canopy<R>> blockingTree = cblock.getBlockingTree(null, null, root, fd);
if (LOG.isDebugEnabled()) {
LOG.debug("The blocking tree is ");
blockingTree.print(2);
Expand Down
Loading

0 comments on commit c69adbe

Please sign in to comment.