Skip to content

Commit

Permalink
In labeller, nos of pairs that MATCH and DO_NOT_MATCH are printed zin…
Browse files Browse the repository at this point in the history
  • Loading branch information
navinrathore committed Dec 29, 2021
1 parent 614349f commit e4d8416
Showing 1 changed file with 22 additions and 2 deletions.
24 changes: 22 additions & 2 deletions core/src/main/java/zingg/Labeller.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class Labeller extends ZinggBase {

protected static String name = "zingg.Labeller";
public static final Log LOG = LogFactory.getLog(Labeller.class);
long positivePairsCount, negativePairsCount;

public Labeller() {
setZinggOptions(ZinggOptions.LABEL);
Expand Down Expand Up @@ -55,6 +56,8 @@ public Dataset<Row> getUnmarkedRecords() throws ZinggClientException {
unmarkedRecords = unmarkedRecords.join(markedRecords,
unmarkedRecords.col(ColName.CLUSTER_COLUMN).equalTo(markedRecords.col(ColName.CLUSTER_COLUMN)),
"left_anti");
positivePairsCount = markedRecords.filter(markedRecords.col(ColName.MATCH_FLAG_COL).equalTo(ColValues.MATCH_TYPE_MATCH)).count() / 2;
negativePairsCount = markedRecords.filter(markedRecords.col(ColName.MATCH_FLAG_COL).equalTo(ColValues.MATCH_TYPE_NOT_A_MATCH)).count() / 2;
}
} catch (Exception e) {
LOG.warn("No unmarked record for labelling");
Expand All @@ -64,7 +67,7 @@ public Dataset<Row> getUnmarkedRecords() throws ZinggClientException {

public void processRecordsCli(Dataset<Row> lines) throws ZinggClientException {
LOG.info("Processing Records for CLI Labelling");

printMarkedRecordsStat();
if (lines == null || lines.count() == 0) {
LOG.info("It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler.");
return;
Expand All @@ -88,14 +91,15 @@ public void processRecordsCli(Dataset<Row> lines) throws ZinggClientException {

score = currentPair.head().getAs(ColName.SCORE_COL);
prediction = currentPair.head().getAs(ColName.PREDICTION_COL);
msg1 = String.format("\tRecord pair %d out of %d records to be labelled by the user.\n", index, totalPairs);
String matchType = LabelMatchType.get(prediction).msg;
msg2 = String.format("\tZingg predicts the records %s with a similarity score of %.2f\n",
matchType, score);
String msgHeader = msg1 + msg2;

selected_option = displayRecordsAndGetUserInput(DSUtil.select(currentPair, displayCols), msgHeader);
updateLabellerStat(selected_option);
if (selected_option == 9) {
LOG.info("User has quit in the middle. Updating the records.");
break;
Expand Down Expand Up @@ -188,6 +192,22 @@ int readCliInput() {
return selection;
}

private void updateLabellerStat(int selected_option) {
if (selected_option == ColValues.IS_MATCH_PREDICTION) {
++positivePairsCount;
}
else if (selected_option == ColValues.IS_NOT_A_MATCH_PREDICTION) {
++negativePairsCount;
}
printMarkedRecordsStat();
}

private void printMarkedRecordsStat() {
System.out.print("\tIn all, " + positivePairsCount + " pair(s) have been marked " + LabelMatchType.MATCH);
System.out.print(" and " + negativePairsCount + " pair(s) have been marked " + LabelMatchType.DO_NOT_MATCH);
System.out.println();
}

void writeLabelledOutput(Dataset<Row> records) {
if (records == null) {
LOG.warn("No records to be labelled.");
Expand Down

0 comments on commit e4d8416

Please sign in to comment.