Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Z columns doc to have different template. #257

Merged
merged 6 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 16 additions & 33 deletions core/src/main/java/zingg/documenter/ColumnDocumenter.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
package zingg.documenter;

import static org.apache.spark.sql.functions.desc;
import static org.apache.spark.sql.functions.explode;
import static org.apache.spark.sql.functions.split;

import java.util.HashMap;
import java.util.Map;

Expand All @@ -17,18 +13,19 @@
import zingg.client.FieldDefinition;
import zingg.client.MatchType;
import zingg.client.ZinggClientException;
import zingg.client.util.ColName;
import zingg.util.PipeUtil;

public class ColumnDocumenter extends DocumenterBase {
protected static String name = "zingg.ColumnDocumenter";
public static final Log LOG = LogFactory.getLog(ColumnDocumenter.class);

private final String CSV_TEMPLATE = "stopWordsCSV.ftlh";
private final String HTML_TEMPLATE = "stopWordsHTML.ftlh";
private final String COLUMN_DOC_TEMPLATE = "columnDocTemplate.ftlh";
private final String Z_COLUMN_TEMPLATE = "zColumnTemplate.ftlh";
protected StopWordsDocumenter stopWordsDoc;

public ColumnDocumenter(SparkSession spark, Arguments args) {
super(spark, args);
stopWordsDoc = new StopWordsDocumenter(spark, args);
}

public void process() throws ZinggClientException {
Expand All @@ -41,49 +38,35 @@ private void createColumnDocuments() throws ZinggClientException {
Dataset<Row> data = PipeUtil.read(spark, false, false, args.getData());
LOG.info("Read input data : " + data.count());

String stopWordsDir = args.getZinggDocDir() + "/stopWords/";
String columnsDir = args.getZinggDocDir();
checkAndCreateDir(stopWordsDir);
checkAndCreateDir(columnsDir);

for (FieldDefinition field: args.getFieldDefinition()) {
if ((field.getMatchType() == null || field.getMatchType().contains(MatchType.DONT_USE))) {
prepareAndWriteColumnDocument(spark.emptyDataFrame(), field.fieldName, stopWordsDir, columnsDir);
if ((field.getMatchType() == null || field.getMatchType().equals(MatchType.DONT_USE))) {
prepareAndWriteColumnDocument(spark.emptyDataFrame(), field.fieldName, columnsDir);
continue;
}
prepareAndWriteColumnDocument(data, field.fieldName, stopWordsDir, columnsDir);
prepareAndWriteColumnDocument(data, field.fieldName, columnsDir);
}

for (String col: getZColumnList()) {
prepareAndWriteColumnDocument(spark.emptyDataFrame(), col, stopWordsDir, columnsDir);
prepareAndWriteColumnDocument(spark.emptyDataFrame(), col, columnsDir);
}

LOG.info("Column Documents generation finishes");
}
private void prepareAndWriteColumnDocument(Dataset<Row> data, String fieldName, String stopWordsDir, String columnsDir) throws ZinggClientException {

private void prepareAndWriteColumnDocument(Dataset<Row> data, String fieldName, String columnsDir) throws ZinggClientException {
Map<String, Object> root = new HashMap<String, Object>();
root.put(TemplateFields.TITLE, fieldName);
root.put(TemplateFields.MODEL_ID, args.getModelId());
root = addStopWords(data, fieldName, root);

String filenameCSV = stopWordsDir + fieldName + ".csv";
String filenameHTML = columnsDir + fieldName + ".html";
writeDocument(CSV_TEMPLATE, root, filenameCSV);
writeDocument(HTML_TEMPLATE, root, filenameHTML);
}


public Map<String, Object> addStopWords(Dataset<Row> data, String fieldName, Map<String, Object> params) {
LOG.debug("Field: " + fieldName);
if(!data.isEmpty()) {
data = data.select(split(data.col(fieldName), "\\s+").as("split"));
data = data.select(explode(data.col("split")).as("word"));
data = data.filter(data.col("word").notEqual(""));
data = data.groupBy("word").count().orderBy(desc("count"));
data = data.limit(Math.round(data.count()*args.getStopWordsCutoff()));
if (isZColumn(fieldName)) {
writeDocument(Z_COLUMN_TEMPLATE, root, filenameHTML);
} else {
root = stopWordsDoc.addStopWords(data, fieldName, root);
writeDocument(COLUMN_DOC_TEMPLATE, root, filenameHTML);
}
params.put("stopWords", data.collectAsList());

return params;
}
}
}
4 changes: 4 additions & 0 deletions core/src/main/java/zingg/documenter/DocumenterBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,8 @@ protected void checkAndCreateDir(String dirName) {
protected List<String> getZColumnList() {
return zColList;
}

public boolean isZColumn(String colName) {
return colName.startsWith(ColName.COL_PREFIX);
}
}
49 changes: 49 additions & 0 deletions core/src/main/java/zingg/documenter/StopWordsDocumenter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package zingg.documenter;

import static org.apache.spark.sql.functions.desc;
import static org.apache.spark.sql.functions.explode;
import static org.apache.spark.sql.functions.split;

import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import zingg.client.Arguments;
import zingg.client.ZinggClientException;

public class StopWordsDocumenter extends DocumenterBase {
protected static String name = "zingg.StopWordsDocumenter";
public static final Log LOG = LogFactory.getLog(StopWordsDocumenter.class);
private final String STOP_WORDS_CSV_TEMPLATE = "stopWordsCSVTemplate.ftl";

public StopWordsDocumenter(SparkSession spark, Arguments args) {
super(spark, args);
}

public Map<String, Object> addStopWords(Dataset<Row> data, String fieldName, Map<String, Object> params) throws ZinggClientException {
LOG.debug("Field: " + fieldName);
if(!data.isEmpty()) {
data = data.select(split(data.col(fieldName), "\\s+").as("split"));
data = data.select(explode(data.col("split")).as("word"));
data = data.filter(data.col("word").notEqual(""));
data = data.groupBy("word").count().orderBy(desc("count"));
data = data.limit(Math.round(data.count()*args.getStopWordsCutoff()));
}
params.put("stopWords", data.collectAsList());

writeStopWordsDocument(fieldName, params);

return params;
}

public void writeStopWordsDocument(String fieldName, Map<String, Object> root) throws ZinggClientException {
String stopWordsDir = args.getZinggDocDir() + "/stopWords/";
checkAndCreateDir(stopWordsDir);
String filenameCSV = stopWordsDir + fieldName + ".csv";
writeDocument(STOP_WORDS_CSV_TEMPLATE, root, filenameCSV);
}
}
24 changes: 24 additions & 0 deletions core/src/main/resources/columnDocTemplate.ftlh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<html>
<head>
<title>${title!}</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
</head>
<body>
<nav class="navbar navbar-light bg-light">
<a class="navbar-brand" href="https://www.zingg.ai">
<img src="https://github.com/zinggai/zingg/raw/main/assets/zinggWhiteTransparent.png" class="d-inline-block align-top" alt="">
</a>
<h1> Field - ${title!} </h1>
<a href="../model.html">
<div class="justify-content-end">Model ${modelId}</div>
</a>
</nav>
<#include "stopWordsDocTemplate.ftlh">
</body>
</html>
<style>
.header{
position:sticky;
top: 0 ;
}
</style>
22 changes: 22 additions & 0 deletions core/src/main/resources/stopWordsDocTemplate.ftlh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<p>
<table class="table table-borderless" style="width:auto" >
<thead class="thead thead-dark">
</thead>
<tbody>
<tr>
<th>StopWord</th>
<th>Count</th>
</tr>
<#list stopWords as words>
<tr>
<td>
${words[0]!}
</td>
<td>
${words[1]?string["0"]!}
</td>
</tr>
</#list>
</tbody>
</table>
</p>
49 changes: 0 additions & 49 deletions core/src/main/resources/stopWordsHTML.ftlh

This file was deleted.

49 changes: 49 additions & 0 deletions core/src/main/resources/zColumnTemplate.ftlh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<html>
<head>
<title>${title!}</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
</head>

<body>
<nav class="navbar navbar-light bg-light">
<a class="navbar-brand" href="https://www.zingg.ai">
<img src="https://github.com/zinggai/zingg/raw/main/assets/zinggWhiteTransparent.png" class="d-inline-block align-top" alt="">
</a>
<h1> Field - ${title!} </h1>
<a href="../model.html">
<div class="justify-content-end">Model ${modelId}</div>
</a>
</nav>
<p>
<table class="table table-borderless" style="width:auto" >
<thead class="thead thead-dark">
</thead>
<tbody>
</tbody>
</table>
<p>
<#if title == "z_cluster">
<p>"z_cluster - identifies a group of records which match or don't match with each other. For each group, z_cluster is unique. Member records of a group share the same z_cluster."</p>
<#elseif title == "z_zid">
<p>"z_zid - an internal id given by Zingg to uniquely identify the record."</p>
<#elseif title == "z_prediction">
<p>"z_prediction - what Zingg thinks about this group/pair of records - 0 for not a match, 1 for a match."</p>
<#elseif title == "z_score">
<p>"z_score - the probability of a pair of records matching. The higher the score, the more likely they are a match."</p>
<#elseif title == "z_isMatch">
<p>z_isMatch - this is the label provided by the user.</p>
<#elseif title == "z_source">
<p>z_source - the source of data as set in the name property of the data in the Zingg configuration file.</p>
<#else>
<p>${title} - this field is internally used by Zingg.</p>
</#if>
</p>
</body>
</html>

<style>
.header{
position:sticky;
top: 0 ;
}
</style>