Skip to content

Commit

Permalink
Merge pull request #322 from ncats/html_truncation_fix_mm
Browse files Browse the repository at this point in the history
Fix and optimize HTML string truncation
  • Loading branch information
blueSwordfish authored Mar 19, 2024
2 parents 18f2b20 + 2fc8b36 commit ff6a675
Showing 1 changed file with 15 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,20 @@ public void head(Node node, int depth) {
String curText = ((TextNode) node).getWholeText();
if (resHtmlLen + nodeHtmlLen > maxLen) {
StringBuilder sb = new StringBuilder(curText);
int curHtmlLen = maxNodeLen + 1;
int curHtmlLen = maxNodeLen;
sb.setLength(curHtmlLen);
sb.setLength(sb.length() - (Long.valueOf(sb.chars().filter(c -> c == '&').count()).intValue() * 4));
sb.setLength(sb.length() - (Long.valueOf(sb.chars().filter(c -> (c == '<' || c == '>')).count()).intValue() * 3));
curHtmlLen += Long.valueOf(sb.chars().filter(c -> c == '&').count()).intValue() * 4;
curHtmlLen += Long.valueOf(sb.chars().filter(c -> (c == '<' || c == '>')).count()).intValue() * 3;
while (curHtmlLen > maxNodeLen) {
char lastChar = sb.charAt(sb.length() - 1);
if (lastChar == '&') {
curHtmlLen = curHtmlLen - 5;
} else if (lastChar == '<' || lastChar == '>') {
curHtmlLen = curHtmlLen - 4;
} else {
curHtmlLen = curHtmlLen - 1;
}
sb.setLength(sb.length() - 1);
curHtmlLen = sb.toString().getBytes(StandardCharsets.UTF_8).length;
curHtmlLen += Long.valueOf(sb.chars().filter(c -> c == '&').count()).intValue() * 4;
curHtmlLen += Long.valueOf(sb.chars().filter(c -> (c == '<' || c == '>')).count()).intValue() * 3;
}
cur.appendText(sb.toString());
throw new IllegalStateException();
Expand All @@ -84,6 +89,9 @@ public void tail(Node node, int depth) {
public static String truncate(String s, int len){
Document srcDoc = Parser.parseBodyFragment(s, "");
srcDoc.outputSettings().prettyPrint(false);
if (srcDoc.body().html().length() <= len) {
return srcDoc.body().html();
}

int maxLength = len-3; //for final ...

Expand All @@ -98,12 +106,7 @@ public static String truncate(String s, int len){
t.traverse(v, srcDoc.body());
} catch (IllegalStateException ex) {}

String htmlReturn = dst.html();
if(htmlReturn.length() < s.length() && htmlReturn.length() <= maxLength){
return htmlReturn + "...";
}else{
return htmlReturn;
}
return dst.html() + "...";
}

public static String clean(String content, String charset) {
Expand Down

0 comments on commit ff6a675

Please sign in to comment.