Skip to content

Commit

Permalink
improve the way the candidate identification for bad figures/tables i…
Browse files Browse the repository at this point in the history
…s loosen up
  • Loading branch information
lfoppiano committed Dec 19, 2024
1 parent f036e0a commit b0b5051
Showing 1 changed file with 21 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,28 @@ else if (config.getConsolidateCitations() == 2)
}
}

long numberFiguresFulltextModel = Arrays.stream(bodyResults.split("\n"))
.filter(r -> r.endsWith("I-" + TaggingLabels.FIGURE_LABEL))
.count();

List<Figure> badFigures = figures.stream()
.filter(f -> !f.isCompleteForTEI())
.collect(Collectors.toList());

LOGGER.info("Identified bad figures: " + badFigures.size());
bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL);
LOGGER.info("Number of figures badly formatted or incomplete we identified: " + badFigures.size());
bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL,
!(figures.size() > numberFiguresFulltextModel));

figures = figures.stream()
.filter(f -> !badFigures.contains(f))
.collect(Collectors.toList());

tables = processTables(bodyResults, bodyLayoutTokens.getTokenization(), doc);

long numberTablesFulltextModel = Arrays.stream(bodyResults.split("\n"))
.filter(r -> r.endsWith("I-" + TaggingLabels.FIGURE_LABEL))
.count();

//We deal with tables considered bad by reverting them as <paragraph>, to reduce the risk them to be
// dropped later on.

Expand All @@ -299,8 +308,9 @@ else if (config.getConsolidateCitations() == 2)
.filter(t -> !(t.isCompleteForTEI() && t.validateTable()))
.collect(Collectors.toList());

LOGGER.info("Identified bad tables: " + badTables.size());
bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL);
LOGGER.info("Number of tables badly formatted or incomplete we identified: " + badTables.size());
bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL,
!(tables.size() > numberTablesFulltextModel));

tables = tables.stream()
.filter(t-> !badTables.contains(t))
Expand Down Expand Up @@ -367,21 +377,21 @@ else if (config.getConsolidateCitations() == 2)
}

static String revertResultsForBadItems(List<? extends Figure> badFiguresOrTables, String resultBody, String itemLabel) {
return revertResultsForBadItems(badFiguresOrTables, resultBody, itemLabel, true);
}

static String revertResultsForBadItems(List<? extends Figure> badFiguresOrTables, String resultBody, String itemLabel, boolean strict) {
//LF: we update the resultBody sequence by reverting these tables as <paragraph> elements
if (CollectionUtils.isNotEmpty(badFiguresOrTables)) {
List<List<String>> labelledResultsAsList = Arrays.stream(resultBody.split("\n"))
.map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList()))
.collect(Collectors.toList());

long numberItems = labelledResultsAsList.stream()
.filter(r -> Iterables.getLast(r).startsWith("I-" + itemLabel))
.count();

for (Figure badItem : badFiguresOrTables) {
// Find the index of the first layoutToken of the table in the tokenization
List<LayoutToken> layoutTokenItem = badItem.getLayoutTokens();
List<Integer> candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList,
itemLabel, !(badFiguresOrTables.size() > numberItems));
itemLabel, strict);
if (candidateIndexes.isEmpty()) {
LOGGER.info("Cannot find the candidate index for fixing the tables.");
continue;
Expand Down Expand Up @@ -455,7 +465,8 @@ static int consolidateResultCandidateThroughSequence(List<Integer> candidateInde
* Find a set of candidates representing the indexes from the labelledResults which could correspond
* to the first token of the figure/table
*
* strict = True check the I-<table> or I-<figure> first and then the <table> or <figure> only if there are not candidates
* strict = True then it will check the items related to I-<table> or I-<figure> first
* and then the <table> or <figure> only if there are not candidates
* strict = False is usually necessary if there are more tables than I- token, this because a figure/table could be
* identified within the sequence initially provided by the fulltext model
*
Expand Down

0 comments on commit b0b5051

Please sign in to comment.