Skip to content

Commit

Permalink
Citavi Importer - Import all knowledge items (#9043)
Browse files Browse the repository at this point in the history
* Import all knowledge items
Filter out empty strings and replace quot

* Use HtmlToLatexFormatter

* Use StringJoiner (instead of StringBuilder)

* Add trimming

* HtmlFormatter keeps <

* Fix 3 bib

* Add cleanUpText

Co-authored-by: Christoph <[email protected]>

* Fix bib files

Co-authored-by: Christoph <[email protected]>

* fix checkstyle

* checkstyle import

* Add quotationIndex and quotationTypes as well

* Adjust tests

* nake checkstyle happy

* fix empty line

Co-authored-by: Oliver Kopp <[email protected]>
  • Loading branch information
Siedlerchr and koppor authored Aug 13, 2022
1 parent 0b58079 commit bd77d73
Show file tree
Hide file tree
Showing 10 changed files with 3,433 additions and 79 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Fixed

- The [HtmlToLaTeXFormatter](https://docs.jabref.org/finding-sorting-and-cleaning-entries/saveactions#html-to-latex) keeps single `<` characters.
- We fixed a performance regression when opening large libraries [#9041](https://github.com/JabRef/jabref/issues/9041)

### Removed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ public String format(String text) {
int c = result.charAt(i);

if (c == '<') {
int oldI = i;
i = readTag(result, i);
if (oldI == i) {
// just a single <, which needs to be kept
sb.append('<');
}
} else {
sb.append((char) c);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.StringJoiner;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
Expand All @@ -28,6 +31,7 @@
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.Parser;
Expand Down Expand Up @@ -59,6 +63,8 @@ public class CitaviXmlImporter extends Importer implements Parser {
private static final Logger LOGGER = LoggerFactory.getLogger(CitaviXmlImporter.class);
private static final byte UUID_LENGTH = 36;
private static final byte UUID_SEMICOLON_OFFSET_INDEX = 37;
private static final EnumSet<QuotationTypeMapping> QUOTATION_TYPES = EnumSet.allOf(QuotationTypeMapping.class);
private final HtmlToLatexFormatter htmlToLatexFormatter = new HtmlToLatexFormatter();
private final NormalizePagesFormatter pagesFormatter = new NormalizePagesFormatter();

private final Map<String, Author> knownPersons = new HashMap<>();
Expand Down Expand Up @@ -363,17 +369,40 @@ private String getPublisher(CitaviExchangeData.References.Reference data) {
}

private String getKnowledgeItem(CitaviExchangeData.References.Reference data) {
Optional<KnowledgeItem> knowledgeItem = knowledgeItems.getKnowledgeItem().stream().filter(p -> data.getId().equals(p.getReferenceID())).findFirst();
StringJoiner comment = new StringJoiner("\n\n");
List<KnowledgeItem> foundItems = knowledgeItems.getKnowledgeItem().stream().filter(p -> data.getId().equals(p.getReferenceID())).toList();
for (KnowledgeItem knowledgeItem : foundItems) {
Optional<String> title = Optional.ofNullable(knowledgeItem.getCoreStatement()).filter(Predicate.not(String::isEmpty));
title.ifPresent(t -> comment.add("# " + cleanUpText(t)));

Optional<String> text = Optional.ofNullable(knowledgeItem.getText()).filter(Predicate.not(String::isEmpty));
text.ifPresent(t -> comment.add(cleanUpText(t)));

Optional<Integer> pages = Optional.ofNullable(knowledgeItem.getPageRangeNumber()).filter(range -> range != -1);
pages.ifPresent(p -> comment.add("page range: " + p));

Optional<String> quotationTypeDesc = Optional.ofNullable(knowledgeItem.getQuotationType()).flatMap(type ->
this.QUOTATION_TYPES.stream()
.filter(qt -> type == qt.getCitaviIndexType())
.map(QuotationTypeMapping::getName).findFirst());
quotationTypeDesc.ifPresent(qt -> comment.add(String.format("quotation type: %s", qt)));

Optional<Short> quotationIndex = Optional.ofNullable(knowledgeItem.getQuotationIndex());
quotationIndex.ifPresent(index -> comment.add(String.format("quotation index: %d", index)));
}
return comment.toString();
}

StringBuilder comment = new StringBuilder();
Optional<String> title = knowledgeItem.map(item -> item.getCoreStatement());
title.ifPresent(t -> comment.append("# ").append(t).append("\n\n"));
Optional<String> text = knowledgeItem.map(item -> item.getText());
text.ifPresent(t -> comment.append(t).append("\n\n"));
Optional<Integer> pages = knowledgeItem.map(item -> item.getPageRangeNumber()).filter(range -> range != -1);
pages.ifPresent(p -> comment.append("page range: ").append(p));
String cleanUpText(String text) {
String result = removeSpacesBeforeLineBreak(text);
result = result.replaceAll("(?<!\\\\)\\{", "\\\\{");
result = result.replaceAll("(?<!\\\\)}", "\\\\}");
return result;
}

return comment.toString();
private String removeSpacesBeforeLineBreak(String string) {
return string.replaceAll(" +\r\n", "\r\n")
.replaceAll(" +\n", "\n");
}

private void initUnmarshaller() throws JAXBException {
Expand Down Expand Up @@ -453,8 +482,35 @@ private static InputStream checkForUtf8BOMAndDiscardIfAny(InputStream inputStrea
}

private String clean(String input) {
return StringUtil.unifyLineBreaks(input, " ")
String result = StringUtil.unifyLineBreaks(input, " ")
.trim()
.replaceAll(" +", " ");
return htmlToLatexFormatter.format(result);
}

enum QuotationTypeMapping {
IMAGE_QUOTATION(0, "Image quotation"),
DIRECT_QUOTATION(1, "Direct quotation"),
INDIRECT_QUOTATION(2, "Indirect quotation"),
SUMMARY(3, "Summary"),
COMMENT(4, "Comment"),
HIGHLIGHT(5, "Highlight"),
HIGHLIGHT_RED(6, "Highlight in red");

int citaviType;
String name;

QuotationTypeMapping(int citaviType, String name) {
this.name = name;
this.citaviType = citaviType;
}

String getName() {
return name;
}

int getCitaviIndexType() {
return citaviType;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import org.jabref.logic.layout.LayoutFormatter;

/**
* Remove non printable character formatter.
* Remove non-printable character formatter.
*/
public class RemoveWhitespace implements LayoutFormatter {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ public void testHTMLCombiningAccents() {
assertEquals("{\\\"{a}}b", formatter.format("a&#x308;b"));
}

@Test
public void keepsSingleLessThan() {
String text = "(p < 0.01)";
assertEquals(text, formatter.format(text));
}

@Test
public void formatExample() {
assertEquals("JabRef", formatter.format(formatter.getExampleInput()));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.jabref.logic.importer.fileformat;

import java.util.stream.Stream;

import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import static org.junit.jupiter.api.Assertions.assertEquals;

class CitaviXmlImporterTest {

CitaviXmlImporter citaviXmlImporter = new CitaviXmlImporter();

public static Stream<Arguments> cleanUpText() {
return Stream.of(
Arguments.of("no action", "no action"),
Arguments.of("\\{action\\}", "{action}"),
Arguments.of("\\}", "}"));
}

@ParameterizedTest
@MethodSource
void cleanUpText(String expected, String input) {
assertEquals(expected, citaviXmlImporter.cleanUpText(input));
}
}
Loading

0 comments on commit bd77d73

Please sign in to comment.