From 5e27b55495a49adf018eb0ee1c60798ec07a542d Mon Sep 17 00:00:00 2001 From: Martin Wiesner Date: Thu, 16 Jan 2025 12:09:51 +0100 Subject: [PATCH] OPENNLP-1695: Add more tests for classes in formats package - introduces AbstractSampleStreamFactoryTest as common base class - reduces code duplication in format factory classes - adds a ton of new test classes for format factories (+ sample data) - adds two more Evalita samples taken from Appendix of: https://www.evalita.it/wp-content/uploads/2021/11/Guidelines_evalita09_NER.pdf - adds two OntoNotes samples from the public, official v5.0 release notes (sec 6.4 + 6.8), see: https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf --- .../tools/chunker/ChunkSampleStream.java | 12 +- .../formats/AbstractSampleStreamFactory.java | 48 +++++ .../formats/BioNLP2004NameSampleStream.java | 38 ++-- .../BioNLP2004NameSampleStreamFactory.java | 30 ++-- .../formats/ChunkerSampleStreamFactory.java | 28 +-- .../Conll02NameSampleStreamFactory.java | 13 +- .../Conll03NameSampleStreamFactory.java | 12 +- .../formats/ConllXPOSSampleStreamFactory.java | 26 +-- .../ConllXSentenceSampleStreamFactory.java | 16 +- .../ConllXTokenSampleStreamFactory.java | 14 +- .../formats/DocumentSampleStreamFactory.java | 27 +-- .../formats/EvalitaNameSampleStream.java | 59 ++++--- .../EvalitaNameSampleStreamFactory.java | 13 +- .../LanguageDetectorSampleStreamFactory.java | 31 +--- .../LemmatizerSampleStreamFactory.java | 28 +-- .../formats/NameSampleDataStreamFactory.java | 29 +-- .../formats/ParseSampleStreamFactory.java | 26 +-- .../formats/SentenceSampleStreamFactory.java | 28 +-- .../formats/TokenSampleStreamFactory.java | 28 +-- .../formats/TwentyNewsgroupSampleStream.java | 9 + .../TwentyNewsgroupSampleStreamFactory.java | 54 +++--- .../formats/WordTagSampleStreamFactory.java | 27 +-- .../tools/formats/ad/ADChunkSampleStream.java | 4 +- .../ad/ADChunkSampleStreamFactory.java | 32 +--- .../tools/formats/ad/ADNameSampleStream.java | 4 +- .../formats/ad/ADNameSampleStreamFactory.java | 32 +--- .../tools/formats/ad/ADPOSSampleStream.java | 2 +- .../formats/ad/ADPOSSampleStreamFactory.java | 33 +--- .../formats/ad/ADSentenceSampleStream.java | 15 +- .../ad/ADSentenceSampleStreamFactory.java | 34 +--- .../tools/formats/ad/ADSentenceStream.java | 40 +---- .../ad/ADTokenSampleStreamFactory.java | 12 +- .../ad/PortugueseContractionUtility.java | 9 +- .../formats/brat/AnnotationConfiguration.java | 114 +++++++----- .../formats/brat/BratAnnotationStream.java | 11 +- .../tools/formats/brat/BratDocument.java | 35 ++-- .../formats/brat/BratNameSampleStream.java | 44 +++-- .../brat/BratNameSampleStreamFactory.java | 47 +++-- .../formats/brat/SegmenterObjectStream.java | 1 + .../ConlluLemmaSampleStreamFactory.java | 15 +- .../conllu/ConlluPOSSampleStreamFactory.java | 16 +- .../tools/formats/conllu/ConlluSentence.java | 4 +- .../ConlluSentenceSampleStreamFactory.java | 16 +- .../ConlluTokenSampleStreamFactory.java | 17 +- .../NameToSentenceSampleStreamFactory.java | 9 +- .../NameToTokenSampleStreamFactory.java | 11 +- .../POSToSentenceSampleStreamFactory.java | 11 +- .../POSToTokenSampleStreamFactory.java | 11 +- .../ParseToPOSSampleStreamFactory.java | 19 +- .../ParseToSentenceSampleStreamFactory.java | 18 +- .../ParseToTokenSampleStreamFactory.java | 17 +- .../ConstitDocumentHandler.java | 3 +- .../ConstitParseSampleStreamFactory.java | 19 +- ...rishSentenceBankSentenceStreamFactory.java | 21 +-- ...hSentenceBankTokenSampleStreamFactory.java | 21 +-- .../LeipzigLanguageSampleStreamFactory.java | 32 ++-- .../formats/leipzig/SampleShuffleStream.java | 17 +- .../formats/leipzig/SampleSkipStream.java | 14 +- .../letsmt/LetsmtSentenceStreamFactory.java | 22 +-- .../java/opennlp/tools/formats/masc/Masc.java | 27 +++ .../tools/formats/masc/MascDocument.java | 1 - .../MascNamedEntitySampleStreamFactory.java | 53 +++--- .../masc/MascPOSSampleStreamFactory.java | 52 +++--- .../tools/formats/masc/MascSentence.java | 4 +- .../masc/MascSentenceSampleStreamFactory.java | 53 +++--- .../opennlp/tools/formats/masc/MascToken.java | 3 + .../masc/MascTokenSampleStreamFactory.java | 54 +++--- .../opennlp/tools/formats/masc/MascWord.java | 3 + .../moses/MosesSentenceSampleStream.java | 12 +- .../MosesSentenceSampleStreamFactory.java | 35 ++-- .../muc/Muc6NameSampleStreamFactory.java | 28 ++- .../tools/formats/muc/MucElementNames.java | 6 +- .../formats/muc/MucNameSampleStream.java | 2 +- .../nkjp/NKJPSegmentationDocument.java | 58 +++--- .../nkjp/NKJPSentenceSampleStream.java | 4 +- .../nkjp/NKJPSentenceSampleStreamFactory.java | 23 ++- .../tools/formats/nkjp/NKJPTextDocument.java | 6 +- .../ontonotes/OntoNotesNameSampleStream.java | 111 ++++++------ .../OntoNotesNameSampleStreamFactory.java | 25 ++- .../OntoNotesPOSSampleStreamFactory.java | 19 +- .../ontonotes/OntoNotesParseSampleStream.java | 2 +- .../OntoNotesParseSampleStreamFactory.java | 30 +++- .../tools/namefind/NameSampleDataStream.java | 10 +- .../tools/parser/ParseSampleStream.java | 4 + .../tools/postag/WordTagSampleStream.java | 12 +- .../AbstractSampleStreamFactoryTest.java | 68 +++++++ .../formats/AbstractSampleStreamTest.java | 3 +- ...BioNLP2004NameSampleStreamFactoryTest.java | 113 ++++++++++++ .../ChunkerSampleStreamFactoryTest.java | 79 +++++++++ .../Conll02NameSampleStreamFactoryTest.java | 127 +++++++++++++ .../Conll03NameSampleStreamFactoryTest.java | 127 +++++++++++++ .../ConllXPOSSampleStreamFactoryTest.java | 78 ++++++++ ...ConllXSentenceSampleStreamFactoryTest.java | 99 +++++++++++ .../ConllXTokenSampleStreamFactoryTest.java | 98 ++++++++++ .../EvalitaNameSampleStreamFactoryTest.java | 107 +++++++++++ .../formats/EvalitaNameSampleStreamTest.java | 88 ++++++--- ...nguageDetectorSampleStreamFactoryTest.java | 79 +++++++++ .../LemmatizerSampleStreamFactoryTest.java | 79 +++++++++ .../NameSampleDataStreamFactoryTest.java | 85 +++++++++ .../formats/ParseSampleStreamFactoryTest.java | 80 +++++++++ .../SentenceSampleStreamFactoryTest.java | 79 +++++++++ .../formats/TokenSampleStreamFactoryTest.java | 79 +++++++++ ...wentyNewsgroupSampleStreamFactoryTest.java | 151 ++++++++++++++++ .../WordTagSampleStreamFactoryTest.java | 83 +++++++++ .../ad/ADChunkSampleStreamFactoryTest.java | 102 +++++++++++ .../ad/ADPOSSampleStreamFactoryTest.java | 105 +++++++++++ .../formats/ad/ADParagraphStreamTest.java | 2 +- .../ad/ADSentenceSampleStreamFactoryTest.java | 105 +++++++++++ .../ad/ADTokenSampleStreamFactoryTest.java | 108 +++++++++++ .../formats/ad/ADTokenSampleStreamTest.java | 6 +- .../ad/AbstractADSampleStreamTest.java | 4 +- .../brat/BratAnnotationStreamTest.java | 39 ++-- .../brat/BratNameSampleStreamFactoryTest.java | 167 ++++++++++++++++++ .../brat/BratNameSampleStreamTest.java | 17 +- .../ConlluLemmaSampleStreamFactoryTest.java | 113 ++++++++++++ .../ConlluPOSSampleStreamFactoryTest.java | 113 ++++++++++++ ...ConlluSentenceSampleStreamFactoryTest.java | 99 +++++++++++ .../ConlluTokenSampleStreamFactoryTest.java | 82 +++++++++ ...NameToSentenceSampleStreamFactoryTest.java | 101 +++++++++++ .../NameToTokenSampleStreamFactoryTest.java | 101 +++++++++++ .../POSToSentenceSampleStreamFactoryTest.java | 101 +++++++++++ .../POSToTokenSampleStreamFactoryTest.java | 101 +++++++++++ .../ParseToPOSSampleStreamFactoryTest.java | 81 +++++++++ ...arseToSentenceSampleStreamFactoryTest.java | 101 +++++++++++ .../ParseToTokenSampleStreamFactoryTest.java | 101 +++++++++++ .../ConstitParseSampleStreamFactoryTest.java | 93 ++++++++++ ...SentenceBankSentenceStreamFactoryTest.java | 83 +++++++++ ...tenceBankTokenSampleStreamFactoryTest.java | 83 +++++++++ ...eipzigLanguageSampleStreamFactoryTest.java | 110 ++++++++++++ .../LetsmtSentenceStreamFactoryTest.java | 84 +++++++++ ...ascNamedEntitySampleStreamFactoryTest.java | 102 +++++++++++ .../masc/MascPOSSampleStreamFactoryTest.java | 102 +++++++++++ .../MascSentenceSampleStreamFactoryTest.java | 102 +++++++++++ .../MascTokenSampleStreamFactoryTest.java | 102 +++++++++++ .../MosesSentenceSampleStreamFactoryTest.java | 82 +++++++++ .../muc/Muc6NameSampleStreamFactoryTest.java | 114 ++++++++++++ .../NKJPSentenceSampleStreamFactoryTest.java | 102 +++++++++++ .../OntoNotesNameSampleStreamFactoryTest.java | 96 ++++++++++ .../OntoNotesPOSSampleStreamFactoryTest.java | 96 ++++++++++ ...OntoNotesParseSampleStreamFactoryTest.java | 102 +++++++++++ .../20newsgroup/sci.electronics/52794.sample | 59 +++++++ .../opennlp/tools/formats/{ => ad}/ad.sample | 0 .../tools/formats/bionlp2004-01.sample | 33 ++++ .../opennlp/tools/formats/brat/brat-ann.conf | 7 + .../opennlp/tools/formats/chunker-01.sample | 16 ++ ...ner-it.sample => evalita-ner-it-01.sample} | 0 .../tools/formats/evalita-ner-it-02.sample | 29 +++ .../tools/formats/evalita-ner-it-03.sample | 22 +++ .../formats/evalita-ner-it-broken.sample | 2 + .../formats/evalita-ner-it-incorrect.sample | 3 + .../tools/formats/lang-detect-01.sample | 1 + .../opennlp/tools/formats/lemma-01.sample | 1 + .../tools/formats/moses/moses-tiny.sample | 3 + .../opennlp/tools/formats/muc/LDC2003T13.sgm | 73 ++++++++ .../opennlp/tools/formats/name-data-01.sample | 1 + .../ontonotes/ontonotes-sample-01.name | 9 + .../ontonotes/ontonotes-sample-02.parse | 29 +++ .../opennlp/tools/formats/parse-01.sample | 1 + .../opennlp/tools/formats/sentences-01.sample | 2 + .../opennlp/tools/formats/tokens-01.sample | 1 + .../opennlp/tools/formats/word-tags-01.sample | 1 + 161 files changed, 6152 insertions(+), 1060 deletions(-) create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/masc/Masc.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ChunkerSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/Conll02NameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/Conll03NameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ConllXPOSSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ConllXTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/LemmatizerSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/NameSampleDataStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ParseSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/SentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/TokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/WordTagSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactoryTest.java create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/20newsgroup/sci.electronics/52794.sample rename opennlp-tools/src/test/resources/opennlp/tools/formats/{ => ad}/ad.sample (100%) create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/bionlp2004-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/brat/brat-ann.conf create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/chunker-01.sample rename opennlp-tools/src/test/resources/opennlp/tools/formats/{evalita-ner-it.sample => evalita-ner-it-01.sample} (100%) create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-02.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-03.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-broken.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-incorrect.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/lang-detect-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/lemma-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/moses/moses-tiny.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/muc/LDC2003T13.sgm create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/name-data-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-01.name create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-02.parse create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/parse-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/sentences-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/tokens-01.sample create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/word-tags-01.sample diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java index 2bc23eedf9..53f6e979a6 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java @@ -31,8 +31,8 @@ * Parses the conll 2000 shared task shallow parser training data. *

* Data format is specified on the conll page:
- * - * http://www.cnts.ua.ac.be/conll2000/chunking/ + * + * https://www.cnts.ua.ac.be/conll2000/chunking/ */ public class ChunkSampleStream extends FilterObjectStream { @@ -57,7 +57,7 @@ public ChunkSample read() throws IOException { for (String line = samples.read(); line != null && !line.isEmpty(); line = samples.read()) { String[] parts = line.split(" "); if (parts.length != 3) { - logger.error("Skipping corrupt line: {}", line); + logger.warn("Skipping corrupt line: {}", line); } else { toks.add(parts[0]); @@ -66,11 +66,11 @@ public ChunkSample read() throws IOException { } } - if (toks.size() > 0) { + if (!toks.isEmpty()) { return new ChunkSample(toks.toArray(new String[0]), tags.toArray(new String[0]), preds.toArray(new String[0])); + } else { + return null; } - - return null; } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java index 663980ac5a..da9a36d18e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java @@ -17,7 +17,15 @@ package opennlp.tools.formats; +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; /** * Base class for sample stream factories. @@ -40,4 +48,44 @@ public String getLang() { public Class

getParameters() { return params; } + + /** + * Creates an {@link ObjectStream} for the specified arguments and + * the generic type {@code P}. + * + * @param args A set of command line arguments. + * @return The created {@link ObjectStream} instance. + */ + protected

ObjectStream readData(String[] args, + Class

parametersClass) { + P params = validateBasicFormatParameters(args, parametersClass); + ObjectStream lineStream = null; + try { + InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); + lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + return lineStream; + } + + /** + * Validates the specified arguments ({@code args}) given the + * context the generic type {@code P} which provides at least all + * {@link BasicFormatParams}. + * + * @implNote Additional checks for the basic {@code -data} argument are conducted, that is + * wether the file exists or not. + * + * @param args A set of command line arguments. + * @return The parsed (basic format) parameter instance. + */ + protected

P validateBasicFormatParameters(String[] args, Class

clazz) { + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + P params = ArgumentParser.parse(args, clazz); + CmdLineUtil.checkInputFile("Data", params.getData()); + return params; + } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java index e2ad4ef43a..5bdff9327f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java @@ -45,25 +45,24 @@ *

* Data can be found on this * website, - * or in - * this repository. + * or in this + * GitHub repository. *

- * The BioNLP/NLPBA 2004 data were originally published here: - *

- * - * http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html, + * The BioNLP/NLPBA 2004 data were originally published + * here, *

* yet this page was gone when last checked in December 2022. *

- * It looks like this repo contains a copy of the data located on the original page: - * The BioNLP 2004 seems to be related to http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004 - *

* Note: * Do not use this class, internal use only! */ @Internal public class BioNLP2004NameSampleStream implements ObjectStream { + private static final String CODEC_TAG_O = "O"; + private static final String CODEC_TAG_B = "B-"; + private static final String CODEC_TAG_I = "I-"; + public static final int GENERATE_DNA_ENTITIES = 0x01; public static final int GENERATE_PROTEIN_ENTITIES = 0x01 << 1; public static final int GENERATE_CELLTYPE_ENTITIES = 0x01 << 2; @@ -96,7 +95,6 @@ public NameSample read() throws IOException { boolean isClearAdaptiveData = false; // Empty line indicates end of sentence - String line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.trim())) { @@ -121,7 +119,7 @@ public NameSample read() throws IOException { } } - if (sentence.size() > 0) { + if (!sentence.isEmpty()) { // convert name tags into spans List names = new ArrayList<>(); @@ -133,34 +131,32 @@ public NameSample read() throws IOException { String tag = tags.get(i); if (tag.endsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0) - tag = "O"; + tag = CODEC_TAG_O; if (tag.endsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0) - tag = "O"; + tag = CODEC_TAG_O; if (tag.endsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0) - tag = "O"; + tag = CODEC_TAG_O; if (tag.endsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0) - tag = "O"; + tag = CODEC_TAG_O; if (tag.endsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0) - tag = "O"; + tag = CODEC_TAG_O; - if (tag.startsWith("B-")) { + if (tag.startsWith(CODEC_TAG_B)) { if (beginIndex != -1) { names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2))); - beginIndex = -1; - endIndex = -1; } beginIndex = i; endIndex = i + 1; } - else if (tag.startsWith("I-")) { + else if (tag.startsWith(CODEC_TAG_I)) { endIndex++; } - else if (tag.equals("O")) { + else if (tag.equals(CODEC_TAG_O)) { if (beginIndex != -1) { names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2))); beginIndex = -1; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java index 422cd4c632..0b7bfe3cfc 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java @@ -19,7 +19,6 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; @@ -30,46 +29,46 @@ /** * @see BioNLP2004NameSampleStream */ -public class BioNLP2004NameSampleStreamFactory

extends AbstractSampleStreamFactory { +public class BioNLP2004NameSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ParameterDescription(valueName = "DNA,protein,cell_type,cell_line,RNA") String getTypes(); } public static void registerFactory() { StreamFactoryRegistry.registerFactory(NameSample.class, - "bionlp2004", new BioNLP2004NameSampleStreamFactory<>(Parameters.class)); + "bionlp2004", new BioNLP2004NameSampleStreamFactory(Parameters.class)); } - protected BioNLP2004NameSampleStreamFactory(Class

params) { + protected BioNLP2004NameSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); int typesToGenerate = 0; - - if (params.getTypes().contains("DNA")) { + String types = params.getTypes(); + if (types.contains("DNA")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES; } - else if (params.getTypes().contains("protein")) { + if (types.contains("protein")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES; } - else if (params.getTypes().contains("cell_type")) { + if (types.contains("cell_type")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES; } - else if (params.getTypes().contains("cell_line")) { + if (types.contains("cell_line")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES; } - else if (params.getTypes().contains("RNA")) { + if (types.contains("RNA")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES; } @@ -77,8 +76,9 @@ else if (params.getTypes().contains("RNA")) { try { return new BioNLP2004NameSampleStream( CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate); - } catch (IOException e) { - throw new IllegalStateException(e); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); } + return null; } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java index 36f8b58efa..8925d1960e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java @@ -17,49 +17,33 @@ package opennlp.tools.formats; -import java.io.IOException; - import opennlp.tools.chunker.ChunkSample; import opennlp.tools.chunker.ChunkSampleStream; -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link ChunkSampleStream}s. */ -public class ChunkerSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ChunkerSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(ChunkSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory(Parameters.class)); } - protected ChunkerSampleStreamFactory(Class

params) { + protected ChunkerSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new ChunkSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java index d417df0cc1..91cca66714 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java @@ -19,7 +19,6 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; @@ -37,9 +36,10 @@ * @see Conll02NameSampleStream */ @Internal -public class Conll02NameSampleStreamFactory

extends LanguageSampleStreamFactory { +public class Conll02NameSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ParameterDescription(valueName = "spa|nld") String getLang(); @@ -49,17 +49,17 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(NameSample.class, - "conll02", new Conll02NameSampleStreamFactory<>(Parameters.class)); + "conll02", new Conll02NameSampleStreamFactory(Parameters.class)); } - protected Conll02NameSampleStreamFactory(Class

params) { + protected Conll02NameSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); LANGUAGE lang; if ("nl".equals(params.getLang()) || "nld".equals(params.getLang())) { @@ -93,7 +93,6 @@ else if ("es".equals(params.getLang()) || "spa".equals(params.getLang())) { Conll02NameSampleStream.GENERATE_MISC_ENTITIES; } - try { return new Conll02NameSampleStream(lang, CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate); diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java index dae580cbba..d1a6150928 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java @@ -19,7 +19,6 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; @@ -32,9 +31,10 @@ /** * @see Conll03NameSampleStream */ -public class Conll03NameSampleStreamFactory

extends LanguageSampleStreamFactory { +public class Conll03NameSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ParameterDescription(valueName = "eng|deu") String getLang(); @@ -44,17 +44,17 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(NameSample.class, - "conll03", new Conll03NameSampleStreamFactory<>(Parameters.class)); + "conll03", new Conll03NameSampleStreamFactory(Parameters.class)); } - protected Conll03NameSampleStreamFactory(Class

params) { + protected Conll03NameSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); // TODO: support the other languages with this CoNLL. LANGUAGE lang; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java index 18b48b5a42..a7a586d9b8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java @@ -18,13 +18,10 @@ package opennlp.tools.formats; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; -import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.commons.Internal; import opennlp.tools.postag.POSSample; @@ -35,40 +32,35 @@ * Note: * Do not use this class, internal use only! * + * @see POSSample * @see ConllXPOSSampleStream */ @Internal -public class ConllXPOSSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ConllXPOSSampleStreamFactory extends + AbstractSampleStreamFactory { public static final String CONLLX_FORMAT = "conllx"; - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(POSSample.class, - CONLLX_FORMAT, new ConllXPOSSampleStreamFactory<>(Parameters.class)); + CONLLX_FORMAT, new ConllXPOSSampleStreamFactory(Parameters.class)); } - protected ConllXPOSSampleStreamFactory(Class

params) { + protected ConllXPOSSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - InputStreamFactory inFactory = - CmdLineUtil.createInputStreamFactory(params.getData()); + Parameters params = validateBasicFormatParameters(args, Parameters.class); try { + InputStreamFactory inFactory = CmdLineUtil.createInputStreamFactory(params.getData()); return new ConllXPOSSampleStream(inFactory, StandardCharsets.UTF_8); - } catch (UnsupportedEncodingException e) { - // this shouldn't happen - throw new TerminateToolException(-1, "UTF-8 encoding is not supported: " + e.getMessage(), e); - } - catch (IOException e) { - // That will throw an exception + } catch (IOException e) { CmdLineUtil.handleCreateObjectStreamError(e); return null; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java index 505f94f2c4..2d622e0fcf 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java @@ -29,29 +29,31 @@ /** * Note: * Do not use this class, internal use only! + * + * @see SentenceSample + * @see POSToSentenceSampleStream */ @Internal -public class ConllXSentenceSampleStreamFactory

extends - DetokenizerSampleStreamFactory { +public class ConllXSentenceSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter { // TODO: make chunk size configurable } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT, - new ConllXSentenceSampleStreamFactory<>(Parameters.class)); + new ConllXSentenceSampleStreamFactory(Parameters.class)); } - protected ConllXSentenceSampleStreamFactory(Class

params) { + protected ConllXSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream posSampleStream = StreamFactoryRegistry.getFactory(POSSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT).create( ArgumentParser.filter(args, ConllXPOSSampleStreamFactory.Parameters.class)); diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java index c894be0a3a..81c1eb217e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java @@ -29,25 +29,29 @@ /** * Note: * Do not use this class, internal use only! + * + * @see TokenSample + * @see POSToTokenSampleStream */ @Internal -public class ConllXTokenSampleStreamFactory

extends DetokenizerSampleStreamFactory { +public class ConllXTokenSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, - ConllXPOSSampleStreamFactory.CONLLX_FORMAT, new ConllXTokenSampleStreamFactory<>(Parameters.class)); + ConllXPOSSampleStreamFactory.CONLLX_FORMAT, new ConllXTokenSampleStreamFactory(Parameters.class)); } - protected ConllXTokenSampleStreamFactory(Class

params) { + protected ConllXTokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream samples = StreamFactoryRegistry.getFactory(POSSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT).create( diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java index 2ecf56641f..1775eaa46b 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java @@ -17,48 +17,33 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.doccat.DocumentSampleStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link DocumentSampleStream}s. */ -public class DocumentSampleStreamFactory

extends AbstractSampleStreamFactory { +public class DocumentSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(DocumentSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new DocumentSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new DocumentSampleStreamFactory(Parameters.class)); } - protected DocumentSampleStreamFactory(Class

params) { + protected DocumentSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new DocumentSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java index 76574500ed..a1382c5dcb 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java @@ -32,7 +32,7 @@ import opennlp.tools.util.StringUtil; /** - * Parser for the Italian NER training files of the Evalita 2007 and 2009 NER shared tasks. + * Parser for the Italian NER training files of the Evalita 2007 and 2009 NER shared tasks. *

* The data does not contain article boundaries, * adaptive data will be cleared for every sentence. @@ -46,12 +46,12 @@ * 2. The Entity type tag: PER (for Person), ORG (for Organization), * GPE (for Geo-Political Entity), or LOC (for Location). *

- * Each file consists of four columns separated by a blank, containing - * respectively the token, the Elsnet PoS-tag, the Adige news story to - * which the token belongs, and the Named Entity tag. + * Each file consists of four columns separated by a blank, containing respectively the token, the + * Elsnet + * PoS-tag, the Adige news story to which the token belongs, and the Named Entity tag. *

* Data can be found on this - * web site. + * web site. *

* Note: * Do not use this class, internal use only! @@ -59,6 +59,15 @@ @Internal public class EvalitaNameSampleStream implements ObjectStream { + public static final String DOCSTART = "-DOCSTART-"; + private static final String CODEC_TAG_O = "O"; + private static final String CODEC_TAG_B = "B-"; + private static final String CODEC_TAG_I = "I-"; + private static final String ENT_TYPE_PER = "PER"; // Person + private static final String ENT_TYPE_LOC = "LOC"; // Location + private static final String ENT_TYPE_GPE = "GPE"; // Geo-Political Entity + private static final String ENT_TYPE_ORG = "ORG"; // Organization + public enum LANGUAGE { IT } @@ -68,8 +77,6 @@ public enum LANGUAGE { public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2; public static final int GENERATE_GPE_ENTITIES = 0x01 << 3; - public static final String DOCSTART = "-DOCSTART-"; - private final LANGUAGE lang; private final ObjectStream lineStream; @@ -82,7 +89,7 @@ public EvalitaNameSampleStream(LANGUAGE lang, ObjectStream lineStream, i } public EvalitaNameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException { - this(lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8),types); + this(lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8), types); } private static Span extract(int begin, int end, String beginTag) throws InvalidFormatException { @@ -90,17 +97,16 @@ private static Span extract(int begin, int end, String beginTag) throws InvalidF String type = beginTag.substring(2); type = switch (type) { - case "PER" -> "person"; - case "LOC" -> "location"; - case "GPE" -> "gpe"; - case "ORG" -> "organization"; + case ENT_TYPE_PER -> "person"; + case ENT_TYPE_LOC -> "location"; + case ENT_TYPE_GPE -> "gpe"; + case ENT_TYPE_ORG -> "organization"; default -> throw new InvalidFormatException("Unknown type: " + type); }; return new Span(begin, end, type); } - @Override public NameSample read() throws IOException { @@ -110,7 +116,6 @@ public NameSample read() throws IOException { boolean isClearAdaptiveData = false; // Empty line indicates end of sentence - String line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { @@ -140,7 +145,7 @@ public NameSample read() throws IOException { if (LANGUAGE.IT.equals(lang)) isClearAdaptiveData = true; - if (sentence.size() > 0) { + if (!sentence.isEmpty()) { // convert name tags into spans List names = new ArrayList<>(); @@ -151,33 +156,31 @@ public NameSample read() throws IOException { String tag = tags.get(i); - if (tag.endsWith("PER") && (types & GENERATE_PERSON_ENTITIES) == 0) - tag = "O"; + if (tag.endsWith(ENT_TYPE_PER) && (types & GENERATE_PERSON_ENTITIES) == 0) + tag = CODEC_TAG_O; - if (tag.endsWith("ORG") && (types & GENERATE_ORGANIZATION_ENTITIES) == 0) - tag = "O"; + if (tag.endsWith(ENT_TYPE_ORG) && (types & GENERATE_ORGANIZATION_ENTITIES) == 0) + tag = CODEC_TAG_O; - if (tag.endsWith("LOC") && (types & GENERATE_LOCATION_ENTITIES) == 0) - tag = "O"; + if (tag.endsWith(ENT_TYPE_LOC) && (types & GENERATE_LOCATION_ENTITIES) == 0) + tag = CODEC_TAG_O; - if (tag.endsWith("GPE") && (types & GENERATE_GPE_ENTITIES) == 0) - tag = "O"; + if (tag.endsWith(ENT_TYPE_GPE) && (types & GENERATE_GPE_ENTITIES) == 0) + tag = CODEC_TAG_O; - if (tag.startsWith("B-")) { + if (tag.startsWith(CODEC_TAG_B)) { if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, tags.get(beginIndex))); - beginIndex = -1; - endIndex = -1; } beginIndex = i; endIndex = i + 1; } - else if (tag.startsWith("I-")) { + else if (tag.startsWith(CODEC_TAG_I)) { endIndex++; } - else if (tag.equals("O")) { + else if (tag.equals(CODEC_TAG_O)) { if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, tags.get(beginIndex))); beginIndex = -1; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java index 7fa9db404b..066861f303 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java @@ -19,7 +19,6 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; @@ -37,9 +36,10 @@ * @see EvalitaNameSampleStream */ @Internal -public class EvalitaNameSampleStreamFactory

extends LanguageSampleStreamFactory { +public class EvalitaNameSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ParameterDescription(valueName = "it") String getLang(); @@ -49,17 +49,16 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(NameSample.class, - "evalita", new EvalitaNameSampleStreamFactory<>(Parameters.class)); + "evalita", new EvalitaNameSampleStreamFactory(Parameters.class)); } - protected EvalitaNameSampleStreamFactory(Class

params) { + protected EvalitaNameSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); LANGUAGE lang; if ("it".equals(params.getLang())) { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java index bda9d482fb..9aedc4bbea 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java @@ -17,51 +17,34 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; -import opennlp.tools.doccat.DocumentSampleStream; import opennlp.tools.langdetect.LanguageDetectorSampleStream; import opennlp.tools.langdetect.LanguageSample; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** - * Factory producing OpenNLP {@link DocumentSampleStream}s. + * Factory producing OpenNLP {@link LanguageDetectorSampleStream lang detector sample streams}. */ -public class LanguageDetectorSampleStreamFactory

- extends AbstractSampleStreamFactory { +public class LanguageDetectorSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(LanguageSample.class, StreamFactoryRegistry.DEFAULT_FORMAT, - new LanguageDetectorSampleStreamFactory<>(Parameters.class)); + new LanguageDetectorSampleStreamFactory(Parameters.class)); } - protected LanguageDetectorSampleStreamFactory(Class

params) { + protected LanguageDetectorSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new LanguageDetectorSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java index dfb137e747..f220917097 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java @@ -17,49 +17,33 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.lemmatizer.LemmaSample; import opennlp.tools.lemmatizer.LemmaSampleStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link LemmaSampleStream}s. */ -public class LemmatizerSampleStreamFactory

extends AbstractSampleStreamFactory { +public class LemmatizerSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(LemmaSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new LemmatizerSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new LemmatizerSampleStreamFactory(Parameters.class)); } - protected LemmatizerSampleStreamFactory(Class

params) { + protected LemmatizerSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new LemmaSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java index 508359bbfe..efc786e1e2 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java @@ -17,50 +17,35 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.namefind.NameSample; import opennlp.tools.namefind.NameSampleDataStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link NameSampleDataStream}s. */ -public class NameSampleDataStreamFactory

extends AbstractSampleStreamFactory { +@Internal +public class NameSampleDataStreamFactory extends + AbstractSampleStreamFactory { public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(NameSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new NameSampleDataStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new NameSampleDataStreamFactory(Parameters.class)); } - protected NameSampleDataStreamFactory(Class

params) { + protected NameSampleDataStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new NameSampleDataStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java index 6ed457430e..2ce9ec36c0 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java @@ -17,49 +17,33 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.parser.Parse; import opennlp.tools.parser.ParseSampleStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link ParseSampleStream}s. */ -public class ParseSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ParseSampleStreamFactory extends + AbstractSampleStreamFactory { public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(Parse.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new ParseSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new ParseSampleStreamFactory(Parameters.class)); } - protected ParseSampleStreamFactory(Class

params) { + protected ParseSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new ParseSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java index e002bbb1d7..61b8e6bb50 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java @@ -17,50 +17,34 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.sentdetect.SentenceSampleStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link SentenceSampleStream}s. */ -public class SentenceSampleStreamFactory

extends AbstractSampleStreamFactory { +public class SentenceSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new SentenceSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new SentenceSampleStreamFactory(Parameters.class)); } - protected SentenceSampleStreamFactory(Class

params) { + protected SentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new SentenceSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java index ffbd1e6c64..75dc62d42b 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java @@ -17,49 +17,33 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.tokenize.TokenSampleStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link TokenSampleStream}s. */ -public class TokenSampleStreamFactory

extends LanguageSampleStreamFactory { +public class TokenSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new TokenSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new TokenSampleStreamFactory(Parameters.class)); } - protected TokenSampleStreamFactory(Class

params) { + protected TokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new TokenSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java index eb3ab1dff5..c69319ce3c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java @@ -28,6 +28,15 @@ import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.ObjectStream; +/** + * An {@link ObjectStream} implementation for the Twenty Newsgroups text corpus. + *

+ * The document collection was created and donated by: Tom Mitchell, + * School of Computer Science, Carnegie Mellon University. + *

+ * Details and the data can be found via this DOI: + * 10.24432/C5C323. + */ public class TwentyNewsgroupSampleStream implements ObjectStream { private final Tokenizer tokenizer; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java index edf3d5d3e2..85179eaefa 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java @@ -26,35 +26,55 @@ import opennlp.tools.cmdline.params.EncodingParameter; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.tokenize.SimpleTokenizer; +import opennlp.tools.tokenize.ThreadSafeTokenizerME; import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; -public class TwentyNewsgroupSampleStreamFactory

extends AbstractSampleStreamFactory { +/** + * Note: Do not use this class, internal use only! + * + * @see TwentyNewsgroupSampleStream + */ +public class TwentyNewsgroupSampleStreamFactory extends + AbstractSampleStreamFactory { + + public interface Parameters extends EncodingParameter { + @ArgumentParser.ParameterDescription(valueName = "dataDir", + description = "dir containing the 20newsgroup folders") + File getDataDir(); + + @ArgumentParser.ParameterDescription(valueName = "modelFile") + @ArgumentParser.OptionalParameter + File getTokenizerModel(); + + @ArgumentParser.ParameterDescription(valueName = "name") + @ArgumentParser.OptionalParameter + String getRuleBasedTokenizer(); + } public static void registerFactory() { StreamFactoryRegistry.registerFactory(DocumentSample.class, "20newsgroup", - new TwentyNewsgroupSampleStreamFactory<>(TwentyNewsgroupSampleStreamFactory.Parameters.class)); + new TwentyNewsgroupSampleStreamFactory(TwentyNewsgroupSampleStreamFactory.Parameters.class)); } - protected TwentyNewsgroupSampleStreamFactory(Class

params) { + protected TwentyNewsgroupSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - TwentyNewsgroupSampleStreamFactory.Parameters params = - ArgumentParser.parse(args, TwentyNewsgroupSampleStreamFactory.Parameters.class); + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + Parameters params = ArgumentParser.parse(args, Parameters.class); Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE; - if (params.getTokenizerModel() != null) { try { - tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel())); + tokenizer = new ThreadSafeTokenizerME(new TokenizerModel(params.getTokenizerModel())); } catch (IOException e) { throw new TerminateToolException(-1, "Failed to load tokenizer model!", e); } @@ -74,24 +94,10 @@ else if ("whitespace".equals(tokenizerName)) { } try { - return new TwentyNewsgroupSampleStream( - tokenizer, params.getDataDir().toPath()); + return new TwentyNewsgroupSampleStream(tokenizer, params.getDataDir().toPath()); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while opening sample data: " + e.getMessage(), e); } } - interface Parameters extends EncodingParameter { - @ArgumentParser.ParameterDescription(valueName = "dataDir", - description = "dir containing the 20newsgroup folders") - File getDataDir(); - - @ArgumentParser.ParameterDescription(valueName = "modelFile") - @ArgumentParser.OptionalParameter - File getTokenizerModel(); - - @ArgumentParser.ParameterDescription(valueName = "name") - @ArgumentParser.OptionalParameter - String getRuleBasedTokenizer(); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java index 4972b4d1db..6b92c41a39 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java @@ -17,52 +17,37 @@ package opennlp.tools.formats; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.commons.Internal; import opennlp.tools.postag.POSSample; import opennlp.tools.postag.WordTagSampleStream; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Note: * Do not use this class, internal use only! */ @Internal -public class WordTagSampleStreamFactory

extends AbstractSampleStreamFactory { +public class WordTagSampleStreamFactory extends + AbstractSampleStreamFactory { public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(POSSample.class, - StreamFactoryRegistry.DEFAULT_FORMAT, new WordTagSampleStreamFactory<>(Parameters.class)); + StreamFactoryRegistry.DEFAULT_FORMAT, new WordTagSampleStreamFactory(Parameters.class)); } - protected WordTagSampleStreamFactory(Class

params) { + protected WordTagSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new WordTagSampleStream(lineStream); } + } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java index cdcbd9d1ae..6c51ee6882 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java @@ -104,14 +104,14 @@ public ChunkSample read() throws IOException { index++; // skip this one } else { - Node root = paragraph.getRoot(); + Node root = paragraph.root(); List sentence = new ArrayList<>(); List tags = new ArrayList<>(); List target = new ArrayList<>(); processRoot(root, sentence, tags, target); - if (sentence.size() > 0) { + if (!sentence.isEmpty()) { index++; return new ChunkSample(sentence, tags, target); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java index 49922f8c22..95183dad26 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java @@ -17,21 +17,16 @@ package opennlp.tools.formats.ad; -import java.io.File; -import java.io.IOException; import java.nio.charset.Charset; import opennlp.tools.chunker.ChunkSample; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.commons.Internal; import opennlp.tools.formats.LanguageSampleStreamFactory; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * A Factory to create a Arvores Deitadas ChunkStream from the command line @@ -41,18 +36,16 @@ * Do not use this class, internal use only! */ @Internal -public class ADChunkSampleStreamFactory

extends LanguageSampleStreamFactory { +public class ADChunkSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters { + public interface Parameters extends BasicFormatParams { //all have to be repeated, because encoding is not optional, //according to the check if (encoding == null) { below (now removed) @ParameterDescription(valueName = "charsetName", description = "encoding for reading and writing text, if absent the system default is used.") Charset getEncoding(); - @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.") - File getData(); - @ParameterDescription(valueName = "language", description = "language which is being processed.") String getLang(); @@ -67,26 +60,17 @@ interface Parameters { public static void registerFactory() { StreamFactoryRegistry.registerFactory(ChunkSample.class, - "ad", new ADChunkSampleStreamFactory<>(Parameters.class)); + "ad", new ADChunkSampleStreamFactory(Parameters.class)); } - protected ADChunkSampleStreamFactory(Class

params) { + protected ADChunkSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); - language = params.getLang(); - - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } + Parameters params = validateBasicFormatParameters(args, Parameters.class); + ObjectStream lineStream = readData(args, Parameters.class); ADChunkSampleStream sampleStream = new ADChunkSampleStream(lineStream); if (params.getStart() != null && params.getStart() > -1) { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java index dae804be5f..d2db063515 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java @@ -206,7 +206,7 @@ public NameSample read() throws IOException { textID = currentTextID; } - Node root = paragraph.getRoot(); + Node root = paragraph.root(); List sentence = new ArrayList<>(); List names = new ArrayList<>(); process(root, sentence, names); @@ -438,7 +438,7 @@ enum Type { private int getTextID(Sentence paragraph) { - final String meta = paragraph.getMetadata(); + final String meta = paragraph.metadata(); Type corpusType; Pattern metaPattern; int textIdMeta2 = -1; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java index 525b40972c..b1af32eb61 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java @@ -17,21 +17,16 @@ package opennlp.tools.formats.ad; -import java.io.File; -import java.io.IOException; import java.nio.charset.Charset; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.commons.Internal; import opennlp.tools.formats.LanguageSampleStreamFactory; import opennlp.tools.namefind.NameSample; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * A Factory to create a Arvores Deitadas NameSampleDataStream from the command line @@ -41,18 +36,16 @@ * Do not use this class, internal use only! */ @Internal -public class ADNameSampleStreamFactory

extends LanguageSampleStreamFactory { +public class ADNameSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters { + public interface Parameters extends BasicFormatParams { //all have to be repeated, because encoding is not optional, //according to the check if (encoding == null) { below (now removed) @ParameterDescription(valueName = "charsetName", description = "encoding for reading and writing text, if absent the system default is used.") Charset getEncoding(); - @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.") - File getData(); - @ParameterDescription(valueName = "split", description = "if true all hyphenated tokens will be separated (default true)") @OptionalParameter(defaultValue = "true") @@ -64,27 +57,18 @@ interface Parameters { public static void registerFactory() { StreamFactoryRegistry.registerFactory(NameSample.class, - "ad", new ADNameSampleStreamFactory<>(Parameters.class)); + "ad", new ADNameSampleStreamFactory(Parameters.class)); } - protected ADNameSampleStreamFactory(Class

params) { + protected ADNameSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); language = params.getLang(); - - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new ADNameSampleStream(lineStream, params.getSplitHyphenatedTokens()); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java index 742e27e696..8b7bd96416 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java @@ -84,7 +84,7 @@ public ADPOSSampleStream(InputStreamFactory in, String charsetName, public POSSample read() throws IOException { Sentence paragraph; if ((paragraph = this.adSentenceStream.read()) != null) { - Node root = paragraph.getRoot(); + Node root = paragraph.root(); List sentence = new ArrayList<>(); List tags = new ArrayList<>(); process(root, sentence, tags); diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java index 80dff4767c..88771d7100 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java @@ -17,38 +17,30 @@ package opennlp.tools.formats.ad; -import java.io.File; -import java.io.IOException; import java.nio.charset.Charset; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.commons.Internal; import opennlp.tools.formats.LanguageSampleStreamFactory; import opennlp.tools.postag.POSSample; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Note: * Do not use this class, internal use only! */ @Internal -public class ADPOSSampleStreamFactory

extends - LanguageSampleStreamFactory { +public class ADPOSSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters { + public interface Parameters extends BasicFormatParams { @ParameterDescription(valueName = "charsetName", description = "encoding for reading and writing text, if absent the system default is used.") Charset getEncoding(); - @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.") - File getData(); - @ParameterDescription(valueName = "language", description = "language which is being processed.") String getLang(); @@ -64,27 +56,18 @@ interface Parameters { public static void registerFactory() { StreamFactoryRegistry.registerFactory(POSSample.class, "ad", - new ADPOSSampleStreamFactory<>(Parameters.class)); + new ADPOSSampleStreamFactory(Parameters.class)); } - protected ADPOSSampleStreamFactory(Class

params) { + protected ADPOSSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); language = params.getLang(); - - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new ADPOSSampleStream(lineStream, params.getExpandME(), params.getIncludeFeatures()); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java index d5a3401b9b..c78bf2b9a4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java @@ -99,9 +99,9 @@ public SentenceSample read() throws IOException { do { do { if (!isTitle || isIncludeTitles) { - if (hasPunctuation(sent.getText())) { + if (hasPunctuation(sent.text())) { int start = document.length(); - document.append(sent.getText()); + document.append(sent.text()); sentences.add(new Span(start, document.length())); document.append(" "); } @@ -116,7 +116,7 @@ public SentenceSample read() throws IOException { while (isSameText); String doc; - if (document.length() > 0) { + if (!document.isEmpty()) { doc = document.substring(0, document.length() - 1); } else { doc = document.toString(); @@ -127,7 +127,7 @@ public SentenceSample read() throws IOException { private boolean hasPunctuation(String text) { text = text.trim(); - if (text.length() > 0) { + if (!text.isEmpty()) { char lastChar = text.charAt(text.length() - 1); return Arrays.binarySearch(ptEosCharacters, lastChar) >= 0; } @@ -135,13 +135,12 @@ private boolean hasPunctuation(String text) { } // there are some different types of metadata depending on the corpus. - // TODO Merge this patterns - private static final Pattern META_1 = Pattern - .compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*"); + // TODO Merge these patterns + private static final Pattern META_1 = Pattern.compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*"); private void updateMeta() { if (this.sent != null) { - String meta = this.sent.getMetadata(); + String meta = this.sent.metadata(); Matcher m = META_1.matcher(meta); int currentText; int currentPara; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java index 48748c9205..50c8f593fc 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java @@ -17,37 +17,29 @@ package opennlp.tools.formats.ad; -import java.io.File; -import java.io.IOException; import java.nio.charset.Charset; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.commons.Internal; import opennlp.tools.formats.LanguageSampleStreamFactory; import opennlp.tools.sentdetect.SentenceSample; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Note: * Do not use this class, internal use only! */ @Internal -public class ADSentenceSampleStreamFactory

extends - LanguageSampleStreamFactory { +public class ADSentenceSampleStreamFactory extends + LanguageSampleStreamFactory { - interface Parameters { + public interface Parameters extends BasicFormatParams { @ParameterDescription(valueName = "charsetName", description = "encoding for reading and writing text.") Charset getEncoding(); - @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.") - File getData(); - @ParameterDescription(valueName = "language", description = "language which is being processed.") String getLang(); @@ -59,29 +51,19 @@ interface Parameters { public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, "ad", - new ADSentenceSampleStreamFactory<>(Parameters.class)); + new ADSentenceSampleStreamFactory(Parameters.class)); } - protected ADSentenceSampleStreamFactory(Class

params) { + protected ADSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); language = params.getLang(); - boolean includeTitle = params.getIncludeTitles(); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new ADSentenceSampleStream(lineStream, includeTitle); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java index 27e9174940..1445f153b5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java @@ -51,38 +51,8 @@ @Internal public class ADSentenceStream extends FilterObjectStream { - public static class Sentence { - - private String text; - private Node root; - private String metadata; - + public record Sentence (String text, Node root, String metadata) { public static final String META_LABEL_FINAL = "final"; - - public String getText() { - return text; - } - - public void setText(String text) { - this.text = text; - } - - public Node getRoot() { - return root; - } - - public void setRoot(Node root) { - this.root = root; - } - - public void setMetadata(String metadata) { - this.metadata = metadata; - } - - public String getMetadata() { - return metadata; - } - } /** @@ -116,7 +86,7 @@ public static class SentenceParser { * @return A {@link Sentence} instance parsed from {@code sentenceString}. */ public Sentence parse(String sentenceString, int para, boolean isTitle, boolean isBox) { - Sentence sentence = new Sentence(); + Sentence sentence; Node root = new Node(); try (BufferedReader reader = new BufferedReader(new StringReader(sentenceString))) { // first line is @@ -153,8 +123,7 @@ public Sentence parse(String sentenceString, int para, boolean isTitle, boolean meta = line.substring(0, start) + " p=" + para + titleTag + boxTag + metaFromSource; } } - sentence.setText(text); - sentence.setMetadata(meta); + sentence = new Sentence(text, root, meta); // now we look for the root node do { line = reader.readLine(); @@ -232,10 +201,9 @@ public Sentence parse(String sentenceString, int para, boolean isTitle, boolean } catch (Exception e) { logger.warn("Caught exception for the given sentence: '{}'", sentenceString, e); - return sentence; + return null; } // second line should be SOURCE - sentence.setRoot(root); return sentence; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java index 7a93006a3c..a720b1abbd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java @@ -32,24 +32,24 @@ * Do not use this class, internal use only! */ @Internal -public class ADTokenSampleStreamFactory

extends - DetokenizerSampleStreamFactory { +public class ADTokenSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends ADNameSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends ADNameSampleStreamFactory.Parameters, DetokenizerParameter { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, "ad", - new ADTokenSampleStreamFactory<>(Parameters.class)); + new ADTokenSampleStreamFactory(Parameters.class)); } - protected ADTokenSampleStreamFactory(Class

params) { + protected ADTokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream samples = StreamFactoryRegistry.getFactory( NameSample.class, "ad").create( diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java index c734ba46c2..36b1f79a9d 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java @@ -40,6 +40,7 @@ public class PortugueseContractionUtility { protected static final Map CONTRACTIONS; + private static final String SYMBOL_PLUS = "+"; static { Map elems = new HashMap<>(); @@ -162,7 +163,7 @@ public class PortugueseContractionUtility { * @return The merged contraction. */ public static String toContraction(String left, String right) { - String key = left + "+" + right; + String key = left + SYMBOL_PLUS + right; if (CONTRACTIONS.containsKey(key)) { return CONTRACTIONS.get(key); } else { @@ -171,7 +172,7 @@ public static String toContraction(String left, String right) { for (int i = 0; i < parts.length - 1; i++) { sb.append(parts[i]).append(" "); } - key = parts[parts.length - 1] + "+" + right; + key = parts[parts.length - 1] + SYMBOL_PLUS + right; if (CONTRACTIONS.containsKey(key)) { sb.append(CONTRACTIONS.get(key)); return sb.toString(); @@ -180,7 +181,7 @@ public static String toContraction(String left, String right) { if (right.contains("_")) { parts = right.split("_"); - key = left + "+" + parts[0]; + key = left + SYMBOL_PLUS + parts[0]; if (CONTRACTIONS.containsKey(key)) { sb.append(CONTRACTIONS.get(key)).append(" "); @@ -194,7 +195,7 @@ public static String toContraction(String left, String right) { } String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]); - key = leftLower + "+" + right; + key = leftLower + SYMBOL_PLUS + right; if (CONTRACTIONS.containsKey(key)) { String r = CONTRACTIONS.get(key); String firstChar = r.substring(0, 1); diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java index b1c7703f5c..1b669d7b76 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java @@ -30,6 +30,13 @@ import opennlp.tools.tokenize.WhitespaceTokenizer; +/** + * Encapsulates a type to class mapping for entities, relations, events, etc. + *

+ * Details on how a annotation configuration file should be structured can be found + * in the brat annotation configuration + * section of the official BRAT documentation. + */ public class AnnotationConfiguration { public static final String SPAN_TYPE = "Span"; @@ -38,68 +45,93 @@ public class AnnotationConfiguration { public static final String ATTRIBUTE_TYPE = "Attribute"; public static final String EVENT_TYPE = "Event"; + private static final String SYMBOL_HASH = "#"; + private static final String BRACKET_OPEN = "["; + private static final String BRACKET_CLOSE = "]"; + private final Map typeToClassMap; + /** + * Initializes an {@link AnnotationConfiguration} with the specified {@code typeToClassMap}. + * @param typeToClassMap A type to class mapping. Must not be {@code null}. + */ public AnnotationConfiguration(Map typeToClassMap) { this.typeToClassMap = Map.copyOf(typeToClassMap); } + /** + * @param type The type to get the type class for. + * @return Retrieves the class for the specified {@code type}, {@code null} if not found. + */ public String getTypeClass(String type) { return typeToClassMap.get(type); } - + /** + * Parses a given {@link File annConfigFile} into a {@link AnnotationConfiguration}. + * + * @param in A valid {@link File annConfigFile} from which the config should + * be read. Must not be {@code null} and must be in the correct format, + * see: + * Brat annotation configuration + * + * @return A valid {@link AnnotationConfiguration} instance. + * @throws IOException Thrown if IO errors occurred during parsing. + */ public static AnnotationConfiguration parse(InputStream in) throws IOException { Map typeToClassMap = new HashMap<>(); - BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); - - // Note: This only supports entities and relations section - String line; - String sectionType = null; - - while ((line = reader.readLine()) != null) { - line = line.trim(); - - if (!line.isEmpty()) { - if (!line.startsWith("#")) { - if (line.startsWith("[") && line.endsWith("]")) { - sectionType = line.substring(line.indexOf('[') + 1, line.indexOf(']')); - } - else { - String typeName = WhitespaceTokenizer.INSTANCE.tokenize(line)[0]; - - switch (sectionType) { - case "entities": - typeToClassMap.put(typeName, AnnotationConfiguration.ENTITY_TYPE); - break; - - case "relations": - typeToClassMap.put(typeName, AnnotationConfiguration.RELATION_TYPE); - break; - - case "attributes": - typeToClassMap.put(typeName, AnnotationConfiguration.ATTRIBUTE_TYPE); - break; - - case "events": - typeToClassMap.put(typeName, AnnotationConfiguration.EVENT_TYPE); - break; - - default: - break; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + // Note: This only supports entities and relations section + String line; + String sectionType = null; + + while ((line = reader.readLine()) != null) { + line = line.trim(); + + if (!line.isEmpty()) { + if (!line.startsWith(SYMBOL_HASH)) { + if (line.startsWith(BRACKET_OPEN) && line.endsWith(BRACKET_CLOSE)) { + sectionType = line.substring(line.indexOf('[') + 1, line.indexOf(']')); + } else { + String typeName = WhitespaceTokenizer.INSTANCE.tokenize(line)[0]; + + switch (sectionType) { + case "entities": + typeToClassMap.put(typeName, AnnotationConfiguration.ENTITY_TYPE); + break; + case "relations": + typeToClassMap.put(typeName, AnnotationConfiguration.RELATION_TYPE); + break; + case "attributes": + typeToClassMap.put(typeName, AnnotationConfiguration.ATTRIBUTE_TYPE); + break; + case "events": + typeToClassMap.put(typeName, AnnotationConfiguration.EVENT_TYPE); + break; + default: + break; + } } } } } } - return new AnnotationConfiguration(typeToClassMap); } + /** + * Parses a given {@link File annConfigFile} into a {@link AnnotationConfiguration}. + * + * @param annConfigFile A valid {@link File annConfigFile} from which the config should + * be read. Must not be {@code null} and must be in the correct format, + * see: + * Brat annotation configuration + * + * @return A valid {@link AnnotationConfiguration} instance. + * @throws IOException Thrown if IO errors occurred during parsing. + */ public static AnnotationConfiguration parse(File annConfigFile) throws IOException { - try (InputStream in = new BufferedInputStream(new FileInputStream(annConfigFile))) { - return parse(in); - } + return parse(new BufferedInputStream(new FileInputStream(annConfigFile))); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java index e45797d414..43dc3a2535 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java @@ -199,8 +199,7 @@ static class AnnotatorNoteParser extends BratAnnotationParser { @Override BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException { - - Span noteSpan = new Span( tokens[START_VALUE_OFFSET].getStart(), + Span noteSpan = new Span( tokens[START_VALUE_OFFSET].getStart(), tokens[tokens.length - 1].getEnd() ); return new AnnotatorNoteAnnotation(tokens[ID_OFFSET].getCoveredText(line).toString(), @@ -208,6 +207,7 @@ BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException { noteSpan.getCoveredText(line).toString()); } } + private final AnnotationConfiguration config; private final BufferedReader reader; private final String id; @@ -219,10 +219,9 @@ BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException { reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); } + @Override public BratAnnotation read() throws IOException { - String line = reader.readLine(); - if (line != null) { Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(line); @@ -259,7 +258,7 @@ public BratAnnotation read() throws IOException { } break; default: - // Skip it, do that for everything unsupported (e.g. "*" id) + // Skip it, do that for everything unsupported (e.g. "*" id) return read(); } @@ -275,10 +274,12 @@ public BratAnnotation read() throws IOException { return null; } + @Override public void reset() throws IOException, UnsupportedOperationException { reader.reset(); } + @Override public void close() throws IOException { reader.close(); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java index 8f786749b9..8f14d54b3a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java @@ -100,25 +100,22 @@ public Collection getAnnotations() { public static BratDocument parseDocument(AnnotationConfiguration config, String id, InputStream txtIn, InputStream annIn) throws IOException { - Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8); - - StringBuilder text = new StringBuilder(); - - char[] cbuf = new char[1024]; - - int len; - while ((len = txtReader.read(cbuf)) > 0) { - text.append(cbuf, 0, len); - } - - Collection annotations = new ArrayList<>(); - ObjectStream annStream = new BratAnnotationStream(config, id, annIn); - BratAnnotation ann; - while ((ann = annStream.read()) != null) { - annotations.add(ann); + try (Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8); + ObjectStream annStream = new BratAnnotationStream(config, id, annIn)) { + + StringBuilder text = new StringBuilder(); + char[] cbuf = new char[1024]; + int len; + while ((len = txtReader.read(cbuf)) > 0) { + text.append(cbuf, 0, len); + } + Collection annotations = new ArrayList<>(); + BratAnnotation ann; + while ((ann = annStream.read()) != null) { + annotations.add(ann); + } + return new BratDocument(config, id, text.toString(), annotations); } - annStream.close(); - - return new BratDocument(config, id, text.toString(), annotations); } + } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java index d799b4c474..feb3a4c3a2 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java @@ -38,44 +38,41 @@ public class BratNameSampleStream extends SegmenterObjectStream samples) { - super(samples); - - this.parser = new BratDocumentParser(sentDetector, tokenizer, null); + this(sentDetector, tokenizer, samples, null); } /** - * Creates a new {@link BratNameSampleStream}. - * @param sentModel a {@link SentenceModel} model - * @param tokenModel a {@link TokenizerModel} model - * @param samples a {@link BratDocument} {@link ObjectStream} + * Initializes a new {@link BratNameSampleStream} with the specified (model) parameters. + * + * @param sentModel A valid {@link SentenceModel sentence detection model}. + * @param tokenModel A valid {@link TokenizerModel tokenizer model}. + * @param samples The {@link BratDocument} {@link ObjectStream} to process. */ public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel, ObjectStream samples) { - super(samples); - - // TODO: We can pass in custom validators here ... - this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), - new TokenizerME(tokenModel), null); + this(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel), samples, null); } /** * Creates a new {@link BratNameSampleStream}. - * @param sentDetector a {@link SentenceDetector} instance - * @param tokenizer a {@link Tokenizer} instance - * @param samples a {@link BratDocument} {@link ObjectStream} - * @param nameTypes the name types to use or null if all name types + * @param sentDetector A valid {@link SentenceDetector} instance. + * @param tokenizer A valid {@link Tokenizer} instance. + * @param samples The {@link BratDocument} {@link ObjectStream} to process. + * + * @param nameTypes the name types to use or {@code null} if all name types. */ public BratNameSampleStream(SentenceDetector sentDetector, Tokenizer tokenizer, ObjectStream samples, Set nameTypes) { super(samples); - this.parser = new BratDocumentParser(sentDetector, tokenizer, nameTypes); } @@ -84,13 +81,12 @@ public BratNameSampleStream(SentenceDetector sentDetector, * @param sentModel a {@link SentenceModel} model * @param tokenModel a {@link TokenizerModel} model * @param samples a {@link BratDocument} {@link ObjectStream} - * @param nameTypes the name types to use or null if all name types + * @param nameTypes the name types to use or {@code null} if all name types */ public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel, ObjectStream samples, Set nameTypes) { super(samples); - - // TODO: We can pass in custom validators here ... + // Hint: We can pass in custom validators here ... this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel), nameTypes); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java index d5e8793ee1..9017c944e4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java @@ -41,10 +41,15 @@ import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; +/** + * Note: Do not use this class, internal use only! + * + * @see BratNameSampleStream + */ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory { - interface Parameters { + public interface Parameters { @ParameterDescription(valueName = "bratDataDir", description = "location of brat data dir") File getBratDataDir(); @@ -76,37 +81,40 @@ protected BratNameSampleStreamFactory() { super(Parameters.class); } + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(NameSample.class, "brat", + new BratNameSampleStreamFactory()); + } + /** - * Checks that non of the passed values are null. + * Checks that non of the passed values are {@code null}. * - * @param objects - * @return true or false + * @param objects The objects to check for {@code null}. + * @return {@code true} if at least one object is {@code null}, {@code false} otherwise. */ private boolean notNull(Object... objects) { - for (Object obj : objects) { if (obj == null) return false; } - return true; } @Override public ObjectStream create(String[] args) { - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } Parameters params = ArgumentParser.parse(args, Parameters.class); if (notNull(params.getRuleBasedTokenizer(), params.getTokenizerModel())) { throw new TerminateToolException(-1, "Either use rule based or statistical tokenizer!"); } - // TODO: Provide the file name to the annotation.conf file and implement the parser ... AnnotationConfiguration annConfig; try { annConfig = AnnotationConfiguration.parse(params.getAnnotationConfig()); - } - catch (IOException e) { + } catch (IOException e) { throw new TerminateToolException(1, "Failed to parse annotation.conf file!"); } @@ -121,37 +129,30 @@ public ObjectStream create(String[] args) { } SentenceDetector sentDetector; - if (params.getSentenceDetectorModel() != null) { try { sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel())); } catch (IOException e) { throw new TerminateToolException(-1, "Failed to load sentence detector model!", e); } - } - else { + } else { sentDetector = new NewlineSentenceDetector(); } Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE; - if (params.getTokenizerModel() != null) { try { tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel())); } catch (IOException e) { throw new TerminateToolException(-1, "Failed to load tokenizer model!", e); } - } - else if (params.getRuleBasedTokenizer() != null) { + } else if (params.getRuleBasedTokenizer() != null) { String tokenizerName = params.getRuleBasedTokenizer(); - if ("simple".equals(tokenizerName)) { tokenizer = SimpleTokenizer.INSTANCE; - } - else if ("whitespace".equals(tokenizerName)) { + } else if ("whitespace".equals(tokenizerName)) { tokenizer = WhitespaceTokenizer.INSTANCE; - } - else { + } else { throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName); } } @@ -167,8 +168,4 @@ else if ("whitespace".equals(tokenizerName)) { return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(NameSample.class, "brat", - new BratNameSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java index 82c4e4aa41..05ee3e3dbb 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java @@ -35,6 +35,7 @@ public SegmenterObjectStream(ObjectStream in) { protected abstract List read(S sample) throws IOException; + @Override public final T read() throws IOException { if (sampleIt.hasNext()) { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java index c7b1f77bf8..2dedf86cea 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java @@ -32,11 +32,15 @@ /** * Note: Do not use this class, internal use only! + * + * @see LemmaSample + * @see ConlluLemmaSampleStream */ @Internal -public class ConlluLemmaSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ConlluLemmaSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ArgumentParser.ParameterDescription(valueName = "tagset", description = "u|x u for unified tags and x for language-specific part-of-speech tags") @ArgumentParser.OptionalParameter(defaultValue = "u") @@ -46,15 +50,15 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(LemmaSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT, - new ConlluLemmaSampleStreamFactory<>(Parameters.class)); + new ConlluLemmaSampleStreamFactory(Parameters.class)); } - protected ConlluLemmaSampleStreamFactory(Class

params) { + protected ConlluLemmaSampleStreamFactory(Class params) { super(params); } public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ConlluTagset tagset = switch (params.getTagset()) { case "u" -> ConlluTagset.U; @@ -68,7 +72,6 @@ public ObjectStream create(String[] args) { try { return new ConlluLemmaSampleStream(new ConlluStream(inFactory), tagset); } catch (IOException e) { - // That will throw an exception CmdLineUtil.handleCreateObjectStreamError(e); } return null; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java index 6601dbb64d..9a40b0a178 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java @@ -32,13 +32,17 @@ /** * Note: Do not use this class, internal use only! + * + * @see POSSample + * @see ConlluPOSSampleStream */ @Internal -public class ConlluPOSSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ConlluPOSSampleStreamFactory extends + AbstractSampleStreamFactory { public static final String CONLLU_FORMAT = "conllu"; - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ArgumentParser.ParameterDescription(valueName = "tagset", description = "u|x u for unified tags and x for language-specific part-of-speech tags") @ArgumentParser.OptionalParameter(defaultValue = "u") @@ -47,15 +51,16 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(POSSample.class, - CONLLU_FORMAT, new ConlluPOSSampleStreamFactory<>(Parameters.class)); + CONLLU_FORMAT, new ConlluPOSSampleStreamFactory(Parameters.class)); } - protected ConlluPOSSampleStreamFactory(Class

params) { + protected ConlluPOSSampleStreamFactory(Class params) { super(params); } + @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ConlluTagset tagset = switch (params.getTagset()) { case "u" -> ConlluTagset.U; @@ -69,7 +74,6 @@ public ObjectStream create(String[] args) { try { return new ConlluPOSSampleStream(new ConlluStream(inFactory), tagset); } catch (IOException e) { - // That will throw an exception CmdLineUtil.handleCreateObjectStreamError(e); } return null; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java index 695534d188..2150be309f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java @@ -44,9 +44,7 @@ public class ConlluSentence { public ConlluSentence(List wordLines, String sentenceIdComment, String textComment, boolean newDocument, String documentId, boolean newParagraph, String paragraphId, Map textLang, String translit) { - this.wordLines = wordLines; - this.sentenceIdComment = sentenceIdComment; - this.textComment = textComment; + this(wordLines, sentenceIdComment, textComment); this.newDocument = newDocument; this.documentId = documentId; this.newParagraph = newParagraph; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java index 3b1164caf6..6e66575954 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java @@ -30,15 +30,16 @@ import opennlp.tools.util.ObjectStream; /** - * Note: - * Do not use this class, internal use only! + * Note: Do not use this class, internal use only! * + * @see SentenceSample * @see ConlluSentenceSampleStream */ @Internal -public class ConlluSentenceSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ConlluSentenceSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample", description = "number of sentences per sample") String getSentencesPerSample(); @@ -47,16 +48,16 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT, - new ConlluSentenceSampleStreamFactory<>(ConlluSentenceSampleStreamFactory.Parameters.class)); + new ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class)); } - protected ConlluSentenceSampleStreamFactory(Class

params) { + protected ConlluSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); InputStreamFactory inFactory = CmdLineUtil.createInputStreamFactory(params.getData()); @@ -65,7 +66,6 @@ public ObjectStream create(String[] args) { return new ConlluSentenceSampleStream(new ConlluStream(inFactory), Integer.parseInt(params.getSentencesPerSample())); } catch (IOException e) { - // That will throw an exception CmdLineUtil.handleCreateObjectStreamError(e); } return null; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java index 5f813a65fe..ac3b18d479 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java @@ -19,7 +19,6 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; @@ -30,30 +29,31 @@ import opennlp.tools.util.ObjectStream; /** - * Note: - * Do not use this class, internal use only! + * Note: Do not use this class, internal use only! * + * @see TokenSample * @see ConlluTokenSampleStream */ @Internal -public class ConlluTokenSampleStreamFactory

extends AbstractSampleStreamFactory { +public class ConlluTokenSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT, - new ConlluTokenSampleStreamFactory<>(ConlluTokenSampleStreamFactory.Parameters.class)); + new ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class)); } - protected ConlluTokenSampleStreamFactory(Class

params) { + protected ConlluTokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); InputStreamFactory inFactory = CmdLineUtil.createInputStreamFactory(params.getData()); @@ -61,7 +61,6 @@ public ObjectStream create(String[] args) { try { return new ConlluTokenSampleStream(new ConlluStream(inFactory)); } catch (IOException e) { - // That will throw an exception CmdLineUtil.handleCreateObjectStreamError(e); } return null; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java index ad5354e4c3..7297126bad 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java @@ -34,23 +34,24 @@ * @see NameToSentenceSampleStream */ @Internal -public class NameToSentenceSampleStreamFactory

extends DetokenizerSampleStreamFactory { +public class NameToSentenceSampleStreamFactory extends + DetokenizerSampleStreamFactory { interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, - "namefinder", new NameToSentenceSampleStreamFactory<>(Parameters.class)); + "namefinder", new NameToSentenceSampleStreamFactory(Parameters.class)); } - protected NameToSentenceSampleStreamFactory(Class

params) { + protected NameToSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream nameSampleStream = StreamFactoryRegistry.getFactory( NameSample.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java index 4d2c10bc61..54392331bc 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java @@ -34,23 +34,24 @@ * @see NameToTokenSampleStream */ @Internal -public class NameToTokenSampleStreamFactory

extends DetokenizerSampleStreamFactory { +public class NameToTokenSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, - "namefinder", new NameToTokenSampleStreamFactory<>(Parameters.class)); + "namefinder", new NameToTokenSampleStreamFactory(Parameters.class)); } - protected NameToTokenSampleStreamFactory(Class

params) { + protected NameToTokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream nameSampleStream = StreamFactoryRegistry.getFactory( NameSample.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java index 25162683ff..8aaf104fc0 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java @@ -34,23 +34,24 @@ * @see POSToSentenceSampleStream */ @Internal -public class POSToSentenceSampleStreamFactory

extends DetokenizerSampleStreamFactory { +public class POSToSentenceSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends WordTagSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends WordTagSampleStreamFactory.Parameters, DetokenizerParameter { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, - "pos", new POSToSentenceSampleStreamFactory<>(Parameters.class)); + "pos", new POSToSentenceSampleStreamFactory(Parameters.class)); } - protected POSToSentenceSampleStreamFactory(Class

params) { + protected POSToSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream posSampleStream = StreamFactoryRegistry.getFactory(POSSample.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java index e5750a3345..5f7eb51d27 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java @@ -34,23 +34,24 @@ * @see POSToTokenSampleStream */ @Internal -public class POSToTokenSampleStreamFactory

extends DetokenizerSampleStreamFactory { +public class POSToTokenSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends WordTagSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends WordTagSampleStreamFactory.Parameters, DetokenizerParameter { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, - "pos", new POSToTokenSampleStreamFactory<>(Parameters.class)); + "pos", new POSToTokenSampleStreamFactory(Parameters.class)); } - protected POSToTokenSampleStreamFactory(Class

params) { + protected POSToTokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream posSampleStream = StreamFactoryRegistry.getFactory(POSSample.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java index 73f12a3cdc..eb9855e9eb 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java @@ -19,6 +19,7 @@ import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.DetokenizerParameter; import opennlp.tools.commons.Internal; import opennlp.tools.formats.LanguageSampleStreamFactory; import opennlp.tools.formats.ParseSampleStreamFactory; @@ -34,17 +35,25 @@ */ @Internal public class ParseToPOSSampleStreamFactory - extends LanguageSampleStreamFactory { + extends LanguageSampleStreamFactory { + public interface Parameters extends ParseSampleStreamFactory.Parameters, DetokenizerParameter { + } + private ParseToPOSSampleStreamFactory() { - super(ParseSampleStreamFactory.Parameters.class); + super(ParseToPOSSampleStreamFactory.Parameters.class); + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(POSSample.class, + "parse", new ParseToPOSSampleStreamFactory()); } @Override public ObjectStream create(String[] args) { ParseSampleStreamFactory.Parameters params = - ArgumentParser.parse(args, ParseSampleStreamFactory.Parameters.class); + validateBasicFormatParameters(args, ParseSampleStreamFactory.Parameters.class); ObjectStream parseSampleStream = StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( @@ -53,8 +62,4 @@ public ObjectStream create(String[] args) { return new ParseToPOSSampleStream(parseSampleStream); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(POSSample.class, - "parse", new ParseToPOSSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java index cfa253b7f0..15648a90bd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java @@ -32,19 +32,24 @@ * Do not use this class, internal use only! */ @Internal -public class ParseToSentenceSampleStreamFactory extends DetokenizerSampleStreamFactory - { +public class ParseToSentenceSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends ParseSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends ParseSampleStreamFactory.Parameters, DetokenizerParameter { } private ParseToSentenceSampleStreamFactory() { super(Parameters.class); } + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(SentenceSample.class, + "parse", new ParseToSentenceSampleStreamFactory()); + } + @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream parseSampleStream = StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( @@ -54,8 +59,5 @@ public ObjectStream create(String[] args) { new ParseToPOSSampleStream(parseSampleStream), 30); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(SentenceSample.class, - "parse", new ParseToSentenceSampleStreamFactory()); - } + } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java index 1a39c2c14c..4d0d0728e3 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java @@ -33,19 +33,24 @@ * Do not use this class, internal use only! */ @Internal -public class ParseToTokenSampleStreamFactory extends DetokenizerSampleStreamFactory { +public class ParseToTokenSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends ParseSampleStreamFactory.Parameters, DetokenizerParameter { + public interface Parameters extends ParseSampleStreamFactory.Parameters, DetokenizerParameter { } private ParseToTokenSampleStreamFactory() { super(Parameters.class); } + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(TokenSample.class, + "parse", new ParseToTokenSampleStreamFactory()); + } + @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); + Parameters params = validateBasicFormatParameters(args, Parameters.class); ObjectStream parseSampleStream = StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT).create( @@ -55,8 +60,4 @@ public ObjectStream create(String[] args) { new ParseToPOSSampleStream(parseSampleStream)); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(TokenSample.class, - "parse", new ParseToTokenSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java index a724d77bfd..f2f57f940d 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java @@ -34,7 +34,6 @@ class ConstitDocumentHandler extends DefaultHandler { private static final String SENT_ELEMENT_NAME = "SENT"; private static final String WORD_ELEMENT_NAME = "w"; - private static final String SENT_TYPE_NAME = "S"; private final List parses; @@ -139,7 +138,7 @@ public void endElement(String uri, String localName, String qName) if (WORD_ELEMENT_NAME.equals(qName)) { String token = tokenBuffer.toString().trim(); - if (token.length() > 0) { + if (!token.isEmpty()) { cons.add(new Constituent(AbstractBottomUpParser.TOK_NODE, new Span(offset, offset + token.length()))); diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java index 11cd010890..d68b9be6a2 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java @@ -28,9 +28,9 @@ import opennlp.tools.util.ObjectStream; /** - * Note: - * Do not use this class, internal use only! + * Note: Do not use this class, internal use only! * + * @see Parse * @see ConstitParseSampleStream */ @Internal @@ -45,18 +45,19 @@ private ConstitParseSampleStreamFactory() { super(Parameters.class); } + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(Parse.class, "frenchtreebank", + new ConstitParseSampleStreamFactory()); + } + @Override public ObjectStream create(String[] args) { - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } Parameters params = ArgumentParser.parse(args, Parameters.class); - - return new ConstitParseSampleStream(new FileToByteArraySampleStream( new DirectorySampleStream(params.getData(), null, false))); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(Parse.class, "frenchtreebank", - new ConstitParseSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java index 3decd8e03c..5c1036f125 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java @@ -19,39 +19,40 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.util.ObjectStream; /** + * Note: Do not use this class, internal use only! + * + * @see SentenceSample * @see IrishSentenceBankSentenceStream */ -public class IrishSentenceBankSentenceStreamFactory

- extends AbstractSampleStreamFactory { +@Internal +public class IrishSentenceBankSentenceStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, - "irishsentencebank", new IrishSentenceBankSentenceStreamFactory<>( + "irishsentencebank", new IrishSentenceBankSentenceStreamFactory( IrishSentenceBankSentenceStreamFactory.Parameters.class)); } - protected IrishSentenceBankSentenceStreamFactory(Class

params) { + protected IrishSentenceBankSentenceStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); + Parameters params = validateBasicFormatParameters(args, Parameters.class); IrishSentenceBankDocument isbDoc = null; try { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java index 213c0b8a8d..b41d57413e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java @@ -19,39 +19,40 @@ import java.io.IOException; -import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.DetokenizerSampleStreamFactory; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.util.ObjectStream; /** + * Note: Do not use this class, internal use only! + * + * @see TokenSample * @see IrishSentenceBankTokenSampleStream */ -public class IrishSentenceBankTokenSampleStreamFactory

- extends DetokenizerSampleStreamFactory { +@Internal +public class IrishSentenceBankTokenSampleStreamFactory extends + DetokenizerSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, - "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory<>( + "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory( IrishSentenceBankTokenSampleStreamFactory.Parameters.class)); } - protected IrishSentenceBankTokenSampleStreamFactory(Class

params) { + protected IrishSentenceBankTokenSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); + Parameters params = validateBasicFormatParameters(args, Parameters.class); IrishSentenceBankDocument isbDoc = null; try { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java index 4b3b769df5..789904f0f1 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java @@ -38,49 +38,49 @@ * @see LeipzigLanguageSampleStream */ @Internal -public class LeipzigLanguageSampleStreamFactory

- extends AbstractSampleStreamFactory { +public class LeipzigLanguageSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends EncodingParameter { + public interface Parameters extends EncodingParameter { @ParameterDescription(valueName = "sentencesDir", description = "dir with Leipzig sentences to be used") File getSentencesDir(); @ParameterDescription(valueName = "sentencesPerSample", description = "number of sentences per sample") - String getSentencesPerSample(); + Integer getSentencesPerSample(); @ParameterDescription(valueName = "samplesPerLanguage", description = "number of samples per language") - String getSamplesPerLanguage(); + Integer getSamplesPerLanguage(); @ParameterDescription(valueName = "samplesToSkip", description = "number of samples to skip before returning") @OptionalParameter(defaultValue = "0") - String getSamplesToSkip(); + Integer getSamplesToSkip(); } - protected LeipzigLanguageSampleStreamFactory(Class

params) { + protected LeipzigLanguageSampleStreamFactory(Class params) { super(params); } public static void registerFactory() { StreamFactoryRegistry.registerFactory(LanguageSample.class, - "leipzig", new LeipzigLanguageSampleStreamFactory<>(Parameters.class)); + "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class)); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); - File sentencesFileDir = params.getSentencesDir(); + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + Parameters p = ArgumentParser.parse(args, Parameters.class); + File sentencesFileDir = p.getSentencesDir(); try { - return new SampleSkipStream<>(new SampleShuffleStream<>( - new LeipzigLanguageSampleStream(sentencesFileDir, - Integer.parseInt(params.getSentencesPerSample()), - Integer.parseInt(params.getSamplesPerLanguage()) + Integer.parseInt(params.getSamplesToSkip()))), - Integer.parseInt(params.getSamplesToSkip())); + return new SampleSkipStream<>(new SampleShuffleStream<>(new LeipzigLanguageSampleStream( + sentencesFileDir, p.getSentencesPerSample(), + p.getSamplesPerLanguage() + p.getSamplesToSkip())), p.getSamplesToSkip()); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while opening sample data.", e); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java index 417d202ef1..1e3d5e699e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java @@ -24,33 +24,40 @@ import java.util.List; import java.util.Random; +import opennlp.tools.commons.Sample; import opennlp.tools.util.ObjectStream; +/** + * A specialization of {@link ObjectStream} that shuffles samples. + * @param The template parameter which represents + * the {@link Sample} type. + */ class SampleShuffleStream implements ObjectStream { private final List bufferedSamples = new ArrayList<>(); private Iterator sampleIt; + /** + * Initializes a {@link SampleShuffleStream} with the specified parameters. + * + * @param samples The {@link ObjectStream} to process. + * @throws IOException Thrown if IO errors occurred during skip operation. + */ SampleShuffleStream(ObjectStream samples) throws IOException { - T sample; while ((sample = samples.read()) != null) { bufferedSamples.add(sample); } - Collections.shuffle(bufferedSamples, new Random(23)); - reset(); } @Override public T read() throws IOException { - if (sampleIt.hasNext()) { return sampleIt.next(); } - return null; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java index 2bfff5b88e..e5f72e0342 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java @@ -19,13 +19,26 @@ import java.io.IOException; +import opennlp.tools.commons.Sample; import opennlp.tools.util.ObjectStream; +/** + * A specialization of {@link ObjectStream} that skips a number of samples. + * @param The template parameter which represents + * the {@link Sample} type. + */ class SampleSkipStream implements ObjectStream { private final ObjectStream samples; private final int samplesToSkip; + /** + * Initializes a {@link SampleSkipStream} with the specified parameters. + * + * @param samples The {@link ObjectStream} to process. + * @param samplesToSkip The number of samples to skip. Must be greater than {@code 0}. + * @throws IOException Thrown if IO errors occurred during skip operation. + */ SampleSkipStream(ObjectStream samples, int samplesToSkip) throws IOException { this.samples = samples; this.samplesToSkip = samplesToSkip; @@ -46,7 +59,6 @@ public void reset() throws IOException, UnsupportedOperationException { private void skipSamples() throws IOException { int i = 0; - while (i < samplesToSkip && (samples.read()) != null) { i++; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java index e3474257af..1077ab37b8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java @@ -25,6 +25,7 @@ import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.tokenize.DetokenizationDictionary; @@ -33,11 +34,16 @@ import opennlp.tools.util.ObjectStream; /** + * Note: Do not use this class, internal use only! + * + * @see SentenceSample * @see LetsmtSentenceStream */ -public class LetsmtSentenceStreamFactory

extends AbstractSampleStreamFactory { +@Internal +public class LetsmtSentenceStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ArgumentParser.ParameterDescription(valueName = "dictionary", description = "specifies the file with detokenizer dictionary.") @ArgumentParser.OptionalParameter @@ -45,21 +51,17 @@ interface Parameters extends BasicFormatParams { } public static void registerFactory() { - StreamFactoryRegistry.registerFactory(SentenceSample.class, - "letsmt", new LetsmtSentenceStreamFactory<>( - LetsmtSentenceStreamFactory.Parameters.class)); + StreamFactoryRegistry.registerFactory(SentenceSample.class, "letsmt", + new LetsmtSentenceStreamFactory(LetsmtSentenceStreamFactory.Parameters.class)); } - protected LetsmtSentenceStreamFactory(Class

params) { + protected LetsmtSentenceStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); + Parameters params = validateBasicFormatParameters(args, Parameters.class); LetsmtDocument letsmtDoc = null; try { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/Masc.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/Masc.java new file mode 100644 index 0000000000..8b79b83d09 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/Masc.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.masc; + +/** + * A simple marker interface for classes that support or refer to + * the {@link #MASC_FORMAT}. + */ +public interface Masc { + + String MASC_FORMAT = "masc"; +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java index e4e1177cb7..7af48a765d 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java @@ -40,7 +40,6 @@ import opennlp.tools.util.Span; import opennlp.tools.util.XmlUtil; - public class MascDocument { private static final Logger logger = LoggerFactory.getLogger(MascDocument.class); diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java index 1adf3535c0..ab85f88d5b 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java @@ -24,29 +24,49 @@ import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.namefind.NameSample; import opennlp.tools.util.ObjectStream; -public class MascNamedEntitySampleStreamFactory

extends AbstractSampleStreamFactory { - public static final String MASC_FORMAT = "masc"; +/** + * Note: Do not use this class, internal use only! + * + * @see NameSample + * @see MascPOSSampleStream + */ +@Internal +public class MascNamedEntitySampleStreamFactory extends + AbstractSampleStreamFactory + implements Masc { + + public interface Parameters extends BasicFormatParams { - protected MascNamedEntitySampleStreamFactory(Class

params) { + @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", + description = "search through files recursively") + Boolean getRecurrentSearch(); + + @ArgumentParser.ParameterDescription(valueName = "fileFilterString", + description = "only include files which contain a given string in their name") + String getFileFilter(); + + } + + protected MascNamedEntitySampleStreamFactory(Class params) { super(params); } public static void registerFactory() { - StreamFactoryRegistry.registerFactory(NameSample.class, - MASC_FORMAT, - new MascNamedEntitySampleStreamFactory<>( - MascNamedEntitySampleStreamFactory.Parameters.class)); + StreamFactoryRegistry.registerFactory(NameSample.class, MASC_FORMAT, + new MascNamedEntitySampleStreamFactory(MascNamedEntitySampleStreamFactory.Parameters.class)); } @Override public ObjectStream create(String[] args) { - MascNamedEntitySampleStreamFactory.Parameters params = - ArgumentParser.parse(args, MascNamedEntitySampleStreamFactory.Parameters.class); - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + Parameters params = ArgumentParser.parse(args, Parameters.class); try { FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter()); @@ -57,17 +77,4 @@ public ObjectStream create(String[] args) { } return null; } - - interface Parameters extends BasicFormatParams { - - @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", - description = "search through files recursively") - boolean getRecurrentSearch(); - - @ArgumentParser.ParameterDescription(valueName = "fileFilterString", - description = "only include files which contain a given string in their name") - String getFileFilter(); - - } - } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java index 4e910c68bb..b3bceddecd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java @@ -24,29 +24,49 @@ import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.postag.POSSample; import opennlp.tools.util.ObjectStream; -public class MascPOSSampleStreamFactory

extends AbstractSampleStreamFactory { - public static final String MASC_FORMAT = "masc"; +/** + * Note: Do not use this class, internal use only! + * + * @see POSSample + * @see MascPOSSampleStream + */ +@Internal +public class MascPOSSampleStreamFactory extends + AbstractSampleStreamFactory + implements Masc { + + public interface Parameters extends BasicFormatParams { + + @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", + description = "search through files recursively") + Boolean getRecurrentSearch(); + + @ArgumentParser.ParameterDescription(valueName = "fileFilterString", + description = "only include files which contain a given string in their name") + String getFileFilter(); + + } - protected MascPOSSampleStreamFactory(Class

params) { + protected MascPOSSampleStreamFactory(Class params) { super(params); } public static void registerFactory() { - StreamFactoryRegistry.registerFactory(POSSample.class, - MASC_FORMAT, - new MascPOSSampleStreamFactory<>( - MascPOSSampleStreamFactory.Parameters.class)); + StreamFactoryRegistry.registerFactory(POSSample.class, MASC_FORMAT, + new MascPOSSampleStreamFactory(Parameters.class)); } @Override public ObjectStream create(String[] args) { - MascPOSSampleStreamFactory.Parameters params = - ArgumentParser.parse(args, MascPOSSampleStreamFactory.Parameters.class); - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + Parameters params = ArgumentParser.parse(args, Parameters.class); try { FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter()); @@ -59,17 +79,5 @@ public ObjectStream create(String[] args) { return null; } - interface Parameters extends BasicFormatParams { - - @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", - description = "search through files recursively") - boolean getRecurrentSearch(); - - @ArgumentParser.ParameterDescription(valueName = "fileFilterString", - description = "only include files which contain a given string in their name") - String getFileFilter(); - - } - } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java index 05d661e859..808bb65c8f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java @@ -18,6 +18,7 @@ package opennlp.tools.formats.masc; import java.io.IOException; +import java.io.Serial; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -34,8 +35,9 @@ public class MascSentence extends Span { - private static final Logger logger = LoggerFactory.getLogger(MascSentence.class); + @Serial private static final long serialVersionUID = 6295507533472650848L; + private static final Logger logger = LoggerFactory.getLogger(MascSentence.class); /** * A helper class to extract the extract a quark from the corpus file even if it is beyond the diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java index d44bf78009..9b6eeb4356 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java @@ -24,28 +24,52 @@ import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.util.ObjectStream; -public class MascSentenceSampleStreamFactory

extends AbstractSampleStreamFactory { +/** + * Note: Do not use this class, internal use only! + * + * @see SentenceSample + * @see MascSentenceSampleStream + */ +@Internal +public class MascSentenceSampleStreamFactory extends + AbstractSampleStreamFactory + implements Masc { + + public interface Parameters extends BasicFormatParams { + @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample", + description = "number of sentences per sample") + String getSentencesPerSample(); + + @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", + description = "search through files recursively") + Boolean getRecurrentSearch(); + + @ArgumentParser.ParameterDescription(valueName = "fileFilterString", + description = "only include files which contain a given string in their name") + String getFileFilter(); - public static final String MASC_FORMAT = "masc"; + } - protected MascSentenceSampleStreamFactory(Class

params) { + protected MascSentenceSampleStreamFactory(Class params) { super(params); } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, MASC_FORMAT, - new MascSentenceSampleStreamFactory<>(MascSentenceSampleStreamFactory.Parameters.class)); + new MascSentenceSampleStreamFactory(Parameters.class)); } @Override public ObjectStream create(String[] args) { - MascSentenceSampleStreamFactory.Parameters params = - ArgumentParser.parse(args, MascSentenceSampleStreamFactory.Parameters.class); - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + Parameters params = ArgumentParser.parse(args, Parameters.class); try { FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter()); @@ -59,20 +83,5 @@ public ObjectStream create(String[] args) { return null; } - interface Parameters extends BasicFormatParams { - @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample", - description = "number of sentences per sample") - String getSentencesPerSample(); - - @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", - description = "search through files recursively") - boolean getRecurrentSearch(); - - @ArgumentParser.ParameterDescription(valueName = "fileFilterString", - description = "only include files which contain a given string in their name") - String getFileFilter(); - - } - } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java index 5fa96eeb7c..aef52dc086 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java @@ -17,6 +17,8 @@ package opennlp.tools.formats.masc; +import java.io.Serial; + import opennlp.tools.util.Span; /** @@ -24,6 +26,7 @@ */ public class MascToken extends Span { + @Serial private static final long serialVersionUID = -780646706788037041L; private final String pos; private final String base; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java index c58eb13370..8a70c0b74e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java @@ -24,29 +24,52 @@ import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.util.ObjectStream; -public class MascTokenSampleStreamFactory

extends AbstractSampleStreamFactory { +/** + * Note: Do not use this class, internal use only! + * + * @see TokenSample + * @see MascTokenSampleStream + */ +@Internal +public class MascTokenSampleStreamFactory extends + AbstractSampleStreamFactory + implements Masc { + + public interface Parameters extends BasicFormatParams { + @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample", + description = "number of sentences per sample") + String getSentencesPerSample(); + + @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", + description = "search through files recursively") + Boolean getRecurrentSearch(); + + @ArgumentParser.ParameterDescription(valueName = "fileFilterString", + description = "only include files which contain a given string in their name") + String getFileFilter(); - public static final String MASC_FORMAT = "masc"; + } - protected MascTokenSampleStreamFactory(Class

params) { + protected MascTokenSampleStreamFactory(Class params) { super(params); } public static void registerFactory() { StreamFactoryRegistry.registerFactory(TokenSample.class, MASC_FORMAT, - new MascTokenSampleStreamFactory<>(MascTokenSampleStreamFactory.Parameters.class)); + new MascTokenSampleStreamFactory(Parameters.class)); } - @Override public ObjectStream create(String[] args) { - MascTokenSampleStreamFactory.Parameters params = - ArgumentParser.parse(args, MascTokenSampleStreamFactory.Parameters.class); - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } + Parameters params = ArgumentParser.parse(args, Parameters.class); try { FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter()); @@ -59,20 +82,5 @@ public ObjectStream create(String[] args) { return null; } - interface Parameters extends BasicFormatParams { - @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample", - description = "number of sentences per sample") - String getSentencesPerSample(); - - @ArgumentParser.ParameterDescription(valueName = "recurrentSearch", - description = "search through files recursively") - boolean getRecurrentSearch(); - - @ArgumentParser.ParameterDescription(valueName = "fileFilterString", - description = "only include files which contain a given string in their name") - String getFileFilter(); - - } - } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java index 916f6d2d04..f351927b6c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java @@ -17,10 +17,13 @@ package opennlp.tools.formats.masc; +import java.io.Serial; + import opennlp.tools.util.Span; public class MascWord extends Span { + @Serial private static final long serialVersionUID = 2133473549058189775L; private final int id; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java index cd7b10f8e8..e0f8bc2fd6 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java @@ -27,6 +27,13 @@ import opennlp.tools.util.ObjectStream; import opennlp.tools.util.Span; +/** + * Moses is a statistical machine translation system that allows you + * to automatically train translation models for any language pair. + *

+ * Details are found on the + * official website. + */ public class MosesSentenceSampleStream extends FilterObjectStream { /** @@ -58,9 +65,8 @@ public SentenceSample read() throws IOException { sentencesString.append(' '); } - if (sentenceSpans.size() > 0) { - return new SentenceSample(sentencesString.toString(), - sentenceSpans.toArray(new Span[0])); + if (!sentenceSpans.isEmpty()) { + return new SentenceSample(sentencesString.toString(), sentenceSpans.toArray(new Span[0])); } return null; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java index 6e942c93ff..529f2179d5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java @@ -17,49 +17,40 @@ package opennlp.tools.formats.moses; -import java.io.IOException; - -import opennlp.tools.cmdline.ArgumentParser; -import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.sentdetect.SentenceSample; -import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; /** * Factory producing OpenNLP {@link MosesSentenceSampleStream} objects. + *

+ * Note: Do not use this class, internal use only! + * + * @see SentenceSample + * @see MosesSentenceSampleStream */ -public class MosesSentenceSampleStreamFactory

extends AbstractSampleStreamFactory { +@Internal +public class MosesSentenceSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { } public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, - "moses", new MosesSentenceSampleStreamFactory<>(Parameters.class)); + "moses", new MosesSentenceSampleStreamFactory(Parameters.class)); } - protected MosesSentenceSampleStreamFactory(Class

params) { + protected MosesSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); - - ObjectStream lineStream = null; - try { - lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); - } catch (IOException ex) { - CmdLineUtil.handleCreateObjectStreamError(ex); - } - + ObjectStream lineStream = readData(args, Parameters.class); return new MosesSentenceSampleStream(lineStream); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java index 28030bebf9..4acb31d1e8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java @@ -23,21 +23,33 @@ import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.cmdline.tokenizer.TokenizerModelLoader; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.formats.DirectorySampleStream; import opennlp.tools.formats.convert.FileToStringSampleStream; import opennlp.tools.namefind.NameSample; +import opennlp.tools.tokenize.ThreadSafeTokenizerME; import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.StringUtil; +/** + * @see NameSample + * @see MucNameSampleStream + */ +@Internal public class Muc6NameSampleStreamFactory extends AbstractSampleStreamFactory { + public interface Parameters extends BasicFormatParams { + @ParameterDescription(valueName = "modelFile") + File getTokenizerModel(); + } + protected Muc6NameSampleStreamFactory() { super(Parameters.class); } @@ -49,11 +61,16 @@ public static void registerFactory() { @Override public ObjectStream create(String[] args) { - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } Parameters params = ArgumentParser.parse(args, Parameters.class); + if (!params.getData().isDirectory() || !params.getData().exists()) { + throw new TerminateToolException(-1, "The specified data directory is not valid!"); + } TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel()); - Tokenizer tokenizer = new TokenizerME(tokenizerModel); + Tokenizer tokenizer = new ThreadSafeTokenizerME(tokenizerModel); ObjectStream mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), @@ -62,9 +79,4 @@ public ObjectStream create(String[] args) { return new MucNameSampleStream(tokenizer, mucDocStream); } - - interface Parameters extends BasicFormatParams { - @ParameterDescription(valueName = "modelFile") - File getTokenizerModel(); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucElementNames.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucElementNames.java index 23eb09f26e..d3ac523db8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucElementNames.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucElementNames.java @@ -31,10 +31,8 @@ class MucElementNames { static { CONTENT_ELEMENTS = Set.of( - MucElementNames.HEADLINE_ELEMENT, - MucElementNames.DATELINE_ELEMENT, - MucElementNames.DD_ELEMENT, - MucElementNames.SENTENCE_ELEMENT); + MucElementNames.HEADLINE_ELEMENT, MucElementNames.DATELINE_ELEMENT, + MucElementNames.DD_ELEMENT, MucElementNames.SENTENCE_ELEMENT); } private MucElementNames() { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java index f9a8bbc040..caf63a3352 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java @@ -64,7 +64,7 @@ public NameSample read() throws IOException { } } - if (storedSamples.size() > 0) { + if (!storedSamples.isEmpty()) { return storedSamples.remove(0); } else { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java index 08f615b1fc..3cbadd82fe 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java @@ -32,6 +32,7 @@ import javax.xml.xpath.XPathFactory; import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; @@ -41,6 +42,13 @@ public class NKJPSegmentationDocument { + private static final String XML_ID = "xml:id"; + private static final String SEG = "seg"; + private static final String CHOICE = "choice"; + private static final String NKJP_PAREN = "nkjp:paren"; + private static final String NKJP_REJECTED = "nkjp:rejected"; + private static final String NKJP_NPS = "nkjp:nps"; + public static class Pointer { final String doc; final String id; @@ -62,8 +70,7 @@ public Span toSpan() { @Override public String toString() { - return doc + "#string-range(" + id + "," + offset - + "," + length + ")"; + return doc + "#string-range(" + id + "," + offset + "," + length + ")"; } } @@ -103,8 +110,8 @@ public static NKJPSegmentationDocument parse(InputStream is) throws IOException Node sentnode = nl.item(i); String sentid = null; - if (sentnode.getAttributes().getNamedItem("xml:id") != null) { - sentid = sentnode.getAttributes().getNamedItem("xml:id").getTextContent(); + if (sentnode.getAttributes().getNamedItem(XML_ID) != null) { + sentid = sentnode.getAttributes().getNamedItem(XML_ID).getTextContent(); } Map segments = new LinkedHashMap<>(); @@ -112,18 +119,17 @@ public static NKJPSegmentationDocument parse(InputStream is) throws IOException for (int j = 0; j < segnl.getLength(); j++) { Node n = segnl.item(j); - if (n.getNodeName().equals("seg")) { + if (n.getNodeName().equals(SEG)) { String segid = xmlID(n); Pointer pointer = fromSeg(n); segments.put(segid, pointer); - } else if (n.getNodeName().equals("choice")) { - + } else if (n.getNodeName().equals(CHOICE)) { NodeList choices = n.getChildNodes(); - for (int k = 0; k < choices.getLength(); k++) { - if (choices.item(k).getNodeName().equals("nkjp:paren")) { - if (!checkRejectedParen(choices.item(k))) { - NodeList paren_segs = (NodeList) SEG_NODES_ONLY.evaluate(choices.item(k), + Node choice = choices.item(k); + if (choice.getNodeName().equals(NKJP_PAREN)) { + if (!checkRejectedParen(choice)) { + NodeList paren_segs = (NodeList) SEG_NODES_ONLY.evaluate(choice, XPathConstants.NODESET); for (int l = 0; l < paren_segs.getLength(); l++) { @@ -132,17 +138,16 @@ public static NKJPSegmentationDocument parse(InputStream is) throws IOException segments.put(segid, pointer); } } - } else if (choices.item(k).getNodeName().equals("seg")) { - if (!checkRejected(choices.item(k))) { - String segid = xmlID(choices.item(k)); - Pointer pointer = fromSeg(choices.item(k)); + } else if (choice.getNodeName().equals(SEG)) { + if (!checkRejected(choice)) { + String segid = xmlID(choice); + Pointer pointer = fromSeg(choice); segments.put(segid, pointer); } } } } } - sentences.put(sentid, segments); } @@ -154,14 +159,14 @@ public static NKJPSegmentationDocument parse(InputStream is) throws IOException } static boolean checkRejected(Node n) { - if (n.getAttributes() == null) { + NamedNodeMap attrs = n.getAttributes(); + if (attrs == null) { return false; } - if (n.getAttributes().getNamedItem("nkjp:rejected") == null) { + if (attrs.getNamedItem(NKJP_REJECTED) == null) { return false; } - String rejected = n.getAttributes().getNamedItem("nkjp:rejected").getTextContent(); - return rejected.equals("true"); + return attrs.getNamedItem(NKJP_REJECTED).getTextContent().equals("true"); } static boolean checkRejectedParen(Node n) { @@ -170,7 +175,7 @@ static boolean checkRejectedParen(Node n) { } else { for (int i = 0; i < n.getChildNodes().getLength(); i++) { Node m = n.getChildNodes().item(i); - if (m.getNodeName().equals("seg")) { + if (m.getNodeName().equals(SEG)) { if (!checkRejected(m)) { return false; } @@ -181,13 +186,14 @@ static boolean checkRejectedParen(Node n) { } static String xmlID(Node n) throws IOException { - if (n.getAttributes() == null || n.getAttributes().getLength() < 1) { + NamedNodeMap attr = n.getAttributes(); + if (attr == null || attr.getLength() < 1) { throw new IOException("Missing required attributes"); } String id = null; - if (n.getAttributes().getNamedItem("xml:id") != null) { - id = n.getAttributes().getNamedItem("xml:id").getTextContent(); + if (attr.getNamedItem(XML_ID) != null) { + id = attr.getNamedItem(XML_ID).getTextContent(); } if (id == null) { @@ -207,8 +213,8 @@ static Pointer fromSeg(Node n) throws IOException { ptr = n.getAttributes().getNamedItem("corresp").getTextContent(); } String spacing = ""; - if (n.getAttributes().getNamedItem("nkjp:nps") != null) { - spacing = n.getAttributes().getNamedItem("nkjp:nps").getTextContent(); + if (n.getAttributes().getNamedItem(NKJP_NPS) != null) { + spacing = n.getAttributes().getNamedItem(NKJP_NPS).getTextContent(); } if (ptr == null) { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java index 03ed8348f3..b71a4c502a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java @@ -28,8 +28,8 @@ import opennlp.tools.util.Span; public class NKJPSentenceSampleStream implements ObjectStream { - private final NKJPSegmentationDocument segments; + private final NKJPSegmentationDocument segments; private final NKJPTextDocument text; private Iterator>> segmentIt; @@ -89,7 +89,7 @@ public SentenceSample read() throws IOException { } // end of stream is reached, indicate that with null return value - if (sentenceSpans.size() == 0) { + if (sentenceSpans.isEmpty()) { return null; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java index d126d21e41..da9e598920 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java @@ -24,13 +24,22 @@ import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.util.ObjectStream; -public class NKJPSentenceSampleStreamFactory

extends AbstractSampleStreamFactory { +/** + * Note: Do not use this class, internal use only! + * + * @see SentenceSample + * @see NKJPSentenceSampleStream + */ +@Internal +public class NKJPSentenceSampleStreamFactory extends + AbstractSampleStreamFactory { - interface Parameters extends BasicFormatParams { + public interface Parameters extends BasicFormatParams { @ArgumentParser.ParameterDescription(valueName = "text", description = "file containing NKJP text") File getTextFile(); @@ -38,20 +47,16 @@ interface Parameters extends BasicFormatParams { public static void registerFactory() { StreamFactoryRegistry.registerFactory(SentenceSample.class, "nkjp", - new NKJPSentenceSampleStreamFactory<>(NKJPSentenceSampleStreamFactory.Parameters.class)); + new NKJPSentenceSampleStreamFactory(NKJPSentenceSampleStreamFactory.Parameters.class)); } - protected NKJPSentenceSampleStreamFactory(Class

params) { + protected NKJPSentenceSampleStreamFactory(Class params) { super(params); } @Override public ObjectStream create(String[] args) { - - Parameters params = ArgumentParser.parse(args, Parameters.class); - - CmdLineUtil.checkInputFile("Data", params.getData()); - + Parameters params = validateBasicFormatParameters(args, Parameters.class); CmdLineUtil.checkInputFile("Text", params.getTextFile()); NKJPSegmentationDocument segDoc = null; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java index 3c20e704e9..205914ebc8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java @@ -43,14 +43,14 @@ * The National corpus of Polish (NKJP) format. *

* Information about the format are found on this - * web site. + * web site. *

* A 1-million word corpus can be found on this - * + * * web site. *

* The NKJP schema can be found - * here. + * here. */ public class NKJPTextDocument { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java index 8dc09af882..130e7386b2 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java @@ -35,10 +35,18 @@ import opennlp.tools.util.StringUtil; /** - * Name Sample Stream parser for the OntoNotes 4.0 corpus. + * Name Sample Stream parser for the OntoNotes 4.0 named entity files. */ public class OntoNotesNameSampleStream extends FilterObjectStream { + private static final String TAG_DOC_OPEN = " tokenConversionMap; private final List nameSamples = new LinkedList<>(); @@ -69,21 +77,18 @@ private String convertToken(String token) { StringBuilder convertedToken = new StringBuilder(token); - int startTagEndIndex = convertedToken.indexOf(">"); - + int startTagEndIndex = convertedToken.indexOf(SYMBOL_CLOSE); if (token.contains("=\"") && startTagEndIndex != -1) { convertedToken.delete(0, startTagEndIndex + 1); } - int endTagBeginIndex = convertedToken.indexOf("<"); - int endTagEndIndex = convertedToken.indexOf(">"); - + int endTagBeginIndex = convertedToken.indexOf(SYMBOL_OPEN); + int endTagEndIndex = convertedToken.indexOf(SYMBOL_CLOSE); if (endTagBeginIndex != -1 && endTagEndIndex != -1) { convertedToken.delete(endTagBeginIndex, endTagEndIndex + 1); } String cleanedToken = convertedToken.toString(); - if (tokenConversionMap.get(cleanedToken) != null) { cleanedToken = tokenConversionMap.get(cleanedToken); } @@ -96,74 +101,64 @@ public NameSample read() throws IOException { if (nameSamples.isEmpty()) { String doc = samples.read(); - if (doc != null) { - BufferedReader docIn = new BufferedReader(new StringReader(doc)); - boolean clearAdaptiveData = true; - String line; - while ((line = docIn.readLine()) != null) { - - if (line.startsWith("")) { - break; - } - - String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(line); + try (BufferedReader docIn = new BufferedReader(new StringReader(doc))) { + while ((line = docIn.readLine()) != null) { - List entities = new LinkedList<>(); - List cleanedTokens = new ArrayList<>(tokens.length); - - int tokenIndex = 0; - int entityBeginIndex = -1; - String entityType = null; - boolean insideStartEnmaxTag = false; - for (String token : tokens) { - - // Split here, next part of tag is in new token - if (token.startsWith(" entities = new LinkedList<>(); + List cleanedTokens = new ArrayList<>(tokens.length); - if (token.startsWith(typeBegin)) { + int tokenIndex = 0; + int entityBeginIndex = -1; + String entityType = null; + boolean insideStartEnmaxTag = false; + for (String token : tokens) { - int typeEnd = token.indexOf("\"", typeBegin.length()); + // Split here, next part of tag is in new token + if (token.startsWith(TAG_ENAMEX_OPEN)) { + insideStartEnmaxTag = true; + continue; + } - entityType = StringUtil.toLowerCase(token.substring(typeBegin.length(), typeEnd)); + if (insideStartEnmaxTag) { + String typeBegin = TYPE; + if (token.startsWith(typeBegin)) { + int typeEnd = token.indexOf("\"", typeBegin.length()); + entityType = StringUtil.toLowerCase(token.substring(typeBegin.length(), typeEnd)); + } + + if (token.contains(SYMBOL_CLOSE)) { + entityBeginIndex = tokenIndex; + insideStartEnmaxTag = false; + } else { + continue; + } } - if (token.contains(">")) { - entityBeginIndex = tokenIndex; - insideStartEnmaxTag = false; - } else { - continue; + if (token.endsWith(TAG_ENAMEX_CLOSE)) { + entities.add(new Span(entityBeginIndex, tokenIndex + 1, entityType)); + entityBeginIndex = -1; } - } - if (token.endsWith("")) { - entities.add(new Span(entityBeginIndex, tokenIndex + 1, - entityType)); - entityBeginIndex = -1; + cleanedTokens.add(convertToken(token)); + tokenIndex++; } - cleanedTokens.add(convertToken(token)); - tokenIndex++; - } + nameSamples.add(new NameSample(cleanedTokens.toArray(new String[0]), + entities.toArray(new Span[0]), clearAdaptiveData)); - nameSamples.add(new NameSample(cleanedTokens - .toArray(new String[0]), entities - .toArray(new Span[0]), clearAdaptiveData)); - - clearAdaptiveData = false; + clearAdaptiveData = false; + } } } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java index f71e304279..9bb154fa38 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java @@ -22,23 +22,42 @@ import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.formats.DirectorySampleStream; import opennlp.tools.formats.convert.FileToStringSampleStream; import opennlp.tools.namefind.NameSample; import opennlp.tools.util.ObjectStream; +/** + * Note: Do not use this class, internal use only! + * + * @see OntoNotesNameSampleStream + */ +@Internal public class OntoNotesNameSampleStreamFactory extends AbstractSampleStreamFactory { + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(NameSample.class, + "ontonotes", new OntoNotesNameSampleStreamFactory()); + } + public OntoNotesNameSampleStreamFactory() { super(OntoNotesFormatParameters.class); } @Override public ObjectStream create(String[] args) { - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); + final File ontoDir = new File(params.getOntoNotesDir()); + if (!ontoDir.isDirectory() || !ontoDir.exists()) { + throw new TerminateToolException(-1, "The specified OntoNotes directory is not valid!"); + } ObjectStream documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), @@ -54,8 +73,4 @@ public ObjectStream create(String[] args) { new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(NameSample.class, - "ontonotes", new OntoNotesNameSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java index a82c0d97f6..493823f4c1 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java @@ -18,12 +18,19 @@ package opennlp.tools.formats.ontonotes; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.formats.convert.ParseToPOSSampleStream; import opennlp.tools.parser.Parse; import opennlp.tools.postag.POSSample; import opennlp.tools.util.ObjectStream; +/** + * Note: Do not use this class, internal use only! + * + * @see ParseToPOSSampleStream + */ +@Internal public class OntoNotesPOSSampleStreamFactory extends AbstractSampleStreamFactory { @@ -34,14 +41,18 @@ protected OntoNotesPOSSampleStreamFactory() { super(OntoNotesFormatParameters.class); } + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(POSSample.class, "ontonotes", + new OntoNotesPOSSampleStreamFactory()); + } + @Override public ObjectStream create(String[] args) { + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } ObjectStream parseSampleStream = parseSampleStreamFactory.create(args); return new ParseToPOSSampleStream(parseSampleStream); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(POSSample.class, "ontonotes", - new OntoNotesPOSSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java index 84fd1a5aba..a18d5558db 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java @@ -52,7 +52,7 @@ public Parse read() throws IOException { } if (parse == null || parse.isEmpty()) { - if (parseString.length() > 0) { + if (!parseString.isEmpty()) { return Parse.parseParse(parseString.toString()); } else { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java index 5b9fa0b538..35bbe33eee 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java @@ -22,12 +22,20 @@ import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.commons.Internal; import opennlp.tools.formats.AbstractSampleStreamFactory; import opennlp.tools.formats.DirectorySampleStream; import opennlp.tools.formats.convert.FileToStringSampleStream; import opennlp.tools.parser.Parse; import opennlp.tools.util.ObjectStream; +/** + * Note: Do not use this class, internal use only! + * + * @see OntoNotesParseSampleStream + */ +@Internal public class OntoNotesParseSampleStreamFactory extends AbstractSampleStreamFactory { @@ -35,10 +43,21 @@ protected OntoNotesParseSampleStreamFactory() { super(OntoNotesFormatParameters.class); } + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(Parse.class, "ontonotes", + new OntoNotesParseSampleStreamFactory()); + } + @Override public ObjectStream create(String[] args) { - + if (args == null) { + throw new IllegalArgumentException("Passed args must not be null!"); + } OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class); + final File ontoDir = new File(params.getOntoNotesDir()); + if (!ontoDir.isDirectory() || !ontoDir.exists()) { + throw new TerminateToolException(-1, "The specified OntoNotes directory is not valid!"); + } ObjectStream documentStream = new DirectorySampleStream(new File( params.getOntoNotesDir()), @@ -53,13 +72,8 @@ public ObjectStream create(String[] args) { // We need file to line here ... and that is probably best doen with the plain text stream // lets copy it over here, refactor it, and then at some point we replace the current version // with the refactored version - - return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream( - documentStream, StandardCharsets.UTF_8))); + return new OntoNotesParseSampleStream(new DocumentToLineStream( + new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8))); } - public static void registerFactory() { - StreamFactoryRegistry.registerFactory(Parse.class, "ontonotes", - new OntoNotesParseSampleStreamFactory()); - } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java index acfdbbae6c..f7e53dabe7 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java @@ -24,12 +24,12 @@ import opennlp.tools.util.ObjectStream; /** - * The {@link NameSampleDataStream} class converts tagged {@link String strings} + * The {@link NameSampleDataStream} class converts tagged tokens * provided by a {@link DataStream} to {@link NameSample} objects. - * It uses text that is one-sentence per line and tokenized - * with names identified by: *

- * {@code <START>} and {@code <END>} tags. + * It uses text that is one-sentence per line and tokenized + * with names identified by:
+ * <{@code START}> and <{@code END}>tags. */ public class NameSampleDataStream extends FilterObjectStream { @@ -55,7 +55,7 @@ public NameSample read() throws IOException { // An empty line indicates the start of a new article // for which the adaptive data in the feature generators // must be cleared - while (token != null && token.trim().length() == 0) { + while (token != null && token.trim().isEmpty()) { isClearAdaptiveData = true; token = samples.read(); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParseSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParseSampleStream.java index 82cfe0a71f..b75edbf6e0 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParseSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParseSampleStream.java @@ -22,6 +22,10 @@ import opennlp.tools.util.FilterObjectStream; import opennlp.tools.util.ObjectStream; +/** + * @see Parse + * @see FilterObjectStream + */ public class ParseSampleStream extends FilterObjectStream { /** diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java index 4aac68692a..406531171a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java @@ -28,7 +28,7 @@ import opennlp.tools.util.ObjectStream; /** - * A stream filter which reads a sentence per line which contains + * A {@link FilterObjectStream stream filter} which reads a sentence per line that contains * words and tags in {@code word_tag} format and outputs a {@link POSSample} objects. */ public class WordTagSampleStream extends FilterObjectStream { @@ -45,14 +45,14 @@ public WordTagSampleStream(ObjectStream sentences) { } /** - * Parses the next sentence and return the next {@link POSSample} object. + * Reads the next tokens and parses it into the next {@link POSSample} object. *

* If an error occurs an empty {@link POSSample} object is returned * and a warning message is logged. Usually it does not matter if one * or many sentences are ignored. * * @return A valid {@link POSSample} or {@code null} if the - * {@link ObjectStream sentence stream} is exhausted. + * {@link ObjectStream stream} is exhausted. * * @throws IOException Thrown if IO errors occurred during read. */ @@ -66,17 +66,13 @@ public POSSample read() throws IOException { try { sample = POSSample.parse(sentence); } catch (InvalidFormatException e) { - // TODO: An exception in error case should be thrown. logger.warn("Error during parsing, ignoring sentence: {}", sentence, e); - sample = new POSSample(new String[]{}, new String[]{}); } - return sample; } else { - // sentences stream is exhausted - return null; + return null; // sentences stream is exhausted } } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamFactoryTest.java new file mode 100644 index 0000000000..bcd2e2ecb8 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamFactoryTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public abstract class AbstractSampleStreamFactoryTest extends AbstractFormatTest { + + protected static final String FORMAT_SAMPLE_DIR = "opennlp/tools/formats/"; + + protected abstract AbstractSampleStreamFactory getFactory(); + protected abstract String getDataFilePath(); + + @Test + void testCreateWithNullParameter() { + assertThrows(IllegalArgumentException.class, () -> { + try (ObjectStream stream = getFactory().create(null)) { + stream.read(); + } + }); + } + + @Test + void testCreateWithInvalidParameter() { + assertThrows(IllegalArgumentException.class, () -> { + try (ObjectStream stream = getFactory().create(new String[]{"X"})) { + stream.read(); + } + }); + } + + /* + * Note: + * This test case must be overridden for non-simple cases where + * more than the '-data' param is required. + */ + @Test + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = getFactory().create( + new String[]{"-data", getDataFilePath() + "xyz"})) { + S sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamTest.java index c154f21d73..f56b771161 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/AbstractSampleStreamTest.java @@ -19,8 +19,7 @@ import opennlp.tools.util.InputStreamFactory; - -public abstract class AbstractSampleStreamTest extends AbstractFormatTest { +abstract class AbstractSampleStreamTest extends AbstractFormatTest { protected InputStreamFactory getFactory(String resource) { return new ResourceAsStreamFactory(AbstractSampleStreamTest.class, FORMATS_BASE_DIR + resource); diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..25c19fa2cd --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactoryTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class BioNLP2004NameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "bionlp2004-01.sample"; + + // SUT + private BioNLP2004NameSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + BioNLP2004NameSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "bionlp2004"); + assertInstanceOf(BioNLP2004NameSampleStreamFactory.class, f); + factory = (BioNLP2004NameSampleStreamFactory) f; + assertEquals(BioNLP2004NameSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + + try (ObjectStream stream = factory.create( + new String[]{"-types", "DNA,protein,cell_type,cell_line,RNA", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + /* some extra checks to make sure the 'types' parameter is handled correctly */ + assertEquals(5, sample.getNames().length); + assertEquals("protein", sample.getNames()[0].getType()); + assertEquals("protein", sample.getNames()[1].getType()); + assertEquals("protein", sample.getNames()[2].getType()); + assertEquals("protein", sample.getNames()[3].getType()); + assertEquals("cell_type", sample.getNames()[4].getType()); + } + } + + @Test + void testCreateWithUnsupportedTypes() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-types", "xyz", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-types", "DNA,protein,cell_type,cell_line,RNA", "-data", sampleFileFullPath + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ChunkerSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ChunkerSampleStreamFactoryTest.java new file mode 100644 index 0000000000..ad5a4e1ccb --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ChunkerSampleStreamFactoryTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.chunker.ChunkSample; +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ChunkerSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "chunker-01.sample"; + + // SUT + private ChunkerSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ChunkerSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(ChunkSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(ChunkerSampleStreamFactory.class, f); + factory = (ChunkerSampleStreamFactory) f; + assertEquals(ChunkerSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + ChunkSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/Conll02NameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/Conll02NameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..dec1fa9a12 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/Conll02NameSampleStreamFactoryTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class Conll02NameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "conll2002-es.sample"; + private static final String SAMPLE_02 = "conll2002-nl.sample"; + + // SUT + private Conll02NameSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + Conll02NameSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "conll02"); + assertInstanceOf(Conll02NameSampleStreamFactory.class, f); + factory = ((Conll02NameSampleStreamFactory) f); + assertEquals(Conll02NameSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"spa", "es", "nld", "nl"}) + void testCreateWithValidParameter(String lang) throws IOException { + // prepare depending on language + if ("spa".equals(lang) || "es".equals(lang)) { + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } else { + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_02).getPath(); + } + + try (ObjectStream stream = factory.create( + new String[]{"-lang", lang, "-types", "per,loc,org,misc", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @ValueSource(strings = {"", "per", "loc", "org", "misc", "per,loc,org,misc"}) + void testCreateWithDifferentTypes(String types) throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-lang", "spa", "-types", types, "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @Test + void testCreateWithInvalidLanguage() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create( + new String[]{"-lang", "xyz", "-types", "per,loc,org,misc", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-lang", "spa", "-types", "per,loc,org,misc", "-data", sampleFileFullPath + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/Conll03NameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/Conll03NameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..999bdf5c60 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/Conll03NameSampleStreamFactoryTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class Conll03NameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "conll2003-de.sample"; + private static final String SAMPLE_02 = "conll2003-en.sample"; + + // SUT + private Conll03NameSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + Conll03NameSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "conll03"); + assertInstanceOf(Conll03NameSampleStreamFactory.class, f); + factory = ((Conll03NameSampleStreamFactory) f); + assertEquals(Conll03NameSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"deu", "eng"}) + void testCreateWithValidParameter(String lang) throws IOException { + // prepare depending on language + if ("deu".equals(lang)) { + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } else { + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_02).getPath(); + } + + try (ObjectStream stream = factory.create( + new String[]{"-lang", lang, "-types", "per,loc,org,misc", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @ValueSource(strings = {"", "per", "loc", "org", "misc", "per,loc,org,misc"}) + void testCreateWithDifferentTypes(String types) throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-lang", "deu", "-types", types, "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @Test + void testCreateWithInvalidLanguage() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create( + new String[]{"-lang", "xyz", "-types", "per,loc,org,misc", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-lang", "deu", "-types", "per,loc,org,misc", "-data", sampleFileFullPath + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXPOSSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXPOSSampleStreamFactoryTest.java new file mode 100644 index 0000000000..095a5c0040 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXPOSSampleStreamFactoryTest.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ConllXPOSSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "conllx.sample"; + + // SUT + private ConllXPOSSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConllXPOSSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT); + assertInstanceOf(ConllXPOSSampleStreamFactory.class, f); + factory = (ConllXPOSSampleStreamFactory) f; + assertEquals(ConllXPOSSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..3b00d1c58a --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactoryTest.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ConllXSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "conllx.sample"; + + // SUT + private ConllXSentenceSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConllXSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT); + assertInstanceOf(ConllXSentenceSampleStreamFactory.class, f); + factory = (ConllXSentenceSampleStreamFactory) f; + assertEquals(ConllXSentenceSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-encoding", "UTF-8", "-detokenizer", detokFileFullPath, "-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-encoding", "UTF-8", "-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..1c607ced2f --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ConllXTokenSampleStreamFactoryTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ConllXTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "conllx.sample"; + + // SUT + private ConllXTokenSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConllXTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, ConllXPOSSampleStreamFactory.CONLLX_FORMAT); + assertInstanceOf(ConllXTokenSampleStreamFactory.class, f); + factory = (ConllXTokenSampleStreamFactory) f; + assertEquals(ConllXTokenSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-encoding", "UTF-8", "-detokenizer", detokFileFullPath, "-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-encoding", "UTF-8", "-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..eb6eebf8f3 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamFactoryTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class EvalitaNameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "evalita-ner-it-01.sample"; + + // SUT + private EvalitaNameSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + EvalitaNameSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "evalita"); + assertInstanceOf(EvalitaNameSampleStreamFactory.class, f); + factory = ((EvalitaNameSampleStreamFactory) f); + assertEquals(EvalitaNameSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + + try (ObjectStream stream = factory.create( + new String[]{"-lang", "it", "-types", "per,loc,org,gpe", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @Test + void testCreateWithInvalidLanguage() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create( + new String[]{"-lang", "xy", "-types", "per,loc,org,gpe", "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-lang", "it", "-types", "per,loc,org,gpe", "-data", sampleFileFullPath + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java index 37929e3ea7..9bfcc57a50 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java @@ -18,54 +18,102 @@ package opennlp.tools.formats; import java.io.IOException; +import java.util.stream.Stream; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import opennlp.tools.formats.EvalitaNameSampleStream.LANGUAGE; import opennlp.tools.namefind.NameSample; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.Span; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + /** * Note: * Sample training data must be UTF-8 encoded and uncompressed! */ public class EvalitaNameSampleStreamTest extends AbstractSampleStreamTest { - @Test - void testParsingItalianSample() throws IOException { + private static final String SAMPLE_01 = "evalita-ner-it-01.sample"; + private static final String SAMPLE_02 = "evalita-ner-it-02.sample"; + private static final String SAMPLE_03 = "evalita-ner-it-03.sample"; + private static final String SAMPLE_BROKEN = "evalita-ner-it-broken.sample"; + private static final String SAMPLE_INCORRECT = "evalita-ner-it-incorrect.sample"; - try (ObjectStream sampleStream = openData()) { - NameSample personName = sampleStream.read(); - Assertions.assertNotNull(personName); + @ParameterizedTest + @MethodSource(value = "provideData") + void testReadItalianDifferentEntityTypes(String file, int nerType, int expectedSentLength, + int expectedStart, int expectedEnd) throws IOException { - Assertions.assertEquals(11, personName.getSentence().length); - Assertions.assertEquals(1, personName.getNames().length); - Assertions.assertTrue(personName.isClearAdaptiveDataSet()); + try (ObjectStream sampleStream = openData(file, nerType)) { + NameSample ne = sampleStream.read(); + assertNotNull(ne); - Span nameSpan = personName.getNames()[0]; - Assertions.assertEquals(8, nameSpan.getStart()); - Assertions.assertEquals(10, nameSpan.getEnd()); - Assertions.assertTrue(personName.isClearAdaptiveDataSet()); + assertEquals(expectedSentLength, ne.getSentence().length); + assertEquals(1, ne.getNames().length); + assertTrue(ne.isClearAdaptiveDataSet()); - Assertions.assertEquals(0, sampleStream.read().getNames().length); + Span nameSpan = ne.getNames()[0]; + assertEquals(expectedStart, nameSpan.getStart()); + assertEquals(expectedEnd, nameSpan.getEnd()); + assertTrue(ne.isClearAdaptiveDataSet()); - Assertions.assertNull(sampleStream.read()); + if (SAMPLE_01.equals(file)) { // this file has an extra sentence + assertEquals(0, sampleStream.read().getNames().length); + } + assertNull(sampleStream.read()); } } + @Test + void testReadWithIncorrectInput() { + assertThrows(IOException.class, () -> { + try (ObjectStream sampleStream = openData( + SAMPLE_INCORRECT, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES)) { + sampleStream.read(); + } + }); + } + + @Test + void testReadWithBrokenDocument() { + assertThrows(IOException.class, () -> { + try (ObjectStream sampleStream = openData( + SAMPLE_BROKEN, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES)) { + sampleStream.read(); + } + }); + } + @Test void testReset() throws IOException { - try (ObjectStream sampleStream = openData()) { + try (ObjectStream sampleStream = openData(SAMPLE_01, + EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES)) { NameSample sample = sampleStream.read(); sampleStream.reset(); - Assertions.assertEquals(sample, sampleStream.read()); + assertEquals(sample, sampleStream.read()); } } - private ObjectStream openData() throws IOException { - return new EvalitaNameSampleStream(LANGUAGE.IT, getFactory("evalita-ner-it.sample"), - EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES); + // Note: This needs to be public as JUnit 5 requires it like this. + public static Stream provideData() { + return Stream.of( + Arguments.of(SAMPLE_01, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES, 11, 8, 10), + Arguments.of(SAMPLE_02, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES, 27, 11, 13), + Arguments.of(SAMPLE_02, EvalitaNameSampleStream.GENERATE_ORGANIZATION_ENTITIES, 27, 10, 11), + Arguments.of(SAMPLE_03, EvalitaNameSampleStream.GENERATE_GPE_ENTITIES, 20, 18, 19) + ); + } + + private ObjectStream openData(String fileName, int nerType) throws IOException { + return new EvalitaNameSampleStream(LANGUAGE.IT, getFactory(fileName), nerType); } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactoryTest.java new file mode 100644 index 0000000000..ad50ec8238 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactoryTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class LanguageDetectorSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "lang-detect-01.sample"; + + // SUT + private LanguageDetectorSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + LanguageDetectorSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(LanguageSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(LanguageDetectorSampleStreamFactory.class, f); + factory = (LanguageDetectorSampleStreamFactory) f; + assertEquals(LanguageDetectorSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + LanguageSample sample = stream.read(); + assertNotNull(sample); + } + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/LemmatizerSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/LemmatizerSampleStreamFactoryTest.java new file mode 100644 index 0000000000..03a4af0d49 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/LemmatizerSampleStreamFactoryTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.lemmatizer.LemmaSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class LemmatizerSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "lemma-01.sample"; + + // SUT + private LemmatizerSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + LemmatizerSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(LemmaSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(LemmatizerSampleStreamFactory.class, f); + factory = (LemmatizerSampleStreamFactory) f; + assertEquals(LemmatizerSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + LemmaSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/NameSampleDataStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/NameSampleDataStreamFactoryTest.java new file mode 100644 index 0000000000..b1f6f8f623 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/NameSampleDataStreamFactoryTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class NameSampleDataStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "name-data-01.sample"; + + // SUT + private NameSampleDataStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + NameSampleDataStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(NameSampleDataStreamFactory.class, f); + factory = (NameSampleDataStreamFactory) f; + assertEquals(NameSampleDataStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + Span[] hits = sample.getNames(); + assertNotNull(hits); + assertEquals(1, hits.length); + assertEquals(5, hits[0].getStart()); + assertEquals(6, hits[0].getEnd()); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ParseSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ParseSampleStreamFactoryTest.java new file mode 100644 index 0000000000..6bb24b7cf4 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ParseSampleStreamFactoryTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.parser.Parse; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ParseSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "parse-01.sample"; + + // SUT + private ParseSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ParseSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(Parse.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(ParseSampleStreamFactory.class, f); + factory = (ParseSampleStreamFactory) f; + assertEquals(ParseSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + Parse sample = stream.read(); + assertNotNull(sample); + assertEquals("The test shall come today . ", sample.getText()); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/SentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/SentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..60c084b57c --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/SentenceSampleStreamFactoryTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class SentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "sentences-01.sample"; + + // SUT + private SentenceSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + SentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(SentenceSampleStreamFactory.class, f); + factory = (SentenceSampleStreamFactory) f; + assertEquals(SentenceSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/TokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/TokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..44e478fa50 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/TokenSampleStreamFactoryTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class TokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "tokens-01.sample"; + + // SUT + private TokenSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + TokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(TokenSampleStreamFactory.class, f); + factory = (TokenSampleStreamFactory) f; + assertEquals(TokenSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactoryTest.java new file mode 100644 index 0000000000..d07f1ece0b --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactoryTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EmptySource; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.doccat.DocumentSample; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.DownloadUtil; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.fail; + +public class TwentyNewsgroupSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME", + System.getProperty("user.home"))).resolve(".opennlp"); + private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin"; + + // SUT + private TwentyNewsgroupSampleStreamFactory factory; + + private static String tokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + TwentyNewsgroupSampleStreamFactory.registerFactory(); + try { + // ensure, the model is available locally for later test purposes + DownloadUtil.downloadModel("en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class); + } catch (IOException e) { + fail(e.getLocalizedMessage()); + } + tokFileFullPath = new File(OPENNLP_DIR + File.separator + TOKENIZER_MODEL_NAME).getPath(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(DocumentSample.class, "20newsgroup"); + assertInstanceOf(TwentyNewsgroupSampleStreamFactory.class, f); + factory = (TwentyNewsgroupSampleStreamFactory) f; + assertEquals(TwentyNewsgroupSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "20newsgroup").getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"simple", "whitespace"}) + void testCreateWithValidParameter(String tokenizerType) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-ruleBasedTokenizer", tokenizerType, "-dataDir", sampleFileFullPath})) { + DocumentSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @EmptySource + @ValueSource(strings = {" ", "unknown"}) + void testCreateWithInvalidRBTokenizer(String tokenizerType) { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-ruleBasedTokenizer", tokenizerType, "-dataDir", sampleFileFullPath})) { + DocumentSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + @Test + void testCreateWithMETokenizer() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-tokenizerModel", tokFileFullPath, "-dataDir", sampleFileFullPath})) { + DocumentSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @EmptySource + @ValueSource(strings = {" ", "unknown"}) + void testCreateWithInvalidMETokenizer(String path) { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-tokenizerModel", path, "-dataDir", sampleFileFullPath})) { + DocumentSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-ruleBasedTokenizer", "whitespace", "-dataDir", sampleFileFullPath + "xyz"})) { + DocumentSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/WordTagSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/WordTagSampleStreamFactoryTest.java new file mode 100644 index 0000000000..c7b7cc96e3 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/WordTagSampleStreamFactoryTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class WordTagSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "word-tags-01.sample"; + + // SUT + private WordTagSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + WordTagSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, StreamFactoryRegistry.DEFAULT_FORMAT); + assertInstanceOf(WordTagSampleStreamFactory.class, f); + factory = (WordTagSampleStreamFactory) f; + assertEquals(WordTagSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + assertNotNull(sample.getTags()); + assertEquals(6, sample.getTags().length); + assertNotNull(sample.getSentence()); + assertEquals(6, sample.getSentence().length); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactoryTest.java new file mode 100644 index 0000000000..052a988a65 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ad; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.chunker.ChunkSample; +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ADChunkSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "ad.sample"; + + // SUT + private ADChunkSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ADChunkSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(ChunkSample.class, "ad"); + assertInstanceOf(ADChunkSampleStreamFactory.class, f); + factory = (ADChunkSampleStreamFactory) f; + assertEquals(ADChunkSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix( + FORMAT_SAMPLE_DIR + "ad/" + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-start", "0", "-end", "1", "-lang", "por", "-encoding", "UTF-8", + "-data", sampleFileFullPath})) { + ChunkSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-start", "0", "-end", "1", "-lang", "por", "-encoding", "UTF-8", + "-data", sampleFileFullPath + "xyz"})) { + ChunkSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactoryTest.java new file mode 100644 index 0000000000..3ba1413be3 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactoryTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ad; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ADPOSSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "ad.sample"; + + // SUT + private ADPOSSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ADPOSSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, "ad"); + assertInstanceOf(ADPOSSampleStreamFactory.class, f); + factory = (ADPOSSampleStreamFactory) f; + assertEquals(ADPOSSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix( + FORMAT_SAMPLE_DIR + "ad/" + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"true", "false"}) + void testCreateWithValidParameter(String expandME) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-expandME", expandME, "-lang", "por", "-encoding", "UTF-8", + "-data", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-expandME", "false", "-lang", "por", "-encoding", "UTF-8", + "-data", sampleFileFullPath + "xyz"})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java index 1f55a49436..486b51f9f5 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java @@ -43,7 +43,7 @@ void testSimpleReading() throws IOException { int count = 0; ADSentenceStream.Sentence paragraph = stream.read(); - paragraph.getRoot(); + paragraph.root(); while (paragraph != null) { count++; paragraph = stream.read(); diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..75d9b977f8 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactoryTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ad; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ADSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "ad.sample"; + + // SUT + private ADSentenceSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ADSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "ad"); + assertInstanceOf(ADSentenceSampleStreamFactory.class, f); + factory = (ADSentenceSampleStreamFactory) f; + assertEquals(ADSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix( + FORMAT_SAMPLE_DIR + "ad/" + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"true", "false"}) + void testCreateWithValidParameter(String includeTitles) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-includeTitles", includeTitles, "-lang", "por", "-encoding", "UTF-8", + "-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-includeTitles", "false", "-lang", "por", "-encoding", "UTF-8", + "-data", sampleFileFullPath + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..2e7abf020d --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactoryTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ad; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ADTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "ad.sample"; + + // SUT + private ADTokenSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ADTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, "ad"); + assertInstanceOf(ADTokenSampleStreamFactory.class, f); + factory = (ADTokenSampleStreamFactory) f; + assertEquals(ADTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + sampleFileFullPath = getResourceWithoutPrefix( + FORMAT_SAMPLE_DIR + "ad/" + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"true", "false"}) + void testCreateWithValidParameter(String includeTitles) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-splitHyphenatedTokens", includeTitles, "-lang", "por", "-encoding", "UTF-8", + "-detokenizer", detokFileFullPath, "-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-splitHyphenatedTokens", "false", "-lang", "por", "-encoding", "UTF-8", + "-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java index 14039e48fb..2cd2b34e55 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java @@ -43,10 +43,10 @@ void testSentences() { void setup() throws IOException { super.setup(); - ADTokenSampleStreamFactory factory = - new ADTokenSampleStreamFactory<>(ADTokenSampleStreamFactory.Parameters.class); + ADTokenSampleStreamFactory factory = + new ADTokenSampleStreamFactory(ADTokenSampleStreamFactory.Parameters.class); - File data = new File(getResource("ad.sample").getFile()); + File data = new File(getResourceWithoutPrefix("opennlp/tools/formats/ad/ad.sample").getFile()); Assertions.assertNotNull(data); File dict = new File(getResourceWithoutPrefix("opennlp/tools/tokenize/latin-detokenizer.xml").getFile()); Assertions.assertNotNull(dict); diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/AbstractADSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/AbstractADSampleStreamTest.java index 025915a435..b176696a83 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/ad/AbstractADSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ad/AbstractADSampleStreamTest.java @@ -17,6 +17,7 @@ package opennlp.tools.formats.ad; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -37,6 +38,7 @@ public abstract class AbstractADSampleStreamTest extends Abstr @BeforeEach void setup() throws IOException { - in = new ResourceAsStreamFactory(AbstractADSampleStreamTest.class, FORMATS_BASE_DIR + "ad.sample"); + in = new ResourceAsStreamFactory(AbstractADSampleStreamTest.class, + FORMATS_BASE_DIR + "ad" + File.separator + "ad.sample"); } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratAnnotationStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratAnnotationStreamTest.java index f039dd581a..1602c90806 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratAnnotationStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratAnnotationStreamTest.java @@ -21,12 +21,14 @@ import java.util.LinkedHashSet; import java.util.Set; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import opennlp.tools.util.ObjectStream; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.fail; public class BratAnnotationStreamTest extends AbstractBratTest { @@ -49,7 +51,7 @@ void testParsingEntities() throws IOException { AnnotationConfiguration annConfig = new AnnotationConfiguration(typeToClassMap); ObjectStream annStream = creatBratAnnotationStream(annConfig, "brat/voa-with-entities.ann"); - Assertions.assertNotNull(annStream); + assertNotNull(annStream); validateBratAnnotationStream(annStream, 5, 10, 3, 0, 2, 0); } @@ -62,7 +64,7 @@ void testParsingRelations() throws IOException { AnnotationConfiguration annConfig = new AnnotationConfiguration(typeToClassMap); ObjectStream annStream = creatBratAnnotationStream(annConfig, "brat/voa-with-relations.ann"); - Assertions.assertNotNull(annStream); + assertNotNull(annStream); validateBratAnnotationStream(annStream, 5, 10, 3, 0, 0, 7); } @@ -87,16 +89,16 @@ private void validateBratAnnotationStream(ObjectStream annStream BratAnnotation ann; while ((ann = annStream.read()) != null) { - Assertions.assertNotNull(ann); + assertNotNull(ann); String type = ann.getType(); - Assertions.assertNotNull(type); + assertNotNull(type); String coveredText = null; RelationAnnotation rAnnotation = null; AnnotatorNoteAnnotation aAnnotation = null; if (ann instanceof SpanAnnotation sAnnotation) { coveredText = sAnnotation.getCoveredText(); - Assertions.assertNotNull(coveredText); + assertNotNull(coveredText); } else if (ann instanceof RelationAnnotation) { rAnnotation = (RelationAnnotation) ann; } else if (ann instanceof AnnotatorNoteAnnotation) { @@ -122,26 +124,27 @@ private void validateBratAnnotationStream(ObjectStream annStream break; } case BRAT_TYPE_RELATED: { relations++; - Assertions.assertNotNull(rAnnotation); + assertNotNull(rAnnotation); break; } case BRAT_TYPE_ANNOTATION: { annotations++; - Assertions.assertNotNull(aAnnotation); + assertNotNull(aAnnotation); break; } default: { fail("Found an unsupported BRAT type!"); } } } - Assertions.assertEquals(expectDates, dates); - Assertions.assertEquals(expectPersons, persons); - Assertions.assertEquals(expectLocations, locations); - Assertions.assertEquals(expectAnnotations, annotations); - Assertions.assertEquals(expectOrganizations, organizations); - Assertions.assertEquals(expectRelations, relations); - - Assertions.assertArrayEquals(VOA_DATES, annotatedDates.toArray()); - Assertions.assertArrayEquals(VOA_PERSONS, annotatedPersons.toArray()); - Assertions.assertArrayEquals(VOA_LOCATIONS, annotatedLocations.toArray()); + assertEquals(expectDates, dates); + assertEquals(expectPersons, persons); + assertEquals(expectLocations, locations); + assertEquals(expectAnnotations, annotations); + assertEquals(expectOrganizations, organizations); + assertEquals(expectOrganizations, organizations); + assertEquals(expectRelations, relations); + + assertArrayEquals(VOA_DATES, annotatedDates.toArray()); + assertArrayEquals(VOA_PERSONS, annotatedPersons.toArray()); + assertArrayEquals(VOA_LOCATIONS, annotatedLocations.toArray()); } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..edfbd8441d --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamFactoryTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.brat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EmptySource; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.DownloadUtil; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.fail; + +public class BratNameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME", + System.getProperty("user.home"))).resolve(".opennlp"); + private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin"; + + // SUT + private BratNameSampleStreamFactory factory; + + private static String tokFileFullPath; + private String bratFullPath; + private String configPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return bratFullPath; + } + + @BeforeAll + static void initEnv() { + BratNameSampleStreamFactory.registerFactory(); + try { + // ensure, the model is available locally for later test purposes + DownloadUtil.downloadModel("en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class); + } catch (IOException e) { + fail(e.getLocalizedMessage()); + } + tokFileFullPath = new File(OPENNLP_DIR + File.separator + TOKENIZER_MODEL_NAME).getPath(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "brat"); + assertInstanceOf(BratNameSampleStreamFactory.class, f); + factory = (BratNameSampleStreamFactory) f; + assertEquals(BratNameSampleStreamFactory.Parameters.class, factory.getParameters()); + bratFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "brat/").getPath(); + configPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "brat/" + "brat-ann.conf").getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"simple", "whitespace"}) + void testCreateWithValidParameter(String tokType) throws IOException { + try (ObjectStream stream = factory.create(new String[]{"-ruleBasedTokenizer", tokType, + "-annotationConfig", configPath, "-bratDataDir", bratFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + Span[] names = sample.getNames(); + assertNotNull(names); + assertEquals(1, names.length); + assertEquals("Name", names[0].getType()); + assertEquals(0, names[0].getStart()); + if ("whitespace".equals(tokType)) { + assertEquals(2, names[0].getEnd()); + } else { + assertEquals(6, names[0].getEnd()); + } + } + } + + @ParameterizedTest + @EmptySource + @ValueSource(strings = {" ", "unknown"}) + void testCreateWithInvalidRBTokenizer(String tokType) { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[]{"-ruleBasedTokenizer", tokType, + "-annotationConfig", configPath, "-bratDataDir", bratFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + @Test + void testCreateWithMETokenizer() throws IOException { + try (ObjectStream stream = factory.create(new String[]{"-tokenizerModel", tokFileFullPath, + "-annotationConfig", configPath, "-bratDataDir", bratFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @EmptySource + @ValueSource(strings = {" ", "unknown"}) + void testCreateWithInvalidMETokenizer(String path) { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[]{"-tokenizerModel", path, + "-annotationConfig", configPath, "-bratDataDir", bratFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[]{"-ruleBasedTokenizer", "whitespace", + "-annotationConfig", configPath, "-bratDataDir", bratFullPath + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java index 9ddae8056d..416edd7d3d 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java @@ -22,7 +22,6 @@ import java.util.Collections; import java.util.Set; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -31,6 +30,10 @@ import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + public class BratNameSampleStreamTest extends AbstractBratTest { @BeforeEach @@ -49,19 +52,19 @@ void readNoOverlap() throws IOException { sample = stream.read(); } - Assertions.assertEquals(8, count); + assertEquals(8, count); } @Test void readOverlapFail() { - Assertions.assertThrows(RuntimeException.class, () -> { - BratNameSampleStream stream = createNameSampleWith("overlapping", + assertThrows(RuntimeException.class, () -> { + BratNameSampleStream stream = createNameSampleWith("-overlapping", null); NameSample sample = stream.read(); while (sample != null) { sample = stream.read(); - Assertions.assertNotNull(sample); + assertNotNull(sample); } }); @@ -69,7 +72,7 @@ void readOverlapFail() { @Test void emptySample() { - Assertions.assertThrows(IllegalArgumentException.class, () -> createNameSampleWith("overlapping", + assertThrows(IllegalArgumentException.class, () -> createNameSampleWith("overlapping", Collections.emptySet())); } @@ -84,7 +87,7 @@ void readOverlapFilter() throws IOException { sample = stream.read(); } - Assertions.assertEquals(8, count); + assertEquals(8, count); } private BratNameSampleStream createNameSampleWith(String nameContainsFilter, diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactoryTest.java new file mode 100644 index 0000000000..0e7f499e2f --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactoryTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.lemmatizer.LemmaSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ConlluLemmaSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "es-ud-sample.conllu"; + + // SUT + private ConlluLemmaSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConlluLemmaSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(LemmaSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT); + assertInstanceOf(ConlluLemmaSampleStreamFactory.class, f); + factory = (ConlluLemmaSampleStreamFactory) f; + assertEquals(ConlluLemmaSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "conllu/" + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"u", "x"}) + void testCreateWithValidParameter(String tagset) throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-tagset", tagset, "-data", sampleFileFullPath})) { + LemmaSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @ValueSource(strings = {" ", "y"}) + void testCreateWithUnknownTagset(String tagset) { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create( + new String[]{"-tagset", tagset, "-data", sampleFileFullPath})) { + assertNotNull(stream.read()); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-tagset", "u", "-data", sampleFileFullPath + "xyz"})) { + LemmaSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactoryTest.java new file mode 100644 index 0000000000..40208bbc59 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactoryTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ConlluPOSSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "es-ud-sample.conllu"; + + // SUT + private ConlluPOSSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConlluPOSSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT); + assertInstanceOf(ConlluPOSSampleStreamFactory.class, f); + factory = (ConlluPOSSampleStreamFactory) f; + assertEquals(ConlluPOSSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "conllu/" + SAMPLE_01).getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"u", "x"}) + void testCreateWithValidParameter(String tagset) throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-tagset", tagset, "-data", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @ValueSource(strings = {" ", "y"}) + void testCreateWithUnknownTagset(String tagset) { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create( + new String[]{"-tagset", tagset, "-data", sampleFileFullPath})) { + assertNotNull(stream.read()); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-tagset", "u", "-data", sampleFileFullPath + "xyz"})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..05cb3efa42 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactoryTest.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ConlluSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "es-ud-sample.conllu"; + + // SUT + private ConlluSentenceSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConlluSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT); + assertInstanceOf(ConlluSentenceSampleStreamFactory.class, f); + factory = (ConlluSentenceSampleStreamFactory) f; + assertEquals(ConlluSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "conllu/" + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-sentencesPerSample", "5", "-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-sentencesPerSample", "5", "-data", sampleFileFullPath + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..ea6f606c62 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactoryTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ConlluTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "es-ud-sample.conllu"; + + // SUT + private ConlluTokenSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConlluTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, ConlluPOSSampleStreamFactory.CONLLU_FORMAT); + assertInstanceOf(ConlluTokenSampleStreamFactory.class, f); + factory = (ConlluTokenSampleStreamFactory) f; + assertEquals(ConlluTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "conllu/" + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..a42cde19ef --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactoryTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class NameToSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "name-data-01.sample"; + + // SUT + private NameToSentenceSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + NameToSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "namefinder"); + assertInstanceOf(NameToSentenceSampleStreamFactory.class, f); + factory = (NameToSentenceSampleStreamFactory) f; + assertEquals(NameToSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath,"-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..7c8ae45171 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactoryTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class NameToTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "name-data-01.sample"; + + // SUT + private NameToTokenSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + NameToTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, "namefinder"); + assertInstanceOf(NameToTokenSampleStreamFactory.class, f); + factory = (NameToTokenSampleStreamFactory) f; + assertEquals(NameToTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath,"-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..6e92de9d2d --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactoryTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class POSToSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "word-tags-01.sample"; + + // SUT + private POSToSentenceSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + POSToSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "pos"); + assertInstanceOf(POSToSentenceSampleStreamFactory.class, f); + factory = (POSToSentenceSampleStreamFactory) f; + assertEquals(POSToSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath,"-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..2501e8c252 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactoryTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class POSToTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "word-tags-01.sample"; + + // SUT + private POSToTokenSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + POSToTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, "pos"); + assertInstanceOf(POSToTokenSampleStreamFactory.class, f); + factory = (POSToTokenSampleStreamFactory) f; + assertEquals(POSToTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath,"-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactoryTest.java new file mode 100644 index 0000000000..e1ca0452de --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactoryTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ParseToPOSSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "parse-01.sample"; + + // SUT + private ParseToPOSSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ParseToPOSSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, "parse"); + assertInstanceOf(ParseToPOSSampleStreamFactory.class, f); + factory = (ParseToPOSSampleStreamFactory) f; + assertEquals(ParseToPOSSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-data", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..a11ce4f557 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactoryTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ParseToSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "parse-01.sample"; + + // SUT + private ParseToSentenceSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ParseToSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "parse"); + assertInstanceOf(ParseToSentenceSampleStreamFactory.class, f); + factory = (ParseToSentenceSampleStreamFactory) f; + assertEquals(ParseToSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath,"-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..08ae5211f0 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactoryTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.convert; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ParseToTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "parse-01.sample"; + + // SUT + private ParseToTokenSampleStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ParseToTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, "parse"); + assertInstanceOf(ParseToTokenSampleStreamFactory.class, f); + factory = (ParseToTokenSampleStreamFactory) f; + assertEquals(ParseToTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath,"-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as different exception is expected! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath + "xyz"})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactoryTest.java new file mode 100644 index 0000000000..14a267336a --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactoryTest.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.frenchtreebank; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.parser.Parse; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class ConstitParseSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private ConstitParseSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + ConstitParseSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(Parse.class, "frenchtreebank"); + assertInstanceOf(ConstitParseSampleStreamFactory.class, f); + factory = (ConstitParseSampleStreamFactory) f; + assertEquals(ConstitParseSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "frenchtreebank/").getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + Parse sample = stream.read(); + assertNotNull(sample); + } + } + + @Test + @Override // due to different exception + protected void testCreateWithInvalidDataFilePath() { + assertThrows(IllegalArgumentException.class, () -> { + try (ObjectStream stream = getFactory().create( + new String[]{"-data", getDataFilePath() + "xyz"})) { + Parse sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactoryTest.java new file mode 100644 index 0000000000..26a1c20f9f --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactoryTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class IrishSentenceBankSentenceStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "irishsentencebank-sample.xml"; + + // SUT + private IrishSentenceBankSentenceStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + IrishSentenceBankSentenceStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "irishsentencebank"); + assertInstanceOf(IrishSentenceBankSentenceStreamFactory.class, f); + factory = (IrishSentenceBankSentenceStreamFactory) f; + assertEquals(IrishSentenceBankSentenceStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix( + FORMAT_SAMPLE_DIR + "irishsentencebank/" + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..ffea30b4eb --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactoryTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class IrishSentenceBankTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "irishsentencebank-sample.xml"; + + // SUT + private IrishSentenceBankTokenSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + IrishSentenceBankTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, "irishsentencebank"); + assertInstanceOf(IrishSentenceBankTokenSampleStreamFactory.class, f); + factory = (IrishSentenceBankTokenSampleStreamFactory) f; + assertEquals(IrishSentenceBankTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix( + FORMAT_SAMPLE_DIR + "irishsentencebank/" + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactoryTest.java new file mode 100644 index 0000000000..36593b8c86 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactoryTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.leipzig; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class LeipzigLanguageSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private LeipzigLanguageSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + LeipzigLanguageSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(LanguageSample.class, "leipzig"); + assertInstanceOf(LeipzigLanguageSampleStreamFactory.class, f); + factory = (LeipzigLanguageSampleStreamFactory) f; + assertEquals(LeipzigLanguageSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "leipzig/samples").getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-sentencesPerSample", "1","-samplesPerLanguage", "1", "-samplesToSkip", "1", + "-sentencesDir", sampleFileFullPath})) { + LanguageSample sample = stream.read(); + assertNotNull(sample); + } + } + + @Test // Given the sample Leipzig are super small, these parameters should not work! + void testCreateWithValidParametersTooLarge() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-sentencesPerSample", "2","-samplesPerLanguage", "2", "-samplesToSkip", "1", + "-sentencesDir", sampleFileFullPath})) { + LanguageSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-sentencesPerSample", "2","-samplesPerLanguage", "2", "-samplesToSkip", "1", + "-sentencesDir", sampleFileFullPath + "xyz"})) { + LanguageSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactoryTest.java new file mode 100644 index 0000000000..ca94a5296b --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactoryTest.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.letsmt; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class LetsmtSentenceStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String DETOKENIZER_FILE = "opennlp/tools/tokenize/latin-detokenizer.xml"; + private static final String SAMPLE_01 = "letsmt-with-words.xml"; + + // SUT + private LetsmtSentenceStreamFactory factory; + + private String detokFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + LetsmtSentenceStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "letsmt"); + assertInstanceOf(LetsmtSentenceStreamFactory.class, f); + factory = (LetsmtSentenceStreamFactory) f; + assertEquals(LetsmtSentenceStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "letsmt/" + SAMPLE_01).getPath(); + detokFileFullPath = getResourceWithoutPrefix(DETOKENIZER_FILE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-detokenizer", detokFileFullPath, "-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactoryTest.java new file mode 100644 index 0000000000..27029cc2a3 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.masc; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class MascNamedEntitySampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private MascNamedEntitySampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + MascNamedEntitySampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, Masc.MASC_FORMAT); + assertInstanceOf(MascNamedEntitySampleStreamFactory.class, f); + factory = (MascNamedEntitySampleStreamFactory) f; + assertEquals(MascNamedEntitySampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "masc/").getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"True", "False"}) + void testCreateWithValidParameter(String recurrentSearch) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-fileFilter", "fakeMASC", "-recurrentSearch", recurrentSearch, + "-data", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: + * Overridden more parameters than the '-data' param is required. + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = getFactory().create(new String[] + {"-fileFilter", "fakeMASC", "-recurrentSearch", "True", + "-data", getDataFilePath() + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactoryTest.java new file mode 100644 index 0000000000..112d0dbbee --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.masc; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class MascPOSSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private MascPOSSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + MascPOSSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, Masc.MASC_FORMAT); + assertInstanceOf(MascPOSSampleStreamFactory.class, f); + factory = (MascPOSSampleStreamFactory) f; + assertEquals(MascPOSSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "masc/").getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"True", "False"}) + void testCreateWithValidParameter(String recurrentSearch) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-fileFilter", "fakeMASC", "-recurrentSearch", recurrentSearch, + "-data", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: + * Overridden more parameters than the '-data' param is required. + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = getFactory().create(new String[] + {"-fileFilter", "fakeMASC", "-recurrentSearch", "True", + "-data", getDataFilePath() + "xyz"})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..c67388dcd0 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.masc; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class MascSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private MascSentenceSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + MascSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, Masc.MASC_FORMAT); + assertInstanceOf(MascSentenceSampleStreamFactory.class, f); + factory = (MascSentenceSampleStreamFactory) f; + assertEquals(MascSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "masc/").getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"True", "False"}) + void testCreateWithValidParameter(String recurrentSearch) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-fileFilter", "fakeMASC", "-sentencesPerSample", "5", "-recurrentSearch", recurrentSearch, + "-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: + * Overridden more parameters than the '-data' param is required. + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = getFactory().create(new String[] + {"-fileFilter", "fakeMASC", "-sentencesPerSample", "5", "-recurrentSearch", "True", + "-data", getDataFilePath() + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactoryTest.java new file mode 100644 index 0000000000..04ea91f2ff --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.masc; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class MascTokenSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private MascTokenSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + MascTokenSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(TokenSample.class, Masc.MASC_FORMAT); + assertInstanceOf(MascTokenSampleStreamFactory.class, f); + factory = (MascTokenSampleStreamFactory) f; + assertEquals(MascTokenSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "masc/").getPath(); + } + + @ParameterizedTest + @ValueSource(strings = {"True", "False"}) + void testCreateWithValidParameter(String recurrentSearch) throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-fileFilter", "fakeMASC", "-sentencesPerSample", "5", "-recurrentSearch", recurrentSearch, + "-data", sampleFileFullPath})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: + * Overridden more parameters than the '-data' param is required. + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = getFactory().create(new String[] + {"-fileFilter", "fakeMASC", "-sentencesPerSample", "5", "-recurrentSearch", "True", + "-data", getDataFilePath() + "xyz"})) { + TokenSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..1b9b618f6f --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactoryTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.moses; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class MosesSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "moses-tiny.sample"; + + // SUT + private MosesSentenceSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + MosesSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "moses"); + assertInstanceOf(MosesSentenceSampleStreamFactory.class, f); + factory = (MosesSentenceSampleStreamFactory) f; + assertEquals(MosesSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "moses/" + SAMPLE_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create( + new String[]{"-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..ffe1c0a688 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactoryTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.muc; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.DownloadUtil; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.fail; + +public class Muc6NameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME", + System.getProperty("user.home"))).resolve(".opennlp"); + private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin"; + + // SUT + private Muc6NameSampleStreamFactory factory; + + private static String tokenizerFileFullPath; + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + Muc6NameSampleStreamFactory.registerFactory(); + try { + // ensure, the model is available locally for later test purposes + DownloadUtil.downloadModel("en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class); + } catch (IOException e) { + fail(e.getLocalizedMessage()); + } + tokenizerFileFullPath = new File(OPENNLP_DIR + File.separator + TOKENIZER_MODEL_NAME).getPath(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "muc6"); + assertInstanceOf(Muc6NameSampleStreamFactory.class, f); + factory = (Muc6NameSampleStreamFactory) f; + assertEquals(Muc6NameSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "muc").getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-tokenizerModel", tokenizerFileFullPath, "-data", sampleFileFullPath + File.separator})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-tokenizerModel", tokenizerFileFullPath, "-data", sampleFileFullPath + "xyz" + File.separator})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactoryTest.java new file mode 100644 index 0000000000..c628caadf0 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.nkjp; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class NKJPSentenceSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String SAMPLE_01 = "ann_segmentation.xml"; + private static final String TEXT_01 = "text_structure.xml"; + + // SUT + private NKJPSentenceSampleStreamFactory factory; + + private String sampleFileFullPath; + private String textFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + NKJPSentenceSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(SentenceSample.class, "nkjp"); + assertInstanceOf(NKJPSentenceSampleStreamFactory.class, f); + factory = (NKJPSentenceSampleStreamFactory) f; + assertEquals(NKJPSentenceSampleStreamFactory.Parameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "nkjp/" + SAMPLE_01).getPath(); + textFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "nkjp/" + TEXT_01).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-textFile", textFileFullPath, "-data", sampleFileFullPath})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: + * Overridden more parameters than the '-data' param is required. + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = getFactory().create(new String[] + {"-textFile", textFileFullPath, "-data", getDataFilePath() + "xyz"})) { + SentenceSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactoryTest.java new file mode 100644 index 0000000000..59194ad8fb --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactoryTest.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ontonotes; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class OntoNotesNameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private OntoNotesNameSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + OntoNotesNameSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(NameSample.class, "ontonotes"); + assertInstanceOf(OntoNotesNameSampleStreamFactory.class, f); + factory = (OntoNotesNameSampleStreamFactory) f; + assertEquals(OntoNotesFormatParameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "ontonotes").getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-ontoNotesDir", sampleFileFullPath})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-ontoNotesDir", sampleFileFullPath + "xyz"})) { + NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactoryTest.java new file mode 100644 index 0000000000..7fddc70ad9 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactoryTest.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ontonotes; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class OntoNotesPOSSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + // SUT + private OntoNotesPOSSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + OntoNotesPOSSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(POSSample.class, "ontonotes"); + assertInstanceOf(OntoNotesPOSSampleStreamFactory.class, f); + factory = (OntoNotesPOSSampleStreamFactory) f; + assertEquals(OntoNotesFormatParameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "ontonotes").getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-ontoNotesDir", sampleFileFullPath})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-ontoNotesDir", sampleFileFullPath + "xyz"})) { + POSSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactoryTest.java new file mode 100644 index 0000000000..555d042ca5 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactoryTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.ontonotes; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.formats.AbstractSampleStreamFactoryTest; +import opennlp.tools.parser.Parse; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class OntoNotesParseSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest { + + private static final String REFERENCE = + "*PRO* Judging from the Americana in Haruki Murakami 's " + + "`` A Wild Sheep Chase '' ( Kodansha , 320 pages , $ 18.95 *U* ) , " + + "baby boomers on both sides of the Pacific have a lot in common . "; + + // SUT + private OntoNotesParseSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory + getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + OntoNotesParseSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + ObjectStreamFactory f = + StreamFactoryRegistry.getFactory(Parse.class, "ontonotes"); + assertInstanceOf(OntoNotesParseSampleStreamFactory.class, f); + factory = (OntoNotesParseSampleStreamFactory) f; + assertEquals(OntoNotesFormatParameters.class, factory.getParameters()); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + "ontonotes").getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (ObjectStream stream = factory.create(new String[] + {"-ontoNotesDir", sampleFileFullPath})) { + Parse sample = stream.read(); + assertNotNull(sample); + assertEquals(REFERENCE, sample.getText()); + } + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (ObjectStream stream = factory.create(new String[] + {"-ontoNotesDir", sampleFileFullPath + "xyz"})) { + Parse sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/20newsgroup/sci.electronics/52794.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/20newsgroup/sci.electronics/52794.sample new file mode 100644 index 0000000000..8addde4e80 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/20newsgroup/sci.electronics/52794.sample @@ -0,0 +1,59 @@ +Newsgroups: sci.electronics +Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!gatech!howland.reston.ans.net!bogus.sura.net!darwin.sura.net!uvaarpa!vdoe386!ragee +From: ragee@vdoe386.vak12ed.edu (Randy Agee) +Subject: Radar detector DETECTORS? +Message-ID: <1993Apr06.173031.9793@vdoe386.vak12ed.edu> +Organization: Virginia's Public Education Network (Richmond) +Date: Tue, 06 Apr 93 17:30:31 GMT +Lines: 50 + +Here's one I hope some knowledgeable readers will make a comment or +contribution to: + +In the State of Virginia radar detectors are illegal, period. If +you are caught with one it will be confiscated on the spot and will +not be returned until after you appear in court and pay your fine. +The fine for having a radar detector accessible in a motor vehicle +(even if it is not on) is $250.00. Sorry, tourist, ignorance of +the law is no excuse - they will get you too! + +It used to be that the only way the law could be enforced was for +an officer to actually see the radar detector. Not any more! Many +law enforcement agencies are now using radar detector detectors. +Right, a super sensitive receiver that is capable of picking up RF +from the radar detector itself. My first reaction was "no way!" +But, guess again, these little buggers really work and the police +are writing citations right and left for people using radar +detectors. One news story quoted an officer as saying that he had +found the radar detector in all of the cars he stopped except one, +and he could never figure out where it was - but he knew it was +there. This tends to make one assume there are few false arrest. + +Now, before I get flamed, please understand that I do drive at or +near the speed limit. I do not need a radar detector to keep me +from getting a speeding ticket. But, I do like to know when my +speed is being clocked or a speed trap is functioning. My radar +detector now stays locked in my trunk when I am in Virginia (which +is what they want - and yes, what the law says, and I intend to +obey the law!) and is only used in states where it is legal. + +For my fellow hams, I am not a microwave person - my mind only +works in the HF spectrum between 10 and 80 meters. Microwave +enlightment may be necessary. + +So, the questions are - + What do the radar detector detectors actually detect? + Would additional shielding/grounding/bypassing shield stray RF generated by + a radar detector, or is the RF actually being emitted by the detector + antenna? + Are any brands "quieter" than others? + +============================================================================== +Randy T. Agee - ARS WB4BZX | At some point, you probably pondered The +P.O. Box 2120 - 20th floor | Meaning of Life, and you came up with a +Virginia Department of Education | satisfactory answer, which has or has not +Richmond, VA 23216-2120 | stood the test of time, or you shrugged +Phone (804) 225-2669 | mightily, muttered "Beats the heck out of +ragee@vdoe386.vak12ed.edu | me," and ordered a cheeseburger. +============================================================================= + diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/ad/ad.sample similarity index 100% rename from opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample rename to opennlp-tools/src/test/resources/opennlp/tools/formats/ad/ad.sample diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/bionlp2004-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/bionlp2004-01.sample new file mode 100644 index 0000000000..f4b8522dca --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/bionlp2004-01.sample @@ -0,0 +1,33 @@ +High-dose O +growth O +hormone O +does O +not O +affect O +proinflammatory B-protein +cytokine I-protein +( O +tumor B-protein +necrosis I-protein +factor-alpha I-protein +, O +interleukin-6 B-protein +, O +and O +interferon-gamma B-protein +) O +release O +from O +activated O +peripheral B-cell_type +blood I-cell_type +mononuclear I-cell_type +cells I-cell_type +or O +after O +minimal O +to O +moderate O +surgical O +stress O +. O \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/brat-ann.conf b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/brat-ann.conf new file mode 100644 index 0000000000..73247e696f --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/brat-ann.conf @@ -0,0 +1,7 @@ +[entities] +Person +Location +Date +[relations] +Related Arg1:Person, Arg2:Person +Related Arg1:Person, Arg2:Location diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/chunker-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/chunker-01.sample new file mode 100644 index 0000000000..2fa5b51084 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/chunker-01.sample @@ -0,0 +1,16 @@ +He PRP B-NP +reckons VBZ B-VP +the DT B-NP +current JJ I-NP +account NN I-NP +deficit NN I-NP +will MD B-VP +narrow VB I-VP +to TO B-PP +only RB B-NP +# # I-NP +1.8 CD I-NP +billion CD I-NP +in IN B-PP +September NNP B-NP +. . O \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-01.sample similarity index 100% rename from opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample rename to opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-01.sample diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-02.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-02.sample new file mode 100644 index 0000000000..99be83e030 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-02.sample @@ -0,0 +1,29 @@ +-DOCSTART- + +A E adige20041008_id414157 O +circa B adige20041008_id414157 O +90 N adige20041008_id414157 O +chilometri SP adige20041008_id414157 O +dall' ES adige20041008_id414157 O +arrivo SS adige20041008_id414157 O +, XPW adige20041008_id414157 O +il RS adige20041008_id414157 O +capitano SS adige20041008_id414157 O +della ES adige20041008_id414157 O +Gerolsteiner SPN adige20041008_id414157 B-ORG +Davide SPN adige20041008_id414157 B-PER +Rebellin SPN adige20041008_id414157 I-PER +ha VIY adige20041008_id414157 O +allungato VSP adige20041008_id414157 O +su E adige20041008_id414157 O +uno RS adige20041008_id414157 O +dei EP adige20041008_id414157 O +pochi DP adige20041008_id414157 O +tratti SP adige20041008_id414157 O +in E adige20041008_id414157 O +salita SS adige20041008_id414157 O +, XPW adige20041008_id414157 O +frazionando VG adige20041008_id414157 O +il RS adige20041008_id414157 O +gruppo SS adige20041008_id414157 O +. XPS adige20041008_id414157 O diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-03.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-03.sample new file mode 100644 index 0000000000..dc7153f5ff --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-03.sample @@ -0,0 +1,22 @@ +-DOCSTART- + +Alla ES adige20041008_id414157 O +sua DS adige20041008_id414157 O +ruota SS adige20041008_id414157 O +si PN adige20041008_id414157 O +sono VIY adige20041008_id414157 O +portati VPP adige20041008_id414157 O +altri DP adige20041008_id414157 O +sei N adige20041008_id414157 O +corridori SP adige20041008_id414157 O +che CCHE adige20041008_id414157 O +hanno VIY adige20041008_id414157 O +poi B adige20041008_id414157 O +disputato VSP adige20041008_id414157 O +lo RS adige20041008_id414157 O +sprint SN adige20041008_id414157 O +sul ES adige20041008_id414157 O +traguardo SS adige20041008_id414157 O +di E adige20041008_id414157 O +Bourges SPN adige20041008_id414157 B-GPE +. XPS adige20041008_id414157 O diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-broken.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-broken.sample new file mode 100644 index 0000000000..0bda8dacf0 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-broken.sample @@ -0,0 +1,2 @@ +-DOCSTART- +xyz diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-incorrect.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-incorrect.sample new file mode 100644 index 0000000000..370c6791c3 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it-incorrect.sample @@ -0,0 +1,3 @@ +-DOCSTART- + +Alla ES adige20041008_id414157 \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/lang-detect-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/lang-detect-01.sample new file mode 100644 index 0000000000..44234479a1 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/lang-detect-01.sample @@ -0,0 +1 @@ +en This is just a test. diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/lemma-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/lemma-01.sample new file mode 100644 index 0000000000..94cc1e913c --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/lemma-01.sample @@ -0,0 +1 @@ +suns NOUN sun \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/moses/moses-tiny.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/moses/moses-tiny.sample new file mode 100644 index 0000000000..627bbbe52e --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/moses/moses-tiny.sample @@ -0,0 +1,3 @@ +je|PRO vous|PRO achète|VB un|ART aardvark|NN +je|PRO vous|PRO achète|VB un|ART chat|NN +je|PRO vous|PRO achète|VB un|ART grand|ADJ chat|NN blanc|ADJ diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/muc/LDC2003T13.sgm b/opennlp-tools/src/test/resources/opennlp/tools/formats/muc/LDC2003T13.sgm new file mode 100644 index 0000000000..52030b4d8b --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/muc/LDC2003T13.sgm @@ -0,0 +1,73 @@ + + + + 123456-7890. + LoremIpsumCorp +

= 123456
+ 123456-7890. + Lorem Ipsum: +@ Dolor Sit Amet Consectetur +@ --- +@ Adipiscing Elit, Sed Do Eiusmod +@ Tempor Incididunt Ut Labore +@ ---- +@ By Lorem Ipsum +@ Placeholder Text Author +
12/12/21
+ LOREM IPSUM TIMES (LIT) + UNKNOWN + PLACEHOLDER TOPICS (PTP) +TEMPORARY SUBJECTS (TMP) + FAKE DEPARTMENT (FD) +DUMMY ORGANIZATION (DO) + LOREMTOWN + +

+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. +

+

+ Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +

+

+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. + Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +

+

+ Phasellus faucibus scelerisque eleifend donec pretium vulputate sapien. + Ultrices dui sapien eget mi proin sed libero enim. +

+

+ Non arcu risus quis varius quam quisque id diam vel. +

+

+ Amet nisl suscipit adipiscing bibendum est ultricies integer quis. + Elit eget gravida cum sociis natoque penatibus et magnis dis. +

+

+ Orci eu lobortis elementum nibh tellus molestie nunc non. + Vitae ultricies leo integer malesuada nunc vel risus commodo. +

+

+ Ultricies mi quis hendrerit dolor magna eget est lorem ipsum. +

+ \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/name-data-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/name-data-01.sample new file mode 100644 index 0000000000..ebfba12acc --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/name-data-01.sample @@ -0,0 +1 @@ +This is a test from Germany . \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-01.name b/opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-01.name new file mode 100644 index 0000000000..a4ade3e82d --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-01.name @@ -0,0 +1,9 @@ + + +Some U.S. allies are complaining that President Bush is pushing conventionalarms talks too quickly , creating a risk that negotiators will make errors that could affect the security of Western Europe for years . +Concerns about the pace of the Vienna talks -- which are aimed at the destruction of some 100,000 weapons , as well as major reductions and realignments of troops in central Europe -- also are being registered at the Pentagon . +Mr. Bush has called for an agreement by next September at the latest . +But some American defense officials believe the North Atlantic Treaty Organization should take more time to examine the long-term implications of the options being considered . +For one thing , Pentagon officials , who asked not to be identified , worry that the U.S. will have a much tougher time persuading Europeans to keep some short-range nuclear weapons on their soil once Soviet armored forces are thinned out . + + \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-02.parse b/opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-02.parse new file mode 100644 index 0000000000..b1ef7dd298 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/ontonotes/ontonotes-sample-02.parse @@ -0,0 +1,29 @@ +((S (S-ADV (NP-SBJ (-NONE- *PRO*)) + (VP (VBG Judging) + (PP-CLR (IN from) + (NP (NP (DT the) (NNS Americana)) + (PP-LOC (IN in) + (NP (NP (NNP Haruki) (NNP Murakami) (POS 's)) + (`` ``) + (NX-TTL (NP (DT A) (NNP Wild) (NNP Sheep) (NNP Chase))) + ('' '') + (NP (-LRB- -LRB-) + (NP (NNP Kodansha)) + (, ,) + (NP (CD 320) (NNS pages)) + (, ,) + (NP ($ $) + (CD 18.95) + (-NONE- *U*)) + (-RRB- -RRB-)))))))) + (, ,) + (NP-SBJ (NP (NN baby) (NNS boomers)) + (PP-LOC (IN on) + (NP (NP (DT both) (NNS sides)) + (PP (IN of) + (NP (DT the) (NNP Pacific)))))) + (VP (VBP have) + (NP (NP (DT a) (NN lot)) + (PP (IN in) + (NP (NN common))))) + (. .))) \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/parse-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/parse-01.sample new file mode 100644 index 0000000000..f814b6cccc --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/parse-01.sample @@ -0,0 +1 @@ +(TOP (S (NP-SBJ (DT The) (NN test) )(VP (MD shall) (VP (VB come) (NP-TMP (NN today) )))(. .) )) \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/sentences-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/sentences-01.sample new file mode 100644 index 0000000000..c8dd92d161 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/sentences-01.sample @@ -0,0 +1,2 @@ +This is a test. +Is it a test? \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/tokens-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/tokens-01.sample new file mode 100644 index 0000000000..b98b8167e9 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/tokens-01.sample @@ -0,0 +1 @@ +token1 token2 token3<SPLIT>token4 \ No newline at end of file diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/word-tags-01.sample b/opennlp-tools/src/test/resources/opennlp/tools/formats/word-tags-01.sample new file mode 100644 index 0000000000..724692ae97 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/word-tags-01.sample @@ -0,0 +1 @@ +The_DT day_NN has_VBZ just_RB started_VBN ._. \ No newline at end of file