Skip to content

Commit

Permalink
OPENNLP-1695: Add more tests for classes in formats package
Browse files Browse the repository at this point in the history
- introduces AbstractSampleStreamFactoryTest as common base class
- reduces code duplication in format factory classes
- adds a ton of new test classes for format factories (+ sample data)
- adds two more Evalita samples taken from Appendix of: https://www.evalita.it/wp-content/uploads/2021/11/Guidelines_evalita09_NER.pdf
- adds two OntoNotes samples from the public, official v5.0 release notes (sec 6.4 + 6.8), see: https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
  • Loading branch information
mawiesne committed Jan 20, 2025
1 parent e86b47f commit 5e27b55
Show file tree
Hide file tree
Showing 161 changed files with 6,152 additions and 1,060 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
* Parses the conll 2000 shared task shallow parser training data.
* <p>
* Data format is specified on the conll page:<br>
* <a href="http://www.cnts.ua.ac.be/conll2000/chunking/">
* http://www.cnts.ua.ac.be/conll2000/chunking/</a>
* <a href="https://www.cnts.ua.ac.be/conll2000/chunking/">
* https://www.cnts.ua.ac.be/conll2000/chunking/</a>
*/
public class ChunkSampleStream extends FilterObjectStream<String, ChunkSample> {

Expand All @@ -57,7 +57,7 @@ public ChunkSample read() throws IOException {
for (String line = samples.read(); line != null && !line.isEmpty(); line = samples.read()) {
String[] parts = line.split(" ");
if (parts.length != 3) {
logger.error("Skipping corrupt line: {}", line);
logger.warn("Skipping corrupt line: {}", line);
}
else {
toks.add(parts[0]);
Expand All @@ -66,11 +66,11 @@ public ChunkSample read() throws IOException {
}
}

if (toks.size() > 0) {
if (!toks.isEmpty()) {
return new ChunkSample(toks.toArray(new String[0]),
tags.toArray(new String[0]), preds.toArray(new String[0]));
} else {
return null;
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@

package opennlp.tools.formats;

import java.io.IOException;

import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.ObjectStreamFactory;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
* Base class for sample stream factories.
Expand All @@ -40,4 +48,44 @@ public String getLang() {
public Class<P> getParameters() {
return params;
}

/**
* Creates an {@link ObjectStream} for the specified arguments and
* the generic type {@code P}.
*
* @param args A set of command line arguments.
* @return The created {@link ObjectStream} instance.
*/
protected <P extends BasicFormatParams> ObjectStream<String> readData(String[] args,
Class<P> parametersClass) {
P params = validateBasicFormatParameters(args, parametersClass);
ObjectStream<String> lineStream = null;
try {
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
} catch (IOException ex) {
CmdLineUtil.handleCreateObjectStreamError(ex);
}
return lineStream;
}

/**
* Validates the specified arguments ({@code args}) given the
* context the generic type {@code P} which provides at least all
* {@link BasicFormatParams}.
*
* @implNote Additional checks for the basic {@code -data} argument are conducted, that is
* wether the file exists or not.
*
* @param args A set of command line arguments.
* @return The parsed (basic format) parameter instance.
*/
protected <P extends BasicFormatParams> P validateBasicFormatParameters(String[] args, Class<P> clazz) {
if (args == null) {
throw new IllegalArgumentException("Passed args must not be null!");
}
P params = ArgumentParser.parse(args, clazz);
CmdLineUtil.checkInputFile("Data", params.getData());
return params;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,24 @@
* <p>
* Data can be found on this
* <a href="http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004">website</a>,
* or in
* <a href="https://github.com/spyysalo/jnlpba">this repository</a>.
* or in this
* <a href="https://github.com/spyysalo/jnlpba">GitHub repository</a>.
* <p>
* The BioNLP/NLPBA 2004 data were originally published here:
* <p>
* <a href="http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html">
* http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html</a>,
* The BioNLP/NLPBA 2004 data were originally published
* <a href="http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html">here</a>,
* <p>
* yet this page was gone when last checked in December 2022.
* <p>
* It looks like this repo contains a copy of the data located on the original page:
* The BioNLP 2004 seems to be related to http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004
* <p>
* <b>Note:</b>
* Do not use this class, internal use only!
*/
@Internal
public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {

private static final String CODEC_TAG_O = "O";
private static final String CODEC_TAG_B = "B-";
private static final String CODEC_TAG_I = "I-";

public static final int GENERATE_DNA_ENTITIES = 0x01;
public static final int GENERATE_PROTEIN_ENTITIES = 0x01 << 1;
public static final int GENERATE_CELLTYPE_ENTITIES = 0x01 << 2;
Expand Down Expand Up @@ -96,7 +95,6 @@ public NameSample read() throws IOException {
boolean isClearAdaptiveData = false;

// Empty line indicates end of sentence

String line;
while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.trim())) {

Expand All @@ -121,7 +119,7 @@ public NameSample read() throws IOException {
}
}

if (sentence.size() > 0) {
if (!sentence.isEmpty()) {

// convert name tags into spans
List<Span> names = new ArrayList<>();
Expand All @@ -133,34 +131,32 @@ public NameSample read() throws IOException {
String tag = tags.get(i);

if (tag.endsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0)
tag = "O";
tag = CODEC_TAG_O;

if (tag.endsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0)
tag = "O";
tag = CODEC_TAG_O;

if (tag.endsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
tag = "O";
tag = CODEC_TAG_O;

if (tag.endsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
tag = "O";
tag = CODEC_TAG_O;
if (tag.endsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0)
tag = "O";
tag = CODEC_TAG_O;

if (tag.startsWith("B-")) {
if (tag.startsWith(CODEC_TAG_B)) {

if (beginIndex != -1) {
names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
beginIndex = -1;
endIndex = -1;
}

beginIndex = i;
endIndex = i + 1;
}
else if (tag.startsWith("I-")) {
else if (tag.startsWith(CODEC_TAG_I)) {
endIndex++;
}
else if (tag.equals("O")) {
else if (tag.equals(CODEC_TAG_O)) {
if (beginIndex != -1) {
names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
beginIndex = -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import java.io.IOException;

import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
Expand All @@ -30,55 +29,56 @@
/**
* @see BioNLP2004NameSampleStream
*/
public class BioNLP2004NameSampleStreamFactory<P> extends AbstractSampleStreamFactory<NameSample, P> {
public class BioNLP2004NameSampleStreamFactory extends
AbstractSampleStreamFactory<NameSample, BioNLP2004NameSampleStreamFactory.Parameters> {

interface Parameters extends BasicFormatParams {
public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "DNA,protein,cell_type,cell_line,RNA")
String getTypes();
}

public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
"bionlp2004", new BioNLP2004NameSampleStreamFactory<>(Parameters.class));
"bionlp2004", new BioNLP2004NameSampleStreamFactory(Parameters.class));
}

protected BioNLP2004NameSampleStreamFactory(Class<P> params) {
protected BioNLP2004NameSampleStreamFactory(Class<Parameters> params) {
super(params);
}

@Override
public ObjectStream<NameSample> create(String[] args) {

Parameters params = ArgumentParser.parse(args, Parameters.class);
Parameters params = validateBasicFormatParameters(args, Parameters.class);

int typesToGenerate = 0;

if (params.getTypes().contains("DNA")) {
String types = params.getTypes();
if (types.contains("DNA")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES;
}
else if (params.getTypes().contains("protein")) {
if (types.contains("protein")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES;
}
else if (params.getTypes().contains("cell_type")) {
if (types.contains("cell_type")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES;
}
else if (params.getTypes().contains("cell_line")) {
if (types.contains("cell_line")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES;
}
else if (params.getTypes().contains("RNA")) {
if (types.contains("RNA")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES;
}

try {
return new BioNLP2004NameSampleStream(
CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate);
} catch (IOException e) {
throw new IllegalStateException(e);
} catch (IOException ex) {
CmdLineUtil.handleCreateObjectStreamError(ex);
}
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,33 @@

package opennlp.tools.formats;

import java.io.IOException;

import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.chunker.ChunkSampleStream;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
* Factory producing OpenNLP {@link ChunkSampleStream}s.
*/
public class ChunkerSampleStreamFactory<P> extends AbstractSampleStreamFactory<ChunkSample, P> {
public class ChunkerSampleStreamFactory extends
AbstractSampleStreamFactory<ChunkSample, ChunkerSampleStreamFactory.Parameters> {

interface Parameters extends BasicFormatParams {
public interface Parameters extends BasicFormatParams {
}

public static void registerFactory() {
StreamFactoryRegistry.registerFactory(ChunkSample.class,
StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory<>(Parameters.class));
StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory(Parameters.class));
}

protected ChunkerSampleStreamFactory(Class<P> params) {
protected ChunkerSampleStreamFactory(Class<Parameters> params) {
super(params);
}

@Override
public ObjectStream<ChunkSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);

CmdLineUtil.checkInputFile("Data", params.getData());
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
ObjectStream<String> lineStream = null;
try {
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());

} catch (IOException ex) {
CmdLineUtil.handleCreateObjectStreamError(ex);
}

ObjectStream<String> lineStream = readData(args, Parameters.class);
return new ChunkSampleStream(lineStream);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import java.io.IOException;

import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
Expand All @@ -37,9 +36,10 @@
* @see Conll02NameSampleStream
*/
@Internal
public class Conll02NameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
public class Conll02NameSampleStreamFactory extends
LanguageSampleStreamFactory<NameSample, Conll02NameSampleStreamFactory.Parameters> {

interface Parameters extends BasicFormatParams {
public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "spa|nld")
String getLang();

Expand All @@ -49,17 +49,17 @@ interface Parameters extends BasicFormatParams {

public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
"conll02", new Conll02NameSampleStreamFactory<>(Parameters.class));
"conll02", new Conll02NameSampleStreamFactory(Parameters.class));
}

protected Conll02NameSampleStreamFactory(Class<P> params) {
protected Conll02NameSampleStreamFactory(Class<Parameters> params) {
super(params);
}

@Override
public ObjectStream<NameSample> create(String[] args) {

Parameters params = ArgumentParser.parse(args, Parameters.class);
Parameters params = validateBasicFormatParameters(args, Parameters.class);

LANGUAGE lang;
if ("nl".equals(params.getLang()) || "nld".equals(params.getLang())) {
Expand Down Expand Up @@ -93,7 +93,6 @@ else if ("es".equals(params.getLang()) || "spa".equals(params.getLang())) {
Conll02NameSampleStream.GENERATE_MISC_ENTITIES;
}


try {
return new Conll02NameSampleStream(lang,
CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate);
Expand Down
Loading

0 comments on commit 5e27b55

Please sign in to comment.