From e46c576831466ee19eaab6793873334d37cf2121 Mon Sep 17 00:00:00 2001 From: amensiko Date: Tue, 10 Jan 2017 22:09:51 +0200 Subject: [PATCH] OPENNLP-855: New SentimentAnalysisParser --- .../main/java/opennlp/tools/cmdline/CLI.java | 8 + .../tools/cmdline/StreamFactoryRegistry.java | 3 + .../SentimentCrossValidatorTool.java | 118 ++++++++ .../SentimentDetailedFMeasureListener.java | 43 +++ .../SentimentEvaluationErrorListener.java | 63 ++++ .../sentiment/SentimentEvaluatorTool.java | 153 ++++++++++ .../sentiment/SentimentModelLoader.java | 49 ++++ .../sentiment/SentimentTrainerTool.java | 110 +++++++ .../formats/SentimentSampleStreamFactory.java | 84 ++++++ .../sentiment/SentimentContextGenerator.java | 83 ++++++ .../sentiment/SentimentCrossValidator.java | 240 +++++++++++++++ .../sentiment/SentimentEvaluationMonitor.java | 28 ++ .../tools/sentiment/SentimentEvaluator.java | 64 ++++ .../tools/sentiment/SentimentEventStream.java | 85 ++++++ .../tools/sentiment/SentimentFactory.java | 72 +++++ .../opennlp/tools/sentiment/SentimentME.java | 274 ++++++++++++++++++ .../tools/sentiment/SentimentModel.java | 127 ++++++++ .../tools/sentiment/SentimentSample.java | 83 ++++++ .../sentiment/SentimentSampleStream.java | 76 +++++ .../sentiment/SentimentSampleTypeFilter.java | 69 +++++ 20 files changed, 1832 insertions(+) create mode 100644 opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java index 6c0413753d..010780b042 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java @@ -71,6 +71,9 @@ import opennlp.tools.cmdline.sentdetect.SentenceDetectorEvaluatorTool; import opennlp.tools.cmdline.sentdetect.SentenceDetectorTool; import opennlp.tools.cmdline.sentdetect.SentenceDetectorTrainerTool; +import opennlp.tools.cmdline.sentiment.SentimentCrossValidatorTool; +import opennlp.tools.cmdline.sentiment.SentimentEvaluatorTool; +import opennlp.tools.cmdline.sentiment.SentimentTrainerTool; import opennlp.tools.cmdline.tokenizer.DictionaryDetokenizerTool; import opennlp.tools.cmdline.tokenizer.SimpleTokenizerTool; import opennlp.tools.cmdline.tokenizer.TokenizerConverterTool; @@ -165,6 +168,11 @@ public final class CLI { // Entity Linker tools.add(new EntityLinkerTool()); + + // Sentiment Analysis Parser + tools.add(new SentimentTrainerTool()); + tools.add(new SentimentEvaluatorTool()); + tools.add(new SentimentCrossValidatorTool()); // Language Model tools.add(new NGramLanguageModelTool()); diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index c4bef61fb3..fb329286b9 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -35,6 +35,7 @@ import opennlp.tools.formats.NameSampleDataStreamFactory; import opennlp.tools.formats.ParseSampleStreamFactory; import opennlp.tools.formats.SentenceSampleStreamFactory; +import opennlp.tools.formats.SentimentSampleStreamFactory; import opennlp.tools.formats.TokenSampleStreamFactory; import opennlp.tools.formats.TwentyNewsgroupSampleStreamFactory; import opennlp.tools.formats.WordTagSampleStreamFactory; @@ -140,6 +141,8 @@ public final class StreamFactoryRegistry { MascPOSSampleStreamFactory.registerFactory(); MascSentenceSampleStreamFactory.registerFactory(); MascTokenSampleStreamFactory.registerFactory(); + + SentimentSampleStreamFactory.registerFactory(); } public static final String DEFAULT_FORMAT = "opennlp"; diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java new file mode 100644 index 0000000000..64aeb4cd14 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.cmdline.AbstractCrossValidatorTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.BasicTrainingParams; +import opennlp.tools.cmdline.params.CVParams; +import opennlp.tools.cmdline.sentiment.SentimentCrossValidatorTool.CVToolParams; +import opennlp.tools.sentiment.SentimentCrossValidator; +import opennlp.tools.sentiment.SentimentEvaluationMonitor; +import opennlp.tools.sentiment.SentimentFactory; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.eval.EvaluationMonitor; +import opennlp.tools.util.model.ModelUtil; + +/** + * Class for helping perform cross validation on the Sentiment Analysis Parser. + */ +public class SentimentCrossValidatorTool + extends AbstractCrossValidatorTool { + + /** + * Interface for parameters + */ + interface CVToolParams extends BasicTrainingParams, CVParams { + + } + + /** + * Constructor + */ + public SentimentCrossValidatorTool() { + super(SentimentSample.class, CVToolParams.class); + } + + /** + * Returns the short description of the tool + * + * @return short description + */ + public String getShortDescription() { + return "K-fold cross validator for the learnable Sentiment Analysis Parser"; + } + + /** + * Runs the tool + * + * @param format + * the format to be used + * @param args + * the arguments + */ + public void run(String format, String[] args) { + super.run(format, args); + + mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true); + if (mlParams == null) { + mlParams = ModelUtil.createDefaultTrainingParameters(); + } + + List> listeners = new LinkedList<>(); + if (params.getMisclassified()) { + listeners.add(new SentimentEvaluationErrorListener()); + } + SentimentDetailedFMeasureListener detailedFListener = null; + SentimentFactory sentimentFactory = new SentimentFactory(); + + SentimentCrossValidator validator; + try { + validator = new SentimentCrossValidator(params.getLang(), mlParams, sentimentFactory, + listeners.toArray(new SentimentEvaluationMonitor[listeners.size()])); + validator.evaluate(sampleStream, params.getFolds()); + } catch (IOException e) { + throw new TerminateToolException(-1, + "IO error while reading training data or indexing data: " + + e.getMessage(), + e); + } finally { + try { + sampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + System.out.println("done"); + + System.out.println(); + + if (detailedFListener == null) { + System.out.println(validator.getFMeasure()); + } else { + System.out.println(detailedFListener.toString()); + } + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java new file mode 100644 index 0000000000..c99fcfc6b4 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import opennlp.tools.cmdline.DetailedFMeasureListener; +import opennlp.tools.sentiment.SentimentEvaluationMonitor; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.Span; + +/** + * Class for creating a detailed F-Measure listener + */ +public class SentimentDetailedFMeasureListener + extends DetailedFMeasureListener + implements SentimentEvaluationMonitor { + + /** + * Returns the sentiment sample as a span array + * + * @param sample + * the sentiment sample to be returned + * @return span array of the sample + */ + @Override + protected Span[] asSpanArray(SentimentSample sample) { + return null; + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java new file mode 100644 index 0000000000..489ec60421 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.OutputStream; + +import opennlp.tools.cmdline.EvaluationErrorPrinter; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * Class for creating an evaluation error listener. + */ +public class SentimentEvaluationErrorListener + extends EvaluationErrorPrinter + implements EvaluationMonitor { + + /** + * Constructor + */ + public SentimentEvaluationErrorListener() { + super(System.err); + } + + /** + * Constructor + */ + protected SentimentEvaluationErrorListener(OutputStream outputStream) { + super(outputStream); + } + + /** + * Prints the error in case of a missclassification in the evaluator + * + * @param reference + * the sentiment sample reference to be used + * @param prediction + * the sentiment sampple prediction + */ + @Override + public void misclassified(SentimentSample reference, + SentimentSample prediction) { + printError(new String[] { reference.getSentiment() }, + new String[] { prediction.getSentiment() }, reference, prediction, + reference.getSentence()); + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java new file mode 100644 index 0000000000..602cd4cc47 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.cmdline.AbstractEvaluatorTool; +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.PerformanceMonitor; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.EvaluatorParams; +import opennlp.tools.cmdline.sentiment.SentimentEvaluatorTool.EvalToolParams; +import opennlp.tools.sentiment.SentimentEvaluationMonitor; +import opennlp.tools.sentiment.SentimentEvaluator; +import opennlp.tools.sentiment.SentimentME; +import opennlp.tools.sentiment.SentimentModel; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.sentiment.SentimentSampleTypeFilter; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * Class for creating an evaluation tool for sentiment analysis. + * + * @see EvalToolParams + * @see SentimentSample + */ +public class SentimentEvaluatorTool + extends AbstractEvaluatorTool { + + /** + * Interface for parameters to be used in evaluation + */ + interface EvalToolParams extends EvaluatorParams { + @OptionalParameter + @ParameterDescription(valueName = "types", description = "name types to use for evaluation") + String getNameTypes(); + } + + /** + * Constructor + */ + public SentimentEvaluatorTool() { + super(SentimentSample.class, EvalToolParams.class); + } + + /** + * Returns the short description of the tool + * + * @return short description + */ + public String getShortDescription() { + return "Measures the performance of the Sentiment model with the reference data"; + } + + /** + * Runs the tool + * + * @param format + * the format to be used + * @param args + * the arguments + */ + public void run(String format, String[] args) { + super.run(format, args); + + SentimentModel model = new SentimentModelLoader().load(params.getModel()); + // TODO: check EvalToolParams --> getNameTypes() + + List> listeners = new LinkedList<>(); + if (params.getMisclassified()) { + listeners.add(new SentimentEvaluationErrorListener()); + } + SentimentDetailedFMeasureListener detailedFListener = null; + + if (params.getNameTypes() != null) { + String[] nameTypes = params.getNameTypes().split(","); + sampleStream = new SentimentSampleTypeFilter(nameTypes, sampleStream); + } + + SentimentEvaluator evaluator = new SentimentEvaluator(new SentimentME(model), + listeners.toArray(new SentimentEvaluationMonitor[listeners.size()])); + + final PerformanceMonitor monitor = new PerformanceMonitor("sent"); + + ObjectStream measuredSampleStream = new ObjectStream() { + + @Override + public SentimentSample read() throws IOException { + SentimentSample sample = sampleStream.read(); + if (sample != null) { + monitor.incrementCounter(); + } + return sample; + } + + @Override + public void reset() throws IOException { + sampleStream.reset(); + } + + @Override + public void close() throws IOException { + sampleStream.close(); + } + }; + + monitor.startAndPrintThroughput(); + + try { + evaluator.evaluate(measuredSampleStream); + } catch (IOException e) { + System.err.println("failed"); + throw new TerminateToolException(-1, + "IO error while reading test data: " + e.getMessage(), e); + } finally { + try { + measuredSampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + monitor.stopAndPrintFinalResult(); + + System.out.println(); + + if (detailedFListener == null) { + System.out.println(evaluator.getFMeasure()); + } else { + System.out.println(detailedFListener.toString()); + } + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java new file mode 100644 index 0000000000..e5e3e20cd3 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.IOException; +import java.io.InputStream; + +import opennlp.tools.cmdline.ModelLoader; +import opennlp.tools.sentiment.SentimentModel; + +/** + * Class for loading a sentiment model. + */ +public class SentimentModelLoader extends ModelLoader { + + /** + * Constructor + */ + public SentimentModelLoader() { + super("Sentiment"); + } + + /** + * Loads the sentiment model + * + * @param modelIn + * the input stream model + * @return the model + */ + @Override + protected SentimentModel loadModel(InputStream modelIn) throws IOException { + return new SentimentModel(modelIn); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java new file mode 100644 index 0000000000..a690f4cfed --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.File; +import java.io.IOException; + +import opennlp.tools.cmdline.AbstractTrainerTool; +import opennlp.tools.cmdline.CLI; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.TrainingToolParams; +import opennlp.tools.sentiment.SentimentFactory; +import opennlp.tools.sentiment.SentimentME; +import opennlp.tools.sentiment.SentimentModel; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.model.ModelUtil; + +/** + * Class for helping train a sentiment analysis model. + */ +public class SentimentTrainerTool + extends AbstractTrainerTool { + + /** + * Constructor + */ + public SentimentTrainerTool() { + super(SentimentSample.class, TrainingToolParams.class); + } + + /** + * Runs the trainer + * + * @param format + * the format to be used + * @param args + * the arguments + */ + @Override + public void run(String format, String[] args) { + super.run(format, args); + if (0 == args.length) { + System.out.println(getHelp()); + } else { + + mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); + if (mlParams == null) { + mlParams = ModelUtil.createDefaultTrainingParameters(); + } + + File modelOutFile = params.getModel(); + + CmdLineUtil.checkOutputFile("sentiment analysis model", modelOutFile); + + SentimentModel model; + try { + SentimentFactory factory = new SentimentFactory(); + model = SentimentME.train(params.getLang(), sampleStream, mlParams, factory); + } catch (IOException e) { + throw new TerminateToolException(-1, + "IO error while reading training data or indexing data: " + e.getMessage(), e); + } finally { + try { + sampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + CmdLineUtil.writeModel("sentiment analysis", modelOutFile, model); + } + } + + /** + * Returns the help message + * + * @return the message + */ + @Override + public String getHelp() { + return "Usage: " + CLI.CMD + " " + getName() + " model < documents"; + } + + /** + * Returns the short description of the programme + * + * @return the description + */ + @Override + public String getShortDescription() { + return "learnable sentiment analysis"; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java new file mode 100644 index 0000000000..47ca8d7e73 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.sentiment.SentimentSampleStream; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; + +/** + * Factory for creating a sample stream factory for sentiment analysis. + * + * @see SentimentSample + */ +public class SentimentSampleStreamFactory

extends AbstractSampleStreamFactory { + + /** + * Instantiates a {@link SentimentSampleStreamFactory} object. + * + * @param params + * any given parameters + */ + protected SentimentSampleStreamFactory(Class

params) { + super(params); + } + + /** + * Creates a sentiment sample stream factory + * + * @param args + * the necessary arguments + * @return SentimentSample stream (factory) + */ + @Override + public ObjectStream create(String[] args) { + BasicFormatParams params = ArgumentParser.parse(args, + BasicFormatParams.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + InputStreamFactory sampleDataIn = CmdLineUtil + .createInputStreamFactory(params.getData()); + ObjectStream lineStream = null; + try { + lineStream = new PlainTextByLineStream(sampleDataIn, + params.getEncoding()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new SentimentSampleStream(lineStream); + } + + /** + * Registers a SentimentSample stream factory + */ + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(SentimentSample.class, + StreamFactoryRegistry.DEFAULT_FORMAT, + new SentimentSampleStreamFactory<>(BasicFormatParams.class)); + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java new file mode 100644 index 0000000000..418574702a --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.util.BeamSearchContextGenerator; +import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; + +/** + * Class for using a Context Generator for Sentiment Analysis. + */ +public class SentimentContextGenerator + implements BeamSearchContextGenerator { + + private AdaptiveFeatureGenerator[] featureGenerators; + + public SentimentContextGenerator() { + this(new AdaptiveFeatureGenerator[0]); + } + + public SentimentContextGenerator( + AdaptiveFeatureGenerator[] featureGenerators) { + this.featureGenerators = featureGenerators; + } + + /** + * Returns the context + * + * @param text + * the given text to be returned as context + * @return the text (the context) + */ + public String[] getContext(String[] text) { + return text; + } + + /** + * Returns the context + * + * @param index + * the index of the context + * @param sequence + * String sequence given + * @param priorDecisions + * decisions given earlier + * @param additionalContext + * any additional context + * @return the context + */ + @Override + public String[] getContext(int index, String[] sequence, + String[] priorDecisions, Object[] additionalContext) { + return new String[] {}; + } + + public void updateAdaptiveData(String[] tokens, String[] outcomes) { + + if (tokens != null && outcomes != null + && tokens.length != outcomes.length) { + throw new IllegalArgumentException( + "The tokens and outcome arrays MUST have the same size!"); + } + + for (AdaptiveFeatureGenerator featureGenerator : featureGenerators) { + featureGenerator.updateAdaptiveData(tokens, outcomes); + } + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java new file mode 100644 index 0000000000..cc3b128889 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.eval.CrossValidationPartitioner; +import opennlp.tools.util.eval.FMeasure; + +/** + * Class for performing cross validation on the Sentiment Analysis Parser. + */ +public class SentimentCrossValidator { + + /** + * Class for creating a document sample + */ + private static class DocumentSample { + + private final SentimentSample[] samples; + + /** + * Constructor + */ + DocumentSample(SentimentSample[] samples) { + this.samples = samples; + } + + /** + * Returns the short description of the tool + * + * @return the samples + */ + private SentimentSample[] getSamples() { + return samples; + } + } + + /** + * Reads Sentiment Samples to group them as a document based on the clear + * adaptive data flag. + */ + private static class SentimentToDocumentSampleStream + extends FilterObjectStream { + + private SentimentSample beginSample; + + /** + * Constructor + */ + protected SentimentToDocumentSampleStream(ObjectStream samples) { + super(samples); + } + + /** + * Reads Sentiment Samples to group them as a document + * + * @return the resulting DocumentSample + */ + @Override + public DocumentSample read() throws IOException { + + List document = new ArrayList<>(); + + if (beginSample == null) { + // Assume that the clear flag is set + beginSample = samples.read(); + } + + // Underlying stream is exhausted! + if (beginSample == null) { + return null; + } + + document.add(beginSample); + + SentimentSample sample; + while ((sample = samples.read()) != null) { + + if (sample.isClearAdaptiveDataSet()) { + beginSample = sample; + break; + } + + document.add(sample); + } + + // Underlying stream is exhausted, + // next call must return null + if (sample == null) { + beginSample = null; + } + + return new DocumentSample(document.toArray(new SentimentSample[0])); + } + + /** + * Performs a reset + */ + @Override + public void reset() throws IOException, UnsupportedOperationException { + super.reset(); + beginSample = null; + } + } + + /** + * Splits DocumentSample into SentimentSamples. + */ + private static class DocumentToSentimentSampleStream + extends FilterObjectStream { + + /** + * Constructor + */ + protected DocumentToSentimentSampleStream( + ObjectStream samples) { + super(samples); + } + + private Iterator documentSamples = Collections.emptyIterator(); + + /** + * Reads Document Sample into SentimentSample + * + * @return the resulting DocumentSample + */ + @Override + public SentimentSample read() throws IOException { + + // Note: Empty document samples should be skipped + + if (documentSamples.hasNext()) { + return documentSamples.next(); + } else { + DocumentSample docSample = samples.read(); + + if (docSample != null) { + documentSamples = Arrays.asList(docSample.getSamples()).iterator(); + + return read(); + } else { + return null; + } + } + } + } + + private final String languageCode; + private final TrainingParameters params; + private final SentimentEvaluationMonitor[] listeners; + + private final SentimentFactory factory; + private final FMeasure fmeasure = new FMeasure(); + + /** + * Constructor + */ + public SentimentCrossValidator(String lang, TrainingParameters params, + SentimentFactory factory, SentimentEvaluationMonitor[] monitors) { + + this.languageCode = lang; + this.factory = factory; + this.params = params; + this.listeners = monitors; + } + + /** + * Performs evaluation + * + * @param samples + * stream of SentimentSamples + * @param nFolds + * the number of folds to be used in cross validation + */ + public void evaluate(ObjectStream samples, int nFolds) + throws IOException { + + // Note: The sentiment samples need to be grouped on a document basis. + + CrossValidationPartitioner partitioner = new CrossValidationPartitioner<>( + new SentimentToDocumentSampleStream(samples), nFolds); + + SentimentModel model = null; + + while (partitioner.hasNext()) { + + CrossValidationPartitioner.TrainingSampleStream trainingSampleStream = partitioner + .next(); + + if (factory != null) { + model = SentimentME.train(languageCode, + new DocumentToSentimentSampleStream(trainingSampleStream), params, + factory); + } + + // do testing + SentimentEvaluator evaluator = new SentimentEvaluator( + new SentimentME(model), listeners); + + evaluator.evaluate(new DocumentToSentimentSampleStream( + trainingSampleStream.getTestSampleStream())); + + fmeasure.mergeInto(evaluator.getFMeasure()); + } + } + + /** + * Returns the F-Measure + * + * @return the F-Measure + */ + public FMeasure getFMeasure() { + return fmeasure; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java new file mode 100644 index 0000000000..ab503f6f8d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * Evaluation Monitor to be used by the evaluator + */ +public interface SentimentEvaluationMonitor + extends EvaluationMonitor { + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java new file mode 100644 index 0000000000..b50d791f55 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.util.eval.Evaluator; +import opennlp.tools.util.eval.FMeasure; + +/** + * Class for performing evaluation on the Sentiment Analysis Parser. + */ +public class SentimentEvaluator extends Evaluator { + + private final FMeasure fmeasure = new FMeasure(); + + private final SentimentME sentiment; + + /** + * Constructor + */ + public SentimentEvaluator(SentimentME sentiment, SentimentEvaluationMonitor... listeners) { + super(listeners); + this.sentiment = sentiment; + } + + /** + * Returns the short description of the tool. + * + * @param reference + * the reference to the SentimentSample to be processed + * @return the processed {@link SentimentSample samples}. + */ + @Override + protected SentimentSample processSample(SentimentSample reference) { + String prediction = sentiment.predict(reference.getSentence()); + String label = reference.getSentiment(); + + fmeasure.updateScores(new String[] { label }, new String[] { prediction }); + + return new SentimentSample(prediction, reference.getSentence()); + } + + /** + * @return Retrieves the {@link FMeasure}. + */ + public FMeasure getFMeasure() { + return fmeasure; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java new file mode 100644 index 0000000000..fe1078d05d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.util.Iterator; + +import opennlp.tools.ml.model.Event; +import opennlp.tools.util.AbstractEventStream; +import opennlp.tools.util.ObjectStream; + +/** + * Class for creating events for Sentiment Analysis that is later sent to + * MaxEnt. + * + * @see SentimentSample + */ +public class SentimentEventStream extends AbstractEventStream { + + private final SentimentContextGenerator contextGenerator; + + /** + * Instantiates a {@link SentimentEventStream event stream}. + * + * @param samples + * the sentiment samples to be used + * @param createContextGenerator + * the context generator to be used + */ + public SentimentEventStream(ObjectStream samples, + SentimentContextGenerator createContextGenerator) { + super(samples); + contextGenerator = createContextGenerator; + } + + /** + * Creates events from {@link SentimentSample sentiment samples}. + * + * @param sample + * the sentiment sample to be used + * @return event iterator + */ + @Override + protected Iterator createEvents(final SentimentSample sample) { + + return new Iterator<>() { + + private boolean isVirgin = true; + + @Override + public boolean hasNext() { + return isVirgin; + } + + @Override + public Event next() { + + isVirgin = false; + + return new Event(sample.getSentiment(), + contextGenerator.getContext(sample.getSentence())); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java new file mode 100644 index 0000000000..0d0c1c31ba --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.tokenize.Tokenizer; +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ext.ExtensionLoader; + +/** + * Class for creating sentiment factories for training. + */ +public class SentimentFactory extends BaseToolFactory { + + private static final String TOKENIZER_NAME = "sentiment.tokenizer"; + + private Tokenizer tokenizer; + + /** + * Validates the artifact map --> nothing to validate. + */ + @Override + public void validateArtifactMap() throws InvalidFormatException { + // nothing to validate + } + + /** + * Creates a new {@link SentimentContextGenerator context generator}. + * + * @return a context generator for Sentiment Analysis + */ + public SentimentContextGenerator createContextGenerator() { + return new SentimentContextGenerator(); + } + + /** + * + * + * @return Retrieves the {@link Tokenizer}. + */ + public Tokenizer getTokenizer() { + if (this.tokenizer == null) { + if (artifactProvider != null) { + String className = artifactProvider.getManifestProperty(TOKENIZER_NAME); + if (className != null) { + this.tokenizer = ExtensionLoader.instantiateExtension(Tokenizer.class, className); + } + } + if (this.tokenizer == null) { // could not load using artifact provider + this.tokenizer = WhitespaceTokenizer.INSTANCE; + } + } + return tokenizer; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java new file mode 100644 index 0000000000..98cdc7d6ab --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import opennlp.tools.ml.EventTrainer; +import opennlp.tools.ml.TrainerFactory; +import opennlp.tools.ml.model.Event; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.ml.model.SequenceClassificationModel; +import opennlp.tools.namefind.BioCodec; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Sequence; +import opennlp.tools.util.SequenceCodec; +import opennlp.tools.util.SequenceValidator; +import opennlp.tools.util.Span; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; +import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator; + +/** + * Class for creating a maximum-entropy-based Sentiment Analysis model. + */ +public class SentimentME { + + public static final String OTHER = "other"; + public static final String START = "start"; + public static final String CONTINUE = "cont"; + public static final int DEFAULT_BEAM_SIZE = 3; + + private static final String[][] EMPTY = new String[0][0]; + + protected SentimentContextGenerator contextGenerator; + + private final AdditionalContextFeatureGenerator additionalContextFeatureGenerator = + new AdditionalContextFeatureGenerator(); + + private Sequence bestSequence; + protected SequenceClassificationModel model; + private SequenceValidator sequenceValidator; + private final SentimentFactory factory; + private final MaxentModel maxentModel; + private final SequenceCodec seqCodec = new BioCodec(); + private AdaptiveFeatureGenerator[] featureGenerators; + + /** + * Constructor, initialises + * + * @param sentModel + * sentiment analysis model + */ + public SentimentME(SentimentModel sentModel) { + + this.model = sentModel.getSentimentModel(); + maxentModel = sentModel.getMaxentModel(); + + factory = sentModel.getFactory(); + + contextGenerator = factory.createContextGenerator(); + } + + /** + * Trains a Sentiment Analysis model. + * + * @param languageCode + * the code for the language of the text, e.g. "en" + * @param samples + * the sentiment samples to be used + * @param trainParams + * parameters for training + * @param factory + * a Sentiment Analysis factory + * @return a Sentiment Analysis model + */ + public static SentimentModel train(String languageCode, + ObjectStream samples, TrainingParameters trainParams, + SentimentFactory factory) throws IOException { + + Map entries = new HashMap<>(); + MaxentModel sentimentModel; + + ObjectStream eventStream = new SentimentEventStream(samples, factory.createContextGenerator()); + + EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, entries); + sentimentModel = trainer.train(eventStream); + + Map manifestInfoEntries = new HashMap<>(); + + return new SentimentModel(languageCode, sentimentModel, manifestInfoEntries, factory); + + } + + /** + * Makes a sentiment prediction + * + * @param sentence + * the text to be analysed for its sentiment + * @return the predicted sentiment + */ + public String predict(String sentence) { + String[] tokens = factory.getTokenizer().tokenize(sentence); + + return predict(tokens); + } + + public String predict(String[] tokens) { + + double[] prob = probabilities(tokens); + return getBestSentiment(prob); + } + + /** + * Returns the best chosen sentiment for the text predicted on + * + * @param outcome + * the outcome + * @return the best sentiment + */ + public String getBestSentiment(double[] outcome) { + return maxentModel.getBestOutcome(outcome); + } + + /** + * Returns the analysis probabilities + * + * @param text + * the text to categorize + */ + public double[] probabilities(String[] text) { + return maxentModel.eval(contextGenerator.getContext(text)); + } + + /** + * Returns an array of probabilities for each of the specified spans which is + * the arithmetic mean of the probabilities for each of the outcomes which + * make up the span. + * + * @param spans + * The spans of the sentiments for which probabilities are desired. + * @return an array of probabilities for each of the specified spans. + */ + public double[] probs(Span[] spans) { + + double[] sprobs = new double[spans.length]; + double[] probs = bestSequence.getProbs(); + + for (int si = 0; si < spans.length; si++) { + + double p = 0; + + for (int oi = spans[si].getStart(); oi < spans[si].getEnd(); oi++) { + p += probs[oi]; + } + + p /= spans[si].length(); + + sprobs[si] = p; + } + + return sprobs; + } + + /** + * Sets the probs for the spans + * + * @param spans + * the spans to be analysed + * @return the span of probs + */ + private Span[] setProbs(Span[] spans) { + double[] probs = probs(spans); + if (probs != null) { + + for (int i = 0; i < probs.length; i++) { + double prob = probs[i]; + spans[i] = new Span(spans[i], prob); + } + } + return spans; + } + + /** + * Generates sentiment tags for the given sequence, typically a sentence, + * returning token spans for any identified sentiments. + * + * @param tokens + * an array of the tokens or words of the sequence, typically a + * sentence + * @return an array of spans for each of the names identified. + */ + public Span[] find(String[] tokens) { + return find(tokens, EMPTY); + } + + /** + * Generates sentiment tags for the given sequence, typically a sentence, + * returning token spans for any identified sentiments. + * + * @param tokens + * an array of the tokens or words of the sequence, typically a + * sentence. + * @param additionalContext + * features which are based on context outside of the sentence but + * which should also be used. + * + * @return an array of spans for each of the names identified. + */ + public Span[] find(String[] tokens, String[][] additionalContext) { + + additionalContextFeatureGenerator.setCurrentContext(additionalContext); + + bestSequence = model.bestSequence(tokens, additionalContext, + contextGenerator, sequenceValidator); + + List c = bestSequence.getOutcomes(); + + contextGenerator.updateAdaptiveData(tokens, c.toArray(new String[c.size()])); + Span[] spans = seqCodec.decode(c); + spans = setProbs(spans); + return spans; + } + + /** + * Makes a sentiment prediction by calling the helper method + * + * @param tokens + * the text to be analysed for its sentiment + * @return the prediction made by the helper method + */ + public Span[] predict2(String[] tokens) { + return predict2(tokens, EMPTY); + } + + /** + * Makes a sentiment prediction + * + * @param tokens + * the text to be analysed for its sentiment + * @param additionalContext + * any required additional context + * @return the predictions + */ + public Span[] predict2(String[] tokens, String[][] additionalContext) { + + additionalContextFeatureGenerator.setCurrentContext(additionalContext); + + bestSequence = model.bestSequence(tokens, additionalContext, + contextGenerator, sequenceValidator); + + List c = bestSequence.getOutcomes(); + + return seqCodec.decode(c); + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java new file mode 100644 index 0000000000..ed016ee9f7 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Map; +import java.util.Properties; + +import opennlp.tools.ml.BeamSearch; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.ml.model.SequenceClassificationModel; +import opennlp.tools.util.model.BaseModel; + +/** + * Class for the basis of the Sentiment Analysis model. + */ +public class SentimentModel extends BaseModel { + + private static final String COMPONENT_NAME = "SentimentME"; + private static final String SENTIMENT_MODEL_ENTRY_NAME = "sentiment.model"; + + /** + * Instantiates a {@link SentimentModel} model. + * + * @param languageCode + * The code for the language of the text, e.g. "en" + * @param sentimentModel + * A {@link MaxentModel} sentiment model + * @param manifestInfoEntries + * Additional information in the manifest + * @param factory + * A {@link SentimentFactory} instance + */ + public SentimentModel(String languageCode, MaxentModel sentimentModel, + Map manifestInfoEntries, SentimentFactory factory) { + super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory); + artifactMap.put(SENTIMENT_MODEL_ENTRY_NAME, sentimentModel); + checkArtifactMap(); + } + + /** + * Instantiates a {@link SentimentModel} model via a {@link URL} reference. + * + * @param modelURL + * The {@link URL} to a file required to load the model. + * + * @throws IOException Thrown if IO errors occurred. + */ + public SentimentModel(URL modelURL) throws IOException { + super(COMPONENT_NAME, modelURL); + } + + /** + * Instantiates a {@link SentimentModel} model via a {@link File} reference. + * + * @param file + * The {@link File} required to load the model. + * + * @throws IOException Thrown if IO errors occurred. + */ + public SentimentModel(File file) throws IOException { + super(COMPONENT_NAME, file); + } + + /** + * Instantiates a {@link SentimentModel} model via a {@link InputStream} reference. + * + * @param modelIn + * The {@link InputStream} required to load the model. + * + * @throws IOException Thrown if IO errors occurred. + */ + public SentimentModel(InputStream modelIn) throws IOException { + super(COMPONENT_NAME, modelIn); + } + + /** + * @return Retrieves the {@link SequenceClassificationModel} model. + */ + @Deprecated + public SequenceClassificationModel getSentimentModel() { + Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY); + + String beamSizeString = manifest.getProperty(BeamSearch.BEAM_SIZE_PARAMETER); + + int beamSize = SentimentME.DEFAULT_BEAM_SIZE; + if (beamSizeString != null) { + beamSize = Integer.parseInt(beamSizeString); + } + + return new BeamSearch(beamSize, + (MaxentModel) artifactMap.get(SENTIMENT_MODEL_ENTRY_NAME)); + } + + /** + * @return Retrieves the {@link SentimentFactory} for the model. + */ + public SentimentFactory getFactory() { + return (SentimentFactory) this.toolFactory; + } + + /** + * @return Retrieves the {@link MaxentModel}. + */ + public MaxentModel getMaxentModel() { + return (MaxentModel) artifactMap.get(SENTIMENT_MODEL_ENTRY_NAME); + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java new file mode 100644 index 0000000000..44eabc4f62 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.util.List; + +/** + * Class for holding text used for sentiment analysis. + */ +public class SentimentSample { + + private final String sentiment; + private final List sentence; + private final boolean isClearAdaptiveData; + private final String id = null; + + /** + * Instantiates a {@link SentimentSample} object. + * + * @param sentiment + * training sentiment + * @param sentence + * training sentence + */ + public SentimentSample(String sentiment, String[] sentence) { + this(sentiment, sentence, true); + } + + public SentimentSample(String sentiment, String[] sentence, + boolean clearAdaptiveData) { + if (sentiment == null) { + throw new IllegalArgumentException("sentiment must not be null"); + } + if (sentence == null) { + throw new IllegalArgumentException("sentence must not be null"); + } + + this.sentiment = sentiment; + this.sentence = List.of(sentence); + this.isClearAdaptiveData = clearAdaptiveData; + } + + /** + * @return Returns the sentiment. + */ + public String getSentiment() { + return sentiment; + } + + /** + * @return Returns the sentence. + */ + public String[] getSentence() { + return sentence.toArray(new String[0]); + } + + public String getId() { + return id; + } + + /** + * @return Returns the value of isClearAdaptiveData, {@code true} or {@code false}. + */ + public boolean isClearAdaptiveDataSet() { + return isClearAdaptiveData; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java new file mode 100644 index 0000000000..9f5a6dee09 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; + +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +/** + * Class for converting Strings through Data Stream to SentimentSample using + * tokenised text. + */ +public class SentimentSampleStream + extends FilterObjectStream { + + /** + * Instantiates a {@link SentimentSampleStream} object. + * + * @param samples + * the sentiment samples to be used + */ + public SentimentSampleStream(ObjectStream samples) { + super(samples); + } + + /** + * Reads the text. + * + * @return A ready-to-be-trained {@link SentimentSample} object. + */ + @Override + public SentimentSample read() throws IOException { + String sentence = samples.read(); + + if (sentence != null) { + + // Whitespace tokenize entire string + String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sentence); + + SentimentSample sample; + + if (tokens.length > 1) { + String sentiment = tokens[0]; + String[] sentTokens = new String[tokens.length - 1]; + System.arraycopy(tokens, 1, sentTokens, 0, tokens.length - 1); + + sample = new SentimentSample(sentiment, sentTokens); + } else { + throw new IOException( + "Empty lines, or lines with only a category string are not allowed!"); + } + + return sample; + } + + return null; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java new file mode 100644 index 0000000000..567721fec8 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +/** + * Class for creating a type filter. + * + * @see FilterObjectStream + */ +public class SentimentSampleTypeFilter + extends FilterObjectStream { + + private final Set types; + + /** + * Constructor + */ + public SentimentSampleTypeFilter(String[] types, ObjectStream samples) { + super(samples); + this.types = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(types))); + } + + /** + * Instantiates a {@link SentimentSampleTypeFilter} object. + * + * @param types + * the types to filter. + * @param samples + * the sentiment samples to be used. + */ + public SentimentSampleTypeFilter(Set types, ObjectStream samples) { + super(samples); + this.types = Set.copyOf(types); + } + + /** + * @return Reads and returns the {@link SentimentSample}. + */ + @Override + public SentimentSample read() throws IOException { + return samples.read(); + + } + +}