diff --git a/de.unistuttgart.ims.drama.io.core/pom.xml b/de.unistuttgart.ims.drama.io.core/pom.xml index 6ad79e97..57177fc6 100644 --- a/de.unistuttgart.ims.drama.io.core/pom.xml +++ b/de.unistuttgart.ims.drama.io.core/pom.xml @@ -49,7 +49,14 @@ ${dkpro.version} - + + de.tudarmstadt.ukp.dkpro.core + + de.tudarmstadt.ukp.dkpro.core.api.ner-asl + + ${dkpro.version} + + de.unistuttgart.ims uimautil diff --git a/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java b/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java index bdbc72e6..652ff00b 100644 --- a/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java +++ b/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java @@ -4,9 +4,12 @@ import java.util.Collection; import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.HashMap; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; +import java.util.regex.Matcher; import org.apache.commons.csv.CSVPrinter; import org.apache.uima.cas.FeatureStructure; @@ -17,6 +20,8 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme; import de.unistuttgart.ims.drama.api.Act; import de.unistuttgart.ims.drama.api.Author; import de.unistuttgart.ims.drama.api.CastFigure; @@ -80,6 +85,8 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException { Utterance.class); Drama drama = JCasUtil.selectSingle(jcas, Drama.class); Set used = new HashSet(); + Pattern numberPattern = Pattern.compile("^.*number=(.+?)(\\|.*$|$)"); + Pattern genderPattern = Pattern.compile("^.*gender=(.+?)(\\|.*$|$)"); for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { Integer tokenId = 0; for (Token token : JCasUtil.selectCovered(Token.class, sentence)) { @@ -98,11 +105,23 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException { p.print(tokenId); tokenId++; p.print(token.getCoveredText()); // Form - p.print("-"); // Tag + p.print(token.getPos().getPosValue()); // Tag p.print("*"); // CFG - p.print("-"); // Lemma - p.print("-"); // Number - p.print("-"); // Gender + p.print(token.getLemma().getValue()); // Lemma + List morph = JCasUtil.selectCovered(Morpheme.class, token); + String morphTag = morph.get(0).getMorphTag(); + Matcher numberMatcher = numberPattern.matcher(morphTag); + Matcher genderMatcher = genderPattern.matcher(morphTag); + if (numberMatcher.find()) { + p.print(numberMatcher.group(1)); // Number + } else { + p.print("-"); + } + if (genderMatcher.find()) { + p.print(genderMatcher.group(1)); // Gender + } else { + p.print("-"); + } if (speakerList.isEmpty()) { p.print("_stage"); } else { @@ -113,7 +132,7 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException { p.print("-"); } } - p.print("-"); // NE + p.print(printNE(token)); // NE p.print("-"); // Tobi p.print("-"); // Tone Boundary p.print("-"); // Nucleus @@ -155,6 +174,8 @@ private void convertCONLL(JCas jcas, CSVPrinter p) throws IOException { Map> mentionMap = JCasUtil.indexCovering(jcas, Token.class, Mention.class); Drama drama = JCasUtil.selectSingle(jcas, Drama.class); Set used = new HashSet(); + Pattern numberPattern = Pattern.compile("^.*number=(.+?)(\\|.*$|$)"); + Pattern genderPattern = Pattern.compile("^.*gender=(.+?)(\\|.*$|$)"); for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { Integer tokenId = 0; for (Token token : JCasUtil.selectCovered(Token.class, sentence)) { @@ -168,12 +189,24 @@ private void convertCONLL(JCas jcas, CSVPrinter p) throws IOException { p.print(tokenId); tokenId++; p.print(token.getCoveredText()); // Form - p.print("-"); // Tag + p.print(token.getPos().getPosValue()); // Tag p.print("*"); // CFG - p.print("-"); // Lemma - p.print("-"); // Num - p.print("-"); // Gend - p.print("-"); // NE + p.print(token.getLemma().getValue()); // Lemma + List morph = JCasUtil.selectCovered(Morpheme.class, token); + String morphTag = morph.get(0).getMorphTag(); + Matcher numberMatcher = numberPattern.matcher(morphTag); + Matcher genderMatcher = genderPattern.matcher(morphTag); + if (numberMatcher.find()) { + p.print(numberMatcher.group(1)); // Number + } else { + p.print("-"); + } + if (genderMatcher.find()) { + p.print(genderMatcher.group(1)); // Gender + } else { + p.print("-"); + } + p.print(printNE(token)); // NE String printId = "-"; if (mentionMap.containsKey(token)) { Collection mList = mentionMap.get(token); @@ -246,4 +279,26 @@ private String createBrackets(String printId, Mention m, Token token) { } return printId; } + + /** + * This function creates the format for NE required by the CoNLL format. + */ + private String printNE(Token token) { + String str = null; + List ne = JCasUtil.selectCovered(NamedEntity.class, token); + if (!ne.isEmpty()) { + if (ne.get(0).getBegin() == token.getBegin() && ne.get(0).getEnd() == token.getEnd()) { + str = "(" + ne.get(0).getValue().replace("I-", "") + "*)"; + } else if (ne.get(0).getBegin() == token.getBegin()) { + str = "(" + ne.get(0).getValue().replace("I-", "") + "*"; + } else if (ne.get(0).getEnd() == token.getEnd()) { + str = "*)"; + } else { + str = "*"; + } + } else { + str = "-"; + } + return str; + } } diff --git a/de.unistuttgart.ims.drama.main/pom.xml b/de.unistuttgart.ims.drama.main/pom.xml index 96f59bfa..ef613a3d 100644 --- a/de.unistuttgart.ims.drama.main/pom.xml +++ b/de.unistuttgart.ims.drama.main/pom.xml @@ -41,6 +41,12 @@ de.tudarmstadt.ukp.dkpro.core.api.coref-asl + + de.tudarmstadt.ukp.dkpro.core + + de.tudarmstadt.ukp.dkpro.core.api.ner-asl + + de.tudarmstadt.ukp.dkpro.core diff --git a/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java b/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java index 6ea5e2ce..ae730355 100644 --- a/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java +++ b/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java @@ -16,6 +16,7 @@ import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter; import de.tudarmstadt.ukp.dkpro.core.matetools.MateLemmatizer; +import de.tudarmstadt.ukp.dkpro.core.matetools.MateMorphTagger; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizer; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPosTagger; import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter; @@ -108,6 +109,7 @@ public static void main(String[] args) throws Exception { } builder.add(createEngineDescription(StanfordPosTagger.class)); builder.add(createEngineDescription(MateLemmatizer.class)); + builder.add(createEngineDescription(MateMorphTagger.class)); if (!options.isSkipNER()) builder.add(createEngineDescription(StanfordNamedEntityRecognizer.class)); builder.add(createEngineDescription(FigureMentionDetection.class));