diff --git a/de.unistuttgart.ims.drama.io.core/pom.xml b/de.unistuttgart.ims.drama.io.core/pom.xml
index 6ad79e97..57177fc6 100644
--- a/de.unistuttgart.ims.drama.io.core/pom.xml
+++ b/de.unistuttgart.ims.drama.io.core/pom.xml
@@ -49,7 +49,14 @@
${dkpro.version}
-
+
+ de.tudarmstadt.ukp.dkpro.core
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner-asl
+
+ ${dkpro.version}
+
+
de.unistuttgart.ims
uimautil
diff --git a/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java b/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java
index bdbc72e6..652ff00b 100644
--- a/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java
+++ b/de.unistuttgart.ims.drama.io.core/src/main/java/de/unistuttgart/quadrama/io/core/CONLLVariant.java
@@ -4,9 +4,12 @@
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
import org.apache.commons.csv.CSVPrinter;
import org.apache.uima.cas.FeatureStructure;
@@ -17,6 +20,8 @@
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
+import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
+import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme;
import de.unistuttgart.ims.drama.api.Act;
import de.unistuttgart.ims.drama.api.Author;
import de.unistuttgart.ims.drama.api.CastFigure;
@@ -80,6 +85,8 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException {
Utterance.class);
Drama drama = JCasUtil.selectSingle(jcas, Drama.class);
Set used = new HashSet();
+ Pattern numberPattern = Pattern.compile("^.*number=(.+?)(\\|.*$|$)");
+ Pattern genderPattern = Pattern.compile("^.*gender=(.+?)(\\|.*$|$)");
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
Integer tokenId = 0;
for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
@@ -98,11 +105,23 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException {
p.print(tokenId);
tokenId++;
p.print(token.getCoveredText()); // Form
- p.print("-"); // Tag
+ p.print(token.getPos().getPosValue()); // Tag
p.print("*"); // CFG
- p.print("-"); // Lemma
- p.print("-"); // Number
- p.print("-"); // Gender
+ p.print(token.getLemma().getValue()); // Lemma
+ List morph = JCasUtil.selectCovered(Morpheme.class, token);
+ String morphTag = morph.get(0).getMorphTag();
+ Matcher numberMatcher = numberPattern.matcher(morphTag);
+ Matcher genderMatcher = genderPattern.matcher(morphTag);
+ if (numberMatcher.find()) {
+ p.print(numberMatcher.group(1)); // Number
+ } else {
+ p.print("-");
+ }
+ if (genderMatcher.find()) {
+ p.print(genderMatcher.group(1)); // Gender
+ } else {
+ p.print("-");
+ }
if (speakerList.isEmpty()) {
p.print("_stage");
} else {
@@ -113,7 +132,7 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException {
p.print("-");
}
}
- p.print("-"); // NE
+ p.print(printNE(token)); // NE
p.print("-"); // Tobi
p.print("-"); // Tone Boundary
p.print("-"); // Nucleus
@@ -155,6 +174,8 @@ private void convertCONLL(JCas jcas, CSVPrinter p) throws IOException {
Map> mentionMap = JCasUtil.indexCovering(jcas, Token.class, Mention.class);
Drama drama = JCasUtil.selectSingle(jcas, Drama.class);
Set used = new HashSet();
+ Pattern numberPattern = Pattern.compile("^.*number=(.+?)(\\|.*$|$)");
+ Pattern genderPattern = Pattern.compile("^.*gender=(.+?)(\\|.*$|$)");
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
Integer tokenId = 0;
for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
@@ -168,12 +189,24 @@ private void convertCONLL(JCas jcas, CSVPrinter p) throws IOException {
p.print(tokenId);
tokenId++;
p.print(token.getCoveredText()); // Form
- p.print("-"); // Tag
+ p.print(token.getPos().getPosValue()); // Tag
p.print("*"); // CFG
- p.print("-"); // Lemma
- p.print("-"); // Num
- p.print("-"); // Gend
- p.print("-"); // NE
+ p.print(token.getLemma().getValue()); // Lemma
+ List morph = JCasUtil.selectCovered(Morpheme.class, token);
+ String morphTag = morph.get(0).getMorphTag();
+ Matcher numberMatcher = numberPattern.matcher(morphTag);
+ Matcher genderMatcher = genderPattern.matcher(morphTag);
+ if (numberMatcher.find()) {
+ p.print(numberMatcher.group(1)); // Number
+ } else {
+ p.print("-");
+ }
+ if (genderMatcher.find()) {
+ p.print(genderMatcher.group(1)); // Gender
+ } else {
+ p.print("-");
+ }
+ p.print(printNE(token)); // NE
String printId = "-";
if (mentionMap.containsKey(token)) {
Collection mList = mentionMap.get(token);
@@ -246,4 +279,26 @@ private String createBrackets(String printId, Mention m, Token token) {
}
return printId;
}
+
+ /**
+ * This function creates the format for NE required by the CoNLL format.
+ */
+ private String printNE(Token token) {
+ String str = null;
+ List ne = JCasUtil.selectCovered(NamedEntity.class, token);
+ if (!ne.isEmpty()) {
+ if (ne.get(0).getBegin() == token.getBegin() && ne.get(0).getEnd() == token.getEnd()) {
+ str = "(" + ne.get(0).getValue().replace("I-", "") + "*)";
+ } else if (ne.get(0).getBegin() == token.getBegin()) {
+ str = "(" + ne.get(0).getValue().replace("I-", "") + "*";
+ } else if (ne.get(0).getEnd() == token.getEnd()) {
+ str = "*)";
+ } else {
+ str = "*";
+ }
+ } else {
+ str = "-";
+ }
+ return str;
+ }
}
diff --git a/de.unistuttgart.ims.drama.main/pom.xml b/de.unistuttgart.ims.drama.main/pom.xml
index 96f59bfa..ef613a3d 100644
--- a/de.unistuttgart.ims.drama.main/pom.xml
+++ b/de.unistuttgart.ims.drama.main/pom.xml
@@ -41,6 +41,12 @@
de.tudarmstadt.ukp.dkpro.core.api.coref-asl
+
+ de.tudarmstadt.ukp.dkpro.core
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner-asl
+
+
de.tudarmstadt.ukp.dkpro.core
diff --git a/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java b/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java
index 6ea5e2ce..ae730355 100644
--- a/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java
+++ b/de.unistuttgart.ims.drama.main/src/main/java/de/unistuttgart/ims/drama/main/TEI2XMI.java
@@ -16,6 +16,7 @@
import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter;
import de.tudarmstadt.ukp.dkpro.core.matetools.MateLemmatizer;
+import de.tudarmstadt.ukp.dkpro.core.matetools.MateMorphTagger;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizer;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPosTagger;
import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter;
@@ -108,6 +109,7 @@ public static void main(String[] args) throws Exception {
}
builder.add(createEngineDescription(StanfordPosTagger.class));
builder.add(createEngineDescription(MateLemmatizer.class));
+ builder.add(createEngineDescription(MateMorphTagger.class));
if (!options.isSkipNER())
builder.add(createEngineDescription(StanfordNamedEntityRecognizer.class));
builder.add(createEngineDescription(FigureMentionDetection.class));