Skip to content

Commit

Permalink
Add linguistic information to CoNLL output
Browse files Browse the repository at this point in the history
Add information about:

- lemmas
- parts-of-speech
- morphological features (number and gender)
- named entities
  • Loading branch information
pagelj committed Jun 8, 2019
1 parent 53d6a84 commit 747dd65
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 11 deletions.
9 changes: 8 additions & 1 deletion de.unistuttgart.ims.drama.io.core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,14 @@
</artifactId>
<version>${dkpro.version}</version>
</dependency>
<dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>
de.tudarmstadt.ukp.dkpro.core.api.ner-asl
</artifactId>
<version>${dkpro.version}</version>
</dependency>
<dependency>
<groupId>de.unistuttgart.ims</groupId>
<artifactId>uimautil</artifactId>
</dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import org.apache.commons.csv.CSVPrinter;
import org.apache.uima.cas.FeatureStructure;
Expand All @@ -17,6 +20,8 @@

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme;
import de.unistuttgart.ims.drama.api.Act;
import de.unistuttgart.ims.drama.api.Author;
import de.unistuttgart.ims.drama.api.CastFigure;
Expand Down Expand Up @@ -80,6 +85,8 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException {
Utterance.class);
Drama drama = JCasUtil.selectSingle(jcas, Drama.class);
Set<Mention> used = new HashSet<Mention>();
Pattern numberPattern = Pattern.compile("^.*number=(.+?)(\\|.*$|$)");
Pattern genderPattern = Pattern.compile("^.*gender=(.+?)(\\|.*$|$)");
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
Integer tokenId = 0;
for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
Expand All @@ -98,11 +105,23 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException {
p.print(tokenId);
tokenId++;
p.print(token.getCoveredText()); // Form
p.print("-"); // Tag
p.print(token.getPos().getPosValue()); // Tag
p.print("*"); // CFG
p.print("-"); // Lemma
p.print("-"); // Number
p.print("-"); // Gender
p.print(token.getLemma().getValue()); // Lemma
List<Morpheme> morph = JCasUtil.selectCovered(Morpheme.class, token);
String morphTag = morph.get(0).getMorphTag();
Matcher numberMatcher = numberPattern.matcher(morphTag);
Matcher genderMatcher = genderPattern.matcher(morphTag);
if (numberMatcher.find()) {
p.print(numberMatcher.group(1)); // Number
} else {
p.print("-");
}
if (genderMatcher.find()) {
p.print(genderMatcher.group(1)); // Gender
} else {
p.print("-");
}
if (speakerList.isEmpty()) {
p.print("_stage");
} else {
Expand All @@ -113,7 +132,7 @@ private void convertDirndl(JCas jcas, CSVPrinter p) throws IOException {
p.print("-");
}
}
p.print("-"); // NE
p.print(printNE(token)); // NE
p.print("-"); // Tobi
p.print("-"); // Tone Boundary
p.print("-"); // Nucleus
Expand Down Expand Up @@ -155,6 +174,8 @@ private void convertCONLL(JCas jcas, CSVPrinter p) throws IOException {
Map<Token, Collection<Mention>> mentionMap = JCasUtil.indexCovering(jcas, Token.class, Mention.class);
Drama drama = JCasUtil.selectSingle(jcas, Drama.class);
Set<Mention> used = new HashSet<Mention>();
Pattern numberPattern = Pattern.compile("^.*number=(.+?)(\\|.*$|$)");
Pattern genderPattern = Pattern.compile("^.*gender=(.+?)(\\|.*$|$)");
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
Integer tokenId = 0;
for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
Expand All @@ -168,12 +189,24 @@ private void convertCONLL(JCas jcas, CSVPrinter p) throws IOException {
p.print(tokenId);
tokenId++;
p.print(token.getCoveredText()); // Form
p.print("-"); // Tag
p.print(token.getPos().getPosValue()); // Tag
p.print("*"); // CFG
p.print("-"); // Lemma
p.print("-"); // Num
p.print("-"); // Gend
p.print("-"); // NE
p.print(token.getLemma().getValue()); // Lemma
List<Morpheme> morph = JCasUtil.selectCovered(Morpheme.class, token);
String morphTag = morph.get(0).getMorphTag();
Matcher numberMatcher = numberPattern.matcher(morphTag);
Matcher genderMatcher = genderPattern.matcher(morphTag);
if (numberMatcher.find()) {
p.print(numberMatcher.group(1)); // Number
} else {
p.print("-");
}
if (genderMatcher.find()) {
p.print(genderMatcher.group(1)); // Gender
} else {
p.print("-");
}
p.print(printNE(token)); // NE
String printId = "-";
if (mentionMap.containsKey(token)) {
Collection<Mention> mList = mentionMap.get(token);
Expand Down Expand Up @@ -246,4 +279,26 @@ private String createBrackets(String printId, Mention m, Token token) {
}
return printId;
}

/**
* This function creates the format for NE required by the CoNLL format.
*/
private String printNE(Token token) {
String str = null;
List<NamedEntity> ne = JCasUtil.selectCovered(NamedEntity.class, token);
if (!ne.isEmpty()) {
if (ne.get(0).getBegin() == token.getBegin() && ne.get(0).getEnd() == token.getEnd()) {
str = "(" + ne.get(0).getValue().replace("I-", "") + "*)";
} else if (ne.get(0).getBegin() == token.getBegin()) {
str = "(" + ne.get(0).getValue().replace("I-", "") + "*";
} else if (ne.get(0).getEnd() == token.getEnd()) {
str = "*)";
} else {
str = "*";
}
} else {
str = "-";
}
return str;
}
}
6 changes: 6 additions & 0 deletions de.unistuttgart.ims.drama.main/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@
de.tudarmstadt.ukp.dkpro.core.api.coref-asl
</artifactId>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>
de.tudarmstadt.ukp.dkpro.core.api.ner-asl
</artifactId>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter;
import de.tudarmstadt.ukp.dkpro.core.matetools.MateLemmatizer;
import de.tudarmstadt.ukp.dkpro.core.matetools.MateMorphTagger;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizer;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPosTagger;
import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter;
Expand Down Expand Up @@ -108,6 +109,7 @@ public static void main(String[] args) throws Exception {
}
builder.add(createEngineDescription(StanfordPosTagger.class));
builder.add(createEngineDescription(MateLemmatizer.class));
builder.add(createEngineDescription(MateMorphTagger.class));
if (!options.isSkipNER())
builder.add(createEngineDescription(StanfordNamedEntityRecognizer.class));
builder.add(createEngineDescription(FigureMentionDetection.class));
Expand Down

0 comments on commit 747dd65

Please sign in to comment.