Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data reformat #6

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
34 changes: 21 additions & 13 deletions crfsemeval/CRFNER.py
Original file line number Diff line number Diff line change
@@ -15,22 +15,30 @@
import pickle

from FeatureExtraction import sent2labels,sent2features
from PhraseEval import phrasesFromTestSenJustExtraction,phrase_extraction_report
from DataExtraction import convertCONLLFormJustExtractionSemEval
from PhraseEval import phrasesFromTestSen,phrasesFromTestSenJustExtraction,phrasesFromTestSenJustExtractionCoNLLBIO,phrase_extraction_report
from DataExtraction import convertCONLLFormJustExtraction

TRAINTOTESTRATIO = 0.75

def main():
train_sents = convertCONLLFormJustExtractionSemEval("semeval-ner-train.txt")
test_sents = convertCONLLFormJustExtractionSemEval("semeval-ner-test.txt")
train_sents = convertCONLLFormJustExtraction("data/forcrf/nolabel/semeval-train-nolabel.txt")
test_sents = convertCONLLFormJustExtraction("data/forcrf/nolabel/semeval-test-nolabel.txt")


pprint(train_sents[0])
pprint(test_sents[0])

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

pprint(train_sents[0][0])
pprint(X_train[0][0])
pprint(y_train[0])

#pprint(X_train[0][1])
#pprint(X_train[0][2])


crf = sklearn_crfsuite.CRF(\
algorithm='lbfgs',\
c1=0.1,\
@@ -46,7 +54,7 @@ def main():
pickle.dump(crf,open("linear-chain-crf.model.pickle","wb"))
y_pred = crf.predict(X_test)

# Use this if you need to do grid search on training data for parameter optimization.
# define fixed parameters and parameters to search
'''
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
@@ -77,10 +85,8 @@ def main():
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

# Use this if you want to see how the phrase extraction works. This does NOT produce the ann files.
'''
test_sents_phrases=[phrasesFromTestSenJustExtraction(x) for x in test_sents]
pprint(test_sents_phrases[:10])
test_sents_phrases=[phrasesFromTestSenJustExtractionCoNLLBIO(x) for x in test_sents]
print "gold standard phrases for test sentences created"

test_sents_pls = [] #test sentences with predicted labels
@@ -92,9 +98,11 @@ def main():
sent.append(nt)
test_sents_pls.append(sent)

test_sents_pls_phrases=[phrasesFromTestSenJustExtraction(x) for x in test_sents_pls]
test_sents_pls_phrases=[phrasesFromTestSenJustExtractionCoNLLBIO(x) for x in test_sents_pls]
print "predicted phrases for test sentences created"


#pprint (phrase_classification_report(test_sents_phrases,test_sents_pls_phrases))

gps = []
pps = []
for sent in test_sents_phrases:
48 changes: 0 additions & 48 deletions crfsemeval/ClassifyCRFtoANN.py

This file was deleted.

47 changes: 47 additions & 0 deletions crfsemeval/ClassifyWithCRF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pickle
import os,sys
from pprint import pprint

from sklearn_crfsuite import metrics

from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
from FeatureExtraction import sent2labels,sent2features
from PhraseEval import phrasesFromTestSenJustExtractionWithIndex

fileinLoc = sys.argv[1]
CRFPREDICTIONRESULTSDIR = sys.argv[2]
fileoutLoc = os.path.join(CRFPREDICTIONRESULTSDIR,os.path.split(fileinLoc)[-1].split("-")[0]+"-crfprediction.txt")

crf = pickle.load(open("linear-chain-crf.model.pickle"))
(test_sents,test_sents_indices) = convertCONLLFormJustExtractionSemEvalPerfile(fileinLoc)

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

y_pred = crf.predict(X_test)

labels = list(crf.classes_)
labels.remove('O')

print labels
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

test_sents_pls = [] #test sentences with predicted labels
for index,testsent in enumerate(test_sents):
sent=[]
pls = y_pred[index]
for ((word,pos,chunk,glabel),pl) in zip(testsent,pls):
nt=(word,pos,chunk,pl)
sent.append(nt)
test_sents_pls.append(sent)

with open(fileoutLoc,"w") as f:
for (sen,senindex) in zip(test_sents_pls,test_sents_indices):
for ((word,pos,chunk,plabel),index) in zip(sen,senindex):
f.write("{0} {1} {2} {3} {4}\n".format(word,pos,chunk,plabel,str(index[0])+","+str(index[1])))
f.write("\n")

print "classified file written at",fileoutLoc


41 changes: 41 additions & 0 deletions crfsemeval/ConvertCoNLLtoANN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pickle
import os,sys
from pprint import pprint

from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
from PhraseEval import phrasesFromTestSenJustExtractionWithIndexCoNLLBIO

def noLabelPhraseExtraction(fileinLoc):
(sents,sents_indices) = convertCONLLFormJustExtractionSemEvalPerfile(fileinLoc)
test_sents_pls_phrases=[x for x in [phrasesFromTestSenJustExtractionWithIndexCoNLLBIO(x,y)[-1]['phrases'] for (x,y) in zip(sents,sents_indices)] if x]
phraseDict = {}
for sen in test_sents_pls_phrases:
for (phrase,pis,pie) in sen:
pti = str(pis)+","+str(pie)
if pti in phraseDict:
phraseDict[pti].append((phrase,"KEYPHRASE_NOTYPES"))
else:
phraseDict[pti] = [(phrase,"KEYPHRASE_NOTYPES")]
return phraseDict

def writeFile(fileoutLoc,phraseDict): # a key in phraseDict is the position of the phrase 120,235. Each value is a list: [(phrase,phrase type)].
i = 0
with open(fileoutLoc,"w") as f:
for tokenloc in phraseDict:
pis = tokenloc.split(",")[0]
pie = tokenloc.split(",")[1]
for (phrase,phrasetype) in phraseDict[tokenloc]:
f.write("T{0}\t{1} {2} {3}\t{4}\n".format(str(i),phrasetype,pis,pie,phrase))
i+=1

def main():
fileInLoc = sys.argv[1]
PREDICTEDANNDIR = sys.argv[2]
fileOutLoc = os.path.join(PREDICTEDANNDIR,os.path.split(fileInLoc)[-1].split("-")[0]+".ann")

pdnolabel = noLabelPhraseExtraction(fileInLoc)
writeFile(fileOutLoc,pdnolabel)
print "file writtem at",fileOutLoc

if __name__ == "__main__":
main()
139 changes: 139 additions & 0 deletions crfsemeval/CreateOneTagandPOSCHunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import random
from pprint import pprint
import os
import sys
import nltk.data
import nltk.chunk
from nltk import pos_tag
from copy import deepcopy

#needed because code failes on a word like -LRB
def filterUnderscores(tokens):
#return [x for x in tokens if "-" not in x[0]]
return tokens

def changeBeginTags(tokens,p): #expects a list of tokens of the form (word,tags**), for example, (word, POS-TAG, Chunk-Tag, Label)
# and a position, 1,2 etc.. The position determine which labels to change.
tags = set([x[p] for x in tokens])
if len(tags) == 1 and list(tags)[0] == "O": #just one tag, O
return tokens
else:
'''
A token sequence
[('An', 'DT', u'B-NP', 'O'), ('Experiment', 'NN', u'I-NP', 'O'), ('In', 'IN', u'B-PP', 'O'), ('Semantic', 'NNP', u'B-NP', 'B_D'), ('Tagging', 'NNP', u'I-NP', 'I_D'), ('Using', 'NNP', u'I-NP', 'O'), ('Hidden', 'NNP', u'I-NP', 'B_T'), ('Markov', 'NNP', u'I-NP', 'I_T'), ('Model', 'NNP', u'I-NP', 'I_T'), ('Tagging', 'VBG', u'B-VP', 'B_D')]
should be changed to:
[('An', 'DT', u'I-NP', 'O'), ('Experiment', 'NN', u'I-NP', 'O'), ('In', 'IN', u'B-PP', 'O'), ('Semantic', 'NNP', u'I-NP', 'I_D'), ('Tagging', 'NNP', u'I-NP', 'I_D'), ('Using', 'NNP', u'I-NP', 'O'), ('Hidden', 'NNP', u'I-NP', 'I_T'), ('Markov', 'NNP', u'I-NP', 'I_T'), ('Model', 'NNP', u'I-NP', 'I_T'), ('Tagging', 'VBG', u'I-VP', 'I_D')]
following the CoNLL guideline: http://www.cnts.ua.ac.be/conll2003/ner/
Only if two phrases of the same type immediately follow each other, the first word of the second phrase will have tag B-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase.
'''
bindices = [x for (x,y) in enumerate(tokens) if y[p].startswith("B")]
ntokens = [list(x) for x in tokens]
for bindex in bindices:
if bindex != 0:
leftTokenLabel = tokens[bindex-1][p]
thisTokenLabel = tokens[bindex][p]
if leftTokenLabel == "O" or leftTokenLabel.split("-")[1] != thisTokenLabel.split("-")[1]:
ntokens[bindex][p] = thisTokenLabel.replace("B-","I-")
else:
ntokens[0][p] = ntokens[0][p].replace("B-","I-")
return filterUnderscores([tuple(x) for x in ntokens])



def randomOneLabelExtractionfromTwoLabels(ts_with_tags_chunks):
i = 0
tokens = []
while i < len(ts_with_tags_chunks):
t = ts_with_tags_chunks[i]
i += 1
tag = t[-2]
if len(tag) == 4 and tag.startswith("B-"):
r = random.random()
chosentag = ""
if r > 0.5 :
chosentag = tag.split("-")[1][1]
else:
chosentag = tag.split("-")[1][0]
tag = "B-"+chosentag
tokens.append((t[0],t[1],t[2],tag,t[4]))
while i < len(ts_with_tags_chunks) and ts_with_tags_chunks[i][-2].startswith("I-"):
t = ts_with_tags_chunks[i]
tag = "I-"+chosentag
i += 1
tokens.append((t[0],t[1],t[2],tag,t[4]))
else:
tokens.append((t[0],t[1],t[2],tag,t[4]))
return changeBeginTags(changeBeginTags(tokens,2),3)



def NoLabelExtractionfromTwoLabels(ts_with_tags_chunks):
tmp = []
for x in changeBeginTags(changeBeginTags(ts_with_tags_chunks,2),3):
if x[3] == "O":
tmp.append(x)
else:
tmp.append((x[0],x[1],x[2],x[3][0]+"-KP",x[4]))
return tmp

def createFile(sens,loc):
with open(loc,"w") as f:
for sen in sens:
for wts in sen:
f.write((" ".join([x.decode("utf-8") for x in wts])+"\n").encode("utf-8"))
f.write("\n")


def main():
loc = sys.argv[1]
#loc = "malletformatfeaturesmultilabel/train/S0003491615000433__output.txt"

base = loc.split("_")[0]

chunker = nltk.data.load("chunkers/conll2000_ub.pickle")

dT=open(loc).read().split("\n")[:-1]
sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
sT1s = [x for x in [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)] if x]
sensdict = {
'nolabel-withouttokenindex':[],
'nolabel-withtokenindex':[],
'onelabel-withouttokenindex':[],
'onelabel-withtokenindex':[],
'multilabel-withouttokenindex':[],
'multilabel-withtokenindex':[]
}
for s in sT1s:
ts = [(x.split("\t")[0],x.split("\t")[1].split(" ")[0], ",".join(x.split("\t")[1].split(" ")[1:])[:-1]) for x in s] #if the tag contains _, replace with -
ts_with_pos_tags = [(x[0],y[1],x[1],x[2]) for (x,y) in zip(ts,pos_tag([x[0] for x in ts])) ]

ts_with_tags_chunks = [(x[0],x[1],y[2],x[2],x[3]) for (x,y) in\
zip(ts_with_pos_tags,nltk.chunk.tree2conlltags(chunker.parse([(x[0],x[1]) for x in ts_with_pos_tags])))]

multilabeltsc = changeBeginTags(changeBeginTags(ts_with_tags_chunks,2),3)

sensdict['multilabel-withtokenindex'].append([(word,pos,chunk,label,index) for (word,pos,chunk,label,index) in multilabeltsc])
sensdict['multilabel-withouttokenindex'].append([(word,pos,chunk,label) for (word,pos,chunk,label,index) in multilabeltsc])

onelabeltsc = randomOneLabelExtractionfromTwoLabels(ts_with_tags_chunks)
sensdict['onelabel-withtokenindex'].append([(word,pos,chunk,label,index) for (word,pos,chunk,label,index) in onelabeltsc])
sensdict['onelabel-withouttokenindex'].append([(word,pos,chunk,label) for (word,pos,chunk,label,index) in onelabeltsc])

nolabeltsc = NoLabelExtractionfromTwoLabels(multilabeltsc)
sensdict['nolabel-withtokenindex'].append([(word,pos,chunk,label,index) for (word,pos,chunk,label,index) in nolabeltsc])
sensdict['nolabel-withouttokenindex'].append([(word,pos,chunk,label) for (word,pos,chunk,label,index) in nolabeltsc])

for k in sensdict:
#print k
#print "----------------"
#pprint(sensdict[k][0])
#print "----------------"
createFile(sensdict[k],base+"-"+k+".txt")






if __name__ == "__main__":
main()
19 changes: 15 additions & 4 deletions crfsemeval/DataExtraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
from nltk import pos_tag

def convertCONLLFormJustExtraction(loc):
dT=open(loc).read().split("\n")[:-2]
sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
sTs = []
for s in sT1s:
xp = [x.split(" ") for x in s]
ts = [(x[0],x[1],x[2],x[3]) for x in xp]
sTs.append(ts)
return sTs

def convertCONLLFormJustExtractionSemEval(loc):
dT=open(loc).read().split("\n")[:-2]
sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
@@ -13,16 +24,16 @@ def convertCONLLFormJustExtractionSemEval(loc):
return sTs

def convertCONLLFormJustExtractionSemEvalPerfile(loc):
#assumes we have a file with token indices in the form `x,y` and the end of each line.
dT=open(loc).read().split("\n")[:-2]
sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
sTs = []
sTIs = []
for s in sT1s:
ts= [(x.split("\t")[0],x.split("\t")[1],x.split("\t")[2]) for x in s]
tss = [(x[0],y[1],x[1],x[2]) for (x,y) in zip(ts,pos_tag([x[0] for x in ts])) ]
tokens = [(x,y,z[0]) for (x,y,z,w) in tss]
tokenindices = [w for (x,y,z,w) in tss]
xp = [x.split(" ") for x in s]
tokens = [(word,pos,chunk,label) for (word,pos,chunk,label,ti) in xp]
tokenindices = [(int(ti.split(",")[0]),int(ti.split(",")[1])) for (word,pos,chunk,label,ti) in xp]
sTs.append(tokens)
sTIs.append(tokenindices)
return (sTs,sTIs)
Loading