SeerLabs · sagnik · Dec 10, 2016 · Dec 10, 2016 · Dec 10, 2016 · Dec 10, 2016
diff --git a/crfsemeval/CRFNER.py b/crfsemeval/CRFNER.py
@@ -15,22 +15,30 @@
 import pickle
 
 from FeatureExtraction import sent2labels,sent2features
-from PhraseEval import phrasesFromTestSenJustExtraction,phrase_extraction_report
-from DataExtraction import convertCONLLFormJustExtractionSemEval
+from PhraseEval import phrasesFromTestSen,phrasesFromTestSenJustExtraction,phrasesFromTestSenJustExtractionCoNLLBIO,phrase_extraction_report
+from DataExtraction import convertCONLLFormJustExtraction
+
+TRAINTOTESTRATIO = 0.75
 
 def main():
-    train_sents = convertCONLLFormJustExtractionSemEval("semeval-ner-train.txt")
-    test_sents = convertCONLLFormJustExtractionSemEval("semeval-ner-test.txt")
+    train_sents = convertCONLLFormJustExtraction("data/forcrf/nolabel/semeval-train-nolabel.txt")
+    test_sents = convertCONLLFormJustExtraction("data/forcrf/nolabel/semeval-test-nolabel.txt")
+
 
-    pprint(train_sents[0])
-    pprint(test_sents[0])
-
     X_train = [sent2features(s) for s in train_sents]
     y_train = [sent2labels(s) for s in train_sents]
 
     X_test = [sent2features(s) for s in test_sents]
     y_test = [sent2labels(s) for s in test_sents]
 
+    pprint(train_sents[0][0]) 
+    pprint(X_train[0][0])
+    pprint(y_train[0]) 
+
+    #pprint(X_train[0][1])
+    #pprint(X_train[0][2])
+
+
     crf = sklearn_crfsuite.CRF(\
     algorithm='lbfgs',\
     c1=0.1,\
@@ -46,7 +54,7 @@ def main():
     pickle.dump(crf,open("linear-chain-crf.model.pickle","wb"))
     y_pred = crf.predict(X_test)
 
-    # Use this if you need to do grid search on training data for parameter optimization.
+    # define fixed parameters and parameters to search
     '''    
     crf = sklearn_crfsuite.CRF(
         algorithm='lbfgs',
@@ -77,10 +85,8 @@ def main():
     sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
     print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
 
-    # Use this if you want to see how the phrase extraction works. This does NOT produce the ann files. 
     '''
-    test_sents_phrases=[phrasesFromTestSenJustExtraction(x) for x in test_sents]
-    pprint(test_sents_phrases[:10]) 
+    test_sents_phrases=[phrasesFromTestSenJustExtractionCoNLLBIO(x) for x in test_sents]
     print "gold standard phrases for test sentences created"
 
     test_sents_pls = []  #test sentences with predicted labels
@@ -92,9 +98,11 @@ def main():
             sent.append(nt)
         test_sents_pls.append(sent)  
 
-    test_sents_pls_phrases=[phrasesFromTestSenJustExtraction(x) for x in test_sents_pls]
+    test_sents_pls_phrases=[phrasesFromTestSenJustExtractionCoNLLBIO(x) for x in test_sents_pls]
     print "predicted phrases for test sentences created"
-
+
+    #pprint (phrase_classification_report(test_sents_phrases,test_sents_pls_phrases)) 
+
     gps = []
     pps = []
     for sent in test_sents_phrases:

diff --git a/crfsemeval/ClassifyCRFtoANN.py b/crfsemeval/ClassifyCRFtoANN.py
diff --git a/crfsemeval/ClassifyWithCRF.py b/crfsemeval/ClassifyWithCRF.py
@@ -0,0 +1,47 @@
+import pickle
+import os,sys
+from pprint import pprint
+
+from sklearn_crfsuite import metrics
+
+from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
+from FeatureExtraction import sent2labels,sent2features
+from PhraseEval import phrasesFromTestSenJustExtractionWithIndex
+
+fileinLoc = sys.argv[1]
+CRFPREDICTIONRESULTSDIR = sys.argv[2]
+fileoutLoc = os.path.join(CRFPREDICTIONRESULTSDIR,os.path.split(fileinLoc)[-1].split("-")[0]+"-crfprediction.txt")
+
+crf = pickle.load(open("linear-chain-crf.model.pickle"))
+(test_sents,test_sents_indices) = convertCONLLFormJustExtractionSemEvalPerfile(fileinLoc)
+
+X_test = [sent2features(s) for s in test_sents]
+y_test = [sent2labels(s) for s in test_sents]
+
+y_pred = crf.predict(X_test)
+
+labels = list(crf.classes_)
+labels.remove('O')
+
+print labels
+sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
+print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
+
+test_sents_pls = []  #test sentences with predicted labels
+for index,testsent in enumerate(test_sents):
+    sent=[]
+    pls = y_pred[index]
+    for ((word,pos,chunk,glabel),pl) in zip(testsent,pls):
+        nt=(word,pos,chunk,pl)
+        sent.append(nt)
+    test_sents_pls.append(sent)
+
+with open(fileoutLoc,"w") as f:
+    for (sen,senindex) in zip(test_sents_pls,test_sents_indices):
+        for ((word,pos,chunk,plabel),index) in zip(sen,senindex):
+            f.write("{0} {1} {2} {3} {4}\n".format(word,pos,chunk,plabel,str(index[0])+","+str(index[1]))) 
+        f.write("\n") 
+
+print "classified file written at",fileoutLoc  
+
+
diff --git a/crfsemeval/ConvertCoNLLtoANN.py b/crfsemeval/ConvertCoNLLtoANN.py
@@ -0,0 +1,41 @@
+import pickle
+import os,sys
+from pprint import pprint
+
+from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
+from PhraseEval import phrasesFromTestSenJustExtractionWithIndexCoNLLBIO
+
+def noLabelPhraseExtraction(fileinLoc):
+    (sents,sents_indices) = convertCONLLFormJustExtractionSemEvalPerfile(fileinLoc)
+    test_sents_pls_phrases=[x for x in [phrasesFromTestSenJustExtractionWithIndexCoNLLBIO(x,y)[-1]['phrases'] for (x,y) in zip(sents,sents_indices)] if x]
+    phraseDict = {}
+    for sen in test_sents_pls_phrases:
+        for (phrase,pis,pie) in sen:
+            pti = str(pis)+","+str(pie)
+            if pti in phraseDict:
+                phraseDict[pti].append((phrase,"KEYPHRASE_NOTYPES")) 
+            else:
+                phraseDict[pti] = [(phrase,"KEYPHRASE_NOTYPES")]
+    return phraseDict       
+
+def writeFile(fileoutLoc,phraseDict): # a key in phraseDict is the position of the phrase 120,235. Each value is a list: [(phrase,phrase type)]. 
+    i = 0
+    with open(fileoutLoc,"w") as f:
+        for tokenloc in phraseDict:
+            pis = tokenloc.split(",")[0]
+            pie = tokenloc.split(",")[1] 
+            for (phrase,phrasetype) in phraseDict[tokenloc]:
+                f.write("T{0}\t{1} {2} {3}\t{4}\n".format(str(i),phrasetype,pis,pie,phrase))
+            i+=1
+
+def main():
+    fileInLoc = sys.argv[1]
+    PREDICTEDANNDIR = sys.argv[2]
+    fileOutLoc = os.path.join(PREDICTEDANNDIR,os.path.split(fileInLoc)[-1].split("-")[0]+".ann")
+
+    pdnolabel = noLabelPhraseExtraction(fileInLoc)
+    writeFile(fileOutLoc,pdnolabel)   
+    print "file writtem at",fileOutLoc
+
+if __name__ == "__main__":
+    main()
diff --git a/crfsemeval/CreateOneTagandPOSCHunks.py b/crfsemeval/CreateOneTagandPOSCHunks.py
@@ -0,0 +1,139 @@
+import random
+from pprint import pprint
+import os
+import sys
+import nltk.data
+import nltk.chunk
+from nltk import pos_tag
+from copy import deepcopy
+
+#needed because code failes on a word like -LRB
+def filterUnderscores(tokens):
+     #return [x for x in tokens if "-" not in x[0]]
+     return tokens
+
+def changeBeginTags(tokens,p): #expects a list of tokens of the form (word,tags**), for example, (word, POS-TAG, Chunk-Tag, Label)
+    # and a position, 1,2 etc.. The position determine which labels to change.
+    tags = set([x[p] for x in tokens])
+    if len(tags) == 1 and list(tags)[0] == "O": #just one tag, O
+        return tokens
+    else:
+        '''
+        A token sequence
+        [('An', 'DT', u'B-NP', 'O'), ('Experiment', 'NN', u'I-NP', 'O'), ('In', 'IN', u'B-PP', 'O'), ('Semantic', 'NNP', u'B-NP', 'B_D'), ('Tagging', 'NNP', u'I-NP', 'I_D'), ('Using', 'NNP', u'I-NP', 'O'), ('Hidden', 'NNP', u'I-NP', 'B_T'), ('Markov', 'NNP', u'I-NP', 'I_T'), ('Model', 'NNP', u'I-NP', 'I_T'), ('Tagging', 'VBG', u'B-VP', 'B_D')]
+        should be changed to:
+        [('An', 'DT', u'I-NP', 'O'), ('Experiment', 'NN', u'I-NP', 'O'), ('In', 'IN', u'B-PP', 'O'), ('Semantic', 'NNP', u'I-NP', 'I_D'), ('Tagging', 'NNP', u'I-NP', 'I_D'), ('Using', 'NNP', u'I-NP', 'O'), ('Hidden', 'NNP', u'I-NP', 'I_T'), ('Markov', 'NNP', u'I-NP', 'I_T'), ('Model', 'NNP', u'I-NP', 'I_T'), ('Tagging', 'VBG', u'I-VP', 'I_D')]
+        following the CoNLL guideline: http://www.cnts.ua.ac.be/conll2003/ner/
+        Only if two phrases of the same type immediately follow each other, the first word of the second phrase will have tag B-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. 
+        '''
+        bindices = [x for (x,y) in enumerate(tokens) if y[p].startswith("B")]
+        ntokens = [list(x) for x in tokens]
+        for bindex in bindices:
+            if bindex != 0:
+                leftTokenLabel = tokens[bindex-1][p]
+                thisTokenLabel = tokens[bindex][p]
+                if leftTokenLabel == "O" or leftTokenLabel.split("-")[1] != thisTokenLabel.split("-")[1]:
+                    ntokens[bindex][p] = thisTokenLabel.replace("B-","I-")
+            else:
+                ntokens[0][p] = ntokens[0][p].replace("B-","I-")            
+        return filterUnderscores([tuple(x) for x in ntokens])              
+
+
+
+def randomOneLabelExtractionfromTwoLabels(ts_with_tags_chunks):
+    i  = 0
+    tokens = []
+    while i < len(ts_with_tags_chunks):
+        t  = ts_with_tags_chunks[i] 
+        i += 1
+        tag = t[-2]
+        if len(tag) == 4 and tag.startswith("B-"):               
+            r = random.random()
+            chosentag  = ""  
+            if r > 0.5 :
+                chosentag = tag.split("-")[1][1]
+            else:
+                chosentag  = tag.split("-")[1][0]
+            tag = "B-"+chosentag
+            tokens.append((t[0],t[1],t[2],tag,t[4]))
+            while i < len(ts_with_tags_chunks) and ts_with_tags_chunks[i][-2].startswith("I-"):
+                t  = ts_with_tags_chunks[i]
+                tag  = "I-"+chosentag
+                i += 1
+                tokens.append((t[0],t[1],t[2],tag,t[4]))
+        else:
+            tokens.append((t[0],t[1],t[2],tag,t[4]))    
+    return changeBeginTags(changeBeginTags(tokens,2),3)
+
+
+
+def NoLabelExtractionfromTwoLabels(ts_with_tags_chunks):
+    tmp = [] 
+    for x in changeBeginTags(changeBeginTags(ts_with_tags_chunks,2),3):
+        if x[3] == "O":
+            tmp.append(x)
+        else:
+            tmp.append((x[0],x[1],x[2],x[3][0]+"-KP",x[4]))   
+    return tmp      
+
+def createFile(sens,loc):
+    with open(loc,"w") as f:
+        for sen in sens:
+            for wts in sen: 
+                f.write((" ".join([x.decode("utf-8") for x in wts])+"\n").encode("utf-8"))
+            f.write("\n")
+
+
+def main():
+    loc = sys.argv[1]
+    #loc = "malletformatfeaturesmultilabel/train/S0003491615000433__output.txt"
+
+    base  = loc.split("_")[0] 
+
+    chunker = nltk.data.load("chunkers/conll2000_ub.pickle")
+
+    dT=open(loc).read().split("\n")[:-1]
+    sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
+    sT1s = [x for x in [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)] if x]
+    sensdict = {
+        'nolabel-withouttokenindex':[],
+        'nolabel-withtokenindex':[],
+        'onelabel-withouttokenindex':[],
+        'onelabel-withtokenindex':[],
+        'multilabel-withouttokenindex':[],
+        'multilabel-withtokenindex':[]         
+    }
+    for s in sT1s:
+        ts = [(x.split("\t")[0],x.split("\t")[1].split(" ")[0], ",".join(x.split("\t")[1].split(" ")[1:])[:-1]) for x in s] #if the tag contains _, replace with - 
+        ts_with_pos_tags = [(x[0],y[1],x[1],x[2]) for (x,y) in zip(ts,pos_tag([x[0] for x in ts])) ]
+
+        ts_with_tags_chunks = [(x[0],x[1],y[2],x[2],x[3]) for (x,y) in\
+        zip(ts_with_pos_tags,nltk.chunk.tree2conlltags(chunker.parse([(x[0],x[1]) for x in ts_with_pos_tags])))]
+
+        multilabeltsc = changeBeginTags(changeBeginTags(ts_with_tags_chunks,2),3)
+
+        sensdict['multilabel-withtokenindex'].append([(word,pos,chunk,label,index) for (word,pos,chunk,label,index) in multilabeltsc])
+        sensdict['multilabel-withouttokenindex'].append([(word,pos,chunk,label) for (word,pos,chunk,label,index) in multilabeltsc])
+
+        onelabeltsc = randomOneLabelExtractionfromTwoLabels(ts_with_tags_chunks) 
+        sensdict['onelabel-withtokenindex'].append([(word,pos,chunk,label,index) for (word,pos,chunk,label,index) in onelabeltsc])
+        sensdict['onelabel-withouttokenindex'].append([(word,pos,chunk,label) for (word,pos,chunk,label,index) in onelabeltsc])
+
+        nolabeltsc = NoLabelExtractionfromTwoLabels(multilabeltsc)
+        sensdict['nolabel-withtokenindex'].append([(word,pos,chunk,label,index) for (word,pos,chunk,label,index) in nolabeltsc])
+        sensdict['nolabel-withouttokenindex'].append([(word,pos,chunk,label) for (word,pos,chunk,label,index) in nolabeltsc])
+
+    for k in sensdict:
+        #print k
+        #print "----------------"
+        #pprint(sensdict[k][0])
+        #print "----------------"    
+        createFile(sensdict[k],base+"-"+k+".txt") 
+
+
+
+
+
+
+if __name__ == "__main__":
+    main()    
diff --git a/crfsemeval/DataExtraction.py b/crfsemeval/DataExtraction.py
@@ -1,5 +1,16 @@
 from nltk import pos_tag
 
+def convertCONLLFormJustExtraction(loc):
+    dT=open(loc).read().split("\n")[:-2]
+    sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
+    sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
+    sTs = []
+    for s in sT1s:
+        xp = [x.split(" ") for x in s] 
+        ts = [(x[0],x[1],x[2],x[3]) for x in xp]
+        sTs.append(ts)
+    return sTs
+
 def convertCONLLFormJustExtractionSemEval(loc):
     dT=open(loc).read().split("\n")[:-2]
     sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
@@ -13,16 +24,16 @@ def convertCONLLFormJustExtractionSemEval(loc):
     return sTs
 
 def convertCONLLFormJustExtractionSemEvalPerfile(loc):
+    #assumes we have a file with token indices in the form `x,y` and the end of each line.
     dT=open(loc).read().split("\n")[:-2]
     sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
     sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
     sTs = []
     sTIs = []
     for s in sT1s:
-        ts= [(x.split("\t")[0],x.split("\t")[1],x.split("\t")[2]) for x in s]
-        tss = [(x[0],y[1],x[1],x[2]) for (x,y) in zip(ts,pos_tag([x[0] for x in ts])) ]
-        tokens = [(x,y,z[0]) for (x,y,z,w) in tss]
-        tokenindices = [w for (x,y,z,w) in tss]
+        xp = [x.split(" ") for x in s]
+        tokens = [(word,pos,chunk,label) for (word,pos,chunk,label,ti) in xp]
+        tokenindices = [(int(ti.split(",")[0]),int(ti.split(",")[1])) for (word,pos,chunk,label,ti) in xp]
         sTs.append(tokens)
         sTIs.append(tokenindices)
     return (sTs,sTIs)