Skip to content

Commit

Permalink
fixes #45 add cosine similarity; other minor fixes
Browse files Browse the repository at this point in the history
Performance not bad.

Compare BL and LR models:

Precision means:  0.78567200746 0.954206911202
Recall    means:  0.734577387486 0.953594950604
F-scores  means:  0.746754321281 0.953675692143

Precision - Paired t-test: t=-70.4747052618, p=1.17856855978e-13
Recall    - Paired t-test: t=-88.6723043477, p=1.49536221115e-14
F-score   - Paired t-test: t=-83.3883810466, p=2.5979028457e-14

Compare BL and DT models:

Precision means:  0.78567200746 0.918097581057
Recall    means:  0.734577387486 0.917672886937
F-scores  means:  0.746754321281 0.917723419455

Precision - Paired t-test: t=-50.2089373038, p=2.47462759727e-12
Recall    - Paired t-test: t=-67.0952922136, p=1.83271094959e-13
F-score   - Paired t-test: t=-62.9563868503, p=3.24700452086e-13

Compare BL and SVM models:

Precision means:  0.78567200746 0.925258744902
Recall    means:  0.734577387486 0.921048298573
F-scores  means:  0.746754321281 0.921956946735

Precision - Paired t-test: t=-66.1487003956, p=2.08222850736e-13
Recall    - Paired t-test: t=-92.1417697406, p=1.05895988743e-14
F-score   - Paired t-test: t=-87.7745650205, p=1.63862062362e-14

Compare LR and DT models:

Precision means:  0.954206911202 0.918097581057
Recall    means:  0.953594950604 0.917672886937
F-scores  means:  0.953675692143 0.917723419455

Precision - Paired t-test: t=33.803769453, p=8.55550255774e-11
Recall    - Paired t-test: t=34.3247724956, p=7.46244372884e-11
F-score   - Paired t-test: t=34.1911739665, p=7.72713487038e-11

Compare LR and SVM models:

Precision means:  0.954206911202 0.925258744902
Recall    means:  0.953594950604 0.921048298573
F-scores  means:  0.953675692143 0.921956946735

Precision - Paired t-test: t=22.522948725, p=3.17684023966e-09
Recall    - Paired t-test: t=24.3538730377, p=1.58854216054e-09
F-score   - Paired t-test: t=23.9657073145, p=1.83202598243e-09

Compare DT and SVM models:

Precision means:  0.918097581057 0.925258744902
Recall    means:  0.917672886937 0.921048298573
F-scores  means:  0.917723419455 0.921956946735

Precision - Paired t-test: t=-5.8377699189, p=0.000247505046138
Recall    - Paired t-test: t=-2.87083479, p=0.0184515734061
F-score   - Paired t-test: t=-3.56313750104, p=0.00608929416352
  • Loading branch information
zachguo committed Apr 20, 2014
1 parent e54c2b8 commit 73ae11b
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 12 deletions.
10 changes: 7 additions & 3 deletions classification/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,19 @@

def fetch_data():
"""Fetch data"""
print 'Preparing data for analysis...'
datamodel = Data()
datamodel.add_date_features()
datamodel.add_ocr_features()
# datamodel.add_ocr_features()
datamodel.add_nllr_features()
# datamodel.add_kld_features()
datamodel.add_cs_features()
data = datamodel.get_data()
print data
print 'Data is ready: [{0} rows x {1} columns]'.format(*data.shape)
return data

READYDATA = fetch_data()

def run(clf):
"""
Run a classifier.
Expand All @@ -31,7 +35,7 @@ def run(clf):
"""
if not issubclass(clf, Classifier):
raise TypeError("Argument should be an instance of Classifier class.")
model = clf(fetch_data())
model = clf(READYDATA)
print "Running %s..." % str(model)
model.repeat(100)
return model.evaluate()
Expand Down
2 changes: 1 addition & 1 deletion classification/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __repr__(self):
def fit_and_predict(self, xtrain, ytrain, xtest, ytest):
xtest = xtest[[x for x in xtest.columns if x.endswith('-1st')]]
true_colname = lambda x: xtest.columns[(x == True).tolist().index(True)][:-4]
ypred = (xtest.apply(true_colname, axis=1)).tolist()
ypred = xtest.apply(true_colname, axis=1).tolist()
return ytest, ypred

class LR(Classifier):
Expand Down
7 changes: 7 additions & 0 deletions classification/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ def add_kld_features(self):
self.add_text_features(['kld_1', 'kld_2', 'kld_3'])


def add_cs_features(self):
"""
Retrieve and append Cosine-Similarity features
"""
self.add_text_features(['cs_1', 'cs_2', 'cs_3'])


def add_ocr_features(self):
"""
Retrieve and append OCR features
Expand Down
47 changes: 40 additions & 7 deletions text_processing/TLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@

from pymongo import MongoClient
from collections import defaultdict
from math import log, log10
from math import log, log10, sqrt
from utils import reshape
import pandas as pd


EPSILON = 0.00001
EPSILON = 0.0001
DATERANGES = ["pre-1839", "1840-1860", "1861-1876", "1877-1887",
"1888-1895", "1896-1901", "1902-1906", "1907-1910",
"1911-1914", "1915-1918", "1919-1922", "1923-present"]
Expand Down Expand Up @@ -75,6 +75,7 @@ def generate_rtmatrix(self):
[281271 rows x 12 columns]
"""

print "Generating term * chronon matrix..."
# read all doc IDs for each date range from mongoDB into a dictionary:
# {'pre-1839':['loc.ark+=13960=t9h42611g', ...], ...}
dr_docid_dict = {}
Expand All @@ -98,7 +99,7 @@ def generate_rtmatrix(self):
for term in tfdict:
dr_tf_dict[daterange][term] += tfdict[term]
else:
print "No term frequency for doc %s." % docid
print "Warning: no term frequency for doc %s." % docid

# Convert 2D dictionary into pandas dataframe (named matrix), with a simple
rtmatrix = pd.DataFrame(dr_tf_dict).fillna(EPSILON)
Expand Down Expand Up @@ -176,6 +177,7 @@ def compute_nllr(self, weighted=True):
@param weighted, whether or not weighted by temporal entropy.
@return a 2D dictionary of NLLRs in format {docid:{daterange: .. } .. }
"""
print 'Computing TEwNLLR...'
nllrdict = {}
llrdict = self.compute_llr(self.get_rtmatrix())
tedict = self.compute_te(self.get_rtmatrix()) if weighted else {}
Expand All @@ -193,8 +195,7 @@ def compute_nllr(self, weighted=True):

def run(self):
"""Run"""
nllrdict = self.compute_nllr()
self.nllrc.insert(reshape(nllrdict))
self.nllrc.insert(reshape(self.compute_nllr()))



Expand All @@ -220,6 +221,32 @@ def compute_cs(self):
@return a 2D dictionary of CSs in format {docid:{daterange: .. } .. }
"""
print 'Computing Cosine-similarity...'
csdict = {}
docids = self.get_docids()
rtmatrix = self.get_rtmatrix()
# Normalize each column from freq to prob: p(w|dr)
rtmatrix = rtmatrix.div(rtmatrix.sum(axis=0), axis=1)
# a vector of which each cell is the vector length for a chronon
rvlength = rtmatrix.applymap(lambda x: x*x).sum(axis=0).apply(sqrt)
rvlength = rvlength.to_dict()
rtmatrix = rtmatrix.to_dict()
for docid in docids:
tfdoc = self.tfc.find_one({u"_id":docid})
if tfdoc:
probs = tfdoc[u"prob"]
csdict[docid] = {}
# a vector of which each cell is the vector length for a doc
dvlength = sqrt(sum([x*x for x in probs.values()]))
for daterange in DATERANGES:
cossim = sum([probs[term] * rtmatrix[daterange][term] for term in probs]) / (dvlength * rvlength[daterange])
csdict[docid][daterange] = cossim if cossim >= -1 and cossim <= 1 else 0
return csdict


def run(self):
"""Run"""
self.csc.insert(reshape(self.compute_cs()))



Expand All @@ -245,6 +272,7 @@ def compute_kld(self):
@return a 2D dictionary of KLDs in format {docid:{daterange: .. } .. }
"""
print 'Computing KL-Divergence...'
klddict = {}
docids = self.get_docids()
rtmatrix = self.get_rtmatrix()
Expand All @@ -262,8 +290,7 @@ def compute_kld(self):

def run(self):
"""Run"""
klddict = self.compute_kld()
self.kldc.insert(reshape(klddict))
self.kldc.insert(reshape(self.compute_kld()))



Expand Down Expand Up @@ -318,6 +345,11 @@ def run_kld(self):
for outc, tfc in zip(self.outcs, self.tfcs):
KLD(self.datec, tfc, outc).run()

def run_cs(self):
"""Run CS"""
for outc, tfc in zip(self.outcs, self.tfcs):
CS(self.datec, tfc, outc).run()

def run_ocr(self):
"""Run OCR (NLLR based on character language model)"""
NLLR(self.datec, self.cfc, self.outcs[0]).run()
Expand All @@ -327,5 +359,6 @@ def run_ocr(self):
if __name__ == '__main__':
RunTLM(['nllr_1', 'nllr_2', 'nllr_3']).run_nllr()
RunTLM(['kld_1', 'kld_2', 'kld_3']).run_kld()
RunTLM(['cs_1', 'cs_2', 'cs_3']).run_cs()
RunTLM(['nllr_ocr']).run_ocr()

2 changes: 1 addition & 1 deletion text_processing/importDate.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def import_date1st(fp_date1st, db):
for line in fin:
if line:
doc_id, date = line.strip('\n').split('\t')
db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}})
db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(int(date))}})
print "Finish importing 1st-date-in-texts."

if __name__ == '__main__':
Expand Down

0 comments on commit 73ae11b

Please sign in to comment.