fixes #45 add cosine similarity; other minor fixes

Performance not bad. Compare BL and LR models: Precision means: 0.78567200746 0.954206911202 Recall means: 0.734577387486 0.953594950604 F-scores means: 0.746754321281 0.953675692143 Precision - Paired t-test: t=-70.4747052618, p=1.17856855978e-13 Recall - Paired t-test: t=-88.6723043477, p=1.49536221115e-14 F-score - Paired t-test: t=-83.3883810466, p=2.5979028457e-14 Compare BL and DT models: Precision means: 0.78567200746 0.918097581057 Recall means: 0.734577387486 0.917672886937 F-scores means: 0.746754321281 0.917723419455 Precision - Paired t-test: t=-50.2089373038, p=2.47462759727e-12 Recall - Paired t-test: t=-67.0952922136, p=1.83271094959e-13 F-score - Paired t-test: t=-62.9563868503, p=3.24700452086e-13 Compare BL and SVM models: Precision means: 0.78567200746 0.925258744902 Recall means: 0.734577387486 0.921048298573 F-scores means: 0.746754321281 0.921956946735 Precision - Paired t-test: t=-66.1487003956, p=2.08222850736e-13 Recall - Paired t-test: t=-92.1417697406, p=1.05895988743e-14 F-score - Paired t-test: t=-87.7745650205, p=1.63862062362e-14 Compare LR and DT models: Precision means: 0.954206911202 0.918097581057 Recall means: 0.953594950604 0.917672886937 F-scores means: 0.953675692143 0.917723419455 Precision - Paired t-test: t=33.803769453, p=8.55550255774e-11 Recall - Paired t-test: t=34.3247724956, p=7.46244372884e-11 F-score - Paired t-test: t=34.1911739665, p=7.72713487038e-11 Compare LR and SVM models: Precision means: 0.954206911202 0.925258744902 Recall means: 0.953594950604 0.921048298573 F-scores means: 0.953675692143 0.921956946735 Precision - Paired t-test: t=22.522948725, p=3.17684023966e-09 Recall - Paired t-test: t=24.3538730377, p=1.58854216054e-09 F-score - Paired t-test: t=23.9657073145, p=1.83202598243e-09 Compare DT and SVM models: Precision means: 0.918097581057 0.925258744902 Recall means: 0.917672886937 0.921048298573 F-scores means: 0.917723419455 0.921956946735 Precision - Paired t-test: t=-5.8377699189, p=0.000247505046138 Recall - Paired t-test: t=-2.87083479, p=0.0184515734061 F-score - Paired t-test: t=-3.56313750104, p=0.00608929416352
zachguo · Apr 20, 2014 · 73ae11b · 73ae11b
1 parent e54c2b8
commit 73ae11b
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 12 deletions.
diff --git a/classification/compare.py b/classification/compare.py
@@ -14,15 +14,19 @@
 
 def fetch_data():
 	"""Fetch data"""
+	print 'Preparing data for analysis...'
 	datamodel = Data()
 	datamodel.add_date_features()
-	datamodel.add_ocr_features()
+	# datamodel.add_ocr_features()
 	datamodel.add_nllr_features()
 	# datamodel.add_kld_features()
+	datamodel.add_cs_features()
 	data = datamodel.get_data()
-	print data
+	print 'Data is ready: [{0} rows x {1} columns]'.format(*data.shape)
 	return data
 
+READYDATA = fetch_data()
+
 def run(clf):
 	"""
 	Run a classifier.
@@ -31,7 +35,7 @@ def run(clf):
 	"""
 	if not issubclass(clf, Classifier):
 		raise TypeError("Argument should be an instance of Classifier class.")
-	model = clf(fetch_data())
+	model = clf(READYDATA)
 	print "Running %s..." % str(model)
 	model.repeat(100)
 	return model.evaluate()

diff --git a/classification/model.py b/classification/model.py
@@ -102,7 +102,7 @@ def __repr__(self):
 	def fit_and_predict(self, xtrain, ytrain, xtest, ytest):
 		xtest = xtest[[x for x in xtest.columns if x.endswith('-1st')]]
 		true_colname = lambda x: xtest.columns[(x == True).tolist().index(True)][:-4]
-		ypred = (xtest.apply(true_colname, axis=1)).tolist()
+		ypred = xtest.apply(true_colname, axis=1).tolist()
 		return ytest, ypred
 
 class LR(Classifier):

diff --git a/classification/prepare.py b/classification/prepare.py
@@ -121,6 +121,13 @@ def add_kld_features(self):
 		self.add_text_features(['kld_1', 'kld_2', 'kld_3'])
 
 
+	def add_cs_features(self):
+		"""
+		Retrieve and append Cosine-Similarity features
+		"""
+		self.add_text_features(['cs_1', 'cs_2', 'cs_3'])
+
+
 	def add_ocr_features(self):
 		"""
 		Retrieve and append OCR features

diff --git a/text_processing/TLM.py b/text_processing/TLM.py
@@ -8,12 +8,12 @@
 
 from pymongo import MongoClient
 from collections import defaultdict
-from math import log, log10
+from math import log, log10, sqrt
 from utils import reshape
 import pandas as pd
 
 
-EPSILON = 0.00001
+EPSILON = 0.0001
 DATERANGES = ["pre-1839", "1840-1860", "1861-1876", "1877-1887", 
 			  "1888-1895", "1896-1901", "1902-1906", "1907-1910", 
 			  "1911-1914", "1915-1918", "1919-1922", "1923-present"]
@@ -75,6 +75,7 @@ def generate_rtmatrix(self):
 		[281271 rows x 12 columns]
 		"""
 
+		print "Generating term * chronon matrix..."
 		# read all doc IDs for each date range from mongoDB into a dictionary:
 		# {'pre-1839':['loc.ark+=13960=t9h42611g', ...], ...}
 		dr_docid_dict = {}
@@ -98,7 +99,7 @@ def generate_rtmatrix(self):
 					for term in tfdict:
 						dr_tf_dict[daterange][term] += tfdict[term]
 				else:
-					print "No term frequency for doc %s." % docid
+					print "Warning: no term frequency for doc %s." % docid
 
 		# Convert 2D dictionary into pandas dataframe (named matrix), with a simple
 		rtmatrix = pd.DataFrame(dr_tf_dict).fillna(EPSILON)
@@ -176,6 +177,7 @@ def compute_nllr(self, weighted=True):
 		@param weighted, whether or not weighted by temporal entropy.
 		@return a 2D dictionary of NLLRs in format {docid:{daterange: .. } .. }
 		"""
+		print 'Computing TEwNLLR...'
 		nllrdict = {}
 		llrdict = self.compute_llr(self.get_rtmatrix())
 		tedict = self.compute_te(self.get_rtmatrix()) if weighted else {}
@@ -193,8 +195,7 @@ def compute_nllr(self, weighted=True):
 
 	def run(self):
 		"""Run"""
-		nllrdict = self.compute_nllr()
-		self.nllrc.insert(reshape(nllrdict))
+		self.nllrc.insert(reshape(self.compute_nllr()))
 
 
 
@@ -220,6 +221,32 @@ def compute_cs(self):
 
 		@return a 2D dictionary of CSs in format {docid:{daterange: .. } .. }
 		"""
+		print 'Computing Cosine-similarity...'
+		csdict = {}
+		docids = self.get_docids()
+		rtmatrix = self.get_rtmatrix()
+		# Normalize each column from freq to prob: p(w|dr)
+		rtmatrix = rtmatrix.div(rtmatrix.sum(axis=0), axis=1)
+		# a vector of which each cell is the vector length for a chronon
+		rvlength = rtmatrix.applymap(lambda x: x*x).sum(axis=0).apply(sqrt)
+		rvlength = rvlength.to_dict()
+		rtmatrix = rtmatrix.to_dict()
+		for docid in docids:
+			tfdoc = self.tfc.find_one({u"_id":docid})
+			if tfdoc:
+				probs = tfdoc[u"prob"]
+				csdict[docid] = {}
+				# a vector of which each cell is the vector length for a doc
+				dvlength = sqrt(sum([x*x for x in probs.values()]))
+				for daterange in DATERANGES:
+					cossim = sum([probs[term] * rtmatrix[daterange][term] for term in probs]) / (dvlength * rvlength[daterange])
+					csdict[docid][daterange] = cossim if cossim >= -1 and cossim <= 1 else 0
+		return csdict
+
+
+	def run(self):
+		"""Run"""
+		self.csc.insert(reshape(self.compute_cs()))
 
 
 
@@ -245,6 +272,7 @@ def compute_kld(self):
 
 		@return a 2D dictionary of KLDs in format {docid:{daterange: .. } .. }
 		"""
+		print 'Computing KL-Divergence...'
 		klddict = {}
 		docids = self.get_docids()
 		rtmatrix = self.get_rtmatrix()
@@ -262,8 +290,7 @@ def compute_kld(self):
 
 	def run(self):
 		"""Run"""
-		klddict = self.compute_kld()
-		self.kldc.insert(reshape(klddict))
+		self.kldc.insert(reshape(self.compute_kld()))
 
 
 
@@ -318,6 +345,11 @@ def run_kld(self):
 		for outc, tfc in zip(self.outcs, self.tfcs):
 			KLD(self.datec, tfc, outc).run()
 
+	def run_cs(self):
+		"""Run CS"""
+		for outc, tfc in zip(self.outcs, self.tfcs):
+			CS(self.datec, tfc, outc).run()
+
 	def run_ocr(self):
 		"""Run OCR (NLLR based on character language model)"""
 		NLLR(self.datec, self.cfc, self.outcs[0]).run()
@@ -327,5 +359,6 @@ def run_ocr(self):
 if __name__ == '__main__':
 	RunTLM(['nllr_1', 'nllr_2', 'nllr_3']).run_nllr()
 	RunTLM(['kld_1', 'kld_2', 'kld_3']).run_kld()
+	RunTLM(['cs_1', 'cs_2', 'cs_3']).run_cs()
 	RunTLM(['nllr_ocr']).run_ocr()
 
diff --git a/text_processing/importDate.py b/text_processing/importDate.py
@@ -55,7 +55,7 @@ def import_date1st(fp_date1st, db):
 		for line in fin:
 			if line:
 				doc_id, date = line.strip('\n').split('\t')
-				db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}})
+				db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(int(date))}})
 	print "Finish importing 1st-date-in-texts."
 
 if __name__ == '__main__':