separate get and import

zachguo · Apr 17, 2014 · dc2f868 · dc2f868
1 parent 18f60ea
commit dc2f868
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 95 deletions.
diff --git a/text_processing/getTLM.py → text_processing/TLM.py b/text_processing/getTLM.py → text_processing/TLM.py
@@ -13,6 +13,7 @@
 import pandas as pd
 
 
+EPSILON = 0.00001
 DATERANGES = ["pre-1839", "1840-1860", "1861-1876", "1877-1887", 
 			  "1888-1895", "1896-1901", "1902-1906", "1907-1910", 
 			  "1911-1914", "1915-1918", "1919-1922", "1923-present"]
@@ -22,8 +23,8 @@ class TLM(object):
 	"""
 	Temporal Language Model.
 
-	Note it's more like a bag-of-ngrams model than a true generative language 
-	model. So each document and chronon is represented as a bag of ngrams.
+	Note, it's more like a bag-of-ngrams model than a true generative language 
+	model. Each document and chronon is represented as a bag of ngrams.
 
 	@param datec, connection to date collection in HTRC mongo database.
 	@param tfc, connection to one of 'tf_1', 'tf_2' and 'tf_3' collections in 
@@ -36,7 +37,6 @@ def __init__(self, datec, tfc):
 		self.rtmatrix = pd.DataFrame()
 		self.docids = []
 		self.generate_rtmatrix()
-		self.smooth_rtmatrix()
 
 
 	def get_rtmatrix(self):
@@ -101,19 +101,13 @@ def generate_rtmatrix(self):
 					print "No term frequency for doc %s." % docid
 
 		# Convert 2D dictionary into pandas dataframe (named matrix), with a simple
-		rtmatrix = pd.DataFrame(dr_tf_dict)
+		rtmatrix = pd.DataFrame(dr_tf_dict).fillna(EPSILON)
 		# Reorder columns of range * term matrix
 		rtmatrix = rtmatrix[DATERANGES]
 		self.set_rtmatrix(rtmatrix)
 		self.set_docids(reduce(lambda x, y: x+y, dr_docid_dict.values()))
 
 
-	def smooth_rtmatrix(self):
-		"""
-		Smooth rtmatrix using Good-Turing Method.
-		"""
-		pass
-
 
 class NLLR(TLM):
 	"""
@@ -147,8 +141,7 @@ def compute_llr(rtmatrix):
 		tfcorpora = rtmatrix.sum(axis=1)
 		tfcorpora = tfcorpora.div(tfcorpora.sum(axis=0))
 		# Compute log likelihood ratio
-		llrmatrix = tfdaterange.div(tfcorpora, axis=0)
-		llrmatrix = llrmatrix.applymap(log)
+		llrmatrix = tfdaterange.div(tfcorpora, axis=0).applymap(log)
 		return llrmatrix.to_dict()
 
 
@@ -194,10 +187,6 @@ def compute_nllr(self, weighted=True):
 				probs = tfdoc[u"prob"]
 				nllrdict[docid] = {}
 				for daterange in DATERANGES:
-					# note that there's no smoothing for document LM, and I think it's
-					# not necessary to smooth document LM, because the score of each 
-					# date range were added a same amount of value after smoothing.
-					# ('for term in probs' means that I simply disgard unseen words)
 					nllrdict[docid][daterange] = sum([tedict[term] * probs[term] * llrdict[daterange][term] for term in probs])
 		return nllrdict
 
@@ -210,8 +199,27 @@ def run(self):
 
 
 class CS(TLM):
-	"""Cosine similarity"""
-	pass
+	"""
+	Cosine similarity
+
+	@param datec, connection to date collection in HTRC mongo database.
+	@param tfc, connection to one of 'tf_1', 'tf_2' and 'tf_3' collections in 
+		            HTRC mongo database.
+	@param csc, connection to one of 'csc_1', 'csc_2' and 'csc_3' collections
+					to store Cos-Sim results.
+	"""
+
+	def __init__(self, datec, tfc, csc):
+		TLM.__init__(self, datec, tfc)
+		self.csc = csc
+
+
+	def compute_cs(self):
+		"""
+		Compute cosine similarity between each pair of term & chronon
+
+		@return a 2D dictionary of CSs in format {docid:{daterange: .. } .. }
+		"""
 
 
 
@@ -223,7 +231,7 @@ class KLD(TLM):
 	@param tfc, connection to one of 'tf_1', 'tf_2' and 'tf_3' collections in 
 		            HTRC mongo database.
 	@param kldc, connection to one of 'kld_1', 'kld_2' and 'kld_3' collections
-					to store NLLR results.
+					to store KL Divergence results.
 	"""
 
 	def __init__(self, datec, tfc, kldc):

diff --git a/text_processing/getDateProb.py b/text_processing/getDateProb.py
diff --git a/text_processing/getFirstDateInText.py b/text_processing/getFirstDateInText.py
@@ -1,61 +1,61 @@
 #! /usr/bin/env python
 
-# Created by Siyuan Guo, Mar 2014.
+"""
+Scan and print first date-in-text.
+This script is intended to run locally, to run:
+	python getFirstDateInText.py path/to/aa > date1st_aa.txt
 
-import re,glob,sys
+Siyuan Guo, Mar 2014.
+"""
+
+import re, glob, sys
 from string import maketrans
-from pymongo import MongoClient
-from utils import date2daterange
 
-digits = re.compile(r'\d')
-def hasDigit(word):
-	return bool(digits.search(word))
+DIGITS = re.compile(r'\d')
+def has_digit(word):
+	"""Check whether word contains digits"""
+	return bool(DIGITS.search(word))
 
-SC4D = re.compile(r'(^|\D+)(\d{4})(\D+|$)') # precompiled pattern for standalone consecutive 4 digits
-TYPOTABLE = maketrans('lJQOo','11000')
-def getDate(word):
-	if hasDigit(word):
+# precompiled pattern for standalone consecutive 4 digits
+SC4D = re.compile(r'(^|\D+)(\d{4})(\D+|$)')
+TYPOTABLE = maketrans('lJQOo', '11000')
+def get_date(word):
+	"""Get date from potential date string"""
+	if has_digit(word):
 		# greedily fix potential OCR typos
 		word = word.translate(TYPOTABLE)
 		# find standalone consecutive 4 digits, '18888' don't count
 		match = SC4D.search(word)
 		if match:
 			word = int(match.groups()[1])
 		# assume all date is later than 1500, to filter noise like address#
-		if word>1400 and word<2000:
+		if word > 1400 and word < 2000:
 			return word
 	return None
 
 def main(filepath):
-	client = MongoClient('localhost', 27017)
-	db = client.HTRC
-	collections = db.collection_names()
-	if "date" not in collections:
-		print "Collection 'date' is required. \
-		Please run metadata_processing/get_dependent_variable/getDV_HTRC.py first."
-
-	# scan first date-in-text
+	"""Scan and print first date-in-text"""
 	allfilenames = glob.glob(filepath.rstrip('/')+'/*.txt')
-	for fn in allfilenames:
-		fn_short = fn.split('/')[-1]
-		if fn_short.endswith('.txt'):
-			doc_id = fn_short.split('.txt')[0]
+	for fname in allfilenames:
+		fname_short = fname.split('/')[-1]
+		if fname_short.endswith('.txt'):
+			doc_id = fname_short.split('.txt')[0]
 			seen_date = False
-			fin = open(fn)
+			fin = open(fname)
 			line = fin.readline()
 			while not seen_date and line:
 				words = line.strip().split(' ')
 				while words and not seen_date:
 					word = words.pop(0)
-					date = getDate(word)
+					date = get_date(word)
 					if date:
 						seen_date = True
-						db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}})
+						print "{0}\t{1}".format(doc_id, date)
 				line = fin.readline()
 			fin.close()
 
 if __name__ == '__main__':
 	if len(sys.argv) != 2:
-		print "Please provide the path of text corpora. Yes, that folder cantaining 250k documents."
+		raise IOError("Please provide the path of text corpora.")
 	else:
 		main(sys.argv[1])
diff --git a/text_processing/importDate.py b/text_processing/importDate.py
@@ -0,0 +1,70 @@
+#! /usr/bin/env python
+
+"""
+generate date frequencies from MapReduce date output
+then import generated date freqs into MongoDB as a new field in 'date' collection
+
+To run: 
+	python importDate.py /path/to/date_aa.txt /path/to/date1st_aa.txt
+
+Created by Bin Dai & Siyuan Guo, Mar 2014.
+"""
+
+from pymongo import MongoClient
+from collections import defaultdict
+from utils import date2daterange, freq2prob
+import sys
+
+def main(fp_date, fp_date1st):
+	"""Run"""
+	# connect to mongoDB and check date collection
+	client = MongoClient('localhost', 27017)
+	db = client.HTRC
+	collections = db.collection_names()
+	if "date" not in collections:
+		print "Collection 'date' is required. \
+		Please run metadata_processing/get_dependent_variable/getDV_HTRC.py first."
+
+	# import date related data into MongoDB
+	import_date(fp_date, db)
+	import_date1st(fp_date1st, db)
+
+def import_date(fp_date, db):
+	"""Import date distribution into MongoDB"""
+	# assume that 'date' collection is small enough to put in memory
+	with open(fp_date) as fin:
+		old_key = None # tracking doc_id
+		tfdict = defaultdict(float) # date term frequency dictionary for each document
+		for line in fin:
+			if line:
+				this_key, date, tf = line.split('\t')
+				if this_key != old_key and old_key:
+					# to successfully update, use unicode
+					db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}})
+					tfdict = defaultdict(float)
+				old_key = this_key
+				# update date tf 
+				tfdict[date2daterange(int(date))] += float(tf)
+		# dont forget last doc
+		db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}})
+	print "Finish importing date distributions."
+
+def import_date1st(fp_date1st, db):
+	"""Import 1st-date-in-text into MongoDB"""
+	with open(fp_date1st) as fin:
+		for line in fin:
+			if line:
+				doc_id, date = line.split('\t')
+				db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}})
+	print "Finish importing 1st-date-in-texts."
+
+if __name__ == '__main__':
+	if len(sys.argv) != 3:
+		raise IOError("Please provide \
+                1) MapReduce date output file and \
+                2) output file from getFirstDateInText.py. \
+                (Two arguments should be given)")
+	elif sys.argv[2].find('1st') < 0:
+		raise IOError("Second argument must contain '1st'.")
+	else:
+		main(sys.argv[1], sys.argv[2])
diff --git a/text_processing/getTFDF.py → text_processing/importTFDF.py b/text_processing/getTFDF.py → text_processing/importTFDF.py
@@ -7,7 +7,7 @@
     'tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'cf'
 
 To run: 
-    python getTF.py /path/to/tf_aa.txt
+    python importTFDF.py /path/to/tf_aa.txt
 
 Created by Bin Dai, Siyuan Guo, Mar 2014.
 """