Skip to content

Commit

Permalink
separate get and import
Browse files Browse the repository at this point in the history
  • Loading branch information
zachguo committed Apr 17, 2014
1 parent 18f60ea commit dc2f868
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 95 deletions.
46 changes: 27 additions & 19 deletions text_processing/getTLM.py → text_processing/TLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd


EPSILON = 0.00001
DATERANGES = ["pre-1839", "1840-1860", "1861-1876", "1877-1887",
"1888-1895", "1896-1901", "1902-1906", "1907-1910",
"1911-1914", "1915-1918", "1919-1922", "1923-present"]
Expand All @@ -22,8 +23,8 @@ class TLM(object):
"""
Temporal Language Model.
Note it's more like a bag-of-ngrams model than a true generative language
model. So each document and chronon is represented as a bag of ngrams.
Note, it's more like a bag-of-ngrams model than a true generative language
model. Each document and chronon is represented as a bag of ngrams.
@param datec, connection to date collection in HTRC mongo database.
@param tfc, connection to one of 'tf_1', 'tf_2' and 'tf_3' collections in
Expand All @@ -36,7 +37,6 @@ def __init__(self, datec, tfc):
self.rtmatrix = pd.DataFrame()
self.docids = []
self.generate_rtmatrix()
self.smooth_rtmatrix()


def get_rtmatrix(self):
Expand Down Expand Up @@ -101,19 +101,13 @@ def generate_rtmatrix(self):
print "No term frequency for doc %s." % docid

# Convert 2D dictionary into pandas dataframe (named matrix), with a simple
rtmatrix = pd.DataFrame(dr_tf_dict)
rtmatrix = pd.DataFrame(dr_tf_dict).fillna(EPSILON)
# Reorder columns of range * term matrix
rtmatrix = rtmatrix[DATERANGES]
self.set_rtmatrix(rtmatrix)
self.set_docids(reduce(lambda x, y: x+y, dr_docid_dict.values()))


def smooth_rtmatrix(self):
"""
Smooth rtmatrix using Good-Turing Method.
"""
pass


class NLLR(TLM):
"""
Expand Down Expand Up @@ -147,8 +141,7 @@ def compute_llr(rtmatrix):
tfcorpora = rtmatrix.sum(axis=1)
tfcorpora = tfcorpora.div(tfcorpora.sum(axis=0))
# Compute log likelihood ratio
llrmatrix = tfdaterange.div(tfcorpora, axis=0)
llrmatrix = llrmatrix.applymap(log)
llrmatrix = tfdaterange.div(tfcorpora, axis=0).applymap(log)
return llrmatrix.to_dict()


Expand Down Expand Up @@ -194,10 +187,6 @@ def compute_nllr(self, weighted=True):
probs = tfdoc[u"prob"]
nllrdict[docid] = {}
for daterange in DATERANGES:
# note that there's no smoothing for document LM, and I think it's
# not necessary to smooth document LM, because the score of each
# date range were added a same amount of value after smoothing.
# ('for term in probs' means that I simply disgard unseen words)
nllrdict[docid][daterange] = sum([tedict[term] * probs[term] * llrdict[daterange][term] for term in probs])
return nllrdict

Expand All @@ -210,8 +199,27 @@ def run(self):


class CS(TLM):
"""Cosine similarity"""
pass
"""
Cosine similarity
@param datec, connection to date collection in HTRC mongo database.
@param tfc, connection to one of 'tf_1', 'tf_2' and 'tf_3' collections in
HTRC mongo database.
@param csc, connection to one of 'csc_1', 'csc_2' and 'csc_3' collections
to store Cos-Sim results.
"""

def __init__(self, datec, tfc, csc):
TLM.__init__(self, datec, tfc)
self.csc = csc


def compute_cs(self):
"""
Compute cosine similarity between each pair of term & chronon
@return a 2D dictionary of CSs in format {docid:{daterange: .. } .. }
"""



Expand All @@ -223,7 +231,7 @@ class KLD(TLM):
@param tfc, connection to one of 'tf_1', 'tf_2' and 'tf_3' collections in
HTRC mongo database.
@param kldc, connection to one of 'kld_1', 'kld_2' and 'kld_3' collections
to store NLLR results.
to store KL Divergence results.
"""

def __init__(self, datec, tfc, kldc):
Expand Down
47 changes: 0 additions & 47 deletions text_processing/getDateProb.py

This file was deleted.

56 changes: 28 additions & 28 deletions text_processing/getFirstDateInText.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,61 @@
#! /usr/bin/env python

# Created by Siyuan Guo, Mar 2014.
"""
Scan and print first date-in-text.
This script is intended to run locally, to run:
python getFirstDateInText.py path/to/aa > date1st_aa.txt
import re,glob,sys
Siyuan Guo, Mar 2014.
"""

import re, glob, sys
from string import maketrans
from pymongo import MongoClient
from utils import date2daterange

digits = re.compile(r'\d')
def hasDigit(word):
return bool(digits.search(word))
DIGITS = re.compile(r'\d')
def has_digit(word):
"""Check whether word contains digits"""
return bool(DIGITS.search(word))

SC4D = re.compile(r'(^|\D+)(\d{4})(\D+|$)') # precompiled pattern for standalone consecutive 4 digits
TYPOTABLE = maketrans('lJQOo','11000')
def getDate(word):
if hasDigit(word):
# precompiled pattern for standalone consecutive 4 digits
SC4D = re.compile(r'(^|\D+)(\d{4})(\D+|$)')
TYPOTABLE = maketrans('lJQOo', '11000')
def get_date(word):
"""Get date from potential date string"""
if has_digit(word):
# greedily fix potential OCR typos
word = word.translate(TYPOTABLE)
# find standalone consecutive 4 digits, '18888' don't count
match = SC4D.search(word)
if match:
word = int(match.groups()[1])
# assume all date is later than 1500, to filter noise like address#
if word>1400 and word<2000:
if word > 1400 and word < 2000:
return word
return None

def main(filepath):
client = MongoClient('localhost', 27017)
db = client.HTRC
collections = db.collection_names()
if "date" not in collections:
print "Collection 'date' is required. \
Please run metadata_processing/get_dependent_variable/getDV_HTRC.py first."

# scan first date-in-text
"""Scan and print first date-in-text"""
allfilenames = glob.glob(filepath.rstrip('/')+'/*.txt')
for fn in allfilenames:
fn_short = fn.split('/')[-1]
if fn_short.endswith('.txt'):
doc_id = fn_short.split('.txt')[0]
for fname in allfilenames:
fname_short = fname.split('/')[-1]
if fname_short.endswith('.txt'):
doc_id = fname_short.split('.txt')[0]
seen_date = False
fin = open(fn)
fin = open(fname)
line = fin.readline()
while not seen_date and line:
words = line.strip().split(' ')
while words and not seen_date:
word = words.pop(0)
date = getDate(word)
date = get_date(word)
if date:
seen_date = True
db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}})
print "{0}\t{1}".format(doc_id, date)
line = fin.readline()
fin.close()

if __name__ == '__main__':
if len(sys.argv) != 2:
print "Please provide the path of text corpora. Yes, that folder cantaining 250k documents."
raise IOError("Please provide the path of text corpora.")
else:
main(sys.argv[1])
70 changes: 70 additions & 0 deletions text_processing/importDate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#! /usr/bin/env python

"""
generate date frequencies from MapReduce date output
then import generated date freqs into MongoDB as a new field in 'date' collection
To run:
python importDate.py /path/to/date_aa.txt /path/to/date1st_aa.txt
Created by Bin Dai & Siyuan Guo, Mar 2014.
"""

from pymongo import MongoClient
from collections import defaultdict
from utils import date2daterange, freq2prob
import sys

def main(fp_date, fp_date1st):
"""Run"""
# connect to mongoDB and check date collection
client = MongoClient('localhost', 27017)
db = client.HTRC
collections = db.collection_names()
if "date" not in collections:
print "Collection 'date' is required. \
Please run metadata_processing/get_dependent_variable/getDV_HTRC.py first."

# import date related data into MongoDB
import_date(fp_date, db)
import_date1st(fp_date1st, db)

def import_date(fp_date, db):
"""Import date distribution into MongoDB"""
# assume that 'date' collection is small enough to put in memory
with open(fp_date) as fin:
old_key = None # tracking doc_id
tfdict = defaultdict(float) # date term frequency dictionary for each document
for line in fin:
if line:
this_key, date, tf = line.split('\t')
if this_key != old_key and old_key:
# to successfully update, use unicode
db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}})
tfdict = defaultdict(float)
old_key = this_key
# update date tf
tfdict[date2daterange(int(date))] += float(tf)
# dont forget last doc
db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}})
print "Finish importing date distributions."

def import_date1st(fp_date1st, db):
"""Import 1st-date-in-text into MongoDB"""
with open(fp_date1st) as fin:
for line in fin:
if line:
doc_id, date = line.split('\t')
db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}})
print "Finish importing 1st-date-in-texts."

if __name__ == '__main__':
if len(sys.argv) != 3:
raise IOError("Please provide \
1) MapReduce date output file and \
2) output file from getFirstDateInText.py. \
(Two arguments should be given)")
elif sys.argv[2].find('1st') < 0:
raise IOError("Second argument must contain '1st'.")
else:
main(sys.argv[1], sys.argv[2])
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
'tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'cf'
To run:
python getTF.py /path/to/tf_aa.txt
python importTFDF.py /path/to/tf_aa.txt
Created by Bin Dai, Siyuan Guo, Mar 2014.
"""
Expand Down

0 comments on commit dc2f868

Please sign in to comment.