Skip to content

Commit

Permalink
more consistent collection name for char-freq
Browse files Browse the repository at this point in the history
  • Loading branch information
zachguo committed Apr 27, 2014
1 parent f70f938 commit 49c7f2b
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 9 deletions.
9 changes: 4 additions & 5 deletions text_processing/TLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def run(self, outc):
class RunTLM(object):
"""
Run various computation based on TLM and save results to mongoDB.
Collections 'date','tf_1','tf_2','tf_3','cf' must exist in mongoDB
Collections 'date','tf_1','tf_2','tf_3','tf_ocr' must exist in mongoDB
before execution.
@param outcollections, a list of names of output collections.
Expand All @@ -289,8 +289,7 @@ class RunTLM(object):
def __init__(self, outcollections):
db, outcs = self.connect_mongo(outcollections)
self.datec = db.date
self.tfcs = [db.tf_1, db.tf_2, db.tf_3]
self.cfc = db.cf
self.tfcs = [db.tf_1, db.tf_2, db.tf_3, db.tf_ocr]
self.outcs = [db[outc] for outc in outcs] if outcs else []


Expand All @@ -306,7 +305,7 @@ def connect_mongo(outcollections):
client = MongoClient('localhost', 27017)
db = client.HTRC
collections = db.collection_names()
musthave = ['date', 'tf_1', 'tf_2', 'tf_3', 'cf']
musthave = ['date', 'tf_1', 'tf_2', 'tf_3', 'tf_ocr']
missing = set(musthave) - set(collections)
if missing:
raise IOError("Collections '%s' doesn't exist in 'HTRC' database. \
Expand Down Expand Up @@ -336,7 +335,7 @@ def run(self, weighted=True):
elif postfix == '3':
model = TLM(self.datec, self.tfcs[2], weighted)
elif postfix == 'ocr':
model = TLM(self.datec, self.cfc, weighted)
model = TLM(self.datec, self.tfcs[-1], weighted)
else:
raise ValueError('Invalid output collection names.')
for outc in outcs:
Expand Down
8 changes: 4 additions & 4 deletions text_processing/importTFDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This module generate term-frequencies, document-frequencies and
character-frequencies from MapReduce TF output, then import generated
TF, DF & CF into MongoDB as seven collections:
'tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'cf'
'tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'tf_ocr'
To run:
python importTFDF.py /path/to/tf_aa.txt
Expand All @@ -29,7 +29,7 @@ def import2mongo(filepath):
client = MongoClient('localhost', 27017)
db = client.HTRC
collections = db.collection_names()
for c in ['tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'cf']:
for c in ['tf_1', 'tf_2', 'tf_3', 'df_1', 'df_2', 'df_3', 'tf_ocr']:
if c in collections:
print "Collection %s already exists in 'HTRC' database. Drop it." % c
db.drop_collection(c)
Expand Down Expand Up @@ -58,7 +58,7 @@ def import2mongo(filepath):
db.tf_1.insert(tf_uni)
db.tf_2.insert(tf_bi)
db.tf_3.insert(tf_tri)
db.cf.insert(chardoclist)
db.tf_ocr.insert(chardoclist)
# clear memory & count
tf_uni, tf_bi, tf_tri = [], [], []
chardoclist = []
Expand Down Expand Up @@ -87,7 +87,7 @@ def import2mongo(filepath):
db.tf_1.insert(tf_uni)
db.tf_2.insert(tf_bi)
db.tf_3.insert(tf_tri)
db.cf.insert(chardoclist)
db.tf_ocr.insert(chardoclist)

# save df (document frequencies) to collections ('df_1','df_2','df_3')
db.df_1.insert(reshape(dfdict_uni))
Expand Down

0 comments on commit 49c7f2b

Please sign in to comment.