-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
126 additions
and
95 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,61 @@ | ||
#! /usr/bin/env python | ||
|
||
# Created by Siyuan Guo, Mar 2014. | ||
""" | ||
Scan and print first date-in-text. | ||
This script is intended to run locally, to run: | ||
python getFirstDateInText.py path/to/aa > date1st_aa.txt | ||
import re,glob,sys | ||
Siyuan Guo, Mar 2014. | ||
""" | ||
|
||
import re, glob, sys | ||
from string import maketrans | ||
from pymongo import MongoClient | ||
from utils import date2daterange | ||
|
||
digits = re.compile(r'\d') | ||
def hasDigit(word): | ||
return bool(digits.search(word)) | ||
DIGITS = re.compile(r'\d') | ||
def has_digit(word): | ||
"""Check whether word contains digits""" | ||
return bool(DIGITS.search(word)) | ||
|
||
SC4D = re.compile(r'(^|\D+)(\d{4})(\D+|$)') # precompiled pattern for standalone consecutive 4 digits | ||
TYPOTABLE = maketrans('lJQOo','11000') | ||
def getDate(word): | ||
if hasDigit(word): | ||
# precompiled pattern for standalone consecutive 4 digits | ||
SC4D = re.compile(r'(^|\D+)(\d{4})(\D+|$)') | ||
TYPOTABLE = maketrans('lJQOo', '11000') | ||
def get_date(word): | ||
"""Get date from potential date string""" | ||
if has_digit(word): | ||
# greedily fix potential OCR typos | ||
word = word.translate(TYPOTABLE) | ||
# find standalone consecutive 4 digits, '18888' don't count | ||
match = SC4D.search(word) | ||
if match: | ||
word = int(match.groups()[1]) | ||
# assume all date is later than 1500, to filter noise like address# | ||
if word>1400 and word<2000: | ||
if word > 1400 and word < 2000: | ||
return word | ||
return None | ||
|
||
def main(filepath): | ||
client = MongoClient('localhost', 27017) | ||
db = client.HTRC | ||
collections = db.collection_names() | ||
if "date" not in collections: | ||
print "Collection 'date' is required. \ | ||
Please run metadata_processing/get_dependent_variable/getDV_HTRC.py first." | ||
|
||
# scan first date-in-text | ||
"""Scan and print first date-in-text""" | ||
allfilenames = glob.glob(filepath.rstrip('/')+'/*.txt') | ||
for fn in allfilenames: | ||
fn_short = fn.split('/')[-1] | ||
if fn_short.endswith('.txt'): | ||
doc_id = fn_short.split('.txt')[0] | ||
for fname in allfilenames: | ||
fname_short = fname.split('/')[-1] | ||
if fname_short.endswith('.txt'): | ||
doc_id = fname_short.split('.txt')[0] | ||
seen_date = False | ||
fin = open(fn) | ||
fin = open(fname) | ||
line = fin.readline() | ||
while not seen_date and line: | ||
words = line.strip().split(' ') | ||
while words and not seen_date: | ||
word = words.pop(0) | ||
date = getDate(word) | ||
date = get_date(word) | ||
if date: | ||
seen_date = True | ||
db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}}) | ||
print "{0}\t{1}".format(doc_id, date) | ||
line = fin.readline() | ||
fin.close() | ||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 2: | ||
print "Please provide the path of text corpora. Yes, that folder cantaining 250k documents." | ||
raise IOError("Please provide the path of text corpora.") | ||
else: | ||
main(sys.argv[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#! /usr/bin/env python | ||
|
||
""" | ||
generate date frequencies from MapReduce date output | ||
then import generated date freqs into MongoDB as a new field in 'date' collection | ||
To run: | ||
python importDate.py /path/to/date_aa.txt /path/to/date1st_aa.txt | ||
Created by Bin Dai & Siyuan Guo, Mar 2014. | ||
""" | ||
|
||
from pymongo import MongoClient | ||
from collections import defaultdict | ||
from utils import date2daterange, freq2prob | ||
import sys | ||
|
||
def main(fp_date, fp_date1st): | ||
"""Run""" | ||
# connect to mongoDB and check date collection | ||
client = MongoClient('localhost', 27017) | ||
db = client.HTRC | ||
collections = db.collection_names() | ||
if "date" not in collections: | ||
print "Collection 'date' is required. \ | ||
Please run metadata_processing/get_dependent_variable/getDV_HTRC.py first." | ||
|
||
# import date related data into MongoDB | ||
import_date(fp_date, db) | ||
import_date1st(fp_date1st, db) | ||
|
||
def import_date(fp_date, db): | ||
"""Import date distribution into MongoDB""" | ||
# assume that 'date' collection is small enough to put in memory | ||
with open(fp_date) as fin: | ||
old_key = None # tracking doc_id | ||
tfdict = defaultdict(float) # date term frequency dictionary for each document | ||
for line in fin: | ||
if line: | ||
this_key, date, tf = line.split('\t') | ||
if this_key != old_key and old_key: | ||
# to successfully update, use unicode | ||
db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}}) | ||
tfdict = defaultdict(float) | ||
old_key = this_key | ||
# update date tf | ||
tfdict[date2daterange(int(date))] += float(tf) | ||
# dont forget last doc | ||
db.date.update({u"_id":unicode(old_key)}, {'$set':{"distribution":freq2prob(tfdict)}}) | ||
print "Finish importing date distributions." | ||
|
||
def import_date1st(fp_date1st, db): | ||
"""Import 1st-date-in-text into MongoDB""" | ||
with open(fp_date1st) as fin: | ||
for line in fin: | ||
if line: | ||
doc_id, date = line.split('\t') | ||
db.date.update({u"_id":unicode(doc_id)},{'$set':{"firstraw":date, "firstrange":date2daterange(date)}}) | ||
print "Finish importing 1st-date-in-texts." | ||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 3: | ||
raise IOError("Please provide \ | ||
1) MapReduce date output file and \ | ||
2) output file from getFirstDateInText.py. \ | ||
(Two arguments should be given)") | ||
elif sys.argv[2].find('1st') < 0: | ||
raise IOError("Second argument must contain '1st'.") | ||
else: | ||
main(sys.argv[1], sys.argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters