Skip to content

Commit

Permalink
switched to c implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
ewulczyn committed Sep 29, 2016
1 parent 3d2c99d commit dc339f4
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 49 deletions.
133 changes: 133 additions & 0 deletions src/Wikipedia Navigation Vectors - Getting Started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,139 @@
"#### Link Recommendations\n",
"If articles are frequently read within the same session, you might be able to make Wikipedia easier to navigate if you were to create a link between them. For a given article you could generate recommendations for links to add by finding the nearest neighbors that are not already linked and adding a link if the original article has a suitable anchor text. Again, the Wikidata embedding would allow you to build a model for all languages."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('Deborah_Orr', 0.91539459281436186),\n",
" ('Jonathan_Self', 0.91529695359415975),\n",
" ('Umbrella_(novel)', 0.88568083130588349),\n",
" ('Shark_(novel)', 0.88144038836349925),\n",
" ('Peter_Self', 0.88066891173737261),\n",
" ('Cock_and_Bull', 0.8700406005395005),\n",
" ('Dorian,_an_Imitation', 0.85718259988297951),\n",
" ('Jonathan_Coe', 0.8134598307405887),\n",
" ('What_a_Carve_Up!_(novel)', 0.80775135057346681),\n",
" ('The_Book_of_Dave', 0.79721259734964156)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"en_embedding.most_similar('Will_Self')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('Kissing_traditions', 0.72906072754159257),\n",
" ('French_kiss', 0.72400639405281908),\n",
" ('Kissing,_Bavaria', 0.68524468918114323),\n",
" ('Air_kiss', 0.66284194730918222),\n",
" ('Hug', 0.6409782198490559),\n",
" ('Kiss_(band)', 0.62808919174690359),\n",
" ('Foreplay', 0.62006375083990484),\n",
" ('Cheek_kissing', 0.5881288871976873),\n",
" ('Making_out', 0.58465149766396551),\n",
" ('Post_Office_(game)', 0.5804142777106831)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"en_embedding.most_similar('Kiss')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('Phyllis_Newman', 0.87788281050268258),\n",
" ('Adolph_Green', 0.86793390643667045),\n",
" ('Bambi_Linn', 0.83127314845549538),\n",
" ('List_of_Broadway_musicals_stars', 0.82739932678976524),\n",
" ('Plain_and_Fancy', 0.823583427500724),\n",
" ('Betty_Comden', 0.82328068158088841),\n",
" ('Bells_Are_Ringing_(musical)', 0.82092651923716164),\n",
" ('Marin_Mazzie', 0.82055069146017445),\n",
" ('I_Love_My_Wife', 0.8199105465494283),\n",
" ('The_Most_Happy_Fella', 0.81986287660477519)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"en_embedding.most_similar('Amanda_Green')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('Lucknow', 0.729475983488725),\n",
" ('List_of_districts_of_Uttar_Pradesh', 0.71982895178235284),\n",
" ('Madhya_Pradesh', 0.70149555349502302),\n",
" ('Varanasi_district', 0.70075913022256842),\n",
" ('History_of_Uttar_Pradesh', 0.69928461973048661),\n",
" ('Pradesh', 0.6957367273967574),\n",
" ('Western_Uttar_Pradesh', 0.68628632668005041),\n",
" ('Moradabad_district', 0.68589711236395601),\n",
" ('Varanasi_division', 0.6843656500536035),\n",
" ('Purvanchal', 0.68426054961414429)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"en_embedding.most_similar('Uttar_Pradesh')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
42 changes: 35 additions & 7 deletions src/get_sessions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import argparse
import datetime
import os
import time
import subprocess


"""
Expand All @@ -25,7 +27,7 @@
--queue priority \
get_sessions.py \
--release test \
--lang en \
--lang en
"""

def parse_requests(requests):
Expand Down Expand Up @@ -121,20 +123,41 @@ def scrub_dates(requests):

args['table'] = args['release'].replace('-', '_') + '_requests'

input_dir = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args
output_dir ='/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
print (os.system('hadoop fs -rm -r %s' % output_dir))
# create base dirs
base_dir = '/user/ellery/a2v/data/%(release)s' % args
print(os.system('hadoop fs -mkdir ' + base_dir) )
local_base_dir = '/home/ellery/a2v/data/%(release)s' % args
print(os.system('mkdir ' + local_base_dir) )


# define io paths
args['input_dir'] = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args
args['output_dir'] = '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['local_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['local_output_dir'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s_dir' % args

# clean up old data
print (os.system('hadoop fs -rm -r %(output_dir)s' % args))
print(os.system('rm -rf %(local_output_file)s' % args))
print(os.system('rm -rf %(local_output_dir)s' % args))

conf = SparkConf()
conf.set("spark.app.name", 'a2v preprocess')
sc = SparkContext(conf=conf, pyFiles=[])

requests = sc.textFile(input_dir) \

requests = sc.textFile(args['input_dir']) \
.map(parse_requests)

if args['lang'] != 'wikidata':
requests = requests.map(lambda rs: [r for r in rs if r['lang'] == args['lang']])


if args['lang'] == 'wikidata':
to_str = lambda x: ' '.join([e['id'] for e in x])
else:
to_str = lambda x: ' '.join([e['title'] for e in x])

requests \
.filter(filter_blacklist) \
.filter(lambda x: len(x) > 1) \
Expand All @@ -145,5 +168,10 @@ def scrub_dates(requests):
.filter(lambda x: len(x) > 1) \
.filter(lambda x: len(x) < 30) \
.map(scrub_dates) \
.map(lambda x: json.dumps(x)) \
.saveAsTextFile (output_dir, compressionCodecClass= "org.apache.hadoop.io.compress.GzipCodec")
.map(to_str) \
.saveAsTextFile (args['output_dir'], compressionCodecClass = "org.apache.hadoop.io.compress.GzipCodec")

# transfer data to local
os.system('hadoop fs -copyToLocal %(output_dir)s %(local_output_dir)s' % args)
os.system('cat %(local_output_dir)s/* | gunzip > %(local_output_file)s' % args)
os.system('rm -rf %(local_output_dir)s' % args)
58 changes: 23 additions & 35 deletions src/get_vectors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import gensim
import scipy
import multiprocessing as mp
import time
Expand All @@ -17,56 +16,45 @@
python /home/ellery/a2v/src/get_vectors.py \
--release test \
--lang en \
--field id \
--dims 10
--dims 100
"""


class HDFSSentenceReader(object):
def __init__(self, fname, field):
self.fname = fname + '/*'
self.field = field
def __iter__(self):
print('##### NEW CALL TO ITERATOR #####')
cat = subprocess.Popen(["hadoop", "fs", "-text",self.fname ], stdout=subprocess.PIPE)
for line in cat.stdout:
rs = json.loads(line.strip())
yield [r[self.field] for r in rs]


if __name__ == '__main__':

parser = argparse.ArgumentParser()
parser.add_argument('--release', required=True)
parser.add_argument('--lang', required=True)
parser.add_argument('--field', required=True)
parser.add_argument('--dims', required=True)

args = vars(parser.parse_args())


release_dir = '/home/ellery/a2v/data/%(release)s' % args
print(os.system('mkdir ' + release_dir) )


for dim in args['dims'].split(','):
args['dim'] = dim
input_dir = '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
m_output_dir = '/home/ellery/a2v/data/%(release)s/%(release)s_model_%(lang)s_%(dim)s' % args
v_output_dir = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args
args['input_dir'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args
args['binary_vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s.bin' % args

sentences = HDFSSentenceReader(input_dir, args['field'])

t1= time.time()

model = gensim.models.Word2Vec( \
sentences, \
workers=10, \
min_count=50, \
size=int(args['dim'])
)
t2= time.time()
print(t2-t1)
cmd = """
nice ~/word2vec/word2vec \
-train %(input_dir)s \
-output %(vectors_output_file)s \
-size %(dim)s \
-threads 18 \
-min-count 50 \
-binary 0 \
-cbow 1
"""
os.system(cmd % args)

cmd = """
~/convertvec/convertvec txt2bin %(vectors_output_file)s %(binary_vectors_output_file)s
"""
os.system(cmd % args)

model.save(m_output_dir)
model.save_word2vec_format(v_output_dir)

t2= time.time()
print(t2-t1)
9 changes: 2 additions & 7 deletions src/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,22 +103,17 @@
--release %(release)s \
--lang %(lang)s \
--dims %(dim)s \
--field %(field)s
"""

for lang in args['langs'].split(','):
args['lang'] = lang
if lang == 'wikidata':
args['field'] = 'id'
else:
args['field'] = 'title'

for dim in args['dims'].split(','):
args['dim'] = dim
cmds.append(cmd % args)

for c in cmds:
Popen([c,], shell=True)
#Popen([c,], shell=True)
os.system(c)

else:
print('need langs and dims to get models')
Expand Down

0 comments on commit dc339f4

Please sign in to comment.