From dc339f42ac4fdb9359aebe593fadeaa9644693cc Mon Sep 17 00:00:00 2001 From: ewulczyn Date: Wed, 28 Sep 2016 23:54:38 -0700 Subject: [PATCH] switched to c implementation --- ...Navigation Vectors - Getting Started.ipynb | 133 ++++++++++++++++++ src/get_sessions.py | 42 +++++- src/get_vectors.py | 58 +++----- src/run.py | 9 +- 4 files changed, 193 insertions(+), 49 deletions(-) diff --git a/src/Wikipedia Navigation Vectors - Getting Started.ipynb b/src/Wikipedia Navigation Vectors - Getting Started.ipynb index e889535..a8feabd 100644 --- a/src/Wikipedia Navigation Vectors - Getting Started.ipynb +++ b/src/Wikipedia Navigation Vectors - Getting Started.ipynb @@ -518,6 +518,139 @@ "#### Link Recommendations\n", "If articles are frequently read within the same session, you might be able to make Wikipedia easier to navigate if you were to create a link between them. For a given article you could generate recommendations for links to add by finding the nearest neighbors that are not already linked and adding a link if the original article has a suitable anchor text. Again, the Wikidata embedding would allow you to build a model for all languages." ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Deborah_Orr', 0.91539459281436186),\n", + " ('Jonathan_Self', 0.91529695359415975),\n", + " ('Umbrella_(novel)', 0.88568083130588349),\n", + " ('Shark_(novel)', 0.88144038836349925),\n", + " ('Peter_Self', 0.88066891173737261),\n", + " ('Cock_and_Bull', 0.8700406005395005),\n", + " ('Dorian,_an_Imitation', 0.85718259988297951),\n", + " ('Jonathan_Coe', 0.8134598307405887),\n", + " ('What_a_Carve_Up!_(novel)', 0.80775135057346681),\n", + " ('The_Book_of_Dave', 0.79721259734964156)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_embedding.most_similar('Will_Self')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Kissing_traditions', 0.72906072754159257),\n", + " ('French_kiss', 0.72400639405281908),\n", + " ('Kissing,_Bavaria', 0.68524468918114323),\n", + " ('Air_kiss', 0.66284194730918222),\n", + " ('Hug', 0.6409782198490559),\n", + " ('Kiss_(band)', 0.62808919174690359),\n", + " ('Foreplay', 0.62006375083990484),\n", + " ('Cheek_kissing', 0.5881288871976873),\n", + " ('Making_out', 0.58465149766396551),\n", + " ('Post_Office_(game)', 0.5804142777106831)]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_embedding.most_similar('Kiss')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Phyllis_Newman', 0.87788281050268258),\n", + " ('Adolph_Green', 0.86793390643667045),\n", + " ('Bambi_Linn', 0.83127314845549538),\n", + " ('List_of_Broadway_musicals_stars', 0.82739932678976524),\n", + " ('Plain_and_Fancy', 0.823583427500724),\n", + " ('Betty_Comden', 0.82328068158088841),\n", + " ('Bells_Are_Ringing_(musical)', 0.82092651923716164),\n", + " ('Marin_Mazzie', 0.82055069146017445),\n", + " ('I_Love_My_Wife', 0.8199105465494283),\n", + " ('The_Most_Happy_Fella', 0.81986287660477519)]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_embedding.most_similar('Amanda_Green')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Lucknow', 0.729475983488725),\n", + " ('List_of_districts_of_Uttar_Pradesh', 0.71982895178235284),\n", + " ('Madhya_Pradesh', 0.70149555349502302),\n", + " ('Varanasi_district', 0.70075913022256842),\n", + " ('History_of_Uttar_Pradesh', 0.69928461973048661),\n", + " ('Pradesh', 0.6957367273967574),\n", + " ('Western_Uttar_Pradesh', 0.68628632668005041),\n", + " ('Moradabad_district', 0.68589711236395601),\n", + " ('Varanasi_division', 0.6843656500536035),\n", + " ('Purvanchal', 0.68426054961414429)]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_embedding.most_similar('Uttar_Pradesh')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/get_sessions.py b/src/get_sessions.py index 123c75d..4927389 100644 --- a/src/get_sessions.py +++ b/src/get_sessions.py @@ -3,6 +3,8 @@ import argparse import datetime import os +import time +import subprocess """ @@ -25,7 +27,7 @@ --queue priority \ get_sessions.py \ --release test \ - --lang en \ + --lang en """ def parse_requests(requests): @@ -121,20 +123,41 @@ def scrub_dates(requests): args['table'] = args['release'].replace('-', '_') + '_requests' - input_dir = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args - output_dir ='/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args - print (os.system('hadoop fs -rm -r %s' % output_dir)) + # create base dirs + base_dir = '/user/ellery/a2v/data/%(release)s' % args + print(os.system('hadoop fs -mkdir ' + base_dir) ) + local_base_dir = '/home/ellery/a2v/data/%(release)s' % args + print(os.system('mkdir ' + local_base_dir) ) + + + # define io paths + args['input_dir'] = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args + args['output_dir'] = '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args + args['local_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args + args['local_output_dir'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s_dir' % args + + # clean up old data + print (os.system('hadoop fs -rm -r %(output_dir)s' % args)) + print(os.system('rm -rf %(local_output_file)s' % args)) + print(os.system('rm -rf %(local_output_dir)s' % args)) conf = SparkConf() conf.set("spark.app.name", 'a2v preprocess') sc = SparkContext(conf=conf, pyFiles=[]) - requests = sc.textFile(input_dir) \ + + requests = sc.textFile(args['input_dir']) \ .map(parse_requests) if args['lang'] != 'wikidata': requests = requests.map(lambda rs: [r for r in rs if r['lang'] == args['lang']]) + + if args['lang'] == 'wikidata': + to_str = lambda x: ' '.join([e['id'] for e in x]) + else: + to_str = lambda x: ' '.join([e['title'] for e in x]) + requests \ .filter(filter_blacklist) \ .filter(lambda x: len(x) > 1) \ @@ -145,5 +168,10 @@ def scrub_dates(requests): .filter(lambda x: len(x) > 1) \ .filter(lambda x: len(x) < 30) \ .map(scrub_dates) \ - .map(lambda x: json.dumps(x)) \ - .saveAsTextFile (output_dir, compressionCodecClass= "org.apache.hadoop.io.compress.GzipCodec") + .map(to_str) \ + .saveAsTextFile (args['output_dir'], compressionCodecClass = "org.apache.hadoop.io.compress.GzipCodec") + + # transfer data to local + os.system('hadoop fs -copyToLocal %(output_dir)s %(local_output_dir)s' % args) + os.system('cat %(local_output_dir)s/* | gunzip > %(local_output_file)s' % args) + os.system('rm -rf %(local_output_dir)s' % args) \ No newline at end of file diff --git a/src/get_vectors.py b/src/get_vectors.py index ef4808e..d245e23 100644 --- a/src/get_vectors.py +++ b/src/get_vectors.py @@ -1,4 +1,3 @@ -import gensim import scipy import multiprocessing as mp import time @@ -17,56 +16,45 @@ python /home/ellery/a2v/src/get_vectors.py \ --release test \ --lang en \ - --field id \ - --dims 10 + --dims 100 """ -class HDFSSentenceReader(object): - def __init__(self, fname, field): - self.fname = fname + '/*' - self.field = field - def __iter__(self): - print('##### NEW CALL TO ITERATOR #####') - cat = subprocess.Popen(["hadoop", "fs", "-text",self.fname ], stdout=subprocess.PIPE) - for line in cat.stdout: - rs = json.loads(line.strip()) - yield [r[self.field] for r in rs] - - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--release', required=True) parser.add_argument('--lang', required=True) - parser.add_argument('--field', required=True) parser.add_argument('--dims', required=True) args = vars(parser.parse_args()) - - release_dir = '/home/ellery/a2v/data/%(release)s' % args - print(os.system('mkdir ' + release_dir) ) - - for dim in args['dims'].split(','): args['dim'] = dim - input_dir = '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args - m_output_dir = '/home/ellery/a2v/data/%(release)s/%(release)s_model_%(lang)s_%(dim)s' % args - v_output_dir = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args + args['input_dir'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args + args['vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args + args['binary_vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s.bin' % args - sentences = HDFSSentenceReader(input_dir, args['field']) t1= time.time() - model = gensim.models.Word2Vec( \ - sentences, \ - workers=10, \ - min_count=50, \ - size=int(args['dim']) - ) - t2= time.time() - print(t2-t1) + cmd = """ + nice ~/word2vec/word2vec \ + -train %(input_dir)s \ + -output %(vectors_output_file)s \ + -size %(dim)s \ + -threads 18 \ + -min-count 50 \ + -binary 0 \ + -cbow 1 + """ + os.system(cmd % args) + + cmd = """ + ~/convertvec/convertvec txt2bin %(vectors_output_file)s %(binary_vectors_output_file)s + """ + os.system(cmd % args) - model.save(m_output_dir) - model.save_word2vec_format(v_output_dir) \ No newline at end of file + + t2= time.time() + print(t2-t1) \ No newline at end of file diff --git a/src/run.py b/src/run.py index c99ca43..556cc63 100644 --- a/src/run.py +++ b/src/run.py @@ -103,22 +103,17 @@ --release %(release)s \ --lang %(lang)s \ --dims %(dim)s \ - --field %(field)s """ for lang in args['langs'].split(','): args['lang'] = lang - if lang == 'wikidata': - args['field'] = 'id' - else: - args['field'] = 'title' - for dim in args['dims'].split(','): args['dim'] = dim cmds.append(cmd % args) for c in cmds: - Popen([c,], shell=True) + #Popen([c,], shell=True) + os.system(c) else: print('need langs and dims to get models')