From dc339f42ac4fdb9359aebe593fadeaa9644693cc Mon Sep 17 00:00:00 2001
From: ewulczyn <ellerywulczyn@gmail.com>
Date: Wed, 28 Sep 2016 23:54:38 -0700
Subject: [PATCH] switched to c implementation

---
 ...Navigation Vectors - Getting Started.ipynb | 133 ++++++++++++++++++
 src/get_sessions.py                           |  42 +++++-
 src/get_vectors.py                            |  58 +++-----
 src/run.py                                    |   9 +-
 4 files changed, 193 insertions(+), 49 deletions(-)

diff --git a/src/Wikipedia Navigation Vectors - Getting Started.ipynb b/src/Wikipedia Navigation Vectors - Getting Started.ipynb
index e889535..a8feabd 100644
--- a/src/Wikipedia Navigation Vectors - Getting Started.ipynb	
+++ b/src/Wikipedia Navigation Vectors - Getting Started.ipynb	
@@ -518,6 +518,139 @@
     "#### Link Recommendations\n",
     "If articles are frequently read within the same session, you might be able to make Wikipedia easier to navigate if you were to create a link between them. For a given article you could generate recommendations for links to add by finding the nearest neighbors that are not already linked and adding a link if the original article has a suitable anchor text. Again, the Wikidata embedding would allow you to build a model for all languages."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Deborah_Orr', 0.91539459281436186),\n",
+       " ('Jonathan_Self', 0.91529695359415975),\n",
+       " ('Umbrella_(novel)', 0.88568083130588349),\n",
+       " ('Shark_(novel)', 0.88144038836349925),\n",
+       " ('Peter_Self', 0.88066891173737261),\n",
+       " ('Cock_and_Bull', 0.8700406005395005),\n",
+       " ('Dorian,_an_Imitation', 0.85718259988297951),\n",
+       " ('Jonathan_Coe', 0.8134598307405887),\n",
+       " ('What_a_Carve_Up!_(novel)', 0.80775135057346681),\n",
+       " ('The_Book_of_Dave', 0.79721259734964156)]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_embedding.most_similar('Will_Self')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Kissing_traditions', 0.72906072754159257),\n",
+       " ('French_kiss', 0.72400639405281908),\n",
+       " ('Kissing,_Bavaria', 0.68524468918114323),\n",
+       " ('Air_kiss', 0.66284194730918222),\n",
+       " ('Hug', 0.6409782198490559),\n",
+       " ('Kiss_(band)', 0.62808919174690359),\n",
+       " ('Foreplay', 0.62006375083990484),\n",
+       " ('Cheek_kissing', 0.5881288871976873),\n",
+       " ('Making_out', 0.58465149766396551),\n",
+       " ('Post_Office_(game)', 0.5804142777106831)]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_embedding.most_similar('Kiss')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Phyllis_Newman', 0.87788281050268258),\n",
+       " ('Adolph_Green', 0.86793390643667045),\n",
+       " ('Bambi_Linn', 0.83127314845549538),\n",
+       " ('List_of_Broadway_musicals_stars', 0.82739932678976524),\n",
+       " ('Plain_and_Fancy', 0.823583427500724),\n",
+       " ('Betty_Comden', 0.82328068158088841),\n",
+       " ('Bells_Are_Ringing_(musical)', 0.82092651923716164),\n",
+       " ('Marin_Mazzie', 0.82055069146017445),\n",
+       " ('I_Love_My_Wife', 0.8199105465494283),\n",
+       " ('The_Most_Happy_Fella', 0.81986287660477519)]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_embedding.most_similar('Amanda_Green')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Lucknow', 0.729475983488725),\n",
+       " ('List_of_districts_of_Uttar_Pradesh', 0.71982895178235284),\n",
+       " ('Madhya_Pradesh', 0.70149555349502302),\n",
+       " ('Varanasi_district', 0.70075913022256842),\n",
+       " ('History_of_Uttar_Pradesh', 0.69928461973048661),\n",
+       " ('Pradesh', 0.6957367273967574),\n",
+       " ('Western_Uttar_Pradesh', 0.68628632668005041),\n",
+       " ('Moradabad_district', 0.68589711236395601),\n",
+       " ('Varanasi_division', 0.6843656500536035),\n",
+       " ('Purvanchal', 0.68426054961414429)]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_embedding.most_similar('Uttar_Pradesh')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/src/get_sessions.py b/src/get_sessions.py
index 123c75d..4927389 100644
--- a/src/get_sessions.py
+++ b/src/get_sessions.py
@@ -3,6 +3,8 @@
 import argparse
 import datetime
 import os
+import time
+import subprocess
 
 
 """
@@ -25,7 +27,7 @@
     --queue priority \
 get_sessions.py \
     --release test \
-    --lang en \
+    --lang en 
 """
 
 def parse_requests(requests):
@@ -121,20 +123,41 @@ def scrub_dates(requests):
 
     args['table'] = args['release'].replace('-', '_') + '_requests'
 
-    input_dir  = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args
-    output_dir ='/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
-    print (os.system('hadoop fs -rm -r %s' % output_dir))
+    # create base dirs
+    base_dir = '/user/ellery/a2v/data/%(release)s' % args
+    print(os.system('hadoop fs -mkdir ' + base_dir) )
+    local_base_dir = '/home/ellery/a2v/data/%(release)s' % args
+    print(os.system('mkdir ' + local_base_dir) )
+
+
+    # define io paths
+    args['input_dir']  = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args
+    args['output_dir'] =        '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
+    args['local_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
+    args['local_output_dir'] =  '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s_dir' % args
+
+    # clean up old data
+    print (os.system('hadoop fs -rm -r %(output_dir)s' % args))
+    print(os.system('rm -rf %(local_output_file)s' % args))
+    print(os.system('rm -rf %(local_output_dir)s' % args))
 
     conf = SparkConf()
     conf.set("spark.app.name", 'a2v preprocess')
     sc = SparkContext(conf=conf, pyFiles=[])
 
-    requests  = sc.textFile(input_dir) \
+    
+    requests  = sc.textFile(args['input_dir']) \
     .map(parse_requests)
     
     if args['lang'] != 'wikidata':
         requests = requests.map(lambda rs: [r for r in rs if r['lang'] == args['lang']])
 
+
+    if args['lang'] == 'wikidata':
+        to_str = lambda x: ' '.join([e['id'] for e in x])
+    else:
+        to_str = lambda x: ' '.join([e['title'] for e in x])
+
     requests \
     .filter(filter_blacklist) \
     .filter(lambda x: len(x) > 1) \
@@ -145,5 +168,10 @@ def scrub_dates(requests):
     .filter(lambda x: len(x) > 1) \
     .filter(lambda x: len(x) < 30) \
     .map(scrub_dates) \
-    .map(lambda x: json.dumps(x)) \
-    .saveAsTextFile (output_dir, compressionCodecClass= "org.apache.hadoop.io.compress.GzipCodec")
+    .map(to_str) \
+    .saveAsTextFile (args['output_dir'], compressionCodecClass = "org.apache.hadoop.io.compress.GzipCodec")
+
+    # transfer data to local
+    os.system('hadoop fs -copyToLocal %(output_dir)s %(local_output_dir)s' % args)
+    os.system('cat %(local_output_dir)s/* | gunzip > %(local_output_file)s' % args)
+    os.system('rm -rf %(local_output_dir)s' % args)
\ No newline at end of file
diff --git a/src/get_vectors.py b/src/get_vectors.py
index ef4808e..d245e23 100644
--- a/src/get_vectors.py
+++ b/src/get_vectors.py
@@ -1,4 +1,3 @@
-import gensim
 import scipy
 import multiprocessing as mp
 import time
@@ -17,56 +16,45 @@
 python /home/ellery/a2v/src/get_vectors.py \
     --release test \
     --lang en \
-    --field id \
-    --dims 10 
+    --dims 100 
 """ 
 
 
-class HDFSSentenceReader(object):
-    def __init__(self, fname, field):
-        self.fname = fname + '/*'
-        self.field = field
-    def __iter__(self):
-        print('##### NEW CALL TO ITERATOR #####')
-        cat = subprocess.Popen(["hadoop", "fs", "-text",self.fname ], stdout=subprocess.PIPE)
-        for line in cat.stdout:
-            rs = json.loads(line.strip())
-            yield [r[self.field] for r in rs]
-
-
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--release', required=True)
     parser.add_argument('--lang', required=True)
-    parser.add_argument('--field', required=True)
     parser.add_argument('--dims', required=True)
 
     args = vars(parser.parse_args())
 
-
-    release_dir = '/home/ellery/a2v/data/%(release)s' % args
-    print(os.system('mkdir ' + release_dir) )
-
-
     for dim in args['dims'].split(','):
         args['dim'] = dim
-        input_dir =  '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
-        m_output_dir = '/home/ellery/a2v/data/%(release)s/%(release)s_model_%(lang)s_%(dim)s' % args
-        v_output_dir = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args
+        args['input_dir'] =  '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
+        args['vectors_output_file'] =        '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args
+        args['binary_vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s.bin' % args
         
-        sentences = HDFSSentenceReader(input_dir, args['field'])
         
         t1= time.time()
 
-        model = gensim.models.Word2Vec( \
-                sentences, \
-                workers=10, \
-                min_count=50, \
-                size=int(args['dim'])
-                )
-        t2= time.time()
-        print(t2-t1)
+        cmd = """
+        nice ~/word2vec/word2vec \
+        -train %(input_dir)s \
+        -output %(vectors_output_file)s \
+        -size %(dim)s \
+        -threads 18 \
+        -min-count 50 \
+        -binary 0 \
+        -cbow 1
+        """
+        os.system(cmd % args)
+
+        cmd = """
+        ~/convertvec/convertvec txt2bin %(vectors_output_file)s %(binary_vectors_output_file)s 
+        """
+        os.system(cmd % args)
 
-        model.save(m_output_dir)
-        model.save_word2vec_format(v_output_dir)
\ No newline at end of file
+
+        t2= time.time()
+        print(t2-t1)
\ No newline at end of file
diff --git a/src/run.py b/src/run.py
index c99ca43..556cc63 100644
--- a/src/run.py
+++ b/src/run.py
@@ -103,22 +103,17 @@
                 --release %(release)s \
                 --lang %(lang)s \
                 --dims %(dim)s \
-                --field %(field)s
             """
 
             for lang in args['langs'].split(','):
                 args['lang'] = lang
-                if lang == 'wikidata':
-                    args['field'] = 'id'
-                else:
-                    args['field'] = 'title'
-
                 for dim in args['dims'].split(','):
                     args['dim'] = dim 
                     cmds.append(cmd % args)
 
             for c in cmds:
-                Popen([c,], shell=True)
+                #Popen([c,], shell=True)
+                os.system(c)
 
         else:
             print('need langs and dims to get models')