Skip to content

Commit

Permalink
final commit before WMF departure
Browse files Browse the repository at this point in the history
  • Loading branch information
ewulczyn committed Feb 23, 2017
1 parent de91b69 commit 05152b1
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 20 deletions.
6 changes: 3 additions & 3 deletions src/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

"""
python etl.py \
--langs it
--day 20160926 \
--langs en,ja,de,es,ru,fr,it,zh,pt,pl,tr,ar,nl,id,sv,ko,cs,fa,fi,vi
"""

if __name__ == '__main__':
Expand All @@ -18,11 +19,10 @@


if args['day']:
sys.exit()
cmd = """
python /home/ellery/wmf/util/wikidata_utils.py \
--day %(day)s \
--dowload_dump
--download_dump
"""
os.system(cmd % args)

Expand Down
27 changes: 17 additions & 10 deletions src/get_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_requests(start, stop, table, trace_db = 'a2v', prod_db = 'prod', priori

query = """
SET mapreduce.input.fileinputformat.split.maxsize=200000000;
SET hive.mapred.mode=nonstrict;
-- get pageviews, resolve redirects, add wikidata ids
Expand All @@ -44,8 +44,8 @@ def get_requests(start, stop, table, trace_db = 'a2v', prod_db = 'prod', priori
ts,
pv1.lang,
CASE
WHEN rd_to IS NULL THEN raw_title
ELSE rd_to
WHEN rd_to_page_title IS NULL THEN raw_title
ELSE rd_to_page_title
END AS title
FROM
(SELECT
Expand All @@ -67,14 +67,20 @@ def get_requests(start, stop, table, trace_db = 'a2v', prod_db = 'prod', priori
AND LENGTH(REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1)) > 0
) pv1
LEFT JOIN
%(prod_db)s.redirect r
ON pv1.raw_title = r.rd_from
AND pv1.lang = r.lang
(SELECT
*
FROM
prod.redirect
WHERE
rd_from_page_namespace = 0
AND rd_to_page_namespace = 0
AND lang RLIKE '.*'
) r
ON
pv1.raw_title = r.rd_from_page_title AND pv1.lang = r.lang
) pv2
INNER JOIN
%(prod_db)s.wikidata_will w
ON pv2.title = w.title
AND pv2.lang = w.lang;
%(prod_db)s.wikidata_will w ON pv2.title = w.title AND pv2.lang = w.lang;
DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors;
Expand Down Expand Up @@ -135,7 +141,7 @@ def get_requests(start, stop, table, trace_db = 'a2v', prod_db = 'prod', priori
GROUP BY
id;
-- remove disambiguation pages and pages with colon in title
DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews;
CREATE TABLE %(trace_db)s.%(trace_table)s_eligible_reader_pageviews AS
SELECT pv.*
Expand All @@ -159,6 +165,7 @@ def get_requests(start, stop, table, trace_db = 'a2v', prod_db = 'prod', priori
WHERE
propname = 'disambiguation'
AND lang RLIKE '.*'
AND page_namespace = 0
GROUP BY
lang,
page_title
Expand Down
6 changes: 3 additions & 3 deletions src/get_sessions.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,15 @@ def scrub_dates(requests):
# create base dirs
base_dir = '/user/ellery/a2v/data/%(release)s' % args
print(os.system('hadoop fs -mkdir ' + base_dir) )
local_base_dir = '/home/ellery/a2v/data/%(release)s' % args
local_base_dir = '/a/ellery/a2v/data/%(release)s' % args
print(os.system('mkdir ' + local_base_dir) )


# define io paths
args['input_dir'] = '/user/hive/warehouse/%(request_db)s.db/%(table)s' % args
args['output_dir'] = '/user/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['local_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['local_output_dir'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s_dir' % args
args['local_output_file'] = '/a/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['local_output_dir'] = '/a/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s_dir' % args

# clean up old data
print (os.system('hadoop fs -rm -r %(output_dir)s' % args))
Expand Down
14 changes: 10 additions & 4 deletions src/get_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@

for dim in args['dims'].split(','):
args['dim'] = dim
args['input_dir'] = '/home/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args
args['binary_vectors_output_file'] = '/home/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s.bin' % args
args['input_dir'] = '/a/ellery/a2v/data/%(release)s/%(release)s_sessions_%(lang)s' % args
args['vectors_output_file'] = '/a/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s' % args
args['binary_vectors_output_file'] = '/a/ellery/a2v/data/%(release)s/%(release)s_%(lang)s_%(dim)s.bin' % args


t1= time.time()
Expand All @@ -46,8 +46,14 @@
-threads 18 \
-min-count 50 \
-binary 0 \
-cbow 1
-cbow 0 \
-iter 10 \
-negative 3 \
-sample 0.001 \
-window 6
"""

print(cmd % args)
os.system(cmd % args)

cmd = """
Expand Down

0 comments on commit 05152b1

Please sign in to comment.