-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnumber_count_cc.pig
43 lines (26 loc) · 1.74 KB
/
number_count_cc.pig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
REGISTER 'lib/httpcore-4.2.1.jar';
REGISTER 'lib/jsoup-1.7.2-SNAPSHOT.jar';
REGISTER 'dist/lib/commoncrawl-examples-1.0.1.jar';
-- pages = LOAD '/data/public/common-crawl/parse-output/segment/*/*arc*' USING org.commoncrawl.pig.ArcLoader() AS (date, length, type, statuscode, ipaddress, url, html);
pages = LOAD '$INPUT' USING org.commoncrawl.pig.ArcLoader() AS (date, length, type, statuscode, ipaddress, url, html);
webpages = FILTER pages BY type == 'text/html';
parsed = FOREACH webpages GENERATE url, nl.vu.few.ParseHTML(html) AS page_info;
non_empty = FILTER parsed BY page_info IS NOT NULL;
input_lines = FOREACH non_empty GENERATE page_info.$1 AS line;
tuples = FOREACH input_lines GENERATE TOKENIZE(LOWER(TRIM(line)), ' ",()*\'\t\n\r\f_#\\/!#') AS tokens;
-- Put each word in its own tuple, put each tuple on a separate row
words = FOREACH tuples GENERATE FLATTEN(tokens) AS word;
ranges = FILTER words BY word MATCHES '^[0-9]+-[0-9]+$';
years = FOREACH ranges GENERATE TOKENIZE(word,'-') AS tokens;
years2 = FOREACH years GENERATE FLATTEN(tokens) AS word;
numbers = UNION words, years2;
filtered_numbers = FILTER numbers BY $0 MATCHES '^([-+]?[0-9]*[.]?[0-9]+(e[-+]?[0-9]+)?)([.]?)$'; -- naive number finding first
-- numbers = FILTER words BY word MATCHES '^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$';
floats = FOREACH filtered_numbers GENERATE (float) REGEX_EXTRACT($0, '^([-+]?[0-9]*[.]?[0-9]+(e[-+]?[0-9]+)?)([.]?)$', 1) AS word;--, $0 as orig;
-- create a group for each word
word_groups = GROUP floats BY word;
-- count the entries in each group
word_count = FOREACH word_groups GENERATE COUNT(floats) AS count, group AS word;
-- order the records by count
ordered_word_count = ORDER word_count BY count DESC;
STORE ordered_word_count INTO '$OUTPUT';