Skip to content

Commit

Permalink
Merge pull request #116 from censoredplanet/flatten4
Browse files Browse the repository at this point in the history
flatten measurements for satellite
  • Loading branch information
ohnorobo authored Feb 8, 2022
2 parents 777f21b + 8de0daa commit 13a163a
Show file tree
Hide file tree
Showing 9 changed files with 1,487 additions and 547 deletions.
27 changes: 16 additions & 11 deletions pipeline/manual_e2e_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,14 +401,16 @@ def test_satellite_v2p1_pipeline_e2e(self) -> None:

written_rows = get_bq_rows(client,
[get_bq_base_table_name(SATELLITE_SCAN_TYPE)])
self.assertEqual(len(written_rows), 8)
self.assertEqual(len(written_rows), 16)

all_expected_domains = [
'www.americorps.gov', 'www.americorps.gov', 'custhelp.com',
'custhelp.com', 'www.mainichi.co.jp', 'www.mainichi.co.jp',
'www.unwatch.org', 'www.unwatch.org'
expected_double_domains = ['custhelp.com', 'www.unwatch.org']
expected_quad_domains = [
'a.root-servers.net', 'www.americorps.gov', 'www.mainichi.co.jp'
]

all_expected_domains = (
expected_double_domains * 2 + expected_quad_domains * 4)

written_domains = [row[0] for row in written_rows]
self.assertListEqual(
sorted(written_domains), sorted(all_expected_domains))
Expand All @@ -427,13 +429,16 @@ def test_satellite_v2p2_pipeline_e2e(self) -> None:

written_rows = get_bq_rows(client,
[get_bq_base_table_name(SATELLITE_SCAN_TYPE)])
self.assertEqual(len(written_rows), 10)
self.assertEqual(len(written_rows), 18)

all_expected_domains = [
'11st.co.kr', '1688.com', '11st.co.kr', '11st.co.kr', '1922.gov.tw',
'1922.gov.tw', 'ajax.aspnetcdn.com', 'alipay.com', '1337x.to',
'104.com.tw'
]
expected_single_domains = ['1688.com', '1337x.to', '104.com.tw']
expected_double_domains = ['a.root-servers.net', '1922.gov.tw']
expected_triple_domains = ['11st.co.kr']
expected_quad_domains = ['ajax.aspnetcdn.com', 'alipay.com']

all_expected_domains = (
expected_single_domains + expected_double_domains * 2 +
expected_triple_domains * 3 + expected_quad_domains * 4)

written_domains = [row[0] for row in written_rows]

Expand Down
10 changes: 6 additions & 4 deletions pipeline/metadata/beam_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ def merge_metadata_with_rows( # pylint: disable=unused-argument
new_row.update(row)
if field == 'received':
if new_row['received']:
new_row['received'].update(ip_metadata)
new_row['received'].pop('date', None)
new_row['received'].pop('name', None)
new_row['received'].pop('country', None)
# Double-flattened rows are stored with a single received ip in each list
# to be reconstructed later
new_row['received'][0].update(ip_metadata)
new_row['received'][0].pop('date', None)
new_row['received'][0].pop('name', None)
new_row['received'][0].pop('country', None)
else:
new_row.update(ip_metadata)
yield new_row
2 changes: 1 addition & 1 deletion pipeline/metadata/flatten_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
CONTROL_URLS = [
'example5718349450314.com', # echo/discard
'rtyutgyhefdafioasfjhjhi.com', # HTTP/S
'a.root-servers.net' # Satellite
'a.root-servers.net', # Satellite
'www.example.com' # Satellite
]

Expand Down
Loading

0 comments on commit 13a163a

Please sign in to comment.