diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index f7d404f02..bd84abdf9 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -48,7 +48,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - test-groups: ["test_[a-e]*", "test_[f-h]*", "test_[i-r,t-z]*", "test_[s]*"] + test-groups: ["test/test_[a-e]*", "test/test_[f-h]*", "test/test_[i-r,t-z]*", "test/test_[s]*", "test/storage/*"] fail-fast: false steps: # All of these steps are just setup, maybe we should wrap them in an action diff --git a/.gitignore b/.gitignore index f09fbebd4..7eaa3a8dd 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,5 @@ openwpm/Extension/firefox/dist openwpm/Extension/firefox/openwpm.xpi openwpm/Extension/firefox/src/content.js openwpm/Extension/firefox/src/feature.js + +datadir diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9c13abd3e..828c50433 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,16 @@ repos: hooks: - id: isort - repo: https://github.com/psf/black - rev: 20.8b1 # Replace by any tag/version: https://github.com/psf/black/tags + rev: 20.8b1 hooks: - id: black - language_version: python3 # Should be a command that runs python3.6+ + language_version: python3 + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.790 + hooks: + - id: mypy + additional_dependencies: [pytest] + # We may need to add more and more dependencies here, as pre-commit + # runs in an environment without our dependencies + + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ed334c6c2..c1612a6e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,6 @@ OpenWPM's tests are build on [pytest](https://docs.pytest.org/en/latest/). Execu in the test directory to run all tests: $ conda activate openwpm - $ cd test $ py.test -vv See the [pytest docs](https://docs.pytest.org/en/latest/) for more information on selecting diff --git a/README.md b/README.md index c63d694c3..b83715e19 100644 --- a/README.md +++ b/README.md @@ -12,25 +12,25 @@ the instrumentation section below for more details. Table of Contents ------------------ -* [Installation](#installation) - * [Pre-requisites](#pre-requisites) - * [Install](#install) - * [Mac OSX](#mac-osx) -* [Quick Start](#quick-start) -* [Troubleshooting](#troubleshooting) -* [Advice for Measurement Researchers](#advice-for-measurement-researchers) -* [Developer instructions](#developer-instructions) -* [Instrumentation and Configuration](#instrumentation-and-configuration) -* [Persistence Types](#persistence-types) - * [Local Databases](#local-databases) - * [Parquet on Amazon S3](#parquet-on-amazon-s3) -* [Docker Deployment for OpenWPM](#docker-deployment-for-openwpm) - * [Building the Docker Container](#building-the-docker-container) - * [Running Measurements from inside the Container](#running-measurements-from-inside-the-container) - * [MacOS GUI applications in Docker](#macos-gui-applications-in-docker) -* [Citation](#citation) -* [License](#license) +- [Installation](#installation) + - [Pre-requisites](#pre-requisites) + - [Install](#install) + - [Mac OSX](#mac-osx) +- [Quick Start](#quick-start) +- [Troubleshooting](#troubleshooting) +- [Advice for Measurement Researchers](#advice-for-measurement-researchers) +- [Developer instructions](#developer-instructions) +- [Instrumentation and Configuration](#instrumentation-and-configuration) +- [Storage](#storage) + - [Local Storage](#local-storage) + - [Remote storage](#remote-storage) +- [Docker Deployment for OpenWPM](#docker-deployment-for-openwpm) + - [Building the Docker Container](#building-the-docker-container) + - [Running Measurements from inside the Container](#running-measurements-from-inside-the-container) + - [MacOS GUI applications in Docker](#macos-gui-applications-in-docker) +- [Citation](#citation) +- [License](#license) Installation ------------ @@ -178,40 +178,52 @@ If you want to contribute to OpenWPM have a look at our [CONTRIBUTING.md](./CONT Instrumentation and Configuration ------------------------------- + OpenWPM provides a breadth of configuration options which can be found in [Configuration.md](docs/Configuration.md) More detail on the output is available [below](#persistence-types). -Persistence Types +Storage ------------ -#### Local Databases -By default OpenWPM saves all data locally on disk in a variety of formats. -Most of the instrumentation saves to a SQLite database specified -by `manager_params.database_name` in the main output directory. Response -bodies are saved in a LevelDB database named `content.ldb`, and are keyed by -the hash of the content. In addition, the browser commands that dump page -source and save screenshots save them in the `sources` and `screenshots` -subdirectories of the main output directory. The SQLite schema -specified by: `openwpm/DataAggregator/schema.sql`. You can specify additional tables -inline by sending a `create_table` message to the data aggregator. - -#### Parquet on Amazon S3 -As an option, OpenWPM can save data directly to an Amazon S3 bucket as a -Parquet Dataset. This is currently experimental and hasn't been thoroughly -tested. Screenshots, and page source saving is not currently supported and -will still be stored in local databases and directories. To enable S3 -saving specify the following configuration parameters in `manager_params`: -* Persistence Type: `manager_params.output_format = 's3'` -* S3 bucket name: `manager_params.s3_bucket = 'openwpm-test-crawl'` -* Directory within S3 bucket: `manager_params.s3_directory = '2018-09-09_test-crawl-new'` - -In order to save to S3 you must have valid access credentials stored in -`~/.aws`. We do not currently allow you to specify an alternate storage -location. - -**NOTE:** The schemas should be kept in sync with the exception of -output-specific columns (e.g., `instance_id` in the S3 output). You can compare +OpenWPM distinguishes between two types of data, structured and unstructured. +Structured data is all data captured by the instrumentation or emitted by the platform. +Generally speaking all data you download is unstructured data. + +For each of the data classes we offer a variety of storage providers, and you are encouraged +to implement your own, should the provided backends not be enough for you. + +We have an outstanding issue to enable saving content generated by commands, such as +screenshots and page dumps to unstructured storage (see [#232](https://github.com/mozilla/OpenWPM/issues/232)). +For now, they get saved to `manager_params.data_directory`. + +### Local Storage + +For storing structured data locally we offer two StorageProviders: + +- The SQLiteStorageProvider which writes all data into a SQLite database + - This is the recommended approach for getting started as the data is easily explorable +- The LocalArrowProvider which stores the data into Parquet files. + - This method integrates well with NumPy/Pandas + - It might be harder to ad-hoc process + +For storing unstructured data locally we also offer two solutions: + +- The LevelDBProvider which stores all data into a LevelDB + - This is the recommended approach +- The LocalGzipProvider that gzips and stores the files individually on disk + - Please note that file systems usually don't like thousands of files in one folder + - Use with care or for single site visits + +### Remote storage + +When running in the cloud, saving records to disk is not a reasonable thing to do. +So we offer a remote StorageProviders for S3 (See [#823](https://github.com/mozilla/OpenWPM/issues/823)) and GCP. +Currently, all remote StorageProviders write to the respective object storage service (S3/GCS). +The structured providers use the Parquet format. + +**NOTE:** The Parquet and SQL schemas should be kept in sync except +output-specific columns (e.g., `instance_id` in the Parquet output). You can compare the two schemas by running `diff -y openwpm/DataAggregator/schema.sql openwpm/DataAggregator/parquet_schema.py`. @@ -238,7 +250,7 @@ Docker service. __Step 2:__ to build the image, run the following command from a terminal within the root OpenWPM directory: -``` +```bash docker build -f Dockerfile -t openwpm . ``` @@ -253,7 +265,7 @@ X-server. You can do this by running: `xhost +local:docker` Then you can run the demo script using: -``` +```bash mkdir -p docker-volume && docker run -v $PWD/docker-volume:/opt/Desktop \ -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix --shm-size=2g \ -it openwpm python3 /opt/OpenWPM/demo.py diff --git a/crawler.py b/crawler.py index 77f1ec6d4..4b156531a 100644 --- a/crawler.py +++ b/crawler.py @@ -4,42 +4,53 @@ import signal import sys import time +from pathlib import Path from threading import Lock from typing import Any, Callable, List -import boto3 import sentry_sdk +from openwpm import mp_logger from openwpm.command_sequence import CommandSequence from openwpm.config import BrowserParams, ManagerParams -from openwpm.mp_logger import parse_config_from_env +from openwpm.storage.cloud_storage.gcp_storage import ( + GcsStructuredProvider, + GcsUnstructuredProvider, +) from openwpm.task_manager import TaskManager from openwpm.utilities import rediswq -from test.utilities import LocalS3Session, local_s3_bucket # Configuration via environment variables +# Crawler specific config REDIS_HOST = os.getenv("REDIS_HOST", "redis-box") REDIS_QUEUE_NAME = os.getenv("REDIS_QUEUE_NAME", "crawl-queue") +MAX_JOB_RETRIES = int(os.getenv("MAX_JOB_RETRIES", "2")) +DWELL_TIME = int(os.getenv("DWELL_TIME", "10")) +TIMEOUT = int(os.getenv("TIMEOUT", "60")) + +# Storage Provider Params CRAWL_DIRECTORY = os.getenv("CRAWL_DIRECTORY", "crawl-data") -S3_BUCKET = os.getenv("S3_BUCKET", "openwpm-crawls") +GCS_BUCKET = os.getenv("GCS_BUCKET", "openwpm-crawls") +GCP_PROJECT = os.getenv("GCP_PROJECT", "") +AUTH_TOKEN = os.getenv("GCP_AUTH_TOKEN", "cloud") + +# Browser Params DISPLAY_MODE = os.getenv("DISPLAY_MODE", "headless") HTTP_INSTRUMENT = os.getenv("HTTP_INSTRUMENT", "1") == "1" COOKIE_INSTRUMENT = os.getenv("COOKIE_INSTRUMENT", "1") == "1" NAVIGATION_INSTRUMENT = os.getenv("NAVIGATION_INSTRUMENT", "1") == "1" JS_INSTRUMENT = os.getenv("JS_INSTRUMENT", "1") == "1" CALLSTACK_INSTRUMENT = os.getenv("CALLSTACK_INSTRUMENT", "1") == "1" -JS_INSTRUMENT_SETTINGS = os.getenv( - "JS_INSTRUMENT_SETTINGS", '["collection_fingerprinting"]' +JS_INSTRUMENT_SETTINGS = json.loads( + os.getenv("JS_INSTRUMENT_SETTINGS", '["collection_fingerprinting"]') ) + SAVE_CONTENT = os.getenv("SAVE_CONTENT", "") PREFS = os.getenv("PREFS", None) -DWELL_TIME = int(os.getenv("DWELL_TIME", "10")) -TIMEOUT = int(os.getenv("TIMEOUT", "60")) -SENTRY_DSN = os.getenv("SENTRY_DSN", None) -LOGGER_SETTINGS = parse_config_from_env() -MAX_JOB_RETRIES = int(os.getenv("MAX_JOB_RETRIES", "2")) -JS_INSTRUMENT_SETTINGS = json.loads(JS_INSTRUMENT_SETTINGS) + +SENTRY_DSN = os.getenv("SENTRY_DSN", None) +LOGGER_SETTINGS = mp_logger.parse_config_from_env() if CALLSTACK_INSTRUMENT is True: # Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work @@ -74,21 +85,30 @@ browser_params[i].prefs = json.loads(PREFS) # Manager configuration -manager_params.data_directory = "~/Desktop/%s/" % CRAWL_DIRECTORY -manager_params.log_directory = "~/Desktop/%s/" % CRAWL_DIRECTORY -manager_params.output_format = "s3" -manager_params.s3_bucket = S3_BUCKET -manager_params.s3_directory = CRAWL_DIRECTORY - -# Allow the use of localstack's mock s3 service -S3_ENDPOINT = os.getenv("S3_ENDPOINT") -if S3_ENDPOINT: - boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT) - manager_params.s3_bucket = local_s3_bucket(boto3.resource("s3"), name=S3_BUCKET) - +manager_params.data_directory = Path("~/Desktop/") / CRAWL_DIRECTORY +manager_params.log_directory = Path("~/Desktop/") / CRAWL_DIRECTORY + +structured = GcsStructuredProvider( + project=GCP_PROJECT, + bucket_name=GCS_BUCKET, + base_path=CRAWL_DIRECTORY, + token=AUTH_TOKEN, +) +unstructured = GcsUnstructuredProvider( + project=GCP_PROJECT, + bucket_name=GCS_BUCKET, + base_path=CRAWL_DIRECTORY + "/data", + token=AUTH_TOKEN, +) # Instantiates the measurement platform # Commands time out by default after 60 seconds -manager = TaskManager(manager_params, browser_params, logger_kwargs=LOGGER_SETTINGS) +manager = TaskManager( + manager_params, + browser_params, + structured, + unstructured, + logger_kwargs=LOGGER_SETTINGS, +) # At this point, Sentry should be initiated if SENTRY_DSN: @@ -96,7 +116,7 @@ with sentry_sdk.configure_scope() as scope: # tags generate breakdown charts and search filters scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY) - scope.set_tag("S3_BUCKET", S3_BUCKET) + scope.set_tag("GCS_BUCKET", GCS_BUCKET) scope.set_tag("DISPLAY_MODE", DISPLAY_MODE) scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT) scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT) @@ -108,9 +128,10 @@ scope.set_tag("DWELL_TIME", DWELL_TIME) scope.set_tag("TIMEOUT", TIMEOUT) scope.set_tag("MAX_JOB_RETRIES", MAX_JOB_RETRIES) - scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (S3_BUCKET, CRAWL_DIRECTORY)) + scope.set_tag("CRAWL_REFERENCE", "%s/%s" % (GCS_BUCKET, CRAWL_DIRECTORY)) # context adds addition information that may be of interest - scope.set_context("PREFS", PREFS) + if PREFS: + scope.set_context("PREFS", json.loads(PREFS)) scope.set_context( "crawl_config", { @@ -159,9 +180,9 @@ def get_job_completion_callback( job_queue: rediswq.RedisWQ, job: bytes, ) -> Callable[[bool], None]: - def callback(sucess: bool) -> None: + def callback(success: bool) -> None: with unsaved_jobs_lock: - if sucess: + if success: logger.info("Job %r is done", job) job_queue.complete(job) else: diff --git a/demo.py b/demo.py index 0e3edf88c..4267faf43 100644 --- a/demo.py +++ b/demo.py @@ -1,7 +1,10 @@ +from pathlib import Path + from custom_command import LinkCountingCommand from openwpm.command_sequence import CommandSequence from openwpm.commands.browser_commands import GetCommand from openwpm.config import BrowserParams, ManagerParams +from openwpm.storage.sql_provider import SQLiteStorageProvider from openwpm.task_manager import TaskManager # The list of sites that we wish to crawl @@ -12,13 +15,10 @@ "http://citp.princeton.edu/", ] - # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams -manager_params = ManagerParams( - num_browsers=NUM_BROWSERS -) # num_browsers is necessary to let TaskManager know how many browsers to spawn +manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)] # Update browser configuration (use this for per-browser settings) @@ -37,35 +37,42 @@ browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) -manager_params.data_directory = "~/Desktop/" -manager_params.log_directory = "~/Desktop/" +manager_params.data_directory = Path("./datadir/") +manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True -# Instantiates the measurement platform -# Commands time out by default after 60 seconds -manager = TaskManager(manager_params, browser_params) -# Visits the sites -for site in sites: +# Commands time out by default after 60 seconds +with TaskManager( + manager_params, + browser_params, + SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), + None, +) as manager: + # Visits the sites + for index, site in enumerate(sites): - # Parallelize sites over all number of browsers set above. - command_sequence = CommandSequence( - site, - reset=True, - callback=lambda success, val=site: print("CommandSequence {} done".format(val)), - ) + def callback(success: bool, val: str = site) -> None: + print( + f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" + ) - # Start by visiting the page - command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) - # Have a look at custom_command.py to see how to implement your own command - command_sequence.append_command(LinkCountingCommand()) + # Parallelize sites over all number of browsers set above. + command_sequence = CommandSequence( + site, + site_rank=index, + reset=True, + callback=callback, + ) - # Run commands across the three browsers (simple parallelization) - manager.execute_command_sequence(command_sequence) + # Start by visiting the page + command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) + # Have a look at custom_command.py to see how to implement your own command + command_sequence.append_command(LinkCountingCommand()) -# Shuts down the browsers and waits for the data to finish logging -manager.close() + # Run commands across the three browsers (simple parallelization) + manager.execute_command_sequence(command_sequence) diff --git a/docs/Configuration.md b/docs/Configuration.md index c9505ac72..f6c954c1d 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -58,15 +58,13 @@ validate_crawl_configs(manager_params, browser_params) # Platform Configuration Options * `data_directory` - * The directory in which to output the crawl database and related files. The - directory given will be created if it does not exist. + * The directory into which screenshots and page dumps will be saved + * [Intended to be removed by #232](https://github.com/mozilla/OpenWPM/issues/232) * `log_directory` * The directory in which to output platform logs. The directory given will be created if it does not exist. * `log_file` -> supported file extensions are `.log` * The name of the log file to be written to `log_directory`. -* `database_name` -> supported file extensions are `.db`, `.sqlite` - * The name of the database file to be written to `data_directory` * `failure_limit` -> has to be either of type `int` or `None` * The number of command failures the platform will tolerate before raising a `CommandExecutionError` exception. Otherwise the default is set to 2 x the diff --git a/docs/Platform-Architecture.md b/docs/Platform-Architecture.md index c6b0c541c..1cc2938d1 100644 --- a/docs/Platform-Architecture.md +++ b/docs/Platform-Architecture.md @@ -2,48 +2,21 @@ ## Overview -The user-facing component of the OpenWPM platform is the Task Manager. The Task Manager oversees multiple browser instances and passes them commands. The Task Manager also ensures that crawls continue despite browser crashes for freezes. In particular, it checks whether a given browser fails to complete a command within a given timeout (or has died) and kills/restarts this browser as necessary. - -## Instantiating a Task Manager - -All automation code is contained within the `openwpm` folder; the Task Manager code is contained in `openwpm/task_manager.py`. - -Task Managers can be instantiated in the following way: -```python -from opemwpm.task_manager import TaskManager -from openwpm.config import ( - BrowserParams, - ManagerParams, -) - -number_of_browser = 5 # Number of browsers to spawn - -# Instantiating Browser and Manager Params with default values. -manager_params = ManagerParams(num_browsers = number_of_browsers) -browser_params = [BrowserParams() for bp in range(manager_params.num_browsers)] - -# These instances can be used to modify default values of both browser and manager params. -manager_params.data_directory = '~/Documents' -manager_params.database_name = 'custom_name.sqlite' - -for i in range(len(browser_params)): - browser_params[i].display_mode = 'headless' # all 5 browsers will spawn in headless mode - -# Instantiating TaskManager -manager = TaskManager(manager_params, browser_params) - -``` - -To learn more about the `manager_params` and `browser_params` have a look at [Configuration.md](Configuration.md) +The user-facing component of the OpenWPM platform is the Task Manager. +The Task Manager oversees multiple browser instances and passes them commands. +The Task Manager also ensures that crawls continue despite browser crashes for freezes. +In particular, it checks whether a given browser fails to complete a command within a given timeout (or has died) and +kills/restarts this browser as necessary. ## Watchdogs -In OpenWPM we have a so called watchdog that tries to ensure two things. +In OpenWPM we have a watchdog thread that tries to ensure two things. - `process_watchdog` * It is part of default manager_params. It is set to false by default which can manually be set to true. - * It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that aren't currently controlled by OpenWPM. (GeckoDriver is used by Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server). + * It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that aren't currently controlled by OpenWPM. + (GeckoDriver is used by Selenium to control Firefox and Xvfb is a "virtual display" we use to simulate having graphics when running on a server). - `memory_watchdog` * It is part of default manager_params. It is set to false by default which can manually be set to true. - * It is a watchdog that tries to ensure that no Firefox instance takes up to much memory. + * It is a watchdog that tries to ensure that no Firefox instance takes up too much memory. * It is mostly useful for long running cloud crawls. ## Issuing commands @@ -60,8 +33,8 @@ For example you could wire up a `CommandSequence` to go to a given url and take command_sequence.save_screenshot() ``` -But this on it's own would do nothing, because `CommandSequence`s are not automatically scheduled. -Instead you need to submit them to a `TaskManager` by calling: +But this on its own would do nothing, because `CommandSequence`s are not automatically scheduled. +Instead, you need to submit them to a `TaskManager` by calling: ```python manager.execute_command_sequence(command_sequence) manager.close() @@ -76,9 +49,7 @@ Please note that you need to close the manager, because by default CommandSequen ## Adding new commands -Currently the easiest way to execute a user defined function as part of a CommandSequence is to use the -`run_custom_function` method on the CommandSequence, however we hope to significantly improve this process -with https://github.com/mozilla/OpenWPM/issues/743. +Have a look at [`custom_command.py`](../custom_command.py) # Browser Manager @@ -94,7 +65,9 @@ The Browser class, contained in the same file, is the Task Manager's wrapper aro ## Browser Information Logging -Throughout the course of a measurement, the Browser Managers' commands (along with timestamps and the status of the commands) are logged by the Task Manager, which contributes the the reproducibility of individual experiments. The data are sent to the Data Aggregator process, which provides stability in logging data despite the possibility of individual browser crashes. +Throughout the course of a measurement, the Browser Managers' commands (along with timestamps and the status of the commands) +are logged by the Task Manager, which contributes to the reproducibility of individual experiments. +The data is sent to the Storage Controller process, which provides stability in logging data despite the possibility of individual browser crashes. # The WebExtension diff --git a/environment.yaml b/environment.yaml index f648e22cf..d55d90c7c 100644 --- a/environment.yaml +++ b/environment.yaml @@ -7,11 +7,12 @@ dependencies: - click=7.1.2 - codecov=2.1.11 - dill=0.3.3 +- gcsfs=0.7.2 - geckodriver=0.29.0 - ipython=7.20.0 - leveldb=1.22 -- localstack=0.11.1.1 - multiprocess=0.70.11.1 +- mypy=0.800 - nodejs=14.15.1 - pandas=1.2.2 - pillow=8.1.0 @@ -19,25 +20,21 @@ dependencies: - pre-commit=2.10.1 - psutil=5.8.0 - pyarrow=3.0.0 +- pytest-asyncio=0.14.0 - pytest-cov=2.11.1 - pytest=6.2.2 -- python=3.8.6 +- python=3.9.1 - pyvirtualdisplay=1.3.2 - redis-py=3.5.3 -- s3fs=0.4.0 +- s3fs=0.5.2 - selenium=3.141.0 - sentry-sdk=0.20.2 - tabulate=0.8.7 - tblib=1.6.0 - wget=1.20.1 - pip: - - amazon-kclpy==2.0.1 - - crontab==0.22.9 - dataclasses-json==0.5.2 - domain-utils==0.7.1 - - flask-cors==3.0.10 - jsonschema==3.2.0 - - moto-ext==1.3.15.15 - plyvel==1.3.0 - - subprocess32==3.5.4 name: openwpm diff --git a/openwpm/DataAggregator/S3_aggregator.py b/openwpm/DataAggregator/S3_aggregator.py deleted file mode 100644 index 9dcae0f12..000000000 --- a/openwpm/DataAggregator/S3_aggregator.py +++ /dev/null @@ -1,416 +0,0 @@ -import base64 -import gzip -import hashlib -import io -import json -import queue -import random -import time -from collections import defaultdict -from typing import Any, DefaultDict, Dict, List, MutableSet, Optional - -import boto3 -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import s3fs -from botocore.client import Config -from botocore.exceptions import ClientError, EndpointConnectionError -from pyarrow.filesystem import S3FSWrapper # noqa - -from openwpm.config import ManagerParamsInternal - -from .base_aggregator import ( - RECORD_TYPE_CONTENT, - RECORD_TYPE_CREATE, - RECORD_TYPE_SPECIAL, - BaseAggregator, - BaseListener, - BaseParams, -) -from .parquet_schema import PQ_SCHEMAS - -CACHE_SIZE = 500 -SITE_VISITS_INDEX = "_site_visits_index" -CONTENT_DIRECTORY = "content" -CONFIG_DIR = "config" -BATCH_COMMIT_TIMEOUT = 30 # commit a batch if no new records for N seconds -S3_CONFIG_KWARGS = {"retries": {"max_attempts": 20}} -S3_CONFIG = Config(**S3_CONFIG_KWARGS) - - -def listener_process_runner( - base_params: BaseParams, manager_params: ManagerParamsInternal, instance_id: int -) -> None: - """S3Listener runner. Pass to new process""" - listener = S3Listener(base_params, manager_params, instance_id) - listener.startup() - - while True: - listener.update_status_queue() - listener.save_batch_if_past_timeout() - if listener.should_shutdown(): - break - try: - record = listener.record_queue.get(block=True, timeout=5) - listener.process_record(record) - except queue.Empty: - pass - - listener.drain_queue() - listener.shutdown() - - -class S3Listener(BaseListener): - """Listener that pushes aggregated records to S3. - - Records for each page visit are stored in memory during a page visit. Once - the browser moves to another page, the data is written to S3 as part of - a parquet dataset. The schema for this dataset is given in - ./parquet_schema.py - """ - - def __init__( - self, - base_params: BaseParams, - manager_params: ManagerParamsInternal, - instance_id: int, - ) -> None: - self.dir = manager_params.s3_directory - - def factory_function(): - return defaultdict(list) - - self._records: Dict[int, DefaultDict[str, List[Any]]] = defaultdict( - factory_function - ) # maps visit_id and table to records - self._batches: DefaultDict[str, List[pa.RecordBatch]] = defaultdict( - list - ) # maps table_name to a list of batches - self._unsaved_visit_ids: MutableSet[int] = set() - - self._instance_id = instance_id - self._bucket = manager_params.s3_bucket - self._s3_content_cache: MutableSet[ - str - ] = set() # cache of filenames already uploaded - self._s3 = boto3.client("s3", config=S3_CONFIG) - self._s3_resource = boto3.resource("s3", config=S3_CONFIG) - self._fs = s3fs.S3FileSystem( - session=boto3.DEFAULT_SESSION, config_kwargs=S3_CONFIG_KWARGS - ) - self._s3_bucket_uri = "s3://%s/%s/visits/%%s" % (self._bucket, self.dir) - # time last record was received - self._last_record_received: Optional[float] = None - super(S3Listener, self).__init__(*base_params) - - def _write_record(self, table, data, visit_id): - """Insert data into a RecordBatch""" - records = self._records[visit_id] - # Add nulls - for item in PQ_SCHEMAS[table].names: - if item not in data: - data[item] = None - # Add instance_id (for partitioning) - data["instance_id"] = self._instance_id - records[table].append(data) - - def _create_batch(self, visit_id: int) -> None: - """Create record batches for all records from `visit_id`""" - if visit_id not in self._records: - # The batch for this `visit_id` was already created, skip - return - for table_name, data in self._records[visit_id].items(): - try: - df = pd.DataFrame(data) - batch = pa.RecordBatch.from_pandas( - df, schema=PQ_SCHEMAS[table_name], preserve_index=False - ) - self._batches[table_name].append(batch) - self.logger.debug( - "Successfully created batch for table %s and " - "visit_id %s" % (table_name, visit_id) - ) - except pa.lib.ArrowInvalid: - self.logger.error( - "Error while creating record batch for table %s\n" % table_name, - exc_info=True, - ) - pass - # We construct a special index file from the site_visits data - # to make it easier to query the dataset - if table_name == "site_visits": - if SITE_VISITS_INDEX not in self._batches: - self._batches[SITE_VISITS_INDEX] = list() - for item in data: - self._batches[SITE_VISITS_INDEX].append(item) - - del self._records[visit_id] - self._unsaved_visit_ids.add(visit_id) - - def _exists_on_s3(self, filename: str) -> bool: - """Check if `filename` already exists on S3""" - # Check local filename cache - if filename.split("/", 1)[1] in self._s3_content_cache: - self.logger.debug("File `%s` found in content cache." % filename) - return True - - # Check S3 - try: - self._s3_resource.Object(self._bucket, filename).load() - except ClientError as e: - if e.response["Error"]["Code"] == "404": - return False - else: - raise - except EndpointConnectionError: - self.logger.error( - "Exception while checking if file exists %s" % filename, exc_info=True - ) - return False - - # Add filename to local cache to avoid remote lookups on next request - # We strip the bucket name as its the same for all files - self._s3_content_cache.add(filename.split("/", 1)[1]) - return True - - def _write_str_to_s3(self, string, filename, compressed=True, skip_if_exists=True): - """Write `string` data to S3 with name `filename`""" - if skip_if_exists and self._exists_on_s3(filename): - self.logger.debug("File `%s` already exists on s3, skipping..." % filename) - return - if not isinstance(string, bytes): - string = string.encode("utf-8") - if compressed: - out_f = io.BytesIO() - with gzip.GzipFile(fileobj=out_f, mode="w") as writer: - writer.write(string) - out_f.seek(0) - else: - out_f = io.BytesIO(string) - - # Upload to S3 - try: - self._s3.upload_fileobj(out_f, self._bucket, filename) - self.logger.debug("Successfully uploaded file `%s` to S3." % filename) - # Cache the filenames that are already on S3 - # We strip the bucket name as its the same for all files - if skip_if_exists: - self._s3_content_cache.add(filename.split("/", 1)[1]) - except Exception: - self.logger.error("Exception while uploading %s" % filename, exc_info=True) - pass - - def _send_to_s3(self, force=False): - """Copy in-memory batches to s3""" - should_send = force - for batches in self._batches.values(): - if len(batches) > CACHE_SIZE: - should_send = True - if not should_send: - return - - for table_name, batches in self._batches.items(): - if table_name == SITE_VISITS_INDEX: - out_str = "\n".join([json.dumps(x) for x in batches]) - out_str = out_str.encode("utf-8") - fname = "%s/site_index/instance-%s-%s.json.gz" % ( - self.dir, - self._instance_id, - hashlib.md5(out_str).hexdigest(), - ) - self._write_str_to_s3(out_str, fname) - else: - if len(batches) == 0: - continue - try: - table = pa.Table.from_batches(batches) - pq.write_to_dataset( - table, - self._s3_bucket_uri % table_name, - filesystem=self._fs, - partition_cols=["instance_id"], - compression="snappy", - flavor="spark", - ) - except (pa.lib.ArrowInvalid, EndpointConnectionError): - self.logger.error( - "Error while sending records for: %s" % table_name, - exc_info=True, - ) - pass - # can't del here because that would modify batches - self._batches[table_name] = list() - for visit_id in self._unsaved_visit_ids: - self.mark_visit_complete(visit_id) - self._unsaved_visit_ids = set() - - def save_batch_if_past_timeout(self): - """Save the current batch of records if no new data has been received. - - If we aren't receiving new data for this batch we commit early - regardless of the current batch size.""" - if self._last_record_received is None: - return - if time.time() - self._last_record_received < BATCH_COMMIT_TIMEOUT: - return - self.logger.debug( - "Saving current record batches to S3 since no new data has " - "been written for %d seconds." % (time.time() - self._last_record_received) - ) - self.drain_queue() - self._last_record_received = None - - def process_record(self, record): - """Add `record` to database""" - if len(record) != 2: - self.logger.error("Query is not the correct length %s", repr(record)) - return - self._last_record_received = time.time() - table, data = record - if table == RECORD_TYPE_CREATE: # drop these statements - return - if table == RECORD_TYPE_CONTENT: - self.process_content(record) - return - if table == RECORD_TYPE_SPECIAL: - self.handle_special(data) - return - - # Convert data to text type - for k, v in data.items(): - if isinstance(v, bytes): - data[k] = str(v, errors="ignore") - elif callable(v): - data[k] = str(v) - # TODO: Can we fix this in the extension? - elif type(v) == dict: - data[k] = json.dumps(v) - - # Save record to disk - self._write_record(table, data, data["visit_id"]) - - def process_content(self, record): - """Upload page content `record` to S3""" - if record[0] != RECORD_TYPE_CONTENT: - raise ValueError( - "Incorrect record type passed to `process_content`. Expected " - "record of type `%s`, received `%s`." % (RECORD_TYPE_CONTENT, record[0]) - ) - content, content_hash = record[1] - content = base64.b64decode(content) - fname = "%s/%s/%s.gz" % (self.dir, CONTENT_DIRECTORY, content_hash) - self._write_str_to_s3(content, fname) - - def drain_queue(self): - """Process remaining records in queue and sync final files to S3""" - super(S3Listener, self).drain_queue() - self._send_to_s3(force=True) - - def run_visit_completion_tasks(self, visit_id: int, interrupted: bool = False): - if interrupted: - self.logger.error("Visit with visit_id %d got interrupted", visit_id) - self._write_record("incomplete_visits", {"visit_id": visit_id}, visit_id) - self._create_batch(visit_id) - self.mark_visit_incomplete(visit_id) - return - self._create_batch(visit_id) - self._send_to_s3() - - def shutdown(self): - # We should only have unsaved records if we are in forced shutdown - if self._relaxed and self._records: - self.logger.error("Had unfinished records during relaxed shutdown") - super(S3Listener, self).shutdown() - self._send_to_s3(force=True) - - -class S3Aggregator(BaseAggregator): - """ - Receives data records from other processes and aggregates them locally - per-site before pushing them to a remote S3 bucket. The remote files are - saved in a Paquet Dataset partitioned by the browser_id and visit_id of - each record. - - The visit and task ids are randomly generated to allow multiple writers - to write to the same S3 bucket. Every record should have a `visit_id` - (which identifies the site visit) and a `browser_id` (which identifies the - browser instance) so we can associate records with the appropriate meta - data. Any records which lack this information will be dropped by the - writer. - - Note: Parquet's partitioned dataset reader only supports integer partition - columns up to 32 bits. Currently, `instance_id` is the only partition - column, and thus can be no larger than 32 bits. - """ - - def __init__(self, manager_params, browser_params): - super(S3Aggregator, self).__init__(manager_params, browser_params) - self.dir = manager_params.s3_directory - self.bucket = manager_params.s3_bucket - self.s3 = boto3.client("s3") - self._instance_id = random.getrandbits(32) - self._create_bucket() - - def _create_bucket(self): - """Create remote S3 bucket if it doesn't exist""" - resource = boto3.resource("s3") - try: - resource.meta.client.head_bucket(Bucket=self.bucket) - except ClientError as e: - error_code = int(e.response["Error"]["Code"]) - if error_code == 404: - resource.create_bucket(Bucket=self.bucket) - else: - raise - - def save_configuration(self, openwpm_version, browser_version): - """Save configuration details for this crawl to the database""" - - # Save config keyed by task id - fname = "%s/%s/instance-%s_configuration.json" % ( - self.dir, - CONFIG_DIR, - self._instance_id, - ) - - # Config parameters for update - out = dict() - out["manager_params"] = self.manager_params - out["openwpm_version"] = str(openwpm_version) - out["browser_version"] = str(browser_version) - out["browser_params"] = self.browser_params - out_str = json.dumps(out) - out_bytes = out_str.encode("utf-8") - out_f = io.BytesIO(out_bytes) - - # Upload to S3 and delete local copy - try: - self.s3.upload_fileobj(out_f, self.bucket, fname) - except Exception: - self.logger.error("Exception while uploading %s" % fname) - raise - - def get_next_visit_id(self): - """Generate visit id as randomly generated positive integer less than 2^53. - - Parquet can support integers up to 64 bits, but Javascript can only - represent integers up to 53 bits: - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/MAX_SAFE_INTEGER - Thus, we cap these values at 53 bits. - """ - return random.getrandbits(53) - - def get_next_browser_id(self): - """Generate crawl id as randomly generated positive 32bit integer - - Note: Parquet's partitioned dataset reader only supports integer - partition columns up to 32 bits. - """ - return random.getrandbits(32) - - def launch(self): - """Launch the aggregator listener process""" - super(S3Aggregator, self).launch( - listener_process_runner, self.manager_params, self._instance_id - ) diff --git a/openwpm/DataAggregator/base_aggregator.py b/openwpm/DataAggregator/base_aggregator.py deleted file mode 100644 index 68fa8d531..000000000 --- a/openwpm/DataAggregator/base_aggregator.py +++ /dev/null @@ -1,304 +0,0 @@ -import abc -import logging -import queue -import threading -import time -from typing import Any, Dict, List, Optional, Tuple - -from multiprocess import Queue - -from openwpm.config import BrowserParamsInternal, ManagerParamsInternal - -from ..socket_interface import ServerSocket -from ..utilities.multiprocess_utils import Process - -RECORD_TYPE_CONTENT = "page_content" -RECORD_TYPE_SPECIAL = "meta_information" -ACTION_TYPE_FINALIZE = "Finalize" -ACTION_TYPE_INITIALIZE = "Initialize" -RECORD_TYPE_CREATE = "create_table" -STATUS_TIMEOUT = 120 # seconds -SHUTDOWN_SIGNAL = "SHUTDOWN" - -STATUS_UPDATE_INTERVAL = 5 # seconds - -BaseParams = Tuple[Queue, Queue, Queue] - - -class BaseListener: - """Base class for the data aggregator listener process. This class is used - alongside the BaseAggregator class to spawn an aggregator process that - combines data collected in multiple crawl processes and stores it - persistently as specified in the child class. The BaseListener class - is instantiated in the remote process, and sets up a listening socket to - receive data. Classes which inherit from this base class define - how that data is written to disk. - """ - - __metaclass = abc.ABCMeta - - def __init__( - self, status_queue: Queue, completion_queue: Queue, shutdown_queue: Queue - ) -> None: - """ - Creates a BaseListener instance - - Parameters - ---------- - status_queue - queue that the current amount of records to be processed will - be sent to - also used for initialization - completion_queue - queue containing the visitIDs of saved records - shutdown_queue - queue that the main process can use to shut down the listener - """ - self.status_queue = status_queue - self.completion_queue = completion_queue - self.shutdown_queue = shutdown_queue - self._shutdown_flag = False - self._relaxed = False - self._last_update = time.time() # last status update time - self.record_queue: Queue = None # Initialized on `startup` - self.logger = logging.getLogger("openwpm") - self.curent_visit_ids: List[int] = list() # All visit_ids in flight - self.sock: Optional[ServerSocket] = None - - @abc.abstractmethod - def process_record(self, record): - """Parse and save `record` to persistent storage. - - Parameters - ---------- - record : tuple - 2-tuple in format (table_name, data). `data` is a dict which maps - column name to the record for that column""" - - @abc.abstractmethod - def process_content(self, record): - """Parse and save page content `record` to persistent storage. - - Parameters - ---------- - record : tuple - 2-tuple in format (table_name, data). `data` is a 2-tuple of the - for (content, content_hash)""" - - @abc.abstractmethod - def run_visit_completion_tasks(self, visit_id: int, interrupted: bool = False): - """Will be called once a visit_id will receive no new records - - Parameters - ---------- - visit_id - the id that will receive no more updates - interrupted - whether a visit is unfinished""" - - def startup(self): - """Run listener startup tasks - - Note: Child classes should call this method""" - self.sock = ServerSocket(name=type(self).__name__) - self.status_queue.put(self.sock.sock.getsockname()) - self.sock.start_accepting() - self.record_queue = self.sock.queue - - def should_shutdown(self): - """Return `True` if the listener has received a shutdown signal - Sets `self._relaxed` and `self.shutdown_flag` - `self._relaxed means this shutdown is - happening after all visits have completed and - all data can be seen as complete - """ - if not self.shutdown_queue.empty(): - _, relaxed = self.shutdown_queue.get() - self._relaxed = relaxed - self._shutdown_flag = True - self.logger.info("Received shutdown signal!") - return True - return False - - def update_status_queue(self): - """Send manager process a status update.""" - if (time.time() - self._last_update) < STATUS_UPDATE_INTERVAL: - return - qsize = self.record_queue.qsize() - self.status_queue.put(qsize) - self.logger.debug( - "Status update; current record queue size: %d. " - "current number of threads: %d." % (qsize, threading.active_count()) - ) - self._last_update = time.time() - - def handle_special(self, data: Dict[str, Any]) -> None: - """ - Messages for the table RECORD_TYPE_SPECIAL are metainformation - communicated to the aggregator - Supported message types: - - finalize: A message sent by the extension to - signal that a visit_id is complete. - """ - if data["action"] == ACTION_TYPE_INITIALIZE: - self.curent_visit_ids.append(data["visit_id"]) - elif data["action"] == ACTION_TYPE_FINALIZE: - try: - self.curent_visit_ids.remove(data["visit_id"]) - except ValueError: - self.logger.error( - "Trying to remove visit_id %i " "from current_visit_ids failed", - data["visit_id"], - ) - - self.run_visit_completion_tasks( - data["visit_id"], interrupted=not data["success"] - ) - else: - raise ValueError( - "Unexpected meta " "information type: %s" % data["meta_type"] - ) - - def mark_visit_complete(self, visit_id: int) -> None: - """This function should be called to indicate that all records - relating to a certain visit_id have been saved""" - self.completion_queue.put((visit_id, False)) - - def mark_visit_incomplete(self, visit_id: int): - """This function should be called to indicate that a certain visit - has been interrupted and will forever be incomplete - """ - self.completion_queue.put((visit_id, True)) - - def shutdown(self): - """Run shutdown tasks defined in the base listener - - Note: Child classes should call this method""" - self.sock.close() - for visit_id in self.curent_visit_ids: - self.run_visit_completion_tasks(visit_id, interrupted=not self._relaxed) - - def drain_queue(self): - """ Ensures queue is empty before closing """ - time.sleep(3) # TODO: the socket needs a better way of closing - while not self.record_queue.empty(): - record = self.record_queue.get() - self.process_record(record) - self.logger.info("Queue was flushed completely") - - -class BaseAggregator: - """Base class for the data aggregator interface. This class is used - alongside the BaseListener class to spawn an aggregator process that - combines data from multiple crawl processes. The BaseAggregator class - manages the child listener process. - - Parameters - ---------- - manager_params : ManagerParamsInternal - TaskManager configuration parameters - browser_params : list of BrowserParamsInternal - List of browser configuration class""" - - __metaclass__ = abc.ABCMeta - - def __init__( - self, - manager_params: ManagerParamsInternal, - browser_params: List[BrowserParamsInternal], - ): - self.manager_params = manager_params - self.browser_params = browser_params - self.listener_address = None - self.listener_process = None - self.status_queue = Queue() - self.completion_queue = Queue() - self.shutdown_queue = Queue() - self._last_status = None - self._last_status_received = None - self.logger = logging.getLogger("openwpm") - - @abc.abstractmethod - def save_configuration(self, openwpm_version, browser_version): - """Save configuration details to the database""" - - @abc.abstractmethod - def get_next_visit_id(self): - """Return a unique visit ID to be used as a key for a single visit""" - - @abc.abstractmethod - def get_next_browser_id(self): - """Return a unique crawl ID used as a key for a browser instance""" - - def get_most_recent_status(self): - """Return the most recent queue size sent from the listener process""" - - # Block until we receive the first status update - if self._last_status is None: - return self.get_status() - - # Drain status queue until we receive most recent update - while not self.status_queue.empty(): - self._last_status = self.status_queue.get() - self._last_status_received = time.time() - - # Check last status signal - if (time.time() - self._last_status_received) > STATUS_TIMEOUT: - raise RuntimeError( - "No status update from DataAggregator listener process " - "for %d seconds." % (time.time() - self._last_status_received) - ) - - return self._last_status - - def get_status(self): - """Get listener process status. If the status queue is empty, block.""" - try: - self._last_status = self.status_queue.get( - block=True, timeout=STATUS_TIMEOUT - ) - self._last_status_received = time.time() - except queue.Empty: - raise RuntimeError( - "No status update from DataAggregator listener process " - "for %d seconds." % (time.time() - self._last_status_received) - ) - return self._last_status - - def get_new_completed_visits(self) -> List[Tuple[int, bool]]: - """ - Returns a list of all visit ids that have been processed since - the last time the method was called and whether or not they - have been interrupted. - - This method will return an empty list in case no visit ids have - been processed since the last time this method was called - """ - finished_visit_ids = list() - while not self.completion_queue.empty(): - finished_visit_ids.append(self.completion_queue.get()) - return finished_visit_ids - - def launch(self, listener_process_runner, *args): - """Launch the aggregator listener process""" - args = ((self.status_queue, self.completion_queue, self.shutdown_queue),) + args - self.listener_process = Process(target=listener_process_runner, args=args) - self.listener_process.daemon = True - self.listener_process.start() - self.listener_address = self.status_queue.get() - - def shutdown(self, relaxed: bool = True): - """ Terminate the aggregator listener process""" - self.logger.debug( - "Sending the shutdown signal to the %s listener process..." - % type(self).__name__ - ) - self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed)) - start_time = time.time() - self.listener_process.join(300) - self.logger.debug( - "%s took %s seconds to close." - % (type(self).__name__, str(time.time() - start_time)) - ) - self.listener_address = None - self.listener_process = None diff --git a/openwpm/DataAggregator/local_aggregator.py b/openwpm/DataAggregator/local_aggregator.py deleted file mode 100644 index 44102d71c..000000000 --- a/openwpm/DataAggregator/local_aggregator.py +++ /dev/null @@ -1,303 +0,0 @@ -import base64 -import json -import os -import sqlite3 -import time -from sqlite3 import IntegrityError, InterfaceError, OperationalError, ProgrammingError -from typing import Any, Dict, List, Tuple, Union - -import plyvel - -from openwpm.config import BrowserParamsInternal, ManagerParamsInternal - -from .base_aggregator import ( - RECORD_TYPE_CONTENT, - RECORD_TYPE_CREATE, - RECORD_TYPE_SPECIAL, - BaseAggregator, - BaseListener, -) - -SQL_BATCH_SIZE = 1000 -LDB_BATCH_SIZE = 100 -MIN_TIME = 5 # seconds -SCHEMA_FILE = os.path.join(os.path.dirname(__file__), "schema.sql") -LDB_NAME = "content.ldb" - - -def listener_process_runner(base_params, manager_params, ldb_enabled): - """LocalListener runner. Pass to new process""" - listener = LocalListener(base_params, manager_params, ldb_enabled) - listener.startup() - - while True: - listener.update_status_queue() - if listener.should_shutdown(): - break - - if listener.record_queue.empty(): - time.sleep(1) - listener.maybe_commit_records() - continue - - # Process record - record = listener.record_queue.get() - listener.process_record(record) - - # batch commit if necessary - listener.maybe_commit_records() - - listener.drain_queue() - listener.shutdown() - - -class LocalListener(BaseListener): - """Listener that interfaces with a local SQLite database.""" - - def __init__(self, base_params, manager_params, ldb_enabled): - db_path = manager_params.database_name - self.db = sqlite3.connect(db_path, check_same_thread=False) - self.cur = self.db.cursor() - self.ldb_enabled = ldb_enabled - if self.ldb_enabled: - self.ldb = plyvel.DB( - os.path.join(manager_params.data_directory, LDB_NAME), - create_if_missing=True, - write_buffer_size=128 * 10 ** 6, - compression="snappy", - ) - self.content_batch = self.ldb.write_batch() - self._ldb_counter = 0 - self._ldb_commit_time = 0 - self._sql_counter = 0 - self._sql_commit_time = 0 - - super(LocalListener, self).__init__(*base_params) - - def _generate_insert(self, table, data): - """Generate a SQL query from `record`""" - statement = "INSERT INTO %s (" % table - value_str = "VALUES (" - values = list() - first = True - for field, value in data.items(): - statement += "" if first else ", " - statement += field - value_str += "?" if first else ",?" - values.append(value) - first = False - statement = statement + ") " + value_str + ")" - return statement, values - - def process_record(self, record: Tuple[str, Union[str, Dict[str, Any]]]): - """Add `record` to database""" - - if len(record) != 2: - self.logger.error("Query is not the correct length %s", repr(record)) - return - - table, data = record - if table == RECORD_TYPE_CREATE: - assert isinstance(data, str) - self.cur.execute(data) - self.db.commit() - return - if table == RECORD_TYPE_CONTENT: - self.process_content(record) - return - - assert isinstance(data, dict) - - if table == RECORD_TYPE_SPECIAL: - self.handle_special(data) - return - - statement, args = self._generate_insert(table=table, data=data) - for i in range(len(args)): - if isinstance(args[i], bytes): - args[i] = str(args[i], errors="ignore") - elif callable(args[i]): - args[i] = str(args[i]) - elif type(args[i]) == dict: - args[i] = json.dumps(args[i]) - try: - self.cur.execute(statement, args) - self._sql_counter += 1 - except ( - OperationalError, - ProgrammingError, - IntegrityError, - InterfaceError, - ) as e: - self.logger.error( - "Unsupported record:\n%s\n%s\n%s\n%s\n" - % (type(e), e, statement, repr(args)) - ) - - def process_content(self, record): - """Add page content to the LevelDB database""" - table, data = record - if table != RECORD_TYPE_CONTENT: - raise ValueError( - "Incorrect record type passed to `process_content`. Expected " - "record of type `%s`, received `%s`." % (RECORD_TYPE_CONTENT, table) - ) - if not self.ldb_enabled: - raise RuntimeError( - "Attempted to save page content but the LevelDB content " - "database is not enabled." - ) - content, content_hash = data - content = base64.b64decode(content) - content_hash = str(content_hash).encode("ascii") - if self.ldb.get(content_hash) is not None: - return - self.content_batch.put(content_hash, content) - self._ldb_counter += 1 - - def _write_content_batch(self): - """Write out content batch to LevelDB database""" - self.content_batch.write() - self.content_batch = self.ldb.write_batch() - - def maybe_commit_records(self): - """Commit records to database if record count or timer is over limit""" - - # Commit SQLite Database inserts - sql_over_time = (time.time() - self._sql_commit_time) > MIN_TIME - if self._sql_counter >= SQL_BATCH_SIZE or ( - self._sql_counter > 0 and sql_over_time - ): - self.db.commit() - self._sql_counter = 0 - self._sql_commit_time = time.time() - - # Write LevelDB batch to DB - if not self.ldb_enabled: - return - ldb_over_time = (time.time() - self._ldb_commit_time) > MIN_TIME - if self._ldb_counter >= LDB_BATCH_SIZE or ( - self._ldb_counter > 0 and ldb_over_time - ): - self._write_content_batch() - self._ldb_counter = 0 - self._ldb_commit_time = time.time() - - def run_visit_completion_tasks(self, visit_id: int, interrupted: bool = False): - if interrupted: - self.logger.warning("Visit with visit_id %d got interrupted", visit_id) - self.cur.execute("INSERT INTO incomplete_visits VALUES (?)", (visit_id,)) - self.mark_visit_incomplete(visit_id) - else: - self.mark_visit_complete(visit_id) - - def shutdown(self): - super(LocalListener, self).shutdown() - self.db.commit() - self.db.close() - if self.ldb_enabled: - self._write_content_batch() - self.ldb.close() - - -class LocalAggregator(BaseAggregator): - """ - Receives SQL queries from other processes and writes them to the central - database. Executes queries until being told to die (then it will finish - work and shut down). Killing this process will result in data loss. - - If content saving is enabled, we write page content to a LevelDB database. - """ - - def __init__( - self, - manager_params: ManagerParamsInternal, - browser_params: List[BrowserParamsInternal], - ): - super(LocalAggregator, self).__init__(manager_params, browser_params) - db_path = self.manager_params.database_name - if not os.path.exists(manager_params.data_directory): - os.mkdir(manager_params.data_directory) - self.db = sqlite3.connect(db_path, check_same_thread=False) - self.cur = self.db.cursor() - self._create_tables() - self._get_last_used_ids() - - # Mark if LDBAggregator is needed - # (if content saving is enabled on any browser) - self.ldb_enabled = False - for params in browser_params: - if params.save_content: - self.ldb_enabled = True - break - - def _create_tables(self): - """Create tables (if this is a new database)""" - with open(SCHEMA_FILE, "r") as f: - self.db.executescript(f.read()) - self.db.commit() - - def _get_last_used_ids(self): - """Query max ids from database""" - self.cur.execute("SELECT MAX(visit_id) from site_visits") - last_visit_id = self.cur.fetchone()[0] - if last_visit_id is None: - last_visit_id = 0 - self.current_visit_id = last_visit_id - - self.cur.execute("SELECT MAX(browser_id) from crawl") - last_browser_id = self.cur.fetchone()[0] - if last_browser_id is None: - last_browser_id = 0 - self.current_browser_id = last_browser_id - - def save_configuration(self, openwpm_version, browser_version): - """Save configuration details for this crawl to the database""" - - # Record task details - self.cur.execute( - "INSERT INTO task " - "(manager_params, openwpm_version, browser_version) " - "VALUES (?,?,?)", - ( - self.manager_params.to_json(), - openwpm_version, - browser_version, - ), - ) - self.db.commit() - self.task_id = self.cur.lastrowid - - # Record browser details for each brower - for i in range(self.manager_params.num_browsers): - self.cur.execute( - "INSERT INTO crawl (browser_id, task_id, browser_params) " - "VALUES (?,?,?)", - ( - self.browser_params[i].browser_id, - self.task_id, - self.browser_params[i].to_json(), - ), - ) - self.db.commit() - - def get_next_visit_id(self): - """Returns the next visit id""" - self.current_visit_id += 1 - return self.current_visit_id - - def get_next_browser_id(self): - """Returns the next crawl id""" - self.current_browser_id += 1 - return self.current_browser_id - - def launch(self): - """Launch the aggregator listener process""" - super(LocalAggregator, self).launch( - listener_process_runner, self.manager_params, self.ldb_enabled - ) - - def shutdown(self, relaxed: bool = False) -> None: - """ Terminates the aggregator""" - super(LocalAggregator, self).shutdown(relaxed) - self.db.close() diff --git a/openwpm/Extension/firefox/README.md b/openwpm/Extension/firefox/README.md index 8bbcdc89b..ac226c026 100644 --- a/openwpm/Extension/firefox/README.md +++ b/openwpm/Extension/firefox/README.md @@ -1,4 +1,3 @@ # OpenWPM Client Extension -Used by the OpenWPM platform. This extension implements the OpenWPM instrumentation -module (https://github.com/mozilla/openwpm-webext-instrumentation/) in a WebExtension. \ No newline at end of file +Used by the OpenWPM platform. This extension implements the OpenWPM instrumentation module (https://github.com/mozilla/openwpm-webext-instrumentation/) in a WebExtension. \ No newline at end of file diff --git a/openwpm/Extension/firefox/content.js/index.js b/openwpm/Extension/firefox/content.js/index.js index d925d3bbc..c60740d92 100644 --- a/openwpm/Extension/firefox/content.js/index.js +++ b/openwpm/Extension/firefox/content.js/index.js @@ -1,4 +1,4 @@ -import { injectJavascriptInstrumentPageScript } from "openwpm-webext-instrumentation"; +import {injectJavascriptInstrumentPageScript} from "openwpm-webext-instrumentation"; injectJavascriptInstrumentPageScript(window.openWpmContentScriptConfig || {}); delete window.openWpmContentScriptConfig; diff --git a/openwpm/Extension/firefox/feature.js/index.js b/openwpm/Extension/firefox/feature.js/index.js index 183b95b39..daf634bcb 100644 --- a/openwpm/Extension/firefox/feature.js/index.js +++ b/openwpm/Extension/firefox/feature.js/index.js @@ -1,13 +1,13 @@ import { - CookieInstrument, - JavascriptInstrument, - HttpInstrument, - NavigationInstrument, - DnsInstrument + CookieInstrument, + DnsInstrument, + HttpInstrument, + JavascriptInstrument, + NavigationInstrument } from "openwpm-webext-instrumentation"; import * as loggingDB from "./loggingdb.js"; -import { CallstackInstrument } from "./callstack-instrument.js"; +import {CallstackInstrument} from "./callstack-instrument.js"; async function main() { // Read the browser configuration from file @@ -50,7 +50,7 @@ async function main() { "the extension. Outputting all queries to console.", {config}); } - await loggingDB.open(config['aggregator_address'], + await loggingDB.open(config['storage_controller_address'], config['logger_address'], config['browser_id']); diff --git a/openwpm/Extension/firefox/feature.js/loggingdb.js b/openwpm/Extension/firefox/feature.js/loggingdb.js index 87fdd8bfb..b8b7caa01 100644 --- a/openwpm/Extension/firefox/feature.js/loggingdb.js +++ b/openwpm/Extension/firefox/feature.js/loggingdb.js @@ -3,7 +3,7 @@ import * as socket from "./socket.js"; let crawlID = null; let visitID = null; let debugging = false; -let dataAggregator = null; +let storageController = null; let logAggregator = null; let listeningSocket = null; @@ -19,19 +19,19 @@ let listeningSocketCallback = async (data) => { } visitID = _visitID; data["browser_id"] = crawlID; - dataAggregator.send(JSON.stringify(["meta_information", data])); + storageController.send(JSON.stringify(["meta_information", data])); break; case "Finalize": if (!visitID) { logWarn("Send Finalize while no visit_id was set") } - if (_visitID != visitID ) { + if (_visitID !== visitID ) { logError("Send Finalize but visit_id didn't match. " + `Current visit_id ${visit_id}, sent visit_id ${_visit_id}.`); } data["browser_id"] = crawlID; data["success"] = true; - dataAggregator.send(JSON.stringify(["meta_information", data])); + storageController.send(JSON.stringify(["meta_information", data])); visitID = null; break; default: @@ -43,8 +43,8 @@ let listeningSocketCallback = async (data) => { } } -export let open = async function(aggregatorAddress, logAddress, curr_crawlID) { - if (aggregatorAddress == null && logAddress == null && curr_crawlID == '') { +export let open = async function(storageControllerAddress, logAddress, curr_crawlID) { + if (storageControllerAddress == null && logAddress == null && curr_crawlID === '') { console.log("Debugging, everything will output to console"); debugging = true; return; @@ -61,10 +61,10 @@ export let open = async function(aggregatorAddress, logAddress, curr_crawlID) { } // Connect to databases for saving data - if (aggregatorAddress != null) { - dataAggregator = new socket.SendingSocket(); - let rv = await dataAggregator.connect(aggregatorAddress[0], aggregatorAddress[1]); - console.log("sqliteSocket started?",rv); + if (storageControllerAddress != null) { + storageController = new socket.SendingSocket(); + let rv = await storageController.connect(storageControllerAddress[0], storageControllerAddress[1]); + console.log("StorageController started?",rv); } // Listen for incoming urls as visit ids @@ -76,8 +76,8 @@ export let open = async function(aggregatorAddress, logAddress, curr_crawlID) { }; export let close = function() { - if (dataAggregator != null) { - dataAggregator.close(); + if (storageController != null) { + storageController.close(); } if (logAggregator != null) { logAggregator.close(); @@ -189,7 +189,7 @@ export let saveRecord = function(instrument, record) { console.log("EXTENSION", instrument, record); return; } - dataAggregator.send(JSON.stringify([instrument, record])); + storageController.send(JSON.stringify([instrument, record])); }; // Stub for now @@ -203,7 +203,7 @@ export let saveContent = async function(content, contentHash) { // Since the content might not be a valid utf8 string and it needs to be // json encoded later, it is encoded using base64 first. const b64 = Uint8ToBase64(content); - dataAggregator.send(JSON.stringify(['page_content', [b64, contentHash]])); + storageController.send(JSON.stringify(['page_content', [b64, contentHash]])); }; function encode_utf8(s) { diff --git a/openwpm/Extension/firefox/package-lock.json b/openwpm/Extension/firefox/package-lock.json index 169f24fee..b038c4c35 100644 --- a/openwpm/Extension/firefox/package-lock.json +++ b/openwpm/Extension/firefox/package-lock.json @@ -12079,7 +12079,10 @@ "from": "github:conventional-changelog/standard-version#master", "requires": { "chalk": "^2.4.2", + "conventional-changelog": "3.1.23", "conventional-changelog-config-spec": "2.1.0", + "conventional-changelog-conventionalcommits": "4.4.0", + "conventional-recommended-bump": "6.0.10", "detect-indent": "^6.0.0", "detect-newline": "^3.1.0", "dotgitignore": "^2.1.0", @@ -12088,7 +12091,8 @@ "fs-access": "^1.0.1", "git-semver-tags": "^4.0.0", "semver": "^7.1.1", - "stringify-package": "^1.0.1" + "stringify-package": "^1.0.1", + "yargs": "^15.3.1" }, "dependencies": { "ansi-styles": { diff --git a/openwpm/Extension/webext-instrumentation/package-lock.json b/openwpm/Extension/webext-instrumentation/package-lock.json index a17353ab2..de7b2148e 100644 --- a/openwpm/Extension/webext-instrumentation/package-lock.json +++ b/openwpm/Extension/webext-instrumentation/package-lock.json @@ -1464,25 +1464,6 @@ "xdg-basedir": "^4.0.0" } }, - "conventional-changelog": { - "version": "3.1.24", - "resolved": "https://registry.npmjs.org/conventional-changelog/-/conventional-changelog-3.1.24.tgz", - "integrity": "sha512-ed6k8PO00UVvhExYohroVPXcOJ/K1N0/drJHx/faTH37OIZthlecuLIRX/T6uOp682CAoVoFpu+sSEaeuH6Asg==", - "dev": true, - "requires": { - "conventional-changelog-angular": "^5.0.12", - "conventional-changelog-atom": "^2.0.8", - "conventional-changelog-codemirror": "^2.0.8", - "conventional-changelog-conventionalcommits": "^4.5.0", - "conventional-changelog-core": "^4.2.1", - "conventional-changelog-ember": "^2.0.9", - "conventional-changelog-eslint": "^3.0.9", - "conventional-changelog-express": "^2.0.6", - "conventional-changelog-jquery": "^3.0.11", - "conventional-changelog-jshint": "^2.0.9", - "conventional-changelog-preset-loader": "^2.3.4" - } - }, "conventional-changelog-angular": { "version": "5.0.12", "resolved": "https://registry.npmjs.org/conventional-changelog-angular/-/conventional-changelog-angular-5.0.12.tgz", @@ -1517,17 +1498,6 @@ "integrity": "sha512-IpVePh16EbbB02V+UA+HQnnPIohgXvJRxHcS5+Uwk4AT5LjzCZJm5sp/yqs5C6KZJ1jMsV4paEV13BN1pvDuxQ==", "dev": true }, - "conventional-changelog-conventionalcommits": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/conventional-changelog-conventionalcommits/-/conventional-changelog-conventionalcommits-4.5.0.tgz", - "integrity": "sha512-buge9xDvjjOxJlyxUnar/+6i/aVEVGA7EEh4OafBCXPlLUQPGbRUBhBUveWRxzvR8TEjhKEP4BdepnpG2FSZXw==", - "dev": true, - "requires": { - "compare-func": "^2.0.0", - "lodash": "^4.17.15", - "q": "^1.5.1" - } - }, "conventional-changelog-core": { "version": "4.2.2", "resolved": "https://registry.npmjs.org/conventional-changelog-core/-/conventional-changelog-core-4.2.2.tgz", @@ -1956,85 +1926,6 @@ } } }, - "conventional-recommended-bump": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/conventional-recommended-bump/-/conventional-recommended-bump-6.1.0.tgz", - "integrity": "sha512-uiApbSiNGM/kkdL9GTOLAqC4hbptObFo4wW2QRyHsKciGAfQuLU1ShZ1BIVI/+K2BE/W1AWYQMCXAsv4dyKPaw==", - "dev": true, - "requires": { - "concat-stream": "^2.0.0", - "conventional-changelog-preset-loader": "^2.3.4", - "conventional-commits-filter": "^2.0.7", - "conventional-commits-parser": "^3.2.0", - "git-raw-commits": "^2.0.8", - "git-semver-tags": "^4.1.1", - "meow": "^8.0.0", - "q": "^1.5.1" - }, - "dependencies": { - "hosted-git-info": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-3.0.8.tgz", - "integrity": "sha512-aXpmwoOhRBrw6X3j0h5RloK4x1OzsxMPyxqIHyNfSe2pypkVTZFpEiRoSipPEPlMrh0HW/XsjkJ5WgnCirpNUw==", - "dev": true, - "requires": { - "lru-cache": "^6.0.0" - } - }, - "lru-cache": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", - "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", - "dev": true, - "requires": { - "yallist": "^4.0.0" - } - }, - "meow": { - "version": "8.1.2", - "resolved": "https://registry.npmjs.org/meow/-/meow-8.1.2.tgz", - "integrity": "sha512-r85E3NdZ+mpYk1C6RjPFEMSE+s1iZMuHtsHAqY0DT3jZczl0diWUZ8g6oU7h0M9cD2EL+PzaYghhCLzR0ZNn5Q==", - "dev": true, - "requires": { - "@types/minimist": "^1.2.0", - "camelcase-keys": "^6.2.2", - "decamelize-keys": "^1.1.0", - "hard-rejection": "^2.1.0", - "minimist-options": "4.1.0", - "normalize-package-data": "^3.0.0", - "read-pkg-up": "^7.0.1", - "redent": "^3.0.0", - "trim-newlines": "^3.0.0", - "type-fest": "^0.18.0", - "yargs-parser": "^20.2.3" - } - }, - "normalize-package-data": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-3.0.0.tgz", - "integrity": "sha512-6lUjEI0d3v6kFrtgA/lOx4zHCWULXsFNIjHolnZCKCTLA6m/G625cdn3O7eNmT0iD3jfo6HZ9cdImGZwf21prw==", - "dev": true, - "requires": { - "hosted-git-info": "^3.0.6", - "resolve": "^1.17.0", - "semver": "^7.3.2", - "validate-npm-package-license": "^3.0.1" - } - }, - "type-fest": { - "version": "0.18.1", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.18.1.tgz", - "integrity": "sha512-OIAYXk8+ISY+qTOwkHtKqzAuxchoMiD9Udx+FSGQDuiRR+PJKJHc2NJAXlbhkGwTt/4/nKZxELY1w3ReWOL8mw==", - "dev": true - }, - "yallist": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", - "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", - "dev": true - } - } - }, "convert-source-map": { "version": "1.7.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.7.0.tgz", @@ -4933,6 +4824,12 @@ "integrity": "sha512-PlhdFcillOINfeV7Ni6oF1TAEayyZBoZ8bcshTHqOYJYlrqzRK5hagpagky5o4HfCzzd1TRkXPMFq6cKk9rGmA==", "dev": true }, + "lodash._reinterpolate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/lodash._reinterpolate/-/lodash._reinterpolate-3.0.0.tgz", + "integrity": "sha1-DM8tiRZq8Ds2Y8eWU4t1rG4RTZ0=", + "dev": true + }, "lodash.flattendeep": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/lodash.flattendeep/-/lodash.flattendeep-4.4.0.tgz", @@ -4951,6 +4848,25 @@ "integrity": "sha1-dx7Hg540c9nEzeKLGTlMNWL09tM=", "dev": true }, + "lodash.template": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-4.5.0.tgz", + "integrity": "sha512-84vYFxIkmidUiFxidA/KjjH9pAycqW+h980j7Fuz5qxRtO9pgB7MDFTdys1N7A5mcucRiDyEq4fusljItR1T/A==", + "dev": true, + "requires": { + "lodash._reinterpolate": "^3.0.0", + "lodash.templatesettings": "^4.0.0" + } + }, + "lodash.templatesettings": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-4.2.0.tgz", + "integrity": "sha512-stgLz+i3Aa9mZgnjr/O+v9ruKZsPsndy7qPZOchbqk2cnTU1ZaldKK+v7m54WoKIyxiuMZTKT2H81F8BeAc3ZQ==", + "dev": true, + "requires": { + "lodash._reinterpolate": "^3.0.0" + } + }, "lodash.toarray": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/lodash.toarray/-/lodash.toarray-4.4.0.tgz", @@ -5585,6 +5501,12 @@ "integrity": "sha1-l33/1xdgErnsMNKjnbXPcqBDnt0=", "dev": true }, + "number-is-nan": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", + "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=", + "dev": true + }, "nyc": { "version": "15.1.0", "resolved": "https://registry.npmjs.org/nyc/-/nyc-15.1.0.tgz", @@ -7274,15 +7196,15 @@ } }, "standard-version": { - "version": "github:conventional-changelog/standard-version#65dd070b9f01ffe1764e64ba739bc064b84f4129", + "version": "github:conventional-changelog/standard-version#0a801d9ddb88941158271073190e2d5eb2bc67d0", "from": "github:conventional-changelog/standard-version#master", "dev": true, "requires": { "chalk": "^2.4.2", - "conventional-changelog": "3.1.24", + "conventional-changelog": "3.1.23", "conventional-changelog-config-spec": "2.1.0", - "conventional-changelog-conventionalcommits": "4.5.0", - "conventional-recommended-bump": "6.1.0", + "conventional-changelog-conventionalcommits": "4.4.0", + "conventional-recommended-bump": "6.0.10", "detect-indent": "^6.0.0", "detect-newline": "^3.1.0", "dotgitignore": "^2.1.0", @@ -7292,7 +7214,7 @@ "git-semver-tags": "^4.0.0", "semver": "^7.1.1", "stringify-package": "^1.0.1", - "yargs": "^16.0.0" + "yargs": "^15.3.1" }, "dependencies": { "ansi-styles": { @@ -7304,6 +7226,29 @@ "color-convert": "^1.9.0" } }, + "arrify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-1.0.1.tgz", + "integrity": "sha1-iYUI2iIm84DfkEcoRWhJwVAaSw0=", + "dev": true + }, + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "dev": true + }, + "camelcase-keys": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/camelcase-keys/-/camelcase-keys-4.2.0.tgz", + "integrity": "sha1-oqpfsa9oh1glnDLBQUJteJI7m3c=", + "dev": true, + "requires": { + "camelcase": "^4.1.0", + "map-obj": "^2.0.0", + "quick-lru": "^1.0.0" + } + }, "chalk": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", @@ -7315,6 +7260,17 @@ "supports-color": "^5.3.0" } }, + "cliui": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-6.0.0.tgz", + "integrity": "sha512-t6wbgtoCXvAzst7QgXxJYqPt0usEfbgQdftEPbLL/cvv6HPE5VgvqCuAIDR0NgU52ds6rFwqrgakNLrHEjCbrQ==", + "dev": true, + "requires": { + "string-width": "^4.2.0", + "strip-ansi": "^6.0.0", + "wrap-ansi": "^6.2.0" + } + }, "color-convert": { "version": "1.9.3", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", @@ -7330,6 +7286,61 @@ "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=", "dev": true }, + "conventional-changelog": { + "version": "3.1.23", + "resolved": "https://registry.npmjs.org/conventional-changelog/-/conventional-changelog-3.1.23.tgz", + "integrity": "sha512-sScUu2NHusjRC1dPc5p8/b3kT78OYr95/Bx7Vl8CPB8tF2mG1xei5iylDTRjONV5hTlzt+Cn/tBWrKdd299b7A==", + "dev": true, + "requires": { + "conventional-changelog-angular": "^5.0.11", + "conventional-changelog-atom": "^2.0.7", + "conventional-changelog-codemirror": "^2.0.7", + "conventional-changelog-conventionalcommits": "^4.4.0", + "conventional-changelog-core": "^4.2.0", + "conventional-changelog-ember": "^2.0.8", + "conventional-changelog-eslint": "^3.0.8", + "conventional-changelog-express": "^2.0.5", + "conventional-changelog-jquery": "^3.0.10", + "conventional-changelog-jshint": "^2.0.8", + "conventional-changelog-preset-loader": "^2.3.4" + } + }, + "conventional-changelog-conventionalcommits": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/conventional-changelog-conventionalcommits/-/conventional-changelog-conventionalcommits-4.4.0.tgz", + "integrity": "sha512-ybvx76jTh08tpaYrYn/yd0uJNLt5yMrb1BphDe4WBredMlvPisvMghfpnJb6RmRNcqXeuhR6LfGZGewbkRm9yA==", + "dev": true, + "requires": { + "compare-func": "^2.0.0", + "lodash": "^4.17.15", + "q": "^1.5.1" + } + }, + "conventional-recommended-bump": { + "version": "6.0.10", + "resolved": "https://registry.npmjs.org/conventional-recommended-bump/-/conventional-recommended-bump-6.0.10.tgz", + "integrity": "sha512-2ibrqAFMN3ZA369JgVoSbajdD/BHN6zjY7DZFKTHzyzuQejDUCjQ85S5KHxCRxNwsbDJhTPD5hOKcis/jQhRgg==", + "dev": true, + "requires": { + "concat-stream": "^2.0.0", + "conventional-changelog-preset-loader": "^2.3.4", + "conventional-commits-filter": "^2.0.6", + "conventional-commits-parser": "^3.1.0", + "git-raw-commits": "2.0.0", + "git-semver-tags": "^4.1.0", + "meow": "^7.0.0", + "q": "^1.5.1" + } + }, + "dargs": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/dargs/-/dargs-4.1.0.tgz", + "integrity": "sha1-A6nbtLXC8Tm/FK5T8LiipqhvThc=", + "dev": true, + "requires": { + "number-is-nan": "^1.0.0" + } + }, "find-up": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", @@ -7340,12 +7351,62 @@ "path-exists": "^4.0.0" } }, + "git-raw-commits": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/git-raw-commits/-/git-raw-commits-2.0.0.tgz", + "integrity": "sha512-w4jFEJFgKXMQJ0H0ikBk2S+4KP2VEjhCvLCNqbNRQC8BgGWgLKNCO7a9K9LI+TVT7Gfoloje502sEnctibffgg==", + "dev": true, + "requires": { + "dargs": "^4.0.1", + "lodash.template": "^4.0.2", + "meow": "^4.0.0", + "split2": "^2.0.0", + "through2": "^2.0.0" + }, + "dependencies": { + "meow": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/meow/-/meow-4.0.1.tgz", + "integrity": "sha512-xcSBHD5Z86zaOc+781KrupuHAzeGXSLtiAOmBsiLDiPSaYSB6hdew2ng9EBAnZ62jagG9MHAOdxpDi/lWBFJ/A==", + "dev": true, + "requires": { + "camelcase-keys": "^4.0.0", + "decamelize-keys": "^1.0.0", + "loud-rejection": "^1.0.0", + "minimist": "^1.1.3", + "minimist-options": "^3.0.1", + "normalize-package-data": "^2.3.4", + "read-pkg-up": "^3.0.0", + "redent": "^2.0.0", + "trim-newlines": "^2.0.0" + } + } + } + }, "has-flag": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", "dev": true }, + "indent-string": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-3.2.0.tgz", + "integrity": "sha1-Sl/W0nzDMvN+VBmlBNu4NxBckok=", + "dev": true + }, + "load-json-file": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", + "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "parse-json": "^4.0.0", + "pify": "^3.0.0", + "strip-bom": "^3.0.0" + } + }, "locate-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", @@ -7355,6 +7416,208 @@ "p-locate": "^5.0.0" } }, + "map-obj": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/map-obj/-/map-obj-2.0.0.tgz", + "integrity": "sha1-plzSkIepJZi4eRJXpSPgISIqwfk=", + "dev": true + }, + "meow": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/meow/-/meow-7.1.1.tgz", + "integrity": "sha512-GWHvA5QOcS412WCo8vwKDlTelGLsCGBVevQB5Kva961rmNfun0PCbv5+xta2kUMFJyR8/oWnn7ddeKdosbAPbA==", + "dev": true, + "requires": { + "@types/minimist": "^1.2.0", + "camelcase-keys": "^6.2.2", + "decamelize-keys": "^1.1.0", + "hard-rejection": "^2.1.0", + "minimist-options": "4.1.0", + "normalize-package-data": "^2.5.0", + "read-pkg-up": "^7.0.1", + "redent": "^3.0.0", + "trim-newlines": "^3.0.0", + "type-fest": "^0.13.1", + "yargs-parser": "^18.1.3" + }, + "dependencies": { + "camelcase": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", + "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==", + "dev": true + }, + "camelcase-keys": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/camelcase-keys/-/camelcase-keys-6.2.2.tgz", + "integrity": "sha512-YrwaA0vEKazPBkn0ipTiMpSajYDSe+KjQfrjhcBMxJt/znbvlHd8Pw/Vamaz5EB4Wfhs3SUR3Z9mwRu/P3s3Yg==", + "dev": true, + "requires": { + "camelcase": "^5.3.1", + "map-obj": "^4.0.0", + "quick-lru": "^4.0.1" + } + }, + "find-up": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", + "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", + "dev": true, + "requires": { + "locate-path": "^5.0.0", + "path-exists": "^4.0.0" + } + }, + "indent-string": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", + "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==", + "dev": true + }, + "locate-path": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", + "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", + "dev": true, + "requires": { + "p-locate": "^4.1.0" + } + }, + "map-obj": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/map-obj/-/map-obj-4.1.0.tgz", + "integrity": "sha512-glc9y00wgtwcDmp7GaE/0b0OnxpNJsVf3ael/An6Fe2Q51LLwN1er6sdomLRzz5h0+yMpiYLhWYF5R7HeqVd4g==", + "dev": true + }, + "minimist-options": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/minimist-options/-/minimist-options-4.1.0.tgz", + "integrity": "sha512-Q4r8ghd80yhO/0j1O3B2BjweX3fiHg9cdOwjJd2J76Q135c+NDxGCqdYKQ1SKBuFfgWbAUzBfvYjPUEeNgqN1A==", + "dev": true, + "requires": { + "arrify": "^1.0.1", + "is-plain-obj": "^1.1.0", + "kind-of": "^6.0.3" + } + }, + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + }, + "p-locate": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", + "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", + "dev": true, + "requires": { + "p-limit": "^2.2.0" + } + }, + "p-try": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", + "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", + "dev": true + }, + "parse-json": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", + "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==", + "dev": true, + "requires": { + "@babel/code-frame": "^7.0.0", + "error-ex": "^1.3.1", + "json-parse-even-better-errors": "^2.3.0", + "lines-and-columns": "^1.1.6" + } + }, + "quick-lru": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-4.0.1.tgz", + "integrity": "sha512-ARhCpm70fzdcvNQfPoy49IaanKkTlRWF2JMzqhcJbhSFRZv7nPTvZJdcY7301IPmvW+/p0RgIWnQDLJxifsQ7g==", + "dev": true + }, + "read-pkg": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-5.2.0.tgz", + "integrity": "sha512-Ug69mNOpfvKDAc2Q8DRpMjjzdtrnv9HcSMX+4VsZxD1aZ6ZzrIE7rlzXBtWTyhULSMKg076AW6WR5iZpD0JiOg==", + "dev": true, + "requires": { + "@types/normalize-package-data": "^2.4.0", + "normalize-package-data": "^2.5.0", + "parse-json": "^5.0.0", + "type-fest": "^0.6.0" + }, + "dependencies": { + "type-fest": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.6.0.tgz", + "integrity": "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==", + "dev": true + } + } + }, + "read-pkg-up": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-7.0.1.tgz", + "integrity": "sha512-zK0TB7Xd6JpCLmlLmufqykGE+/TlOePD6qKClNW7hHDKFh/J7/7gCWGR7joEQEW1bKq3a3yUZSObOoWLFQ4ohg==", + "dev": true, + "requires": { + "find-up": "^4.1.0", + "read-pkg": "^5.2.0", + "type-fest": "^0.8.1" + }, + "dependencies": { + "type-fest": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.8.1.tgz", + "integrity": "sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==", + "dev": true + } + } + }, + "redent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redent/-/redent-3.0.0.tgz", + "integrity": "sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==", + "dev": true, + "requires": { + "indent-string": "^4.0.0", + "strip-indent": "^3.0.0" + } + }, + "strip-indent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-3.0.0.tgz", + "integrity": "sha512-laJTa3Jb+VQpaC6DseHhF7dXVqHTfJPCRDaEbid/drOhgitgYku/letMUqOXFoWV0zIIUbjpdH2t+tYj4bQMRQ==", + "dev": true, + "requires": { + "min-indent": "^1.0.0" + } + }, + "trim-newlines": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/trim-newlines/-/trim-newlines-3.0.0.tgz", + "integrity": "sha512-C4+gOpvmxaSMKuEf9Qc134F1ZuOHVXKRbtEflf4NTtuuJDEIJ9p5PXsalL8SkeRw+qit1Mo+yuvMPAKwWg/1hA==", + "dev": true + } + } + }, + "minimist-options": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/minimist-options/-/minimist-options-3.0.2.tgz", + "integrity": "sha512-FyBrT/d0d4+uiZRbqznPXqw3IpZZG3gl3wKWiX784FycUKVwBt0uLBFkQrtE4tZOrgo78nZp2jnKz3L65T5LdQ==", + "dev": true, + "requires": { + "arrify": "^1.0.1", + "is-plain-obj": "^1.1.0" + } + }, "p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -7373,6 +7636,148 @@ "p-limit": "^3.0.2" } }, + "p-try": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-1.0.0.tgz", + "integrity": "sha1-y8ec26+P1CKOE/Yh8rGiN8GyB7M=", + "dev": true + }, + "path-type": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", + "integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", + "dev": true, + "requires": { + "pify": "^3.0.0" + } + }, + "pify": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-3.0.0.tgz", + "integrity": "sha1-5aSs0sEB/fPZpNB/DbxNtJ3SgXY=", + "dev": true + }, + "quick-lru": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-1.1.0.tgz", + "integrity": "sha1-Q2CxfGETatOAeDl/8RQW4Ybc+7g=", + "dev": true + }, + "read-pkg": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", + "integrity": "sha1-nLxoaXj+5l0WwA4rGcI3/Pbjg4k=", + "dev": true, + "requires": { + "load-json-file": "^4.0.0", + "normalize-package-data": "^2.3.2", + "path-type": "^3.0.0" + } + }, + "read-pkg-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-3.0.0.tgz", + "integrity": "sha1-PtSWaF26D4/hGNBpHcUfSh/5bwc=", + "dev": true, + "requires": { + "find-up": "^2.0.0", + "read-pkg": "^3.0.0" + }, + "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "locate-path": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-2.0.0.tgz", + "integrity": "sha1-K1aLJl7slExtnA3pw9u7ygNUzY4=", + "dev": true, + "requires": { + "p-locate": "^2.0.0", + "path-exists": "^3.0.0" + } + }, + "p-limit": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-1.3.0.tgz", + "integrity": "sha512-vvcXsLAJ9Dr5rQOPk7toZQZJApBl2K4J6dANSsEuh6QI41JYcsS/qhTGa9ErIUUgK3WNQoJYvylxvjqmiqEA9Q==", + "dev": true, + "requires": { + "p-try": "^1.0.0" + } + }, + "p-locate": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-2.0.0.tgz", + "integrity": "sha1-IKAQOyIqcMj9OcwuWAaA893l7EM=", + "dev": true, + "requires": { + "p-limit": "^1.1.0" + } + }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } + } + }, + "readable-stream": { + "version": "2.3.7", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", + "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==", + "dev": true, + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "redent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/redent/-/redent-2.0.0.tgz", + "integrity": "sha1-wbIAe0LVfrE4kHmzyDM2OdXhzKo=", + "dev": true, + "requires": { + "indent-string": "^3.0.0", + "strip-indent": "^2.0.0" + } + }, + "split2": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/split2/-/split2-2.2.0.tgz", + "integrity": "sha512-RAb22TG39LhI31MbreBgIuKiIKhVsawfTgEGqKHTK87aG+ul/PB8Sqoi3I7kVdRWiCfrKxK3uo4/YUkpNvhPbw==", + "dev": true, + "requires": { + "through2": "^2.0.2" + } + }, + "string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dev": true, + "requires": { + "safe-buffer": "~5.1.0" + } + }, + "strip-indent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-2.0.0.tgz", + "integrity": "sha1-XvjbKV0B5u1sv3qrlpmNeCJSe2g=", + "dev": true + }, "supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -7381,6 +7786,153 @@ "requires": { "has-flag": "^3.0.0" } + }, + "through2": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.5.tgz", + "integrity": "sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ==", + "dev": true, + "requires": { + "readable-stream": "~2.3.6", + "xtend": "~4.0.1" + } + }, + "trim-newlines": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/trim-newlines/-/trim-newlines-2.0.0.tgz", + "integrity": "sha1-tAPQuRvlDDMd/EuC7s6yLD3hbSA=", + "dev": true + }, + "type-fest": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz", + "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==", + "dev": true + }, + "wrap-ansi": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz", + "integrity": "sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==", + "dev": true, + "requires": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "dependencies": { + "ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "requires": { + "color-convert": "^2.0.1" + } + }, + "color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "requires": { + "color-name": "~1.1.4" + } + }, + "color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + } + } + }, + "y18n": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.1.tgz", + "integrity": "sha512-wNcy4NvjMYL8gogWWYAO7ZFWFfHcbdbE57tZO8e4cbpj8tfUcwrwqSl3ad8HxpYWCdXcJUCeKKZS62Av1affwQ==", + "dev": true + }, + "yargs": { + "version": "15.4.1", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-15.4.1.tgz", + "integrity": "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==", + "dev": true, + "requires": { + "cliui": "^6.0.0", + "decamelize": "^1.2.0", + "find-up": "^4.1.0", + "get-caller-file": "^2.0.1", + "require-directory": "^2.1.1", + "require-main-filename": "^2.0.0", + "set-blocking": "^2.0.0", + "string-width": "^4.2.0", + "which-module": "^2.0.0", + "y18n": "^4.0.0", + "yargs-parser": "^18.1.2" + }, + "dependencies": { + "find-up": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", + "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", + "dev": true, + "requires": { + "locate-path": "^5.0.0", + "path-exists": "^4.0.0" + } + }, + "locate-path": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", + "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", + "dev": true, + "requires": { + "p-locate": "^4.1.0" + } + }, + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + }, + "p-locate": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", + "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", + "dev": true, + "requires": { + "p-limit": "^2.2.0" + } + }, + "p-try": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", + "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", + "dev": true + } + } + }, + "yargs-parser": { + "version": "18.1.3", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-18.1.3.tgz", + "integrity": "sha512-o50j0JeToy/4K6OZcaQmW6lyXXKhq7csREXcDwk2omFPJEwUNOVtJKvmDr9EI1fAJZUyZcRF7kxGBWmRXudrCQ==", + "dev": true, + "requires": { + "camelcase": "^5.0.0", + "decamelize": "^1.2.0" + }, + "dependencies": { + "camelcase": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", + "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==", + "dev": true + } + } } } }, diff --git a/openwpm/Extension/webext-instrumentation/src/background/cookie-instrument.ts b/openwpm/Extension/webext-instrumentation/src/background/cookie-instrument.ts index b9d356b48..72d6aa7ea 100644 --- a/openwpm/Extension/webext-instrumentation/src/background/cookie-instrument.ts +++ b/openwpm/Extension/webext-instrumentation/src/background/cookie-instrument.ts @@ -1,9 +1,9 @@ -import { incrementedEventOrdinal } from "../lib/extension-session-event-ordinal"; -import { extensionSessionUuid } from "../lib/extension-session-uuid"; -import { boolToInt, escapeString } from "../lib/string-utils"; +import {incrementedEventOrdinal} from "../lib/extension-session-event-ordinal"; +import {extensionSessionUuid} from "../lib/extension-session-uuid"; +import {boolToInt, escapeString} from "../lib/string-utils"; +import {JavascriptCookie, JavascriptCookieRecord} from "../schema"; import Cookie = browser.cookies.Cookie; import OnChangedCause = browser.cookies.OnChangedCause; -import { JavascriptCookie, JavascriptCookieRecord } from "../schema"; export const transformCookieObjectToMatchOpenWPMSchema = (cookie: Cookie) => { const javascriptCookie = {} as JavascriptCookie; diff --git a/openwpm/Extension/webext-instrumentation/src/background/dns-instrument.ts b/openwpm/Extension/webext-instrumentation/src/background/dns-instrument.ts index b59ad9daf..728f7b01a 100755 --- a/openwpm/Extension/webext-instrumentation/src/background/dns-instrument.ts +++ b/openwpm/Extension/webext-instrumentation/src/background/dns-instrument.ts @@ -1,12 +1,8 @@ -import { PendingResponse } from "../lib/pending-response"; -import { DnsResolved } from "../schema"; +import {PendingResponse} from "../lib/pending-response"; +import {DnsResolved} from "../schema"; +import {WebRequestOnCompletedEventDetails,} from "../types/browser-web-request-event-details"; +import {allTypes} from "./http-instrument"; import RequestFilter = browser.webRequest.RequestFilter; -import { - WebRequestOnCompletedEventDetails, -} from "../types/browser-web-request-event-details"; -import { - allTypes -} from "./http-instrument"; export class DnsInstrument { @@ -15,48 +11,48 @@ export class DnsInstrument { private pendingResponses: { [requestId: number]: PendingResponse; } = {}; - + constructor(dataReceiver) { this.dataReceiver = dataReceiver; } - + public run(crawlID){ const filter: RequestFilter = { urls: [""], types: allTypes }; - + const requestStemsFromExtension = details => { return ( - details.originUrl && - details.originUrl.indexOf("moz-extension://") > -1 && + details.originUrl && + details.originUrl.indexOf("moz-extension://") > -1 && details.originUrl.includes("fakeRequest") ); }; - + /* * Attach handlers to event listeners */ this.onCompleteListener = ( details: WebRequestOnCompletedEventDetails, - ) => { + ) => { // Ignore requests made by extensions if (requestStemsFromExtension(details)) { return; } const pendingResponse = this.getPendingResponse(details.requestId); - pendingResponse.resolveOnCompletedEventDetails(details); - + pendingResponse.resolveOnCompletedEventDetails(details); + this.onCompleteDnsHandler( details, crawlID, ); }; - + browser.webRequest.onCompleted.addListener( this.onCompleteListener, filter, ); } - + public cleanup() { if (this.onCompleteListener) { browser.webRequest.onCompleted.removeListener( @@ -64,7 +60,7 @@ export class DnsInstrument { ); } } - + private getPendingResponse(requestId): PendingResponse { if (!this.pendingResponses[requestId]) { this.pendingResponses[requestId] = new PendingResponse(); @@ -77,14 +73,14 @@ export class DnsInstrument { return function(record) { // Get data from API call dnsRecordObj.addresses = record.addresses.toString() - dnsRecordObj.canonical_name = record.canonicalName + dnsRecordObj.canonical_name = record.canonicalName dnsRecordObj.is_TRR = record.isTRR // Send data to main OpenWPM data aggregator. dataReceiver.saveRecord("dns_responses", dnsRecordObj); } } - + private async onCompleteDnsHandler( details: WebRequestOnCompletedEventDetails, crawlID, @@ -96,12 +92,12 @@ export class DnsInstrument { dnsRecord.used_address = details.ip; const currentTime = new Date(details.timeStamp); dnsRecord.time_stamp = currentTime.toISOString(); - + // Query DNS API const url = new URL(details.url); dnsRecord.hostname = url.hostname; const dnsResolve = browser.dns.resolve(dnsRecord.hostname, ["canonical_name"]); dnsResolve.then(this.handleResolvedDnsData(dnsRecord, this.dataReceiver)); } - + } diff --git a/openwpm/Extension/webext-instrumentation/src/background/http-instrument.ts b/openwpm/Extension/webext-instrumentation/src/background/http-instrument.ts index 2eb08efaa..658d08417 100644 --- a/openwpm/Extension/webext-instrumentation/src/background/http-instrument.ts +++ b/openwpm/Extension/webext-instrumentation/src/background/http-instrument.ts @@ -1,20 +1,20 @@ -import { incrementedEventOrdinal } from "../lib/extension-session-event-ordinal"; -import { extensionSessionUuid } from "../lib/extension-session-uuid"; -import { HttpPostParser, ParsedPostRequest } from "../lib/http-post-parser"; -import { PendingRequest } from "../lib/pending-request"; -import { PendingResponse } from "../lib/pending-response"; -import ResourceType = browser.webRequest.ResourceType; -import RequestFilter = browser.webRequest.RequestFilter; -import BlockingResponse = browser.webRequest.BlockingResponse; -import HttpHeaders = browser.webRequest.HttpHeaders; -import { boolToInt, escapeString, escapeUrl } from "../lib/string-utils"; -import { HttpRedirect, HttpRequest, HttpResponse } from "../schema"; +import {incrementedEventOrdinal} from "../lib/extension-session-event-ordinal"; +import {extensionSessionUuid} from "../lib/extension-session-uuid"; +import {HttpPostParser, ParsedPostRequest} from "../lib/http-post-parser"; +import {PendingRequest} from "../lib/pending-request"; +import {PendingResponse} from "../lib/pending-response"; +import {boolToInt, escapeString, escapeUrl} from "../lib/string-utils"; +import {HttpRedirect, HttpRequest, HttpResponse} from "../schema"; import { WebRequestOnBeforeRedirectEventDetails, WebRequestOnBeforeRequestEventDetails, WebRequestOnBeforeSendHeadersEventDetails, WebRequestOnCompletedEventDetails, } from "../types/browser-web-request-event-details"; +import ResourceType = browser.webRequest.ResourceType; +import RequestFilter = browser.webRequest.RequestFilter; +import BlockingResponse = browser.webRequest.BlockingResponse; +import HttpHeaders = browser.webRequest.HttpHeaders; type SaveContentOption = boolean | string; @@ -25,7 +25,7 @@ type SaveContentOption = boolean | string; * redirect = original request headers+body, followed by a onBeforeRedirect and then a new set of request headers+body and response headers+body * Docs: https://developer.mozilla.org/en-US/docs/User:wbamberg/webRequest.RequestDetails */ - + const allTypes: ResourceType[] = [ "beacon", "csp_report", diff --git a/openwpm/Extension/webext-instrumentation/src/background/javascript-instrument.ts b/openwpm/Extension/webext-instrumentation/src/background/javascript-instrument.ts index 82cb04e78..f49b54e30 100644 --- a/openwpm/Extension/webext-instrumentation/src/background/javascript-instrument.ts +++ b/openwpm/Extension/webext-instrumentation/src/background/javascript-instrument.ts @@ -1,8 +1,8 @@ import MessageSender = browser.runtime.MessageSender; -import { incrementedEventOrdinal } from "../lib/extension-session-event-ordinal"; -import { extensionSessionUuid } from "../lib/extension-session-uuid"; -import { boolToInt, escapeString, escapeUrl } from "../lib/string-utils"; -import { JavascriptOperation } from "../schema"; +import {incrementedEventOrdinal} from "../lib/extension-session-event-ordinal"; +import {extensionSessionUuid} from "../lib/extension-session-uuid"; +import {boolToInt, escapeString, escapeUrl} from "../lib/string-utils"; +import {JavascriptOperation} from "../schema"; export class JavascriptInstrument { /** diff --git a/openwpm/Extension/webext-instrumentation/src/background/navigation-instrument.ts b/openwpm/Extension/webext-instrumentation/src/background/navigation-instrument.ts index 223f5a74d..5fee53877 100644 --- a/openwpm/Extension/webext-instrumentation/src/background/navigation-instrument.ts +++ b/openwpm/Extension/webext-instrumentation/src/background/navigation-instrument.ts @@ -1,9 +1,9 @@ -import { incrementedEventOrdinal } from "../lib/extension-session-event-ordinal"; -import { extensionSessionUuid } from "../lib/extension-session-uuid"; -import { PendingNavigation } from "../lib/pending-navigation"; -import { boolToInt, escapeString, escapeUrl } from "../lib/string-utils"; -import { makeUUID } from "../lib/uuid"; -import { Navigation } from "../schema"; +import {incrementedEventOrdinal} from "../lib/extension-session-event-ordinal"; +import {extensionSessionUuid} from "../lib/extension-session-uuid"; +import {PendingNavigation} from "../lib/pending-navigation"; +import {boolToInt, escapeString, escapeUrl} from "../lib/string-utils"; +import {makeUUID} from "../lib/uuid"; +import {Navigation} from "../schema"; import { WebNavigationBaseEventDetails, WebNavigationOnBeforeNavigateEventDetails, diff --git a/openwpm/Extension/webext-instrumentation/src/content/javascript-instrument-content-scope.ts b/openwpm/Extension/webext-instrumentation/src/content/javascript-instrument-content-scope.ts index b5aa4a06f..b63e4c80c 100755 --- a/openwpm/Extension/webext-instrumentation/src/content/javascript-instrument-content-scope.ts +++ b/openwpm/Extension/webext-instrumentation/src/content/javascript-instrument-content-scope.ts @@ -1,5 +1,5 @@ -import { getInstrumentJS } from "../lib/js-instruments"; -import { pageScript } from "./javascript-instrument-page-scope"; +import {getInstrumentJS} from "../lib/js-instruments"; +import {pageScript} from "./javascript-instrument-page-scope"; function getPageScriptAsString( jsInstrumentationSettingsString: string, diff --git a/openwpm/Extension/webext-instrumentation/src/lib/extension-session-uuid.ts b/openwpm/Extension/webext-instrumentation/src/lib/extension-session-uuid.ts index 1e5d246ab..7b76e3cfb 100644 --- a/openwpm/Extension/webext-instrumentation/src/lib/extension-session-uuid.ts +++ b/openwpm/Extension/webext-instrumentation/src/lib/extension-session-uuid.ts @@ -1,4 +1,4 @@ -import { makeUUID } from "./uuid"; +import {makeUUID} from "./uuid"; /** * This enables us to access a unique reference to this browser diff --git a/openwpm/Extension/webext-instrumentation/src/lib/http-post-parser.ts b/openwpm/Extension/webext-instrumentation/src/lib/http-post-parser.ts index 5566eac58..90916b5ad 100644 --- a/openwpm/Extension/webext-instrumentation/src/lib/http-post-parser.ts +++ b/openwpm/Extension/webext-instrumentation/src/lib/http-post-parser.ts @@ -1,12 +1,8 @@ // Incorporates code from: https://github.com/redline13/selenium-jmeter/blob/6966d4b326cd78261e31e6e317076569051cac37/content/library/recorder/HttpPostParser.js -import { - WebRequestOnBeforeRequestEventDetails, - // WebRequestOnBeforeSendHeadersEventDetails, -} from "../types/browser-web-request-event-details"; +import {WebRequestOnBeforeRequestEventDetails,} from "../types/browser-web-request-event-details"; // import { escapeString, escapeUrl } from "./string-utils"; - -import { escapeString, Uint8ToBase64 } from "./string-utils"; +import {escapeString, Uint8ToBase64} from "./string-utils"; // const components: any = {}; diff --git a/openwpm/Extension/webext-instrumentation/src/lib/number.spec.ts b/openwpm/Extension/webext-instrumentation/src/lib/number.spec.ts index 132254333..16dcf81c0 100644 --- a/openwpm/Extension/webext-instrumentation/src/lib/number.spec.ts +++ b/openwpm/Extension/webext-instrumentation/src/lib/number.spec.ts @@ -1,6 +1,6 @@ // tslint:disable:no-expression-statement import test from "ava"; -import { double, power } from "./number"; +import {double, power} from "./number"; test("double", t => { t.is(double(2), 4); diff --git a/openwpm/Extension/webext-instrumentation/src/lib/pending-navigation.ts b/openwpm/Extension/webext-instrumentation/src/lib/pending-navigation.ts index d7a014bc6..e6f9ea242 100644 --- a/openwpm/Extension/webext-instrumentation/src/lib/pending-navigation.ts +++ b/openwpm/Extension/webext-instrumentation/src/lib/pending-navigation.ts @@ -1,4 +1,4 @@ -import { Navigation } from "../schema"; +import {Navigation} from "../schema"; /** * Ties together the two separate navigation events that together holds information about both parent frame id and transition-related attributes diff --git a/openwpm/Extension/webext-instrumentation/src/lib/pending-response.ts b/openwpm/Extension/webext-instrumentation/src/lib/pending-response.ts index 5edf7f642..7ac0607da 100644 --- a/openwpm/Extension/webext-instrumentation/src/lib/pending-response.ts +++ b/openwpm/Extension/webext-instrumentation/src/lib/pending-response.ts @@ -2,7 +2,7 @@ import { WebRequestOnBeforeRequestEventDetails, WebRequestOnCompletedEventDetails, } from "../types/browser-web-request-event-details"; -import { ResponseBodyListener } from "./response-body-listener"; +import {ResponseBodyListener} from "./response-body-listener"; /** * Ties together the two separate events that together holds information about both response headers and body diff --git a/openwpm/Extension/webext-instrumentation/src/lib/response-body-listener.ts b/openwpm/Extension/webext-instrumentation/src/lib/response-body-listener.ts index 6cd86350f..4fc492a2e 100644 --- a/openwpm/Extension/webext-instrumentation/src/lib/response-body-listener.ts +++ b/openwpm/Extension/webext-instrumentation/src/lib/response-body-listener.ts @@ -1,5 +1,5 @@ -import { WebRequestOnBeforeRequestEventDetails } from "../types/browser-web-request-event-details"; -import { digestMessage } from "./sha256"; +import {WebRequestOnBeforeRequestEventDetails} from "../types/browser-web-request-event-details"; +import {digestMessage} from "./sha256"; export class ResponseBodyListener { private readonly responseBody: Promise; diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index c0d312422..544b8d817 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -17,9 +17,11 @@ from tblib import pickling_support from .commands.types import BaseCommand, ShutdownSignal +from .config import BrowserParamsInternal, ManagerParamsInternal from .deploy_browsers import deploy_firefox from .errors import BrowserConfigError, BrowserCrashError, ProfileLoadError from .socket_interface import ClientSocket +from .types import BrowserId, VisitId from .utilities.multiprocess_utils import ( Process, kill_process_and_children, @@ -40,23 +42,28 @@ class Browser: this browser is headless, etc.) """ - def __init__(self, manager_params, browser_params) -> None: + def __init__( + self, + manager_params: ManagerParamsInternal, + browser_params: BrowserParamsInternal, + ) -> None: # Constants self._SPAWN_TIMEOUT = 120 # seconds self._UNSUCCESSFUL_SPAWN_LIMIT = 4 # manager parameters self.current_profile_path = None - self.db_socket_address = manager_params.aggregator_address - self.browser_id = browser_params.browser_id - self.curr_visit_id: int = None + self.db_socket_address = manager_params.storage_controller_address + assert browser_params.browser_id is not None + self.browser_id: BrowserId = browser_params.browser_id + self.curr_visit_id: Optional[VisitId] = None self.browser_params = browser_params self.manager_params = manager_params # Queues and process IDs for BrowserManager # thread to run commands issues from TaskManager - self.command_thread: threading.Thread = None + self.command_thread: Optional[threading.Thread] = None # queue for passing command tuples to BrowserManager self.command_queue: Optional[Queue] = None # queue for receiving command execution status from BrowserManager @@ -74,7 +81,7 @@ def __init__(self, manager_params, browser_params) -> None: self.restart_required = False self.current_timeout: Optional[int] = None # timeout of the current command - self.browser_manager = None # process that controls browser + self.browser_manager: Optional[Process] = None # process that controls browser self.logger = logging.getLogger("openwpm") @@ -106,7 +113,7 @@ def launch_browser_manager(self): ) # make sure browser loads crashed profile self.browser_params.recovery_tar = tempdir - + crash_recovery = True else: """ @@ -245,6 +252,7 @@ def close_browser_manager(self, force: bool = False): If the browser manager process is unresponsive, the process is killed. """ self.logger.debug("BROWSER %i: Closing browser..." % self.browser_id) + assert self.status_queue is not None if force: self.kill_browser_manager() @@ -286,7 +294,7 @@ def close_browser_manager(self, force: bool = False): # Send the shutdown command command = ShutdownSignal() - self.command_queue.put((command)) + self.command_queue.put(command) # Verify that webdriver has closed (30 second timeout) try: @@ -308,13 +316,13 @@ def close_browser_manager(self, force: bool = False): # Verify that the browser process has closed (30 second timeout) if self.browser_manager is not None: self.browser_manager.join(30) - if self.browser_manager.is_alive(): - self.logger.debug( - "BROWSER %i: Browser manager process still alive 30 seconds " - "after executing shutdown command." % self.browser_id - ) - self.kill_browser_manager() - return + if self.browser_manager.is_alive(): + self.logger.debug( + "BROWSER %i: Browser manager process still alive 30 seconds " + "after executing shutdown command." % self.browser_id + ) + self.kill_browser_manager() + return self.logger.debug( "BROWSER %i: Browser manager closed successfully." % self.browser_id diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index 394c81c3f..a5eca5b0c 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -180,7 +180,7 @@ def recursive_dump_page_source(self, suffix="", timeout=30): def append_command(self, command: BaseCommand, timeout: int = 30) -> None: self._commands_with_timeout.append((command, timeout)) - def mark_done(self, success: bool): + def mark_done(self, success: bool) -> None: if self.callback is not None: self.callback(success) diff --git a/openwpm/commands/browser_commands.py b/openwpm/commands/browser_commands.py index cbfde6661..e1692f83b 100644 --- a/openwpm/commands/browser_commands.py +++ b/openwpm/commands/browser_commands.py @@ -500,7 +500,7 @@ class InitializeCommand(BaseCommand): """The command is automatically prepended to the beginning of a CommandSequence It initializes state both in the extensions as well in as the - Aggregator + StorageController """ def __repr__(self): diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index 1f6a26e8e..d44781d49 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -1,12 +1,11 @@ import logging -import os import shutil import tarfile -from typing import Any, Dict +from pathlib import Path from selenium.webdriver import Firefox -from openwpm.config import BrowserParams, ManagerParams +from openwpm.config import BrowserParamsInternal, ManagerParamsInternal from ..errors import ProfileLoadError from ..socket_interface import ClientSocket @@ -18,70 +17,57 @@ class DumpProfileCommand(BaseCommand): """ - dumps a browser profile currently stored in to - in which both folders are absolute paths. + Dumps a browser profile currently stored in to + """ - def __init__(self, dump_folder, close_webdriver, compress): - self.dump_folder = dump_folder + def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> None: + self.tar_path = tar_path self.close_webdriver = close_webdriver self.compress = compress + raise NotImplementedError( + "Profile dumping is currently unsupported. " + "See: https://github.com/mozilla/OpenWPM/projects/2." + ) - def __repr__(self): + def __repr__(self) -> str: return "DumpProfCommand({},{},{})".format( - self.dump_folder, self.close_webdriver, self.compress + self.tar_path, self.close_webdriver, self.compress ) def execute( self, webdriver: Firefox, - browser_params: BrowserParams, - manager_params: ManagerParams, + browser_params: BrowserParamsInternal, + manager_params: ManagerParamsInternal, extension_socket: ClientSocket, ) -> None: - logger.debug( - "BROWSER %i: Profile dumping is currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2." - % browser_params.browser_id - ) - return browser_profile_folder = browser_params.profile_path + assert browser_profile_folder is not None - # ensures that folder paths end with slashes - if browser_profile_folder[-1] != "/": - browser_profile_folder = browser_profile_folder + "/" - if tar_location[-1] != "/": - tar_location = tar_location + "/" - - if not os.path.exists(tar_location): - os.makedirs(tar_location) - - if compress: - tar_name = "profile.tar.gz" - else: - tar_name = "profile.tar" + # Creating the folders if need be + self.tar_path.parent.mkdir(exist_ok=True, parents=True) # see if this file exists first # if it does, delete it before we try to save the current session - if os.path.isfile(tar_location + tar_name): - os.remove(tar_location + tar_name) - + if self.tar_path.exists(): + self.tar_path.unlink() # IDK why it's called like this # if this is a dump on close, close the webdriver and wait for checkpoint - if close_webdriver: + if self.close_webdriver: webdriver.close() sleep_until_sqlite_checkpoint(browser_profile_folder) # backup and tar profile - if compress: - tar = tarfile.open(tar_location + tar_name, "w:gz", errorlevel=1) + if self.compress: + tar = tarfile.open(self.tar_path, "w:gz", errorlevel=1) else: - tar = tarfile.open(tar_location + tar_name, "w", errorlevel=1) + tar = tarfile.open(self.tar_path, "w", errorlevel=1) logger.debug( "BROWSER %i: Backing up full profile from %s to %s" % ( - browser_params.browser_id, + self.browser_id, browser_profile_folder, - tar_location + tar_name, + self.tar_path, ) ) storage_vector_files = [ @@ -100,69 +86,66 @@ def execute( "storage", # directory for IndexedDB ] for item in storage_vector_files: - full_path = os.path.join(browser_profile_folder, item) + full_path = browser_profile_folder / item if ( - not os.path.isfile(full_path) - and full_path[-3:] != "shm" - and full_path[-3:] != "wal" + not full_path.is_file() + and not full_path.name.endswith("shm") + and not full_path.name.endswith("wal") ): logger.critical( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (browser_params.browser_id, full_path) + % (self.browser_id, full_path) ) - elif not os.path.isfile(full_path) and ( - full_path[-3:] == "shm" or full_path[-3:] == "wal" + elif not full_path.is_file() and ( + full_path.name.endswith("shm") or full_path.name.endswith("wal") ): continue # These are just checkpoint files tar.add(full_path, arcname=item) for item in storage_vector_dirs: - full_path = os.path.join(browser_profile_folder, item) - if not os.path.isdir(full_path): + full_path = browser_profile_folder / item + if not full_path.is_dir(): logger.warning( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (browser_params.browser_id, full_path) + % (self.browser_id, full_path) ) continue tar.add(full_path, arcname=item) tar.close() -def load_profile(browser_profile_folder, manager_params, browser_params, tar_location): +def load_profile( + browser_profile_folder: Path, + manager_params: ManagerParamsInternal, + browser_params: BrowserParamsInternal, + tar_path: Path, +) -> None: """ - loads a zipped cookie-based profile stored in and - unzips it to . This will load whatever profile - is in the folder, either full_profile.tar.gz or profile.tar.gz + loads a zipped cookie-based profile stored at and + unzips it to . + The tar will remain unmodified. """ - try: - # ensures that folder paths end with slashes - if browser_profile_folder[-1] != "/": - browser_profile_folder = browser_profile_folder + "/" - if tar_location[-1] != "/": - tar_location = tar_location + "/" - - if os.path.isfile(tar_location + "profile.tar.gz"): - tar_name = "profile.tar.gz" - else: - tar_name = "profile.tar" + assert tar_path.is_file() + assert browser_params.browser_id is not None + try: # Copy and untar the loaded profile logger.debug( "BROWSER %i: Copying profile tar from %s to %s" % ( browser_params.browser_id, - tar_location + tar_name, + tar_path, browser_profile_folder, ) ) - shutil.copy(tar_location + tar_name, browser_profile_folder) - - if tar_name == "profile.tar.gz": - f = tarfile.open(browser_profile_folder + tar_name, "r:gz", errorlevel=1) + shutil.copy(tar_path, browser_profile_folder) + tar_path = browser_profile_folder / tar_path.name + if tar_path.name.endswith("tar.gz"): + f = tarfile.open(tar_path, "r:gz", errorlevel=1) else: - f = tarfile.open(browser_profile_folder + tar_name, "r", errorlevel=1) + f = tarfile.open(tar_path, "r", errorlevel=1) f.extractall(browser_profile_folder) f.close() - os.remove(browser_profile_folder + tar_name) + tar_path.unlink() logger.debug("BROWSER %i: Tarfile extracted" % browser_params.browser_id) except Exception as ex: diff --git a/openwpm/commands/types.py b/openwpm/commands/types.py index 597afcc5c..2b4fcf307 100644 --- a/openwpm/commands/types.py +++ b/openwpm/commands/types.py @@ -2,7 +2,7 @@ from selenium.webdriver import Firefox -from ..config import BrowserParams, ManagerParams +from ..config import BrowserParamsInternal, ManagerParamsInternal from ..socket_interface import ClientSocket @@ -26,8 +26,8 @@ def set_start_time(self, start_time): def execute( self, webdriver: Firefox, - browser_params: BrowserParams, - manager_params: ManagerParams, + browser_params: BrowserParamsInternal, + manager_params: ManagerParamsInternal, extension_socket: ClientSocket, ) -> None: """ diff --git a/openwpm/config.py b/openwpm/config.py index 0d931684d..d8f4a4777 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -1,10 +1,14 @@ import os from dataclasses import dataclass, field +from json import JSONEncoder +from pathlib import Path from typing import List, Optional, Tuple, Union -from dataclasses_json import dataclass_json +from dataclasses_json import DataClassJsonMixin +from dataclasses_json import config as DCJConfig from .errors import ConfigError +from .types import BrowserId BOOL_TYPE_VALIDATION_LIST = [True, False] DISPLAY_MODE_VALIDATION_LIST = ["native", "headless", "xvfb"] @@ -12,7 +16,6 @@ "firefox" ] # Using List instead of a str type to future proof the logic as OpenWPM may add support for more browsers in future TP_COOKIES_OPTIONALS_LIST = ["always", "never", "from_visited"] -DB_EXTENSION_TYPE_LIST = [".db", ".sqlite"] LOG_EXTENSION_TYPE_LIST = [".log"] CONFIG_ERROR_STRING = ( "Found {value} as value for {parameter_name} in BrowserParams. " @@ -28,7 +31,6 @@ "Found invalid value `{value}` for {parameter_name} in {params_type}. " "Please look at docs/Configuration.md for more information" ) -OUTPUT_FORMAT_LIST = ["local", "s3"] ALL_RESOURCE_TYPES = { "beacon", @@ -54,9 +56,20 @@ } -@dataclass_json +def str_to_path(string: Optional[str]) -> Optional[Path]: + if string is not None: + return Path(string) + return None + + +def path_to_str(path: Optional[Path]) -> Optional[str]: + if path is not None: + return str(path.resolve()) + return None + + @dataclass -class BrowserParams: +class BrowserParams(DataClassJsonMixin): """ Configuration that might differ per browser @@ -68,7 +81,7 @@ class BrowserParams: extension_enabled: bool = True cookie_instrument: bool = True js_instrument: bool = False - js_instrument_settings: List = field( + js_instrument_settings: List[Union[str, dict]] = field( default_factory=lambda: ["collection_fingerprinting"] ) http_instrument: bool = False @@ -76,40 +89,43 @@ class BrowserParams: save_content: Union[bool, str] = False callstack_instrument: bool = False dns_instrument: bool = False - seed_tar: Optional[str] = None + seed_tar: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) + ) display_mode: str = "native" browser: str = "firefox" prefs: dict = field(default_factory=dict) tp_cookies: str = "always" bot_mitigation: bool = False profile_archive_dir: Optional[str] = None - recovery_tar: Optional[str] = None - donottrack: str = False + recovery_tar: Optional[Path] = None + donottrack: bool = False tracking_protection: bool = False -@dataclass_json @dataclass -class ManagerParams: +class ManagerParams(DataClassJsonMixin): """ Configuration for the TaskManager - The configuration will be the same for all browsers running on the same TaskManager. It can be used to control storage locations or which watchdogs should run """ - data_directory: str = "~/openwpm/" - log_directory: str = "~/openwpm/" - screenshot_path: Optional[str] = None - source_dump_path: Optional[str] = None - output_format: str = "local" - database_name: str = "crawl-data.sqlite" - log_file: str = "openwpm.log" + data_directory: Path = field( + default=Path("~/openwpm/"), + metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), + ) + log_directory: Path = field( + default=Path("~/openwpm/"), + metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), + ) + log_file: Path = field( + default=Path("openwpm.log"), + metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), + ) testing: bool = False - s3_bucket: Optional[str] = None - s3_directory: Optional[str] = None memory_watchdog: bool = False process_watchdog: bool = False num_browsers: int = 1 @@ -122,21 +138,26 @@ def failure_limit(self) -> int: return self._failure_limit @failure_limit.setter - def failure_limit(self, value) -> None: + def failure_limit(self, value: int) -> None: self._failure_limit = value @dataclass class BrowserParamsInternal(BrowserParams): - browser_id: Optional[int] = None - profile_path: str = "" + browser_id: Optional[BrowserId] = None + profile_path: Optional[Path] = None @dataclass class ManagerParamsInternal(ManagerParams): - aggregator_address: Tuple[str] = () - logger_address: Tuple[str] = () - ldb_address: Tuple[str] = () + storage_controller_address: Optional[Tuple[str, int]] = None + logger_address: Optional[Tuple[str, ...]] = None + screenshot_path: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) + ) + source_dump_path: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) + ) def validate_browser_params(browser_params: BrowserParams) -> None: @@ -228,25 +249,6 @@ def validate_manager_params(manager_params: ManagerParams) -> None: ) ) - try: - database_extension = os.path.splitext(manager_params.database_name)[1] - if database_extension.lower() not in DB_EXTENSION_TYPE_LIST: - raise ConfigError( - EXTENSION_ERROR_STRING.format( - extension=database_extension or "no", - value_list=DB_EXTENSION_TYPE_LIST, - parameter_name="database_name", - ) - ) - except (TypeError, AttributeError): - raise ConfigError( - GENERAL_ERROR_STRING.format( - value=manager_params.database_name, - parameter_name="database_name", - params_type="ManagerParams", - ) - ) - # This check is necessary to not cause any internal error if not isinstance(manager_params.failure_limit, int): raise ConfigError( @@ -260,28 +262,13 @@ def validate_manager_params(manager_params: ManagerParams) -> None: ) ) - try: - if manager_params.output_format.lower() not in OUTPUT_FORMAT_LIST: - raise ConfigError( - CONFIG_ERROR_STRING.format( - value=manager_params.output_format, - parameter_name="output_format", - value_list=OUTPUT_FORMAT_LIST, - ).replace( - "Please look at docs/Configuration.md#browser-configuration-options for more information", - "Please look at docs/Configuration.md for more information", - ) - ) - except: - raise ConfigError( - "Something went wrong while validating ManagerParams. " - "Please check values provided for ManagerParams are of expected types" - ) - def validate_crawl_configs( manager_params: ManagerParams, browser_params: List[BrowserParams] ) -> None: + validate_manager_params(manager_params) + for bp in browser_params: + validate_browser_params(bp) if len(browser_params) != manager_params.num_browsers: raise ConfigError( @@ -289,3 +276,10 @@ def validate_crawl_configs( "as manager_params.num_browsers. Make sure you are assigning number of browsers " "to be used to manager_params.num_browsers in your entry file" ) + + +class ConfigEncoder(JSONEncoder): + def default(self, obj): + if isinstance(obj, Path): + return str(obj.resolve()) + return JSONEncoder.default(self, obj) diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 7d8f62592..862525689 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -1,7 +1,8 @@ import json import logging import os.path -from typing import Any, List, Optional +from pathlib import Path +from typing import Any, Dict, Optional, Tuple from easyprocess import EasyProcessError from multiprocess import Queue @@ -10,7 +11,7 @@ from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from ..commands.profile_commands import load_profile -from ..config import BrowserParams, ManagerParams +from ..config import BrowserParamsInternal, ConfigEncoder, ManagerParamsInternal from ..utilities.platform_utils import get_firefox_binary_path from . import configure_firefox from .selenium_firefox import FirefoxBinary, FirefoxLogInterceptor, Options @@ -21,10 +22,10 @@ def deploy_firefox( status_queue: Queue, - browser_params: List[BrowserParams], - manager_params: ManagerParams, + browser_params: BrowserParamsInternal, + manager_params: ManagerParamsInternal, crash_recovery: bool, -) -> (webdriver.Firefox, str, Optional[Display]): +) -> Tuple[webdriver.Firefox, str, Optional[Display]]: """ launches a firefox instance with parameters set by the input dictionary """ @@ -33,14 +34,14 @@ def deploy_firefox( root_dir = os.path.dirname(__file__) # directory of this file fp = FirefoxProfile() - browser_profile_path = fp.path + "/" + browser_profile_path = Path(fp.path) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the # Options method has no "frozen"/restricted options. # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039 fo = Options() - + assert browser_params.browser_id is not None if browser_params.seed_tar and not crash_recovery: logger.info( "BROWSER %i: Loading initial browser profile from: %s" @@ -70,7 +71,7 @@ def deploy_firefox( display_port = None display = None if display_mode == "headless": - fo.set_headless(True) + fo.headless = True fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0])) fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1])) if display_mode == "xvfb": @@ -91,18 +92,16 @@ def deploy_firefox( if browser_params.extension_enabled: # Write config file - extension_config = dict() + extension_config: Dict[str, Any] = dict() extension_config.update(browser_params.to_dict()) extension_config["logger_address"] = manager_params.logger_address - extension_config["aggregator_address"] = manager_params.aggregator_address - if manager_params.ldb_address: - extension_config["leveldb_address"] = manager_params.ldb_address - else: - extension_config["leveldb_address"] = None + extension_config[ + "storage_controller_address" + ] = manager_params.storage_controller_address extension_config["testing"] = manager_params.testing - ext_config_file = browser_profile_path + "browser_params.json" + ext_config_file = browser_profile_path / "browser_params.json" with open(ext_config_file, "w") as f: - json.dump(extension_config, f) + json.dump(extension_config, f, cls=ConfigEncoder) logger.debug( "BROWSER %i: Saved extension config file to: %s" % (browser_params.browser_id, ext_config_file) diff --git a/openwpm/mp_logger.py b/openwpm/mp_logger.py index 401895256..7b9f46f35 100644 --- a/openwpm/mp_logger.py +++ b/openwpm/mp_logger.py @@ -99,13 +99,13 @@ class MPLogger(object): def __init__( self, log_file, - crawl_context=None, + crawl_reference: str = None, log_level_console=logging.INFO, log_level_file=logging.DEBUG, log_level_sentry_breadcrumb=logging.DEBUG, log_level_sentry_event=logging.ERROR, - ): - self._crawl_context = crawl_context + ) -> None: + self._crawl_reference = crawl_reference self._log_level_console = log_level_console self._log_level_file = log_level_file self._log_level_sentry_breadcrumb = log_level_sentry_breadcrumb @@ -206,14 +206,10 @@ def _initialize_sentry(self): self._event_handler = EventHandler(level=self._log_level_sentry_event) sentry_sdk.init(dsn=self._sentry_dsn, before_send=self._sentry_before_send) with sentry_sdk.configure_scope() as scope: - if self._crawl_context: + if self._crawl_reference: scope.set_tag( "CRAWL_REFERENCE", - "%s/%s" - % ( - self._crawl_context.get("s3_bucket", "UNKNOWN"), - self._crawl_context.get("s3_directory", "UNKNOWN"), - ), + self._crawl_reference, ) def _start_listener(self): diff --git a/openwpm/socket_interface.py b/openwpm/socket_interface.py index 2b8628782..bf3862f80 100644 --- a/openwpm/socket_interface.py +++ b/openwpm/socket_interface.py @@ -1,9 +1,11 @@ +import asyncio import json import socket import struct import threading import traceback from queue import Queue +from typing import Any import dill @@ -47,9 +49,7 @@ def _accept(self): thread.start() except ConnectionAbortedError: # Workaround for #278 - print( - "A connection establish request was performed " "on a closed socket" - ) + print("A connection establish request was performed on a closed socket") return def _handle_conn(self, client, address): @@ -76,28 +76,23 @@ def _handle_conn(self, client, address): % (msglen, serialization) ) msg = self.receive_msg(client, msglen) - if serialization != b"n": - try: - if serialization == b"d": # dill serialization - msg = dill.loads(msg) - elif serialization == b"j": # json serialization - msg = json.loads(msg.decode("utf-8")) - elif serialization == b"u": # utf-8 serialization - msg = msg.decode("utf-8") - else: - print("Unrecognized serialization type: %r" % serialization) - continue - except (UnicodeDecodeError, ValueError) as e: - print( - "Error de-serializing message: %s \n %s" - % (msg, traceback.format_exc(e)) - ) - continue - self.queue.put(msg) + try: + msg = _parse(serialization, msg) + except (UnicodeDecodeError, ValueError) as e: + print( + "Error de-serializing message: %s \n %s" + % (msg, traceback.format_exc(e)) + ) + continue + self._put_into_queue(msg) except RuntimeError: if self.verbose: print("Client socket: " + str(address) + " closed") + def _put_into_queue(self, msg): + """Put the parsed message into a queue from where it can be read by consumers""" + self.queue.put(msg) + def receive_msg(self, client, msglen): msg = b"" while len(msg) < msglen: @@ -149,7 +144,9 @@ def send(self, msg): msg = json.dumps(msg).encode("utf-8") serialization = b"j" else: - raise ValueError("Unsupported serialization type set: %s" % serialization) + raise ValueError( + "Unsupported serialization type set: %s" % self.serialization + ) if self.verbose: print("Sending message with serialization %s" % serialization) @@ -166,15 +163,47 @@ def close(self): self.sock.close() +async def get_message_from_reader(reader: asyncio.StreamReader) -> Any: + """ + Reads a message from the StreamReader + :exception IncompleteReadError if the underlying socket is closed + + To safely use this method, you should guard against the exception + like this: + ``` + try: + record: Tuple[str, Any] = await get_message_from_reader(reader) + except IncompleteReadError as e: + print("The underlying socket closed", repr(e)) + ``` + """ + msg = await reader.readexactly(5) + msglen, serialization = struct.unpack(">Lc", msg) + msg = await reader.readexactly(msglen) + return _parse(serialization, msg) + + +def _parse(serialization: bytes, msg: bytes) -> Any: + if serialization == b"n": + return msg + if serialization == b"d": # dill serialization + return dill.loads(msg) + if serialization == b"j": # json serialization + return json.loads(msg.decode("utf-8")) + if serialization == b"u": # utf-8 serialization + return msg.decode("utf-8") + raise ValueError("Unknown Encoding") + + def main(): import sys # Just for testing if sys.argv[1] == "s": - sock = ServerSocket(verbose=True) - sock.start_accepting() + ssock = ServerSocket(verbose=True) + ssock.start_accepting() input("Press enter to exit...") - sock.close() + ssock.close() elif sys.argv[1] == "c": host = input("Enter the host name:\n") port = input("Enter the port:\n") diff --git a/openwpm/DataAggregator/__init__.py b/openwpm/storage/__init__.py similarity index 100% rename from openwpm/DataAggregator/__init__.py rename to openwpm/storage/__init__.py diff --git a/openwpm/storage/arrow_storage.py b/openwpm/storage/arrow_storage.py new file mode 100644 index 000000000..78be54b9c --- /dev/null +++ b/openwpm/storage/arrow_storage.py @@ -0,0 +1,171 @@ +import asyncio +import logging +import random +from abc import abstractmethod +from asyncio import Task +from collections import defaultdict +from typing import Any, DefaultDict, Dict, List + +import pandas as pd +import pyarrow as pa +from pyarrow import Table + +from openwpm.types import VisitId + +from .parquet_schema import PQ_SCHEMAS +from .storage_providers import INCOMPLETE_VISITS, StructuredStorageProvider, TableName + +CACHE_SIZE = 500 + + +class ArrowProvider(StructuredStorageProvider): + """This class implements a StructuredStorage provider that + serializes records into the arrow format + """ + + storing_lock: asyncio.Lock + + def __init__(self) -> None: + super().__init__() + self.logger = logging.getLogger("openwpm") + + def factory_function() -> DefaultDict[TableName, List[Dict[str, Any]]]: + return defaultdict(list) + + # Raw records per VisitId and Table + self._records: DefaultDict[ + VisitId, DefaultDict[TableName, List[Dict[str, Any]]] + ] = defaultdict(factory_function) + + # Record batches by TableName + self._batches: DefaultDict[TableName, List[pa.RecordBatch]] = defaultdict(list) + self._instance_id = random.getrandbits(32) + + self.flush_events: List[asyncio.Event] = list() + + async def init(self) -> None: + # Used to synchronize the finalizing and the flushing + self.storing_lock = asyncio.Lock() + + async def store_record( + self, table: TableName, visit_id: VisitId, record: Dict[str, Any] + ) -> None: + records = self._records[visit_id] + # Add nulls + for item in PQ_SCHEMAS[table].names: + if item not in record: + record[item] = None + # Add instance_id (for partitioning) + record["instance_id"] = self._instance_id + records[table].append(record) + + def _create_batch(self, visit_id: VisitId) -> None: + """Create record batches for all records from `visit_id`""" + if visit_id not in self._records: + # The batch for this `visit_id` was already created, skip + self.logger.error( + "Trying to create batch for visit_id %d when one was already created", + visit_id, + ) + return + for table_name, data in self._records[visit_id].items(): + try: + df = pd.DataFrame(data) + batch = pa.RecordBatch.from_pandas( + df, schema=PQ_SCHEMAS[table_name], preserve_index=False + ) + self._batches[table_name].append(batch) + self.logger.debug( + "Successfully created batch for table %s and " + "visit_id %s" % (table_name, visit_id) + ) + except pa.lib.ArrowInvalid: + self.logger.error( + "Error while creating record batch for table %s\n" % table_name, + exc_info=True, + ) + pass + + del self._records[visit_id] + + def _is_cache_full(self) -> bool: + for batches in self._batches.values(): + if len(batches) > CACHE_SIZE: + return True + return False + + async def finalize_visit_id( + self, visit_id: VisitId, interrupted: bool = False + ) -> Task[None]: + """This method is the reason the finalize_visit_id interface returns a task. + This was necessary as we needed to enable the following pattern. + ``` + token = await structured_storage.finalize_visit_id(1) + structured_storage.flush_cache() + await token + ``` + If there was no task returned and the method would just block/yield after turning the + record into a batch, there would be no way to know, when it's safe to flush_cache as + I couldn't find a way to run a coroutine until it yields and then run a different one. + + With the current setup `token` aka a `wait_on_condition` coroutine will only return once + it's respective event has been set. + """ + if interrupted: + await self.store_record(INCOMPLETE_VISITS, visit_id, {"visit_id": visit_id}) + # This code is pretty tricky as there are a number of things going on + # 1. The awaitable returned by finalize_visit_id should only + # resolve once the data is saved to persistent storage + # 2. No new batches should be created while saving out all the batches + async with self.storing_lock: + self._create_batch(visit_id) + + event = asyncio.Event() + self.flush_events.append(event) + + if self._is_cache_full(): + await self.flush_cache(self.storing_lock) + + async def wait_on_condition(e: asyncio.Event) -> None: + await e.wait() + + return asyncio.create_task(wait_on_condition(event)) + + @abstractmethod + async def write_table(self, table_name: TableName, table: Table) -> None: + """Write out the table to persistent storage + This should only return once it's actually saved out + """ + + async def flush_cache(self, lock: asyncio.Lock = None) -> None: + """We need to hack around the fact that asyncio has no reentrant lock + So we either grab the storing_lock ourselves or the caller needs + to pass us the locked storing_lock + """ + has_lock_arg = lock is not None + if not has_lock_arg: + lock = self.storing_lock + await lock.acquire() + + assert lock == self.storing_lock and lock.locked() + + for table_name, batches in self._batches.items(): + table = pa.Table.from_batches(batches) + await self.write_table(table_name, table) + self._batches.clear() + + for event in self.flush_events: + event.set() + self.flush_events.clear() + + if not has_lock_arg: + lock.release() + + async def shutdown(self) -> None: + for table_name, batches in self._batches.items(): + if len(batches) != 0: + self.logger.error( + "While shutting down there were %d cached entries for table %s", + len(batches), + table_name, + ) diff --git a/openwpm/storage/cloud_storage/__init__.py b/openwpm/storage/cloud_storage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openwpm/storage/cloud_storage/gcp_storage.py b/openwpm/storage/cloud_storage/gcp_storage.py new file mode 100644 index 000000000..09810f23e --- /dev/null +++ b/openwpm/storage/cloud_storage/gcp_storage.py @@ -0,0 +1,115 @@ +import logging +from typing import Set + +import pyarrow.parquet as pq +from gcsfs import GCSFileSystem +from pyarrow.lib import Table + +from ..arrow_storage import ArrowProvider +from ..storage_providers import TableName, UnstructuredStorageProvider + + +class GcsStructuredProvider(ArrowProvider): + """This class allows you to upload Parquet files to GCS. + This might not actually be the thing that we want to do + long term but seeing as GCS is the S3 equivalent of GCP + it is the easiest way forward. + + Inspired by the old S3Aggregator structure the GcsStructuredProvider + will by default store into + base_path/visits/table_name in the given bucket. + + Pass a different sub_dir to change this. + """ + + file_system: GCSFileSystem + + def __init__( + self, + project: str, + bucket_name: str, + base_path: str, + token: str = None, + sub_dir: str = "visits", + ) -> None: + super().__init__() + self.project = project + self.token = token + self.base_path = f"{bucket_name}/{base_path}/{sub_dir}/{{table_name}}" + + def __str__(self) -> str: + return f"GCS:{self.base_path.removesuffix('/{table_name}')}" + + async def init(self) -> None: + await super(GcsStructuredProvider, self).init() + self.file_system = GCSFileSystem( + project=self.project, token=self.token, access="read_write" + ) + + async def write_table(self, table_name: TableName, table: Table) -> None: + self.file_system.start_transaction() + pq.write_to_dataset( + table, + self.base_path.format(table_name=table_name), + filesystem=self.file_system, + ) + self.file_system.end_transaction() + + async def shutdown(self) -> None: + pass + + +class GcsUnstructuredProvider(UnstructuredStorageProvider): + """This class allows you to upload arbitrary bytes to GCS. + They will be stored under bucket_name/base_path/filename + """ + + file_system: GCSFileSystem + + def __init__( + self, + project: str, + bucket_name: str, + base_path: str, + token: str = None, + ) -> None: + super().__init__() + self.project = project + self.bucket_name = bucket_name + self.base_path = base_path + self.token = token + self.base_path = f"{bucket_name}/{base_path}/{{filename}}" + + self.file_name_cache: Set[str] = set() + """The set of all filenames ever uploaded, checked before uploading""" + self.logger = logging.getLogger("openwpm") + + async def init(self) -> None: + await super(GcsUnstructuredProvider, self).init() + self.file_system = GCSFileSystem( + project=self.project, token=self.token, access="read_write" + ) + + async def store_blob( + self, filename: str, blob: bytes, overwrite: bool = False + ) -> None: + target_path = self.base_path.format(filename=filename) + if not overwrite and ( + filename in self.file_name_cache or self.file_system.exists(target_path) + ): + self.logger.info("Not saving out file %s as it already exists", filename) + return + self.file_system.start_transaction() + + with self.file_system.open(target_path, mode="wb") as f: + f.write(blob) + + self.file_system.end_transaction() + + self.file_name_cache.add(filename) + + async def flush_cache(self) -> None: + pass + + async def shutdown(self) -> None: + pass diff --git a/openwpm/storage/cloud_storage/s3_storage.py b/openwpm/storage/cloud_storage/s3_storage.py new file mode 100644 index 000000000..788d5d126 --- /dev/null +++ b/openwpm/storage/cloud_storage/s3_storage.py @@ -0,0 +1,5 @@ +from ..arrow_storage import ArrowProvider + + +class S3StorageProvider(ArrowProvider): + pass diff --git a/openwpm/storage/in_memory_storage.py b/openwpm/storage/in_memory_storage.py new file mode 100644 index 000000000..1da7a9fce --- /dev/null +++ b/openwpm/storage/in_memory_storage.py @@ -0,0 +1,163 @@ +""" +This module contains implementations for various kinds of storage providers +that store their results in memory. +These classes are designed to allow for easier parallel testing as there are +no shared resources between tests. It also makes it easier to verify results +by not having to do a round trip through a persistent storage provider +""" + +import asyncio +import logging +from asyncio import Event, Lock, Task +from collections import defaultdict +from typing import Any, DefaultDict, Dict, List + +from multiprocess import Queue +from pyarrow import Table + +from openwpm.types import VisitId + +from .arrow_storage import ArrowProvider +from .storage_providers import ( + StructuredStorageProvider, + TableName, + UnstructuredStorageProvider, +) + + +class MemoryStructuredProvider(StructuredStorageProvider): + """ + This storage provider passes all it's data to the MemoryStructuredProviderHandle in a + process safe way. + + This makes it ideal for testing + + It also aims to only save out data as late as possible to ensure that storage_controller + only relies on the guarantees given in the interface. + """ + + lock: Lock + + def __init__(self) -> None: + super().__init__() + self.queue = Queue() + self.handle = MemoryProviderHandle(self.queue) + self.logger = logging.getLogger("openwpm") + self.cache1: DefaultDict[ + VisitId, DefaultDict[TableName, List[Dict[str, Any]]] + ] = defaultdict(lambda: defaultdict(list)) + """The cache for entries before they are finalized""" + self.cache2: DefaultDict[TableName, List[Dict[str, Any]]] = defaultdict(list) + """For all entries that have been finalized but not yet flushed out to the queue""" + self.signal_list: List[Event] = [] + + async def init(self) -> None: + self.lock = asyncio.Lock() + + async def flush_cache(self) -> None: + async with self.lock as _: + self.logger.info("Flushing cache") + + for table, record_list in self.cache2.items(): + self.logger.info(f"Saving out {len(record_list)} entries for {table}") + for record in record_list: + self.queue.put((table, record)) + self.cache2.clear() + for ev in self.signal_list: + ev.set() + + async def store_record( + self, table: TableName, visit_id: VisitId, record: Dict[str, Any] + ) -> None: + self.logger.info( + "Saving into table %s for visit_id %d record %r", table, visit_id, record + ) + self.cache1[visit_id][table].append(record) + + async def finalize_visit_id( + self, visit_id: VisitId, interrupted: bool = False + ) -> Task[None]: + async with self.lock as _: + self.logger.info( + f"Finalizing visit_id {visit_id} which was {'' if interrupted else 'not'} interrupted" + ) + for table, record_list in self.cache1[visit_id].items(): + self.cache2[table].extend(record_list) + + del self.cache1[visit_id] + + async def wait(signal: Event) -> None: + await signal.wait() + + ev = Event() + self.signal_list.append(ev) + return asyncio.create_task(wait(ev)) + + async def shutdown(self) -> None: + if self.cache1 != {} or self.cache2 != {}: + self.logger.error("Shutting down with unsaved records") + + +class MemoryProviderHandle: + """ + Call poll_queue to load all available data into the dict + at self.storage + """ + + def __init__(self, queue: Queue) -> None: + self.queue = queue + self.storage: DefaultDict[str, List[Any]] = defaultdict(list) + + def poll_queue(self, *args: Any, **kwargs: Any) -> None: + while not self.queue.empty(): + table, record = self.queue.get(*args, **kwargs) + self.storage[table].append(record) + + +class MemoryUnstructuredProvider(UnstructuredStorageProvider): + """This storage provider stores all data in memory under self.storage as a dict + from filename to content. + Use this provider for writing tests and for small crawls where no persistence is required + """ + + async def init(self) -> None: + pass + + def __init__(self) -> None: + self.storage: Dict[str, bytes] = {} + self.queue = Queue() + self.handle = MemoryProviderHandle(self.queue) + + async def store_blob( + self, + filename: str, + blob: bytes, + compressed: bool = True, + skip_if_exists: bool = True, + ) -> None: + if skip_if_exists and filename in self.storage: + return + if compressed: + bytesIO = self._compress(blob) + blob = bytesIO.getvalue() + self.storage[filename] = blob + self.queue.put((filename, blob)) + + async def flush_cache(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + +class MemoryArrowProvider(ArrowProvider): + def __init__(self) -> None: + super().__init__() + self.queue = Queue() + self.handle = MemoryProviderHandle(self.queue) + + async def write_table(self, table_name: TableName, table: Table) -> None: + self.queue.put((table_name, table)) + + async def shutdown(self) -> None: + pass diff --git a/openwpm/storage/leveldb.py b/openwpm/storage/leveldb.py new file mode 100644 index 000000000..14a629831 --- /dev/null +++ b/openwpm/storage/leveldb.py @@ -0,0 +1,52 @@ +from pathlib import Path + +import plyvel +from plyvel._plyvel import WriteBatch + +from .storage_providers import UnstructuredStorageProvider + +LDB_BATCH_SIZE = 100 + + +class LevelDbProvider(UnstructuredStorageProvider): + ldb: plyvel.DB + content_batch: WriteBatch + + def __init__(self, db_path: Path): + self.db_path = db_path + self._ldb_counter = 0 + self._ldb_commit_time = 0 + + async def init(self) -> None: + self.ldb = plyvel.DB( + str(self.db_path), + create_if_missing=True, + write_buffer_size=128 * 10 ** 6, + compression="snappy", + ) + self.content_batch = self.ldb.write_batch() + + async def flush_cache(self) -> None: + """Write out content batch to LevelDB database""" + self.content_batch.write() + self.content_batch = self.ldb.write_batch() + + async def shutdown(self) -> None: + self.ldb.close() + + async def store_blob( + self, + filename: str, + blob: bytes, + overwrite: bool = False, + ) -> None: + + content_hash = str(filename).encode("ascii") + if self.ldb.get(content_hash) is not None and not overwrite: + return + self.content_batch.put(content_hash, blob) + self._ldb_counter += 1 + + if self._ldb_counter >= LDB_BATCH_SIZE: + await self.flush_cache() + self._ldb_counter = 0 diff --git a/openwpm/storage/local_storage.py b/openwpm/storage/local_storage.py new file mode 100644 index 000000000..e958a3d63 --- /dev/null +++ b/openwpm/storage/local_storage.py @@ -0,0 +1,50 @@ +import logging +from pathlib import Path + +import pyarrow.parquet as pq +from pyarrow.lib import Table + +from .arrow_storage import ArrowProvider +from .storage_providers import TableName, UnstructuredStorageProvider + + +class LocalArrowProvider(ArrowProvider): + """ Stores Parquet files under storage_path/table_name/n.parquet""" + + def __init__(self, storage_path: Path) -> None: + super().__init__() + self.storage_path = storage_path + + async def write_table(self, table_name: TableName, table: Table) -> None: + pq.write_to_dataset(table, str(self.storage_path / table_name)) + + +class LocalGzipProvider(UnstructuredStorageProvider): + """ Stores files as storage_path/hash.zip """ + + async def init(self) -> None: + pass + + def __init__(self, storage_path: Path) -> None: + super().__init__() + self.storage_path = storage_path + self.logger = logging.getLogger("openwpm") + + async def store_blob( + self, filename: str, blob: bytes, overwrite: bool = False + ) -> None: + path = self.storage_path / (filename + ".zip") + if path.exists() and not overwrite: + self.logger.debug( + "File %s already exists on disk. Not overwriting", filename + ) + return + compressed = self._compress(blob) + with path.open(mode="wb") as f: + f.write(compressed.read()) + + async def flush_cache(self) -> None: + pass + + async def shutdown(self) -> None: + pass diff --git a/openwpm/DataAggregator/parquet_schema.py b/openwpm/storage/parquet_schema.py similarity index 91% rename from openwpm/DataAggregator/parquet_schema.py rename to openwpm/storage/parquet_schema.py index dbd6dab1d..cf83f4406 100644 --- a/openwpm/DataAggregator/parquet_schema.py +++ b/openwpm/storage/parquet_schema.py @@ -1,7 +1,32 @@ +""" +Arrow schema for our ArrowProvider.py + +IF YOU CHANGE THIS FILE ALSO CHANGE schema.sql and test_values.py +AND Schema-Documentation.md + +""" + import pyarrow as pa PQ_SCHEMAS = dict() +fields = [ + pa.field("task_id", pa.int64(), nullable=False), + pa.field("manager_params", pa.string(), nullable=False), + pa.field("openwpm_version", pa.string(), nullable=False), + pa.field("browser_version", pa.string(), nullable=False), + pa.field("instance_id", pa.uint32(), nullable=False), +] +PQ_SCHEMAS["task"] = pa.schema(fields) + +fields = [ + pa.field("browser_id", pa.uint32(), nullable=False), + pa.field("task_id", pa.int64(), nullable=False), + pa.field("browser_params", pa.string(), nullable=False), + pa.field("instance_id", pa.uint32(), nullable=False), +] +PQ_SCHEMAS["crawl"] = pa.schema(fields) + # site_visits fields = [ pa.field("visit_id", pa.int64(), nullable=False), diff --git a/openwpm/DataAggregator/schema.sql b/openwpm/storage/schema.sql similarity index 96% rename from openwpm/DataAggregator/schema.sql rename to openwpm/storage/schema.sql index 105c4e39c..aab712670 100644 --- a/openwpm/DataAggregator/schema.sql +++ b/openwpm/storage/schema.sql @@ -1,6 +1,8 @@ /* This file is sourced during the initialization * of the crawler. Make sure everything is CREATE * IF NOT EXISTS, otherwise there will be errors + * IF YOU CHANGE THIS FILE ALSO CHANGE test_values.py and parquet_schema.py + * AND Schema-Documentation.md */ CREATE TABLE IF NOT EXISTS task ( @@ -25,7 +27,7 @@ CREATE TABLE IF NOT EXISTS site_visits ( browser_id INTEGER NOT NULL, site_url VARCHAR(500) NOT NULL, site_rank INTEGER, - FOREIGN KEY(browser_id) REFERENCES crawl(id)); + FOREIGN KEY(browser_id) REFERENCES crawl(browser_id)); /* # crawl_history @@ -41,7 +43,7 @@ CREATE TABLE IF NOT EXISTS crawl_history ( traceback TEXT, duration INTEGER, dtg DATETIME DEFAULT (CURRENT_TIMESTAMP), - FOREIGN KEY(browser_id) REFERENCES crawl(id)); + FOREIGN KEY(browser_id) REFERENCES crawl(browser_id)); /* # http_requests diff --git a/openwpm/storage/sql_provider.py b/openwpm/storage/sql_provider.py new file mode 100644 index 000000000..0e9d66900 --- /dev/null +++ b/openwpm/storage/sql_provider.py @@ -0,0 +1,109 @@ +import json +import logging +import os +import sqlite3 +from pathlib import Path +from sqlite3 import ( + Connection, + Cursor, + IntegrityError, + InterfaceError, + OperationalError, + ProgrammingError, +) +from typing import Any, Dict, List, Tuple + +from openwpm.types import VisitId + +from .storage_providers import StructuredStorageProvider, TableName + +SCHEMA_FILE = os.path.join(os.path.dirname(__file__), "schema.sql") + + +class SQLiteStorageProvider(StructuredStorageProvider): + db: Connection + cur: Cursor + + def __init__(self, db_path: Path) -> None: + super().__init__() + self.db_path = db_path + self._sql_counter = 0 + self._sql_commit_time = 0 + self.logger = logging.getLogger("openwpm") + + async def init(self) -> None: + self.db = sqlite3.connect(str(self.db_path)) + self.cur = self.db.cursor() + self._create_tables() + + def _create_tables(self) -> None: + """Create tables (if this is a new database)""" + with open(SCHEMA_FILE, "r") as f: + self.db.executescript(f.read()) + self.db.commit() + + async def flush_cache(self) -> None: + self.db.commit() + + async def store_record( + self, table: TableName, visit_id: VisitId, record: Dict[str, Any] + ) -> None: + """Submit a record to be stored + The storing might not happen immediately + """ + assert self.cur is not None + statement, args = self._generate_insert(table=table, data=record) + for i in range(len(args)): + if isinstance(args[i], bytes): + args[i] = str(args[i], errors="ignore") + elif callable(args[i]): + args[i] = str(args[i]) + elif type(args[i]) == dict: + args[i] = json.dumps(args[i]) + try: + self.cur.execute(statement, args) + self._sql_counter += 1 + except ( + OperationalError, + ProgrammingError, + IntegrityError, + InterfaceError, + ) as e: + self.logger.error( + "Unsupported record:\n%s\n%s\n%s\n%s\n" + % (type(e), e, statement, repr(args)) + ) + + @staticmethod + def _generate_insert( + table: TableName, data: Dict[str, Any] + ) -> Tuple[str, List[Any]]: + """Generate a SQL query from `record`""" + statement = "INSERT INTO %s (" % table + value_str = "VALUES (" + values = list() + first = True + for field, value in data.items(): + statement += "" if first else ", " + statement += field + value_str += "?" if first else ",?" + values.append(value) + first = False + statement = statement + ") " + value_str + ")" + return statement, values + + def execute_statement(self, statement: str) -> None: + self.cur.execute(statement) + self.db.commit() + + async def finalize_visit_id( + self, visit_id: VisitId, interrupted: bool = False + ) -> None: + if interrupted: + self.logger.warning("Visit with visit_id %d got interrupted", visit_id) + self.cur.execute("INSERT INTO incomplete_visits VALUES (?)", (visit_id,)) + self.db.commit() + + async def shutdown(self) -> None: + self.db.commit() + self.db.close() diff --git a/openwpm/storage/storage_controller.py b/openwpm/storage/storage_controller.py new file mode 100644 index 000000000..28b441f6e --- /dev/null +++ b/openwpm/storage/storage_controller.py @@ -0,0 +1,545 @@ +import asyncio +import base64 +import logging +import queue +import random +import socket +import time +from asyncio import IncompleteReadError, Task +from collections import defaultdict +from typing import Any, DefaultDict, Dict, List, NoReturn, Optional, Tuple + +from multiprocess import Queue + +from openwpm.utilities.multiprocess_utils import Process + +from ..config import BrowserParamsInternal, ManagerParamsInternal +from ..socket_interface import ClientSocket, get_message_from_reader +from ..types import BrowserId, VisitId +from .storage_providers import ( + StructuredStorageProvider, + TableName, + UnstructuredStorageProvider, +) + +RECORD_TYPE_CONTENT = "page_content" +RECORD_TYPE_META = "meta_information" +ACTION_TYPE_FINALIZE = "Finalize" +ACTION_TYPE_INITIALIZE = "Initialize" + +RECORD_TYPE_CREATE = "create_table" +STATUS_TIMEOUT = 120 # seconds +SHUTDOWN_SIGNAL = "SHUTDOWN" +BATCH_COMMIT_TIMEOUT = 30 # commit a batch if no new records for N seconds + + +STATUS_UPDATE_INTERVAL = 5 # seconds +INVALID_VISIT_ID = VisitId(-1) + + +class StorageController: + """ + Manages incoming data and it's saving to disk + + Provides it's status to the task manager via the completion and status queue. + Can be shut down via a shutdown signal in the shutdown queue + """ + + def __init__( + self, + structured_storage: StructuredStorageProvider, + unstructured_storage: Optional[UnstructuredStorageProvider], + status_queue: Queue, + completion_queue: Queue, + shutdown_queue: Queue, + ) -> None: + """ + Parameters + ---------- + status_queue + queue through which the StorageControllerHandler + receives updates on the current amount of records to be processed. + Also used for initialization + completion_queue + queue containing the visit_ids of saved records + shutdown_queue + queue that the main process can use to shut down the StorageController + """ + self.status_queue = status_queue + self.completion_queue = completion_queue + self.shutdown_queue = shutdown_queue + self._shutdown_flag = False + self._relaxed = False + self.logger = logging.getLogger("openwpm") + self.store_record_tasks: DefaultDict[VisitId, List[Task[None]]] = defaultdict( + list + ) + """Contains all store_record tasks for a given visit_id""" + self.finalize_tasks: List[Tuple[VisitId, Optional[Task[None]], bool]] = [] + """Contains all information required for update_completion_queue to work + Tuple structure is: VisitId, optional completion token, success + """ + self.structured_storage = structured_storage + self.unstructured_storage = unstructured_storage + self._last_record_received: Optional[float] = None + + async def _handler( + self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter + ) -> None: + """This is a dirty hack around the fact that exceptions get swallowed by the asyncio.Server + and the coroutine just dies without any message. + By having this function be a wrapper we at least get a log message + """ + try: + await self.handler(reader, writer) + except Exception as e: + self.logger.error( + "An exception occurred while processing records", exc_info=e + ) + + async def handler( + self, reader: asyncio.StreamReader, _: asyncio.StreamWriter + ) -> None: + """Created for every new connection to the Server""" + self.logger.debug("Initializing new handler") + while True: + try: + record: Tuple[str, Any] = await get_message_from_reader(reader) + except IncompleteReadError: + self.logger.info( + "Terminating handler, because the underlying socket closed" + ) + break + if len(record) != 2: + self.logger.error("Query is not the correct length %s", repr(record)) + continue + + self._last_record_received = time.time() + record_type, data = record + + if record_type == RECORD_TYPE_CREATE: + raise RuntimeError( + f"""{RECORD_TYPE_CREATE} is no longer supported. + Please change the schema before starting the StorageController. + For an example of that see test/test_custom_function.py + """ + ) + + if record_type == RECORD_TYPE_CONTENT: + assert len(data) == 2 + if self.unstructured_storage is None: + self.logger.error( + """Tried to save content while not having + provided any unstructured storage provider.""" + ) + continue + content, content_hash = data + content = base64.b64decode(content) + await self.unstructured_storage.store_blob( + filename=content_hash, blob=content + ) + continue + + if "visit_id" not in data: + self.logger.error( + "Skipping record: No visit_id contained in record %r", record + ) + continue + + visit_id = VisitId(data["visit_id"]) + + if record_type == RECORD_TYPE_META: + await self._handle_meta(visit_id, data) + continue + + table_name = TableName(record_type) + await self.store_record(table_name, visit_id, data) + + async def store_record( + self, table_name: TableName, visit_id: VisitId, data: Dict[str, Any] + ) -> None: + + if visit_id == INVALID_VISIT_ID: + # Hacking around the fact that task and crawl don't have a VisitID + del data["visit_id"] + # Turning these into task to be able to have them complete without blocking the socket + self.store_record_tasks[visit_id].append( + asyncio.create_task( + self.structured_storage.store_record( + table=table_name, visit_id=visit_id, record=data + ) + ) + ) + + async def _handle_meta(self, visit_id: VisitId, data: Dict[str, Any]) -> None: + """ + Messages for the table RECORD_TYPE_SPECIAL are meta information + communicated to the storage controller + Supported message types: + - finalize: A message sent by the extension to + signal that a visit_id is complete. + - initialize: TODO: Start complaining if we receive data for a visit_id + before the initialize event happened. + See also https://github.com/mozilla/OpenWPM/issues/846 + """ + action: str = data["action"] + if action == ACTION_TYPE_INITIALIZE: + return + elif action == ACTION_TYPE_FINALIZE: + success: bool = data["success"] + completion_token = await self.finalize_visit_id(visit_id, success) + self.finalize_tasks.append((visit_id, completion_token, success)) + else: + raise ValueError("Unexpected action: %s", action) + + async def finalize_visit_id( + self, visit_id: VisitId, success: bool + ) -> Optional[Task[None]]: + """Makes sure all records for a given visit_id + have been processed before we invoke finalize_visit_id + on the structured_storage + + See StructuredStorageProvider::finalize_visit_id for additional + documentation + """ + + if visit_id not in self.store_record_tasks: + self.logger.error( + "There are no records to be stored for visit_id %d, skipping...", + visit_id, + ) + return None + + self.logger.info("Awaiting all tasks for visit_id %d", visit_id) + for task in self.store_record_tasks[visit_id]: + await task + del self.store_record_tasks[visit_id] + self.logger.debug( + "Awaited all tasks for visit_id %d while finalizing", visit_id + ) + + completion_token = await self.structured_storage.finalize_visit_id( + visit_id, interrupted=not success + ) + return completion_token + + async def update_status_queue(self) -> NoReturn: + """Send manager process a status update. + + This coroutine will get cancelled with an exception + so there is no need for an orderly return + """ + while True: + await asyncio.sleep(STATUS_UPDATE_INTERVAL) + visit_id_count = len(self.store_record_tasks.keys()) + task_count = 0 + for task_list in self.store_record_tasks.values(): + for task in task_list: + if not task.done(): + task_count += 1 + self.status_queue.put(task_count) + self.logger.debug( + ( + "StorageController status: There are currently %d scheduled tasks " + "for %d visit_ids" + ), + task_count, + visit_id_count, + ) + + async def shutdown(self, completion_queue_task: Task[None]) -> None: + completion_tokens = {} + visit_ids = list(self.store_record_tasks.keys()) + for visit_id in visit_ids: + t = await self.finalize_visit_id(visit_id, success=False) + if t is not None: + completion_tokens[visit_id] = t + await self.structured_storage.flush_cache() + await completion_queue_task + for visit_id, token in completion_tokens.items(): + await token + self.completion_queue.put((visit_id, False)) + + await self.structured_storage.shutdown() + + if self.unstructured_storage is not None: + await self.unstructured_storage.flush_cache() + await self.unstructured_storage.shutdown() + + async def should_shutdown(self) -> None: + """Returns when we should shut down""" + + while self.shutdown_queue.empty(): + await asyncio.sleep(STATUS_UPDATE_INTERVAL) + _, relaxed = self.shutdown_queue.get() + self._relaxed = relaxed + self._shutdown_flag = True + self.logger.info("Received shutdown signal!") + + async def save_batch_if_past_timeout(self) -> NoReturn: + """Save the current batch of records if no new data has been received. + + If we aren't receiving new data for this batch we commit early + regardless of the current batch size. + + This coroutine will get cancelled with an exception + so there is no need for an orderly return + """ + while True: + if self._last_record_received is None: + await asyncio.sleep(BATCH_COMMIT_TIMEOUT) + continue + + diff = time.time() - self._last_record_received + if diff < BATCH_COMMIT_TIMEOUT: + time_until_timeout = BATCH_COMMIT_TIMEOUT - diff + await asyncio.sleep(time_until_timeout) + continue + + self.logger.debug( + "Saving current records since no new data has " + "been written for %d seconds." % diff + ) + await self.structured_storage.flush_cache() + if self.unstructured_storage: + await self.unstructured_storage.flush_cache() + self._last_record_received = None + + async def update_completion_queue(self) -> None: + """ All completed finalize_visit_id tasks get put into the completion_queue here """ + while not (self._shutdown_flag and len(self.finalize_tasks) == 0): + # This list is needed because iterating over a list and changing it at the same time + # is forbidden + new_finalize_tasks: List[Tuple[VisitId, Optional[Task[None]], bool]] = [] + for visit_id, token, success in self.finalize_tasks: + if ( + not token or token.done() + ): # Either way all data for the visit_id was saved out + self.completion_queue.put((visit_id, success)) + else: + new_finalize_tasks.append((visit_id, token, success)) + self.finalize_tasks = new_finalize_tasks + await asyncio.sleep(5) + + async def _run(self) -> None: + await self.structured_storage.init() + if self.unstructured_storage: + await self.unstructured_storage.init() + server: asyncio.AbstractServer = await asyncio.start_server( + self._handler, "localhost", 0, family=socket.AF_INET + ) + sockets = server.sockets + assert sockets is not None + socketname = sockets[0].getsockname() + self.status_queue.put(socketname) + status_queue_update = asyncio.create_task( + self.update_status_queue(), name="StatusQueue" + ) + timeout_check = asyncio.create_task( + self.save_batch_if_past_timeout(), name="TimeoutCheck" + ) + + update_completion_queue = asyncio.create_task( + self.update_completion_queue(), name="CompletionQueueFeeder" + ) + # Blocks until we should shutdown + await self.should_shutdown() + + server.close() + status_queue_update.cancel() + timeout_check.cancel() + await server.wait_closed() + await self.shutdown(update_completion_queue) + + def run(self) -> None: + logging.getLogger("asyncio").setLevel(logging.WARNING) + asyncio.run(self._run(), debug=True) + + +class DataSocket: + """Wrapper around ClientSocket to make sending records to the StorageController more convenient""" + + def __init__(self, listener_address: Tuple[str, int]) -> None: + self.socket = ClientSocket(serialization="dill") + self.socket.connect(*listener_address) + self.logger = logging.getLogger("openwpm") + + def store_record( + self, table_name: TableName, visit_id: VisitId, data: Dict[str, Any] + ) -> None: + data["visit_id"] = visit_id + self.socket.send( + ( + table_name, + data, + ) + ) + + def finalize_visit_id(self, visit_id: VisitId, success: bool) -> None: + self.socket.send( + ( + RECORD_TYPE_META, + { + "action": ACTION_TYPE_FINALIZE, + "visit_id": visit_id, + "success": success, + }, + ) + ) + + def close(self) -> None: + self.socket.close() + + +class StorageControllerHandle: + """This class contains all methods relevant for the TaskManager + to interact with the StorageController + """ + + def __init__( + self, + structured_storage: StructuredStorageProvider, + unstructured_storage: Optional[UnstructuredStorageProvider], + ) -> None: + + self.listener_address: Optional[Tuple[str, int]] = None + self.listener_process: Optional[Process] = None + self.status_queue = Queue() + self.completion_queue = Queue() + self.shutdown_queue = Queue() + self._last_status = None + self._last_status_received: Optional[float] = None + self.logger = logging.getLogger("openwpm") + self.storage_controller = StorageController( + structured_storage, + unstructured_storage, + status_queue=self.status_queue, + completion_queue=self.completion_queue, + shutdown_queue=self.shutdown_queue, + ) + + def get_next_visit_id(self) -> VisitId: + """Generate visit id as randomly generated positive integer less than 2^53. + + Parquet can support integers up to 64 bits, but Javascript can only + represent integers up to 53 bits: + https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/MAX_SAFE_INTEGER + Thus, we cap these values at 53 bits. + """ + return VisitId(random.getrandbits(53)) + + def get_next_browser_id(self) -> BrowserId: + """Generate crawl id as randomly generated positive 32bit integer + + Note: Parquet's partitioned dataset reader only supports integer + partition columns up to 32 bits. + """ + return BrowserId(random.getrandbits(32)) + + def save_configuration( + self, + manager_params: ManagerParamsInternal, + browser_params: List[BrowserParamsInternal], + openwpm_version: str, + browser_version: str, + ) -> None: + assert self.listener_address is not None + sock = DataSocket(self.listener_address) + task_id = random.getrandbits(32) + sock.store_record( + TableName("task"), + INVALID_VISIT_ID, + { + "task_id": task_id, + "manager_params": manager_params.to_json(), + "openwpm_version": openwpm_version, + "browser_version": browser_version, + }, + ) + # Record browser details for each browser + for browser_param in browser_params: + sock.store_record( + TableName("crawl"), + INVALID_VISIT_ID, + { + "browser_id": browser_param.browser_id, + "task_id": task_id, + "browser_params": browser_param.to_json(), + }, + ) + sock.finalize_visit_id(INVALID_VISIT_ID, success=True) + + def launch(self) -> None: + """Starts the storage controller""" + self.storage_controller = Process( + name="StorageController", + target=StorageController.run, + args=(self.storage_controller,), + ) + self.storage_controller.daemon = True + self.storage_controller.start() + + self.listener_address = self.status_queue.get() + + def get_new_completed_visits(self) -> List[Tuple[int, bool]]: + """ + Returns a list of all visit ids that have been processed since + the last time the method was called and whether or not they + ran successfully. + + This method will return an empty list in case no visit ids have + been processed since the last time this method was called + """ + finished_visit_ids = list() + while not self.completion_queue.empty(): + finished_visit_ids.append(self.completion_queue.get()) + return finished_visit_ids + + def shutdown(self, relaxed: bool = True) -> None: + """ Terminate the storage controller process""" + assert isinstance(self.storage_controller, Process) + self.logger.debug("Sending the shutdown signal to the Storage Controller...") + self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed)) + start_time = time.time() + self.storage_controller.join(300) + self.logger.debug( + "%s took %s seconds to close." + % (type(self).__name__, str(time.time() - start_time)) + ) + + def get_most_recent_status(self) -> int: + """Return the most recent queue size sent from the Storage Controller process""" + + # Block until we receive the first status update + if self._last_status is None: + return self.get_status() + + # Drain status queue until we receive most recent update + while not self.status_queue.empty(): + self._last_status = self.status_queue.get() + self._last_status_received = time.time() + + # Check last status signal + if (time.time() - self._last_status_received) > STATUS_TIMEOUT: + raise RuntimeError( + "No status update from the storage controller process " + "for %d seconds." % (time.time() - self._last_status_received) + ) + + return self._last_status + + def get_status(self) -> int: + """Get listener process status. If the status queue is empty, block.""" + try: + self._last_status = self.status_queue.get( + block=True, timeout=STATUS_TIMEOUT + ) + self._last_status_received = time.time() + except queue.Empty: + assert self._last_status_received is not None + raise RuntimeError( + "No status update from the storage controller process " + "for %d seconds." % (time.time() - self._last_status_received) + ) + assert isinstance(self._last_status, int) + return self._last_status diff --git a/openwpm/storage/storage_providers.py b/openwpm/storage/storage_providers.py new file mode 100644 index 000000000..2a2bc6aa6 --- /dev/null +++ b/openwpm/storage/storage_providers.py @@ -0,0 +1,114 @@ +""" +This module contains all base classes of the storage provider hierarchy +Any subclass of these classes should be able to be used in OpenWPM +without any changes to the rest of the code base +""" +import gzip +import io +from abc import ABC, abstractmethod +from asyncio import Task +from typing import Any, Dict, NewType, Optional + +from openwpm.types import VisitId + +TableName = NewType("TableName", str) +INCOMPLETE_VISITS = TableName("incomplete_visits") + + +class StorageProvider(ABC): + """Base class that defines some general helper methods + Do not inherit from this class directly + Inherit from StructuredStorageProvider or UnstructuredStorageProvider instead + """ + + @abstractmethod + async def init(self) -> None: + """Initializes the StorageProvider for use + + Guaranteed to be called in the process the + StorageController runs in. + """ + pass + + @abstractmethod + async def flush_cache(self) -> None: + """ Blockingly write out any cached data to the respective storage """ + pass + + @abstractmethod + async def shutdown(self) -> None: + """Close all open resources + After this method has been called no further calls should be made to the object + """ + pass + + +class StructuredStorageProvider(StorageProvider): + """Structured Storage Providers are responsible for handling all structured data + that OpenWPM emits. + This includes: + - All data that is collected by the WebExtension instrumentation + - Data about browser configuration and + - Any data that custom commands send to the Storage Controller + + See docs/Schema-Documentation.md to see what an unmodified OpenWPM will attempt + to store + """ + + def __init__(self) -> None: + super().__init__() + + @abstractmethod + async def store_record( + self, table: TableName, visit_id: VisitId, record: Dict[str, Any] + ) -> None: + """Submit a record to be stored + The storing might not happen immediately + """ + pass + + @abstractmethod + async def finalize_visit_id( + self, visit_id: VisitId, interrupted: bool = False + ) -> Optional[Task[None]]: + """This method is invoked to inform the StructuredStorageProvider that no more + records for this visit_id will be submitted + + This method returns once the data is ready to be written out. + If the data is immediately written out nothing will be returned. + Otherwise an awaitable will returned that resolve onces the records have been + saved out to persistent storage + """ + pass + + +class UnstructuredStorageProvider(StorageProvider): + """Unstructured Storage Providers are responsible for handling the unstructured data + that OpenWPM emits. + This is primarily content loaded by websites. + Don't make any assumptions about the data (especially don't assume it's valid unicode) + + In the future this interface will be expanded to address the needs of https://github.com/mozilla/OpenWPM/issues/232 + """ + + @abstractmethod + async def store_blob( + self, + filename: str, + blob: bytes, + overwrite: bool = False, + ) -> None: + """Stores the given bytes under the provided filename""" + pass + + @staticmethod + def _compress(blob: bytes) -> io.BytesIO: + """Takes a byte blob and compresses it with gzip + The returned BytesIO object is at stream position 0. + This means it can be treated like a zip file on disk. + """ + out_f = io.BytesIO() + with gzip.GzipFile(fileobj=out_f, mode="w") as writer: + writer.write(blob) + out_f.seek(0) + return out_f diff --git a/openwpm/task_manager.py b/openwpm/task_manager.py index c05d8a54f..e61d98f04 100644 --- a/openwpm/task_manager.py +++ b/openwpm/task_manager.py @@ -2,11 +2,13 @@ import logging import os import pickle +import sys import threading import time import traceback from queue import Empty as EmptyQueue -from typing import Any, Dict, List, Optional, Set, Tuple +from types import TracebackType +from typing import Any, Dict, List, Optional, Set, Tuple, Type import psutil import tblib @@ -25,12 +27,15 @@ from .command_sequence import CommandSequence from .commands.browser_commands import FinalizeCommand from .commands.utils.webdriver_utils import parse_neterror -from .DataAggregator import S3_aggregator, base_aggregator, local_aggregator -from .DataAggregator.base_aggregator import ACTION_TYPE_FINALIZE, RECORD_TYPE_SPECIAL from .errors import CommandExecutionError from .js_instrumentation import clean_js_instrumentation_settings from .mp_logger import MPLogger -from .socket_interface import ClientSocket +from .storage.storage_controller import DataSocket, StorageControllerHandle +from .storage.storage_providers import ( + StructuredStorageProvider, + TableName, + UnstructuredStorageProvider, +) from .utilities.multiprocess_utils import kill_process_and_children from .utilities.platform_utils import get_configuration_string, get_version @@ -39,15 +44,15 @@ SLEEP_CONS = 0.1 # command sleep constant (in seconds) BROWSER_MEMORY_LIMIT = 1500 # in MB -AGGREGATOR_QUEUE_LIMIT = 10000 # number of records in the queue +STORAGE_CONTROLLER_JOB_LIMIT = 10000 # number of records in the queue class TaskManager: """User-facing Class for interfacing with OpenWPM The TaskManager spawns several child processes to run the automation tasks. - - DataAggregator to aggregate data across browsers and save to the - database. + - StorageController to receive data from across browsers and save it to + the provided StorageProviders - MPLogger to aggregate logs across processes - BrowserManager processes to isolate Browsers in a separate process """ @@ -56,6 +61,8 @@ def __init__( self, manager_params_temp: ManagerParams, browser_params_temp: List[BrowserParams], + structured_storage_provider: StructuredStorageProvider, + unstructured_storage_provider: Optional[UnstructuredStorageProvider], logger_kwargs: Dict[Any, Any] = {}, ) -> None: """Initialize the TaskManager with browser and manager config params @@ -71,38 +78,26 @@ def __init__( Keyword arguments to pass to MPLogger on initialization. """ - validate_manager_params(manager_params_temp) - for bp in browser_params_temp: - validate_browser_params(bp) validate_crawl_configs(manager_params_temp, browser_params_temp) - - manager_params = ManagerParamsInternal(**manager_params_temp.to_dict()) + manager_params = ManagerParamsInternal.from_dict(manager_params_temp.to_dict()) browser_params = [ - BrowserParamsInternal(**bp.to_dict()) for bp in browser_params_temp + BrowserParamsInternal.from_dict(bp.to_dict()) for bp in browser_params_temp ] # Make paths absolute in manager_params if manager_params.data_directory: - manager_params.data_directory = os.path.expanduser( - manager_params.data_directory - ) + manager_params.data_directory = manager_params.data_directory.expanduser() + if manager_params.log_directory: - manager_params.log_directory = os.path.expanduser( - manager_params.log_directory - ) + manager_params.log_directory = manager_params.log_directory.expanduser() - manager_params.database_name = os.path.join( - manager_params.data_directory, manager_params.database_name - ) - manager_params.log_file = os.path.join( - manager_params.log_directory, manager_params.log_file - ) - manager_params.screenshot_path = os.path.join( - manager_params.data_directory, "screenshots" - ) - manager_params.source_dump_path = os.path.join( - manager_params.data_directory, "sources" + manager_params.log_file = ( + manager_params.log_directory / manager_params.log_file.name ) + manager_params.screenshot_path = manager_params.data_directory / "screenshots" + + manager_params.source_dump_path = manager_params.data_directory / "sources" + self.manager_params = manager_params self.browser_params = browser_params self._logger_kwargs = logger_kwargs @@ -129,16 +124,19 @@ def __init__( self.failure_count = 0 self.failure_limit = manager_params.failure_limit - # Start logging server thread self.logging_server = MPLogger( - self.manager_params.log_file, self.manager_params, **self._logger_kwargs + self.manager_params.log_file, + str(structured_storage_provider), + **self._logger_kwargs ) self.manager_params.logger_address = self.logging_server.logger_address self.logger = logging.getLogger("openwpm") - # Initialize the data aggregators - self._launch_aggregators() + # Initialize the storage controller + self._launch_storage_controller( + structured_storage_provider, unstructured_storage_provider + ) # Sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) @@ -152,7 +150,9 @@ def __init__( # Save crawl config information to database openwpm_v, browser_v = get_version() - self.data_aggregator.save_configuration(openwpm_v, browser_v) + self.storage_controller_handle.save_configuration( + manager_params, browser_params, openwpm_v, browser_v + ) self.logger.info( get_configuration_string( self.manager_params, browser_params, (openwpm_v, browser_v) @@ -165,13 +165,18 @@ def __init__( self.callback_thread.name = "OpenWPM-completion_handler" self.callback_thread.start() - def __enter__(): + def __enter__(self): """ Execute starting procedure for TaskManager """ return self - def __exit__(): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> None: """ Execute shutdown procedure for TaskManager """ @@ -183,7 +188,9 @@ def _initialize_browsers( """ initialize the browser classes, each its unique set of params """ browsers = list() for i in range(self.num_browsers): - browser_params[i].browser_id = self.data_aggregator.get_next_browser_id() + browser_params[ + i + ].browser_id = self.storage_controller_handle.get_next_browser_id() browsers.append(Browser(self.manager_params, browser_params[i])) return browsers @@ -272,24 +279,21 @@ def _manager_watchdog(self) -> None: ) kill_process_and_children(process, self.logger) - def _launch_aggregators(self) -> None: - """Launch the necessary data aggregators""" - self.data_aggregator: base_aggregator.BaseAggregator - if self.manager_params.output_format == "local": - self.data_aggregator = local_aggregator.LocalAggregator( - self.manager_params, self.browser_params - ) - elif self.manager_params.output_format == "s3": - self.data_aggregator = S3_aggregator.S3Aggregator( - self.manager_params, self.browser_params - ) - - self.data_aggregator.launch() - self.manager_params.aggregator_address = self.data_aggregator.listener_address - - # open connection to aggregator for saving crawl details - self.sock = ClientSocket(serialization="dill") - self.sock.connect(*self.manager_params.aggregator_address) + def _launch_storage_controller( + self, + structured_storage_provider: StructuredStorageProvider, + unstructured_storage_provider: Optional[UnstructuredStorageProvider], + ) -> None: + self.storage_controller_handle = StorageControllerHandle( + structured_storage_provider, unstructured_storage_provider + ) + self.storage_controller_handle.launch() + self.manager_params.storage_controller_address = ( + self.storage_controller_handle.listener_address + ) + assert self.manager_params.storage_controller_address is not None + # open connection to storage controller for saving crawl details + self.sock = DataSocket(self.manager_params.storage_controller_address) def _shutdown_manager( self, during_init: bool = False, relaxed: bool = True @@ -321,8 +325,8 @@ def _shutdown_manager( browser.command_thread.join() browser.shutdown_browser(during_init, force=not relaxed) - self.sock.close() # close socket to data aggregator - self.data_aggregator.shutdown(relaxed=relaxed) + self.sock.close() # close socket to storage controller + self.storage_controller_handle.shutdown(relaxed=relaxed) self.logging_server.close() if hasattr(self, "callback_thread"): self.callback_thread.join() @@ -369,21 +373,20 @@ def _start_thread( self.logger.error("Attempted to execute command on a closed TaskManager") raise RuntimeError("Attempted to execute command on a closed TaskManager") self._check_failure_status() - visit_id = self.data_aggregator.get_next_visit_id() + visit_id = self.storage_controller_handle.get_next_visit_id() browser.set_visit_id(visit_id) if command_sequence.callback: self.unsaved_command_sequences[visit_id] = command_sequence - self.sock.send( - ( - "site_visits", - { - "visit_id": visit_id, - "browser_id": browser.browser_id, - "site_url": command_sequence.url, - "site_rank": command_sequence.site_rank, - }, - ) + self.sock.store_record( + TableName("site_visits"), + visit_id, + { + "visit_id": visit_id, + "browser_id": browser.browser_id, + "site_url": command_sequence.url, + "site_rank": command_sequence.site_rank, + }, ) # Start command execution thread @@ -395,7 +398,7 @@ def _start_thread( return thread def _mark_command_sequences_complete(self) -> None: - """Polls the data aggregator for saved records + """Polls the storage controller for saved records and calls their callbacks """ while True: @@ -403,16 +406,16 @@ def _mark_command_sequences_complete(self) -> None: # we're shutting down and have no unprocessed callbacks break - visit_id_list = self.data_aggregator.get_new_completed_visits() + visit_id_list = self.storage_controller_handle.get_new_completed_visits() if not visit_id_list: time.sleep(1) continue - for visit_id, interrupted in visit_id_list: + for visit_id, successful in visit_id_list: self.logger.debug("Invoking callback of visit_id %d", visit_id) cs = self.unsaved_command_sequences.pop(visit_id, None) if cs: - cs.mark_done(not interrupted) + cs.mark_done(successful) def _unpack_pickled_error(self, pickled_error: bytes) -> Tuple[str, str]: """Unpacks `pickled_error` into an error `message` and `tb` string.""" @@ -428,7 +431,8 @@ def _issue_command( Sends CommandSequence to the BrowserManager one command at a time """ browser.is_fresh = False - + assert browser.browser_id is not None + assert browser.curr_visit_id is not None reset = command_sequence.reset if not reset: self.logger.warning( @@ -447,6 +451,9 @@ def _issue_command( browser.curr_visit_id, browser.browser_id, ) + assert browser.command_queue is not None + assert browser.status_queue is not None + for command_and_timeout in command_sequence.get_commands_with_timeout(): command, timeout = command_and_timeout command.set_visit_browser_id(browser.curr_visit_id, browser.browser_id) @@ -466,7 +473,6 @@ def _issue_command( try: status = browser.status_queue.get(True, browser.current_timeout) except EmptyQueue: - command_status = "timeout" self.logger.info( "BROWSER %i: Timeout while executing command, %s, killing " "browser manager" % (browser.browser_id, repr(command)) @@ -475,6 +481,7 @@ def _issue_command( if status is None: # allows us to skip this entire block without having to bloat # every if statement + command_status = "timeout" pass elif status == "OK": command_status = "ok" @@ -509,36 +516,28 @@ def _issue_command( else: raise ValueError("Unknown browser status message %s" % status) - self.sock.send( - ( - "crawl_history", - { - "browser_id": browser.browser_id, - "visit_id": browser.curr_visit_id, - "command": type(command).__name__, - "arguments": json.dumps( - command.__dict__, default=lambda x: repr(x) - ).encode("utf-8"), - "retry_number": command_sequence.retry_number, - "command_status": command_status, - "error": error_text, - "traceback": tb, - "duration": int((time.time_ns() - t1) / 1000000), - }, - ) + self.sock.store_record( + TableName("crawl_history"), + browser.curr_visit_id, + { + "browser_id": browser.browser_id, + "visit_id": browser.curr_visit_id, + "command": type(command).__name__, + "arguments": json.dumps( + command.__dict__, default=lambda x: repr(x) + ).encode("utf-8"), + "retry_number": command_sequence.retry_number, + "command_status": command_status, + "error": error_text, + "traceback": tb, + "duration": int((time.time_ns() - t1) / 1000000), + }, ) if command_status == "critical": - self.sock.send( - ( - RECORD_TYPE_SPECIAL, - { - "browser_id": browser.browser_id, - "success": False, - "action": ACTION_TYPE_FINALIZE, - "visit_id": browser.curr_visit_id, - }, - ) + self.sock.finalize_visit_id( + success=False, + visit_id=browser.curr_visit_id, ) return @@ -566,16 +565,8 @@ def _issue_command( self.failure_count = 0 if browser.restart_required: - self.sock.send( - ( - RECORD_TYPE_SPECIAL, - { - "browser_id": browser.browser_id, - "success": False, - "action": ACTION_TYPE_FINALIZE, - "visit_id": browser.curr_visit_id, - }, - ) + self.sock.finalize_visit_id( + success=False, visit_id=browser.curr_visit_id ) break @@ -617,16 +608,16 @@ def execute_command_sequence( int -> index of browser to send command to """ - # Block if the aggregator queue is too large - agg_queue_size = self.data_aggregator.get_most_recent_status() - if agg_queue_size >= AGGREGATOR_QUEUE_LIMIT: - while agg_queue_size >= AGGREGATOR_QUEUE_LIMIT: + # Block if the storage controller has too many unfinished records + agg_queue_size = self.storage_controller_handle.get_most_recent_status() + if agg_queue_size >= STORAGE_CONTROLLER_JOB_LIMIT: + while agg_queue_size >= STORAGE_CONTROLLER_JOB_LIMIT: self.logger.info( - "Blocking command submission until the DataAggregator " + "Blocking command submission until the storage controller " "is below the max queue size of %d. Current queue " - "length %d. " % (AGGREGATOR_QUEUE_LIMIT, agg_queue_size) + "length %d. " % (STORAGE_CONTROLLER_JOB_LIMIT, agg_queue_size) ) - agg_queue_size = self.data_aggregator.get_status() + agg_queue_size = self.storage_controller_handle.get_status() # Distribute command if index is None: diff --git a/openwpm/types.py b/openwpm/types.py new file mode 100644 index 000000000..005258cda --- /dev/null +++ b/openwpm/types.py @@ -0,0 +1,4 @@ +from typing import NewType + +VisitId = NewType("VisitId", int) +BrowserId = NewType("BrowserId", int) diff --git a/openwpm/utilities/build_cookie_table.py b/openwpm/utilities/build_cookie_table.py index e6faa347d..9d4130deb 100644 --- a/openwpm/utilities/build_cookie_table.py +++ b/openwpm/utilities/build_cookie_table.py @@ -8,7 +8,7 @@ # This should be the modified Cookie.py included # the standard lib Cookie.py has many bugs -from . import Cookie +from . import cookie as Cookie # Potential formats for expires timestamps DATE_FORMATS = [ diff --git a/openwpm/utilities/db_utils.py b/openwpm/utilities/db_utils.py index d54bbb894..2ae04dcf1 100644 --- a/openwpm/utilities/db_utils.py +++ b/openwpm/utilities/db_utils.py @@ -1,12 +1,14 @@ -import os import sqlite3 +from collections.abc import Iterable +from pathlib import Path +from typing import Any, AnyStr, Iterator, List, Tuple, Union import plyvel -CONTENT_DB_NAME = "content.ldb" - -def query_db(db, query, params=None, as_tuple=False): +def query_db( + db: Path, query: str, params: Iterable = None, as_tuple: bool = False +) -> List[Union[sqlite3.Row, tuple]]: """Run a query against the given db. If params is not None, securely construct a query from the given @@ -22,34 +24,36 @@ def query_db(db, query, params=None, as_tuple=False): return rows -def get_content(data_directory): +def get_content(db_name: Path) -> Iterator[Tuple[AnyStr, AnyStr]]: """Yield key, value pairs from the deduplicated leveldb content database Parameters ---------- - data_directory : string - root directory of the crawl files containing the content database + db_name : Path + The full path to the current db """ - db_path = os.path.join(data_directory, CONTENT_DB_NAME) - db = plyvel.DB(db_path, create_if_missing=False, compression="snappy") + db = plyvel.DB(str(db_name), create_if_missing=False, compression="snappy") for content_hash, content in db.iterator(): yield content_hash, content db.close() -def get_javascript_entries(db, all_columns=False, as_tuple=False): +def get_javascript_entries( + db: Path, all_columns: bool = False, as_tuple: bool = False +) -> List[Union[Tuple[Any, ...], sqlite3.Row]]: if all_columns: select_columns = "*" else: select_columns = "script_url, symbol, operation, value, arguments" - return query_db(db, "SELECT %s FROM javascript" % select_columns, as_tuple=as_tuple) + return query_db(db, f"SELECT {select_columns} FROM javascript", as_tuple=as_tuple) -def any_command_failed(db): +def any_command_failed(db: Path) -> bool: """Returns True if any command in a given database failed""" rows = query_db(db, "SELECT * FROM crawl_history;") for row in rows: + assert isinstance(row, sqlite3.Row) if row["command_status"] != "ok": return True return False diff --git a/openwpm/utilities/platform_utils.py b/openwpm/utilities/platform_utils.py index bf94a0d4f..cee880cc0 100644 --- a/openwpm/utilities/platform_utils.py +++ b/openwpm/utilities/platform_utils.py @@ -7,6 +7,8 @@ from tabulate import tabulate +from openwpm.config import ConfigEncoder + def parse_http_stack_trace_str(trace_str): """Parse a stacktrace string and return an array of dict.""" @@ -95,8 +97,13 @@ def get_configuration_string(manager_params, browser_params, versions): config_str = "\n\nOpenWPM Version: %s\nFirefox Version: %s\n" % versions config_str += "\n========== Manager Configuration ==========\n" + config_str += json.dumps( - manager_params.to_dict(), sort_keys=True, indent=2, separators=(",", ": ") + manager_params.to_dict(), + sort_keys=True, + indent=2, + separators=(",", ": "), + cls=ConfigEncoder, ) config_str += "\n\n========== Browser Configuration ==========\n" @@ -116,13 +123,13 @@ def get_configuration_string(manager_params, browser_params, versions): archive_all_none = False # Separate out long profile directory strings - profile_dirs[browser_id] = item.pop("seed_tar") - archive_dirs[browser_id] = item.pop("profile_archive_dir") + profile_dirs[browser_id] = str(item.pop("seed_tar")) + archive_dirs[browser_id] = str(item.pop("profile_archive_dir")) js_config[browser_id] = item.pop("js_instrument_settings") # Copy items in sorted order dct = OrderedDict() - dct[u"browser_id"] = browser_id + dct["browser_id"] = browser_id for key in sorted(item.keys()): dct[key] = item[key] table_input.append(dct) diff --git a/test/pytest.ini b/pytest.ini similarity index 92% rename from test/pytest.ini rename to pytest.ini index 866c7203a..0ee2310c5 100644 --- a/test/pytest.ini +++ b/pytest.ini @@ -1,5 +1,6 @@ [pytest] python_files=test_*.py +testpaths=test markers = pyonly: marks a test as being python only and so server and xpi not needed diff --git a/scripts/ci.sh b/scripts/ci.sh index 10472fdc3..e192d9b72 100755 --- a/scripts/ci.sh +++ b/scripts/ci.sh @@ -1,7 +1,6 @@ #!/bin/bash -cd test; -python -m pytest --cov=../openwpm --cov-report=xml $TESTS -s -v --durations=10; +python -m pytest --cov=openwpm --cov-report=xml $TESTS -s -v --durations=10; exit_code=$?; if [[ "$exit_code" -ne 0 ]]; then exit $exit_code; diff --git a/scripts/environment-unpinned-dev.yaml b/scripts/environment-unpinned-dev.yaml index 2000e1109..455b85557 100644 --- a/scripts/environment-unpinned-dev.yaml +++ b/scripts/environment-unpinned-dev.yaml @@ -5,14 +5,9 @@ dependencies: - codecov - pytest-cov - ipython - - localstack==0.11.1.1 # See https://github.com/mozilla/OpenWPM/pull/682 - pip - pre-commit - pytest - - pip: - # Select depenedencies from localstack[full] that we need - - amazon-kclpy - - crontab - - flask-cors - - moto-ext==1.3.15.15 # See https://github.com/mozilla/OpenWPM/pull/682 - - subprocess32 + - mypy + - pytest-asyncio + diff --git a/scripts/environment-unpinned.yaml b/scripts/environment-unpinned.yaml index 9330db7b2..529ef937b 100644 --- a/scripts/environment-unpinned.yaml +++ b/scripts/environment-unpinned.yaml @@ -7,6 +7,7 @@ dependencies: - click - dill # - firefox-unbranded - when it's available + - gcsfs - geckodriver - leveldb - multiprocess @@ -16,10 +17,10 @@ dependencies: - pillow - psutil - pyarrow - - python==3.8.6 + - python - pyvirtualdisplay - redis-py - - s3fs==0.4.0 # https://github.com/mozilla/OpenWPM/issues/614 + - s3fs - selenium - sentry-sdk - tabulate diff --git a/scripts/prune-environment.py b/scripts/prune-environment.py index 171bb4347..9f81d0b26 100644 --- a/scripts/prune-environment.py +++ b/scripts/prune-environment.py @@ -1,3 +1,5 @@ +from typing import Iterable, List + import yaml with open("environment-unpinned.yaml", "r") as fp: @@ -19,24 +21,46 @@ # Only pin explicit dependencies -def iterate_deps(xs, ys, accumulator): +def iterate_deps(xs: Iterable[str], ys: Iterable[str], accumulator: List[str]) -> None: for x in xs: for y in ys: if x.split("=")[0] == y.split("=")[0]: accumulator.append(x) -deps_not_pip = [] -deps_pip = [] +deps_not_pip: List[str] = [] +deps_pip: List[str] = [] + +env_unpinned_contains_pip = "pip" in env_unpinned["dependencies"][-1] +env_unpinned_dev_contains_pip = "pip" in env_unpinned_dev["dependencies"][-1] iterate_deps( env_pinned["dependencies"][:-1], - env_unpinned["dependencies"][:-1] + env_unpinned_dev["dependencies"][:-1], + ( + env_unpinned["dependencies"][:-1] + if env_unpinned_contains_pip + else env_unpinned["dependencies"] + ) + + ( + env_unpinned_dev["dependencies"][:-1] + if env_unpinned_dev_contains_pip + else env_unpinned_dev["dependencies"] + ), deps_not_pip, ) + +# Checking if there are any pip dependencies +try: + deps_pip_unpinned = env_unpinned["dependencies"][-1]["pip"] +except: + deps_pip_unpinned = [] +try: + deps_pip_unpinned_dev = env_unpinned_dev["dependencies"][-1]["pip"] +except: + deps_pip_unpinned_dev = [] + iterate_deps( env_pinned["dependencies"][-1]["pip"], - env_unpinned["dependencies"][-1]["pip"] - + env_unpinned_dev["dependencies"][-1]["pip"], + deps_pip_unpinned_dev + deps_pip_unpinned, deps_pip, ) pruned_dependencies = [ diff --git a/setup.cfg b/setup.cfg index f3297deed..37cb0b7c8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,28 @@ known_future_library = future known_first_party = openwpm,openwpmtest,test default_section = THIRDPARTY skip = venv,openwpm/Extension,firefox-bin + +[mypy] +follow_imports = silent +python_version = 3.9 +warn_unused_configs = True +ignore_missing_imports = True +disallow_incomplete_defs = True +disallow_untyped_defs = True + +[mypy-openwpm.storage.*] +disallow_incomplete_defs = True +disallow_untyped_defs = True + +[mypy-openwpm.*] +disallow_untyped_defs = False + +[mypy-openwpm.utilities.*,openwpm.mp_logger,openwpm.commands.browser_commands] +disallow_incomplete_defs = False + +[mypy-openwpm.browser_manager] +allow_redefinition = True +disallow_incomplete_defs = False + +[mypy-test.*] +allow_untyped_defs = True \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index 8e308b4b0..e33801938 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,9 +1,18 @@ +import logging import os import subprocess +from pathlib import Path +from typing import Any, Callable, Generator, List, Tuple import pytest +from openwpm.config import BrowserParams, ManagerParams +from openwpm.mp_logger import MPLogger +from openwpm.storage.sql_provider import SQLiteStorageProvider +from openwpm.task_manager import TaskManager + from . import utilities +from .openwpmtest import NUM_BROWSERS EXTENSION_DIR = os.path.join( os.path.dirname(os.path.realpath(__file__)), @@ -12,29 +21,96 @@ "Extension", "firefox", ) +pytest_plugins = "test.storage.fixtures" -def create_xpi(): +@pytest.fixture(scope="session") +def xpi(): # Creates a new xpi using npm run build. print("Building new xpi") subprocess.check_call(["npm", "run", "build"], cwd=EXTENSION_DIR) -@pytest.fixture(scope="session", autouse=True) -def prepare_test_setup(request): +@pytest.fixture(scope="session") +def server(): """Run an HTTP server during the tests.""" + print("Starting local_http_server") + server, server_thread = utilities.start_server() + yield + print("\nClosing server thread...") + server.shutdown() + server_thread.join() - if "pyonly" in request.config.invocation_params.args: - return - create_xpi() +@pytest.fixture() +def default_params( + tmp_path: Path, num_browsers: int = NUM_BROWSERS +) -> Tuple[ManagerParams, List[BrowserParams]]: + """Just a simple wrapper around task_manager.load_default_params""" - print("Starting local_http_server") - server, server_thread = utilities.start_server() + manager_params = ManagerParams( + num_browsers=NUM_BROWSERS + ) # num_browsers is necessary to let TaskManager know how many browsers to spawn + + browser_params = [ + BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) + ] + manager_params.data_directory = tmp_path + manager_params.log_directory = tmp_path + for i in range(num_browsers): + browser_params[i].display_mode = "headless" + return manager_params, browser_params + + +@pytest.fixture() +def task_manager_creator( + server: None, xpi: None +) -> Callable[[Tuple[ManagerParams, List[BrowserParams]]], Tuple[TaskManager, Path]]: + """We create a callable that returns a TaskManager that has + been configured with the Manager and BrowserParams""" + # We need to create the fixtures like this because usefixtures doesn't work on fixtures + def _create_task_manager( + params: Tuple[ManagerParams, List[BrowserParams]] + ) -> Tuple[TaskManager, Path]: + manager_params, browser_params = params + db_path = manager_params.data_directory / "crawl-data.sqlite" + structured_provider = SQLiteStorageProvider(db_path) + manager = TaskManager( + manager_params, + browser_params, + structured_provider, + None, + ) + return manager, db_path + + return _create_task_manager + + +@pytest.fixture() +def http_params( + default_params: Tuple[ManagerParams, List[BrowserParams]], +) -> Callable[[str], Tuple[ManagerParams, List[BrowserParams]]]: + manager_params, browser_params = default_params + for browser_param in browser_params: + browser_param.http_instrument = True + + def parameterize( + display_mode: str = "headless", + ) -> Tuple[ManagerParams, List[BrowserParams]]: + for browser_param in browser_params: + browser_param.display_mode = display_mode + return manager_params, browser_params + + return parameterize - def local_http_server_stop(): - print("\nClosing server thread...") - server.shutdown() - server_thread.join() - request.addfinalizer(local_http_server_stop) +@pytest.fixture() +def mp_logger(tmp_path: Path) -> Generator[MPLogger, Any, None]: + log_path = tmp_path / "openwpm.log" + logger = MPLogger(log_path, log_level_console=logging.DEBUG) + yield logger + logger.close() + # The performance hit for this might be unacceptable but it might help us discover bugs + with log_path.open("r") as f: + for line in f: + assert "ERROR" not in line diff --git a/test/manual_test.py b/test/manual_test.py index bf23316de..0f5afb347 100644 --- a/test/manual_test.py +++ b/test/manual_test.py @@ -12,15 +12,15 @@ from openwpm.deploy_browsers import configure_firefox from openwpm.utilities.platform_utils import get_firefox_binary_path -from .conftest import create_xpi +from .conftest import xpi from .utilities import BASE_TEST_URL, start_server # import commonly used modules and utilities so they can be easily accessed # in the interactive session from openwpm.commands.utils import webdriver_utils as wd_util # noqa isort:skip import domain_utils as du # noqa isort:skip -from selenium.webdriver.common.keys import Keys # noqa isort:skip from selenium.common.exceptions import * # noqa isort:skip +from selenium.webdriver.common.keys import Keys # noqa isort:skip OPENWPM_LOG_PREFIX = "console.log: openwpm: " INSERT_PREFIX = "Array" @@ -140,7 +140,7 @@ def cleanup_server(): if with_extension: # add openwpm extension to profile - create_xpi() + xpi() ext_xpi = join(EXT_PATH, "dist", "openwpm-1.0.zip") driver.install_addon(ext_xpi, temporary=True) @@ -166,28 +166,27 @@ def start_webext(): thread.join() -flag_opts = dict( - is_flag=True, - default=False, -) - - @click.command() @click.option( "--selenium", help=""" Run a selenium webdriver instance, and drop into an IPython shell""", - **flag_opts, + is_flag=True, + default=False, ) @click.option( "--no-extension", help=""" Use this to prevent the openwpm webextension being loaded. Only applies if --selenium is being used.""", - **flag_opts, + is_flag=True, + default=False, ) @click.option( - "--browser-params", help="""Set flag to load browser_params.""", **flag_opts + "--browser-params", + help="""Set flag to load browser_params.""", + is_flag=True, + default=False, ) @click.option( "--browser-params-file", diff --git a/test/openwpm_jstest.py b/test/openwpm_jstest.py index f89dc7851..203d7453d 100644 --- a/test/openwpm_jstest.py +++ b/test/openwpm_jstest.py @@ -1,12 +1,17 @@ import re +from pathlib import Path +from typing import List, Optional, Tuple +from openwpm.config import BrowserParams, ManagerParams from openwpm.utilities import db_utils from .openwpmtest import OpenWPMTest class OpenWPMJSTest(OpenWPMTest): - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Optional[Path] + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = self.get_test_config(data_dir) browser_params[0].js_instrument = True manager_params.testing = True diff --git a/test/openwpmtest.py b/test/openwpmtest.py index 8b82ca0c6..cb42810bd 100644 --- a/test/openwpmtest.py +++ b/test/openwpmtest.py @@ -1,17 +1,19 @@ -import os -from os.path import isfile, join +from pathlib import Path +from typing import List, Optional, Tuple import pytest from openwpm import task_manager from openwpm.config import BrowserParams, ManagerParams +from openwpm.storage.sql_provider import SQLiteStorageProvider from . import utilities +NUM_BROWSERS = 2 -class OpenWPMTest(object): - NUM_BROWSERS = 1 +@pytest.mark.usefixtures("xpi", "server") +class OpenWPMTest: @pytest.fixture(autouse=True) def set_tmpdir(self, tmpdir): """Create a tmpdir fixture to be used in `get_test_config`. @@ -19,44 +21,46 @@ def set_tmpdir(self, tmpdir): Based on: https://mail.python.org/pipermail/pytest-dev/2014-April/002484.html """ - self.tmpdir = str(tmpdir) + self.tmpdir = Path(tmpdir) - def visit(self, page_url, data_dir="", sleep_after=0): + def get_config( + self, data_dir: Optional[Path] + ) -> Tuple[ManagerParams, List[BrowserParams]]: + raise NotImplementedError() + + def visit( + self, page_url: str, data_dir: Optional[Path] = None, sleep_after: int = 0 + ) -> Path: """Visit a test page with the given parameters.""" manager_params, browser_params = self.get_config(data_dir) - manager = task_manager.TaskManager(manager_params, browser_params) + if data_dir: + db_path = data_dir / "crawl-data.sqlite" + else: + db_path = self.tmpdir / "crawl-data.sqlite" + structured_provider = SQLiteStorageProvider(db_path) + manager = task_manager.TaskManager( + manager_params, browser_params, structured_provider, None + ) if not page_url.startswith("http"): page_url = utilities.BASE_TEST_URL + page_url manager.get(url=page_url, sleep=sleep_after) manager.close() - return manager_params.database_name + return db_path def get_test_config( - self, data_dir="", num_browsers=NUM_BROWSERS, display_mode="headless" - ): + self, + data_dir: Optional[Path] = None, + num_browsers: int = NUM_BROWSERS, + display_mode: str = "headless", + ) -> Tuple[ManagerParams, List[BrowserParams]]: """Load and return the default test parameters.""" if not data_dir: data_dir = self.tmpdir + assert data_dir is not None # Mypy doesn't understand this without help manager_params = ManagerParams(num_browsers=num_browsers) browser_params = [BrowserParams() for _ in range(num_browsers)] - manager_params.data_directory = data_dir manager_params.log_directory = data_dir manager_params.num_browsers = num_browsers for i in range(num_browsers): browser_params[i].display_mode = display_mode - manager_params.database_name = join( - manager_params.data_directory, manager_params.database_name - ) return manager_params, browser_params - - def is_installed(self, cmd): - """Check if a program is available via the standard PATH lookup.""" - path = os.environ["PATH"].split(os.pathsep) - for d in path: - candidate = join(d, cmd) - if isfile(candidate) and os.access(candidate, os.X_OK): - return True - return False - - def assert_is_installed(self, cmd): - assert self.is_installed(cmd), "Cannot find %s in your system" % cmd diff --git a/test/storage/__init__.py b/test/storage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/storage/fixtures.py b/test/storage/fixtures.py new file mode 100644 index 000000000..a25423ca9 --- /dev/null +++ b/test/storage/fixtures.py @@ -0,0 +1,84 @@ +from typing import Any, List + +import pytest +from _pytest.fixtures import FixtureRequest + +from openwpm.storage.in_memory_storage import ( + MemoryArrowProvider, + MemoryStructuredProvider, + MemoryUnstructuredProvider, +) +from openwpm.storage.leveldb import LevelDbProvider +from openwpm.storage.local_storage import LocalGzipProvider +from openwpm.storage.sql_provider import SQLiteStorageProvider +from openwpm.storage.storage_controller import INVALID_VISIT_ID +from openwpm.storage.storage_providers import ( + StructuredStorageProvider, + UnstructuredStorageProvider, +) +from test.storage.test_values import dt_test_values, generate_test_values + +memory_structured = "memory_structured" +sqlite = "sqlite" +memory_arrow = "memory_arrow" + + +@pytest.fixture +def structured_provider( + request: Any, tmp_path_factory: Any +) -> StructuredStorageProvider: + if request.param == memory_structured: + return MemoryStructuredProvider() + elif request.param == sqlite: + tmp_path = tmp_path_factory.mktemp("sqlite") + return SQLiteStorageProvider(tmp_path / "test_db.sqlite") + elif request.param == memory_arrow: + return MemoryArrowProvider() + assert isinstance( + request, FixtureRequest + ) # See https://github.com/pytest-dev/pytest/issues/8073 for why this can't be type annotated + request.raiseerror("invalid internal test config") + + +structured_scenarios: List[str] = [ + memory_structured, + sqlite, + memory_arrow, +] + +# Unstructured Providers +memory_unstructured = "memory_unstructured" +leveldb = "leveldb" +local_gzip = "local_gzip" + + +@pytest.fixture +def unstructured_provider( + request: Any, tmp_path_factory: Any +) -> UnstructuredStorageProvider: + if request.param == memory_unstructured: + return MemoryUnstructuredProvider() + elif request.param == leveldb: + tmp_path = tmp_path_factory.mktemp(leveldb) + return LevelDbProvider(tmp_path / "content.ldb") + elif request.param == local_gzip: + tmp_path = tmp_path_factory.mktemp(local_gzip) + return LocalGzipProvider(tmp_path) + assert isinstance( + request, FixtureRequest + ) # See https://github.com/pytest-dev/pytest/issues/8073 for why this can't be type annotated + request.raiseerror("invalid internal test config") + + +unstructured_scenarios: List[str] = [memory_unstructured, leveldb, local_gzip] + + +@pytest.fixture +def test_values() -> dt_test_values: + data, visit_ids = generate_test_values() + for table in data.values(): + table["visit_id"] = ( + table["visit_id"] if "visit_id" in table else INVALID_VISIT_ID + ) + visit_ids.add(INVALID_VISIT_ID) + return data, visit_ids diff --git a/test/storage/test_arrow_cache.py b/test/storage/test_arrow_cache.py new file mode 100644 index 000000000..b728b1256 --- /dev/null +++ b/test/storage/test_arrow_cache.py @@ -0,0 +1,44 @@ +import asyncio +from typing import Awaitable, Dict + +import pytest +from pandas import DataFrame + +from openwpm.mp_logger import MPLogger +from openwpm.storage.arrow_storage import CACHE_SIZE +from openwpm.storage.in_memory_storage import MemoryArrowProvider +from openwpm.storage.storage_providers import TableName +from openwpm.types import VisitId +from test.storage.test_values import dt_test_values + + +@pytest.mark.asyncio +async def test_arrow_cache(mp_logger: MPLogger, test_values: dt_test_values) -> None: + prov = MemoryArrowProvider() + await prov.init() + site_visit = test_values[0][TableName("site_visits")] + for j in range(5): # Testing that the cache works repeatedly + d: Dict[VisitId, Awaitable[None]] = {} + for i in range(CACHE_SIZE + 1): + visit_id = VisitId(i + j * 1000) + site_visit["visit_id"] = visit_id + await prov.store_record(TableName("site_visits"), visit_id, site_visit) + d[visit_id] = await prov.finalize_visit_id(visit_id) + + for visit_id in d: + await d[visit_id] + + await asyncio.sleep(1) + handle = prov.handle + # The queue should not be empty at this point + handle.poll_queue(block=False) + + assert len(handle.storage["site_visits"]) == j + 1 + table = handle.storage["site_visits"][j] + + df: DataFrame = table.to_pandas() + for row in df.itertuples(index=False): + del d[row.visit_id] + + assert len(d) == 0 + await prov.shutdown() diff --git a/test/storage/test_gcp.py b/test/storage/test_gcp.py new file mode 100644 index 000000000..ac63e7c90 --- /dev/null +++ b/test/storage/test_gcp.py @@ -0,0 +1,30 @@ +import pytest + +from openwpm.storage.cloud_storage.gcp_storage import GcsStructuredProvider +from openwpm.storage.storage_providers import TableName +from openwpm.types import VisitId + + +@pytest.mark.skip +@pytest.mark.asyncio +async def test_gcp_structured(mp_logger, test_values): + tables, visit_ids = test_values + project = "senglehardt-openwpm-test-1" + bucket_name = "openwpm-test-bucket" + structured_provider = GcsStructuredProvider( + project=project, + bucket_name=bucket_name, + base_path="test/2", + token="/home/stefan/.config/gcloud/legacy_credentials/szabka@mozilla.com/adc.json", + ) + await structured_provider.init() + + for table_name, test_data in tables.items(): + visit_id = VisitId(test_data["visit_id"]) + await structured_provider.store_record( + TableName(table_name), visit_id, test_data + ) + finalize_token = [await structured_provider.finalize_visit_id(i) for i in visit_ids] + await structured_provider.flush_cache() + for token in finalize_token: + await token diff --git a/test/storage/test_storage_controller.py b/test/storage/test_storage_controller.py new file mode 100644 index 000000000..98eefcd07 --- /dev/null +++ b/test/storage/test_storage_controller.py @@ -0,0 +1,69 @@ +import pandas as pd +from pandas.testing import assert_frame_equal + +from openwpm.mp_logger import MPLogger +from openwpm.storage.in_memory_storage import ( + MemoryArrowProvider, + MemoryStructuredProvider, +) +from openwpm.storage.storage_controller import ( + INVALID_VISIT_ID, + DataSocket, + StorageControllerHandle, +) +from test.storage.fixtures import dt_test_values + + +def test_startup_and_shutdown(mp_logger: MPLogger, test_values: dt_test_values) -> None: + test_table, visit_ids = test_values + structured = MemoryStructuredProvider() + controller_handle = StorageControllerHandle(structured, None) + controller_handle.launch() + assert controller_handle.listener_address is not None + cs = DataSocket(controller_handle.listener_address) + for table, data in test_table.items(): + visit_id = data["visit_id"] + cs.store_record( + table, visit_id, data + ) # cloning to avoid the modifications in store_record + + for visit_id in visit_ids: + cs.finalize_visit_id(visit_id, True) + cs.close() + controller_handle.shutdown() + + handle = structured.handle + handle.poll_queue() + for table, data in test_table.items(): + if data["visit_id"] == INVALID_VISIT_ID: + del data["visit_id"] + assert handle.storage[table] == [data] + + +def test_arrow_provider(mp_logger: MPLogger, test_values: dt_test_values) -> None: + test_table, visit_ids = test_values + structured = MemoryArrowProvider() + controller_handle = StorageControllerHandle(structured, None) + controller_handle.launch() + + assert controller_handle.listener_address is not None + cs = DataSocket(controller_handle.listener_address) + + for table, data in test_table.items(): + visit_id = data["visit_id"] + cs.store_record(table, visit_id, data) + + for visit_id in visit_ids: + cs.finalize_visit_id(visit_id, True) + cs.close() + controller_handle.shutdown() + + handle = structured.handle + handle.poll_queue() + for table, data in test_table.items(): + t1 = handle.storage[table][0].to_pandas().drop(columns=["instance_id"]) + if data["visit_id"] == INVALID_VISIT_ID: + del data["visit_id"] + t2 = pd.DataFrame({k: [v] for k, v in data.items()}) + # Since t2 doesn't get created schema the inferred types are different + assert_frame_equal(t1, t2, check_dtype=False) diff --git a/test/storage/test_storage_providers.py b/test/storage/test_storage_providers.py new file mode 100644 index 000000000..7471335bb --- /dev/null +++ b/test/storage/test_storage_providers.py @@ -0,0 +1,77 @@ +import asyncio +from pathlib import Path + +import pytest +from pandas import DataFrame +from pyarrow.parquet import ParquetDataset + +from openwpm.storage.local_storage import LocalArrowProvider +from openwpm.storage.storage_controller import INVALID_VISIT_ID +from openwpm.storage.storage_providers import ( + StructuredStorageProvider, + TableName, + UnstructuredStorageProvider, +) +from openwpm.types import VisitId + +from .fixtures import structured_scenarios, unstructured_scenarios +from .test_values import dt_test_values + + +@pytest.mark.usefixtures("mp_logger") +@pytest.mark.asyncio +async def test_local_arrow_storage_provider( + tmp_path: Path, test_values: dt_test_values +) -> None: + test_table, visit_ids = test_values + structured_provider = LocalArrowProvider(tmp_path) + await structured_provider.init() + for table_name, test_data in test_table.items(): + await structured_provider.store_record( + TableName(table_name), test_data["visit_id"], test_data + ) + token_list = [] + for i in visit_ids: + token_list.append(await structured_provider.finalize_visit_id(i)) + await structured_provider.flush_cache() + await asyncio.gather(*token_list) + for table_name, test_data in test_table.items(): + dataset = ParquetDataset(tmp_path / table_name) + df: DataFrame = dataset.read().to_pandas() + assert df.shape[0] == 1 + for row in df.itertuples(index=False): + if test_data["visit_id"] == INVALID_VISIT_ID: + del test_data["visit_id"] + assert row._asdict() == test_data + + +@pytest.mark.parametrize("structured_provider", structured_scenarios, indirect=True) +@pytest.mark.asyncio +async def test_basic_access(structured_provider: StructuredStorageProvider) -> None: + data = { + "visit_id": 2, + "browser_id": 3, + "site_url": "https://example.com", + } + + await structured_provider.init() + + await structured_provider.store_record(TableName("site_visits"), VisitId(2), data) + token = await structured_provider.finalize_visit_id(VisitId(2)) + await structured_provider.flush_cache() + if token is not None: + await token + await structured_provider.shutdown() + + +@pytest.mark.parametrize("unstructured_provider", unstructured_scenarios, indirect=True) +@pytest.mark.asyncio +async def test_basic_unstructured_storing( + unstructured_provider: UnstructuredStorageProvider, +) -> None: + test_string = "This is my test string" + blob = test_string.encode() + await unstructured_provider.init() + await unstructured_provider.store_blob("test", blob) + await unstructured_provider.flush_cache() + await unstructured_provider.shutdown() diff --git a/test/storage/test_values.py b/test/storage/test_values.py new file mode 100644 index 000000000..30006fc9c --- /dev/null +++ b/test/storage/test_values.py @@ -0,0 +1,240 @@ +""" This file should contain one entry for every table +so that we can test storing and loading for every single entry +for every structured storage provider. + +IF YOU CHANGE THIS FILE ALSO CHANGE schema.sql and parquet_schema.py +AND Schema-Documentation.md +""" + +import random +import string +from typing import Any, Dict, Set, Tuple + +from openwpm.storage.storage_providers import TableName +from openwpm.types import VisitId + +dt_test_values = Tuple[Dict[TableName, Dict[str, Any]], Set[VisitId]] + + +def generate_test_values() -> dt_test_values: + test_values: Dict[TableName, Dict[str, Any]] = dict() + + def random_word(length): + letters = string.ascii_lowercase + return "".join(random.choice(letters) for _ in range(length)) + + # task + fields = { + "task_id": random.randint(0, 2 ** 63 - 1), + "manager_params": random_word(12), + "openwpm_version": random_word(12), + "browser_version": random_word(12), + } + test_values[TableName("task")] = fields + # crawl + fields = { + "browser_id": random.randint(0, 2 ** 31 - 1), + "task_id": random.randint(0, 2 ** 63 - 1), + "browser_params": random_word(12), + } + test_values[TableName("crawl")] = fields + # site_visits + fields = { + "visit_id": random.randint(0, 2 ** 63 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "site_url": random_word(12), + "site_rank": random.randint(0, 2 ** 31 - 1), + } + test_values[TableName("site_visits")] = fields + # crawl_history + fields = { + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "command": random_word(12), + "arguments": random_word(12), + "retry_number": random.randint(0, 2 ** 7 - 1), + "command_status": random_word(12), + "error": random_word(12), + "traceback": random_word(12), + "duration": random.randint(0, 2 ** 63 - 1), + } + test_values[TableName("crawl_history")] = fields + # http_requests + fields = { + "incognito": random.randint(0, 2 ** 31 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "extension_session_uuid": random_word(12), + "event_ordinal": random.randint(0, 2 ** 63 - 1), + "window_id": random.randint(0, 2 ** 63 - 1), + "tab_id": random.randint(0, 2 ** 63 - 1), + "frame_id": random.randint(0, 2 ** 63 - 1), + "url": random_word(12), + "top_level_url": random_word(12), + "parent_frame_id": random.randint(0, 2 ** 63 - 1), + "frame_ancestors": random_word(12), + "method": random_word(12), + "referrer": random_word(12), + "headers": random_word(12), + "request_id": random.randint(0, 2 ** 63 - 1), + "is_XHR": random.choice([True, False]), + "is_third_party_channel": random.choice([True, False]), + "is_third_party_to_top_window": random.choice([True, False]), + "triggering_origin": random_word(12), + "loading_origin": random_word(12), + "loading_href": random_word(12), + "req_call_stack": random_word(12), + "resource_type": random_word(12), + "post_body": random_word(12), + "post_body_raw": random_word(12), + "time_stamp": random_word(12), + } + test_values[TableName("http_requests")] = fields + # http_responses + fields = { + "incognito": random.randint(0, 2 ** 31 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "extension_session_uuid": random_word(12), + "event_ordinal": random.randint(0, 2 ** 63 - 1), + "window_id": random.randint(0, 2 ** 63 - 1), + "tab_id": random.randint(0, 2 ** 63 - 1), + "frame_id": random.randint(0, 2 ** 63 - 1), + "url": random_word(12), + "method": random_word(12), + "response_status": random.randint(0, 2 ** 63 - 1), + "response_status_text": random_word(12), + "is_cached": random.choice([True, False]), + "headers": random_word(12), + "request_id": random.randint(0, 2 ** 63 - 1), + "location": random_word(12), + "time_stamp": random_word(12), + "content_hash": random_word(12), + } + test_values[TableName("http_responses")] = fields + # http_redirects + fields = { + "incognito": random.randint(0, 2 ** 31 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "old_request_url": random_word(12), + "old_request_id": random_word(12), + "new_request_url": random_word(12), + "new_request_id": random_word(12), + "extension_session_uuid": random_word(12), + "event_ordinal": random.randint(0, 2 ** 63 - 1), + "window_id": random.randint(0, 2 ** 63 - 1), + "tab_id": random.randint(0, 2 ** 63 - 1), + "frame_id": random.randint(0, 2 ** 63 - 1), + "response_status": random.randint(0, 2 ** 63 - 1), + "response_status_text": random_word(12), + "headers": random_word(12), + "time_stamp": random_word(12), + } + test_values[TableName("http_redirects")] = fields + # javascript + fields = { + "incognito": random.randint(0, 2 ** 31 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "extension_session_uuid": random_word(12), + "event_ordinal": random.randint(0, 2 ** 63 - 1), + "page_scoped_event_ordinal": random.randint(0, 2 ** 63 - 1), + "window_id": random.randint(0, 2 ** 63 - 1), + "tab_id": random.randint(0, 2 ** 63 - 1), + "frame_id": random.randint(0, 2 ** 63 - 1), + "script_url": random_word(12), + "script_line": random_word(12), + "script_col": random_word(12), + "func_name": random_word(12), + "script_loc_eval": random_word(12), + "document_url": random_word(12), + "top_level_url": random_word(12), + "call_stack": random_word(12), + "symbol": random_word(12), + "operation": random_word(12), + "value": random_word(12), + "arguments": random_word(12), + "time_stamp": random_word(12), + } + test_values[TableName("javascript")] = fields + # javascript_cookies + fields = { + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "extension_session_uuid": random_word(12), + "event_ordinal": random.randint(0, 2 ** 63 - 1), + "record_type": random_word(12), + "change_cause": random_word(12), + "expiry": random_word(12), + "is_http_only": random.choice([True, False]), + "is_host_only": random.choice([True, False]), + "is_session": random.choice([True, False]), + "host": random_word(12), + "is_secure": random.choice([True, False]), + "name": random_word(12), + "path": random_word(12), + "value": random_word(12), + "same_site": random_word(12), + "first_party_domain": random_word(12), + "store_id": random_word(12), + "time_stamp": random_word(12), + } + test_values[TableName("javascript_cookies")] = fields + # navigations + fields = { + "incognito": random.randint(0, 2 ** 31 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "extension_session_uuid": random_word(12), + "process_id": random.randint(0, 2 ** 63 - 1), + "window_id": random.randint(0, 2 ** 63 - 1), + "tab_id": random.randint(0, 2 ** 63 - 1), + "tab_opener_tab_id": random.randint(0, 2 ** 63 - 1), + "frame_id": random.randint(0, 2 ** 63 - 1), + "parent_frame_id": random.randint(0, 2 ** 63 - 1), + "window_width": random.randint(0, 2 ** 63 - 1), + "window_height": random.randint(0, 2 ** 63 - 1), + "window_type": random_word(12), + "tab_width": random.randint(0, 2 ** 63 - 1), + "tab_height": random.randint(0, 2 ** 63 - 1), + "tab_cookie_store_id": random_word(12), + "uuid": random_word(12), + "url": random_word(12), + "transition_qualifiers": random_word(12), + "transition_type": random_word(12), + "before_navigate_event_ordinal": random.randint(0, 2 ** 63 - 1), + "before_navigate_time_stamp": random_word(12), + "committed_event_ordinal": random.randint(0, 2 ** 63 - 1), + "time_stamp": random_word(12), + } + test_values[TableName("navigations")] = fields + # callstacks + fields = { + "visit_id": random.randint(0, 2 ** 63 - 1), + "request_id": random.randint(0, 2 ** 63 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "call_stack": random_word(12), + } + test_values[TableName("callstacks")] = fields + # incomplete_visits + fields = { + "visit_id": random.randint(0, 2 ** 63 - 1), + } + test_values[TableName("incomplete_visits")] = fields + # dns_responses + fields = { + "request_id": random.randint(0, 2 ** 63 - 1), + "browser_id": random.randint(0, 2 ** 31 - 1), + "visit_id": random.randint(0, 2 ** 63 - 1), + "hostname": random_word(12), + "addresses": random_word(12), + "canonical_name": random_word(12), + "is_TRR": random.choice([True, False]), + "time_stamp": random_word(12), + } + test_values[TableName("dns_responses")] = fields + visit_id_set = set( + d["visit_id"] for d in filter(lambda d: "visit_id" in d, test_values.values()) + ) + return test_values, visit_id_set diff --git a/test/test_callback.py b/test/test_callback.py index 852e422fa..472307632 100644 --- a/test/test_callback.py +++ b/test/test_callback.py @@ -2,33 +2,25 @@ from typing import List from openwpm.command_sequence import CommandSequence -from openwpm.task_manager import TaskManager -from .openwpmtest import OpenWPMTest from .utilities import BASE_TEST_URL -class TestCallbackCommand(OpenWPMTest): - """Test test the Aggregators as well as the entire callback machinery +def test_local_callbacks(default_params, task_manager_creator): + """Test test the storage controller as well as the entire callback machinery to see if all callbacks get correctly called""" + manager, _ = task_manager_creator(default_params) + TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" - def get_config(self, data_dir=""): - return self.get_test_config(data_dir) + def callback(argument: List[int], success: bool) -> None: + argument.extend([1, 2, 3]) - def test_local_callbacks(self): - manager_params, browser_params = self.get_config() - TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" - manager = TaskManager(manager_params, browser_params) + my_list: List[int] = [] + sequence = CommandSequence( + TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) + ) + sequence.get() - def callback(argument: List[int], success: bool): - argument.extend([1, 2, 3]) - - my_list = [] - sequence = CommandSequence( - TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) - ) - sequence.get() - - manager.execute_command_sequence(sequence) - manager.close() - assert my_list == [1, 2, 3] + manager.execute_command_sequence(sequence) + manager.close() + assert my_list == [1, 2, 3] diff --git a/test/test_callstack_instrument.py b/test/test_callstack_instrument.py index 71415a9d3..2b6cad08f 100644 --- a/test/test_callstack_instrument.py +++ b/test/test_callstack_instrument.py @@ -1,9 +1,6 @@ -from openwpm import task_manager from openwpm.utilities import db_utils -from openwpm.utilities.platform_utils import parse_http_stack_trace_str from . import utilities -from .openwpmtest import OpenWPMTest # HTTP request call stack instrumentation # Expected stack frames @@ -27,90 +24,46 @@ "onload@" + HTTP_STACKTRACE_TEST_URL + ":1:1;null" ) -HTTP_STACKTRACES = set( - (STACK_TRACE_INJECT_IMAGE, STACK_TRACE_INJECT_PIXEL, STACK_TRACE_INJECT_JS) -) -# parsed HTTP call stack dict -CALL_STACK_INJECT_IMAGE = [ - { - "func_name": "inject_image", - "filename": HTTP_STACKTRACE_TEST_URL, - "line_no": "18", - "col_no": "7", - "async_cause": "null", - }, - { - "func_name": "inject_all", - "filename": HTTP_STACKTRACE_TEST_URL, - "line_no": "22", - "col_no": "7", - "async_cause": "null", - }, - { - "func_name": "onload", - "filename": HTTP_STACKTRACE_TEST_URL, - "line_no": "1", - "col_no": "1", - "async_cause": "null", - }, -] - +HTTP_STACKTRACES = { + STACK_TRACE_INJECT_IMAGE, + STACK_TRACE_INJECT_PIXEL, + STACK_TRACE_INJECT_JS, +} -class TestCallstackInstrument(OpenWPMTest): - def get_config(self, data_dir=""): - manager_params, browser_params = self.get_test_config(data_dir) - # Record HTTP Requests and Responses - browser_params[0].http_instrument = True - # Record JS Web API calls - browser_params[0].js_instrument = True - # Record the callstack of all WebRequests made - browser_params[0].callstack_instrument = True - return manager_params, browser_params - def test_http_stacktrace(self): - test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html" - manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(test_url, sleep=10) - db = manager_params.database_name - manager.close() - rows = db_utils.query_db( - db, - ( - "SELECT hr.url, c.call_stack" - " FROM callstacks c" - " JOIN http_requests hr" - " ON c.request_id=hr.request_id" - " AND c.visit_id= hr.visit_id" - " AND c.browser_id = hr.browser_id;" - ), +def test_http_stacktrace(default_params, task_manager_creator): + manager_params, browser_params = default_params + # Record HTTP Requests and Responses + browser_params[0].http_instrument = True + # Record JS Web API calls + browser_params[0].js_instrument = True + # Record the callstack of all WebRequests made + browser_params[0].callstack_instrument = True + test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html" + manager, db = task_manager_creator((manager_params, browser_params)) + manager.get(test_url, sleep=10) + manager.close() + rows = db_utils.query_db( + db, + ( + "SELECT hr.url, c.call_stack" + " FROM callstacks c" + " JOIN http_requests hr" + " ON c.request_id=hr.request_id" + " AND c.visit_id= hr.visit_id" + " AND c.browser_id = hr.browser_id;" + ), + ) + print("Printing callstacks contents") + observed_records = set() + for row in rows: + print(row["call_stack"]) + url, call_stack = row + test_urls = ( + "inject_pixel.js", + "test_image.png", + "Blank.gif", ) - print("Printing callstacks contents") - observed_records = set() - for row in rows: - print(row["call_stack"]) - url, call_stack = row - test_urls = ( - "inject_pixel.js", - "test_image.png", - "Blank.gif", - ) - if url.endswith(test_urls): - observed_records.add(call_stack) - assert HTTP_STACKTRACES == observed_records - - def test_parse_http_stack_trace_str(self): - stacktrace = STACK_TRACE_INJECT_IMAGE - stack_frames = parse_http_stack_trace_str(stacktrace) - assert stack_frames == CALL_STACK_INJECT_IMAGE - - # TODO: webext instrumentation doesn't support req_call_stack yet. - # def test_http_stacktrace_nonjs_loads(self): - # # stacktrace should be empty for requests NOT triggered by scripts - # test_url = utilities.BASE_TEST_URL + '/http_test_page.html' - # db = self.visit(test_url, sleep_after=3) - # rows = db_utils.query_db(db, ( - # "SELECT url, req_call_stack FROM http_requests")) - # for row in rows: - # _, stacktrace = row - # assert stacktrace == "" + if url.endswith(test_urls): + observed_records.add(call_stack) + assert HTTP_STACKTRACES == observed_records diff --git a/test/test_crawl.py b/test/test_crawl.py index 8bf17f5ef..c65a2ab2b 100644 --- a/test/test_crawl.py +++ b/test/test_crawl.py @@ -1,10 +1,14 @@ +# type:ignore +# As this file is no longer maintained, mypy shouldn't check this import os import tarfile +from pathlib import Path +from typing import List, Tuple import domain_utils as du import pytest -from openwpm import task_manager +from openwpm.config import BrowserParams, ManagerParams from openwpm.utilities import db_utils from .openwpmtest import OpenWPMTest @@ -46,7 +50,9 @@ class TestCrawl(OpenWPMTest): tests will be easier to debug """ - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Path = None + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = self.get_test_config(data_dir) browser_params[0].profile_archive_dir = os.path.join( manager_params.data_directory, "browser_profile" @@ -56,7 +62,7 @@ def get_config(self, data_dir=""): @pytest.mark.xfail(run=False) @pytest.mark.slow - def test_browser_profile_coverage(self, tmpdir): + def test_browser_profile_coverage(self, tmpdir: Path, task_manager_creator) -> None: """Test the coverage of the browser's profile This verifies that Firefox's places.sqlite database contains @@ -64,9 +70,9 @@ def test_browser_profile_coverage(self, tmpdir): it is likely the profile is lost at some point during the crawl """ # Run the test crawl - data_dir = os.path.join(str(tmpdir), "data_dir") + data_dir = tmpdir / "data_dir" manager_params, browser_params = self.get_config(data_dir) - manager = task_manager.TaskManager(manager_params, browser_params) + manager, crawl_db = task_manager_creator((manager_params, browser_params)) for site in TEST_SITES: manager.get(site) ff_db_tar = os.path.join( @@ -80,7 +86,6 @@ def test_browser_profile_coverage(self, tmpdir): # Output databases ff_db = os.path.join(browser_params[0].profile_archive_dir, "places.sqlite") - crawl_db = manager_params.database_name # Grab urls from crawl database rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") diff --git a/test/test_custom_function_command.py b/test/test_custom_function_command.py index 81d9a4552..6768268b5 100644 --- a/test/test_custom_function_command.py +++ b/test/test_custom_function_command.py @@ -1,13 +1,17 @@ +import sqlite3 + from selenium.webdriver import Firefox -from openwpm import command_sequence, task_manager +from openwpm import command_sequence from openwpm.commands.types import BaseCommand -from openwpm.config import BrowserParams, ManagerParams +from openwpm.config import BrowserParams, ManagerParamsInternal from openwpm.socket_interface import ClientSocket +from openwpm.storage.sql_provider import SQLiteStorageProvider +from openwpm.storage.storage_providers import TableName +from openwpm.task_manager import TaskManager from openwpm.utilities import db_utils from . import utilities -from .openwpmtest import OpenWPMTest url_a = utilities.BASE_TEST_URL + "/simple_a.html" @@ -30,7 +34,7 @@ class CollectLinksCommand(BaseCommand): """ Collect links with `scheme` and save in table `table_name` """ - def __init__(self, scheme, table_name) -> None: + def __init__(self, table_name: TableName, scheme: str) -> None: self.scheme = scheme self.table_name = table_name @@ -38,9 +42,11 @@ def execute( self, webdriver: Firefox, browser_params: BrowserParams, - manager_params: ManagerParams, + manager_params: ManagerParamsInternal, extension_socket: ClientSocket, ) -> None: + browser_id = self.browser_id + visit_id = self.visit_id link_urls = [ x for x in ( @@ -52,14 +58,8 @@ def execute( current_url = webdriver.current_url sock = ClientSocket() - sock.connect(*manager_params.aggregator_address) - - query = ( - "CREATE TABLE IF NOT EXISTS %s (" - "top_url TEXT, link TEXT, " - "visit_id INTEGER, browser_id INTEGER);" % self.table_name - ) - sock.send(("create_table", query)) + assert manager_params.storage_controller_address is not None + sock.connect(*manager_params.storage_controller_address) for link in link_urls: query = ( @@ -67,33 +67,42 @@ def execute( { "top_url": current_url, "link": link, - "visit_id": self.visit_id, - "browser_id": self.browser_id, + "visit_id": visit_id, + "browser_id": browser_id, }, ) sock.send(query) sock.close() -class TestCustomFunctionCommand(OpenWPMTest): - """Test `custom_function` command's ability to handle inline functions""" - - def get_config(self, data_dir=""): - return self.get_test_config(data_dir) - - def test_custom_function(self): - """ Test `custom_function` with an inline func that collects links """ - - manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) - cs = command_sequence.CommandSequence(url_a) - cs.get(sleep=0, timeout=60) - cs.append_command(CollectLinksCommand("http", "page_links")) - manager.execute_command_sequence(cs) - manager.close() - query_result = db_utils.query_db( - manager_params.database_name, - "SELECT top_url, link FROM page_links;", - as_tuple=True, - ) - assert PAGE_LINKS == set(query_result) +def test_custom_function(default_params, xpi, server): + """ Test `custom_function` with an inline func that collects links """ + table_name = TableName("page_links") + + manager_params, browser_params = default_params + path = manager_params.data_directory / "crawl-data.sqlite" + db = sqlite3.connect(path) + cur = db.cursor() + + cur.execute( + """CREATE TABLE IF NOT EXISTS %s ( + top_url TEXT, link TEXT, + visit_id INTEGER, browser_id INTEGER);""" + % table_name + ) + cur.close() + db.close() + + storage_provider = SQLiteStorageProvider(path) + manager = TaskManager(manager_params, browser_params, storage_provider, None) + cs = command_sequence.CommandSequence(url_a) + cs.get(sleep=0, timeout=60) + cs.append_command(CollectLinksCommand(table_name, "http")) + manager.execute_command_sequence(cs) + manager.close() + query_result = db_utils.query_db( + path, + "SELECT top_url, link FROM page_links;", + as_tuple=True, + ) + assert PAGE_LINKS == set(query_result) diff --git a/test/test_dataclass_validations.py b/test/test_dataclass_validations.py index 9f32da3ad..1a8386a51 100644 --- a/test/test_dataclass_validations.py +++ b/test/test_dataclass_validations.py @@ -77,14 +77,6 @@ def test_log_file_extension(): validate_manager_params(manager_params) -def test_database_file_extension(): - manager_params = ManagerParams() - - manager_params.database_name = "something.unsupported" - with pytest.raises(ConfigError): - validate_manager_params(manager_params) - - def test_failure_limit(): manager_params = ManagerParams() @@ -99,17 +91,6 @@ def test_failure_limit(): validate_manager_params(manager_params) -def test_output_format(): - manager_params = ManagerParams() - - manager_params.output_format = "not None and not int" - with pytest.raises(ConfigError): - validate_manager_params(manager_params) - - manager_params.output_format = "s3" - validate_manager_params(manager_params) - - def test_num_browser_crawl_config(): manager_params = ManagerParams(num_browsers=2) browser_params = [BrowserParams()] diff --git a/test/test_dns_instrument.py b/test/test_dns_instrument.py index 28da4b27a..48f183292 100644 --- a/test/test_dns_instrument.py +++ b/test/test_dns_instrument.py @@ -1,21 +1,18 @@ from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest +def test_name_resolution(default_params, task_manager_creator): + manager_params, browser_params = default_params + for browser_param in browser_params: + browser_param.dns_instrument = True -class TestDNSInstrument(OpenWPMTest): - def get_config(self, data_dir=""): - manager_params, browser_params = self.get_test_config(data_dir) - for browser_param in browser_params: - browser_param.dns_instrument = True - return manager_params, browser_params + manager, db = task_manager_creator((manager_params, browser_params)) + manager.get("http://localtest.me:8000") + manager.close() - def test_name_resolution(self): - db = self.visit("http://localtest.me:8000") - result = db_utils.query_db(db, "SELECT * FROM dns_responses") - result = result[0] - print(result.keys()) - assert result["used_address"] == "127.0.0.1" - assert result["addresses"] == "127.0.0.1" - assert result["hostname"] == "localtest.me" - assert result["canonical_name"] == "localtest.me" + result = db_utils.query_db(db, "SELECT * FROM dns_responses") + result = result[0] + assert result["used_address"] == "127.0.0.1" + assert result["addresses"] == "127.0.0.1" + assert result["hostname"] == "localtest.me" + assert result["canonical_name"] == "localtest.me" diff --git a/test/test_env.py b/test/test_env.py deleted file mode 100644 index d9189133a..000000000 --- a/test/test_env.py +++ /dev/null @@ -1,11 +0,0 @@ -from os.path import isfile - -from openwpm.utilities.platform_utils import get_firefox_binary_path - -from .openwpmtest import OpenWPMTest - - -class TestDependencies(OpenWPMTest): - def test_dependencies(self): - firefox_binary_path = get_firefox_binary_path() - assert isfile(firefox_binary_path) diff --git a/test/test_extension.py b/test/test_extension.py index 017ef2b72..2527d4912 100644 --- a/test/test_extension.py +++ b/test/test_extension.py @@ -1,9 +1,12 @@ import os from datetime import datetime +from pathlib import Path +from sqlite3 import Row +from typing import List, Tuple import pytest -from openwpm import task_manager +from openwpm.config import BrowserParams, ManagerParams from openwpm.utilities import db_utils from . import utilities @@ -244,16 +247,18 @@ u"test_cookie=Test-0123456789; " "expires=Tue, 31 Dec 2030 00:00:00 UTC; path=/", ) -DOCUMENT_COOKIE_READ_WRITE = set([DOCUMENT_COOKIE_READ, DOCUMENT_COOKIE_WRITE]) +DOCUMENT_COOKIE_READ_WRITE = {DOCUMENT_COOKIE_READ, DOCUMENT_COOKIE_WRITE} class TestExtension(OpenWPMTest): - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Path = None + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = self.get_test_config(data_dir) browser_params[0].js_instrument = True return manager_params, browser_params - def test_property_enumeration(self): + def test_property_enumeration(self) -> None: test_url = utilities.BASE_TEST_URL + "/property_enumeration.html" db = self.visit(test_url) rows = db_utils.query_db(db, "SELECT script_url, symbol FROM javascript") @@ -263,12 +268,13 @@ def test_property_enumeration(self): observed_symbols.add(symbol) assert PROPERTIES == observed_symbols - def test_canvas_fingerprinting(self): + def test_canvas_fingerprinting(self) -> None: db = self.visit("/canvas_fingerprinting.html") # Check that all calls and methods are recorded rows = db_utils.get_javascript_entries(db) observed_rows = set() for row in rows: + assert isinstance(row, Row) item = ( row["script_url"], row["symbol"], @@ -279,19 +285,13 @@ def test_canvas_fingerprinting(self): observed_rows.add(item) assert CANVAS_CALLS == observed_rows - def test_extension_gets_correct_visit_id(self): - manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) - + def test_extension_gets_correct_visit_id(self) -> None: url_a = utilities.BASE_TEST_URL + "/simple_a.html" url_b = utilities.BASE_TEST_URL + "/simple_b.html" + self.visit(url_a) + db = self.visit(url_b) - manager.get(url_a) - manager.get(url_b) - manager.close() - qry_res = db_utils.query_db( - manager_params.database_name, "SELECT visit_id, site_url FROM site_visits" - ) + qry_res = db_utils.query_db(db, "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() @@ -299,21 +299,21 @@ def test_extension_gets_correct_visit_id(self): visit_ids[row[1]] = row[0] simple_a_visit_id = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM javascript WHERE " "symbol=?", + db, + "SELECT visit_id FROM javascript WHERE symbol=?", ("window.navigator.userAgent",), ) simple_b_visit_id = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM javascript WHERE " "symbol=?", + db, + "SELECT visit_id FROM javascript WHERE symbol=?", ("window.navigator.platform",), ) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0] - def check_webrtc_sdp_offer(self, sdp_str): + def check_webrtc_sdp_offer(self, sdp_str: str) -> None: """Make sure the SDP offer includes expected fields/strings. SDP offer contains randomly generated strings (e.g. GUID). That's why @@ -323,12 +323,13 @@ def check_webrtc_sdp_offer(self, sdp_str): for expected_str in WEBRTC_SDP_OFFER_STRINGS: assert expected_str in sdp_str - def test_webrtc_localip(self): + def test_webrtc_localip(self) -> None: db = self.visit("/webrtc_localip.html") # Check that all calls and methods are recorded rows = db_utils.get_javascript_entries(db) observed_rows = set() for row in rows: + assert isinstance(row, Row) if row["symbol"] == "RTCPeerConnection.setLocalDescription" and ( row["operation"] == "call" ): diff --git a/test/test_http_instrumentation.py b/test/test_http_instrumentation.py index 538735bed..ac954d951 100644 --- a/test/test_http_instrumentation.py +++ b/test/test_http_instrumentation.py @@ -5,13 +5,19 @@ import json import os from hashlib import sha256 +from pathlib import Path from time import sleep +from typing import List, Optional, Set, Tuple from urllib.parse import urlparse import pytest from openwpm import command_sequence, task_manager +from openwpm.command_sequence import CommandSequence from openwpm.commands.types import BaseCommand +from openwpm.config import BrowserParams, ManagerParams +from openwpm.storage.leveldb import LevelDbProvider +from openwpm.storage.sql_provider import SQLiteStorageProvider from openwpm.utilities import db_utils from . import utilities @@ -590,7 +596,9 @@ class TestHTTPInstrument(OpenWPMTest): - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Optional[Path] + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = self.get_test_config(data_dir) browser_params[0].http_instrument = True return manager_params, browser_params @@ -624,7 +632,7 @@ def test_page_visit(self): # HTTP Responses rows = db_utils.query_db(db, "SELECT * FROM http_responses") - observed_records = set() + observed_records: Set[Tuple[str, str]] = set() for row in rows: observed_records.add( ( @@ -656,155 +664,6 @@ def test_page_visit(self): observed_records.add((src, dst, location)) assert HTTP_REDIRECTS == observed_records - def test_cache_hits_recorded(self): - """Verify all http responses are recorded, including cached responses - - Note that we expect to see all of the same requests and responses - during the second vist (even if cached) except for images. Cached - images do not trigger Observer Notification events. - See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073 - - The test page includes an image which does several permanent redirects - before returning a 404. We expect to see new requests and responses - for this image when the page is reloaded. Additionally, the redirects - should be cached. - """ - test_url = utilities.BASE_TEST_URL + "/http_test_page.html" - manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(test_url, sleep=5) - manager.get(test_url, sleep=5) - manager.close() - db = manager_params.database_name - - request_id_to_url = dict() - - # HTTP Requests - rows = db_utils.query_db(db, "SELECT * FROM http_requests WHERE visit_id = 2") - observed_records = set() - for row in rows: - # HACK: favicon caching is unpredictable, don't bother checking it - if row["url"].split("?")[0].endswith("favicon.ico"): - continue - observed_records.add( - ( - row["url"].split("?")[0], - row["top_level_url"], - row["triggering_origin"], - row["loading_origin"], - row["loading_href"], - row["is_XHR"], - row["is_third_party_channel"], - row["is_third_party_to_top_window"], - row["resource_type"], - ) - ) - request_id_to_url[row["request_id"]] = row["url"] - assert HTTP_CACHED_REQUESTS == observed_records - - # HTTP Responses - rows = db_utils.query_db(db, "SELECT * FROM http_responses WHERE visit_id = 2") - observed_records = set() - for row in rows: - # HACK: favicon caching is unpredictable, don't bother checking it - if row["url"].split("?")[0].endswith("favicon.ico"): - continue - observed_records.add( - ( - row["url"].split("?")[0], - # TODO: referrer isn't available yet in the - # webext instrumentation | row['referrer'], - row["is_cached"], - ) - ) - assert row["request_id"] in request_id_to_url - assert request_id_to_url[row["request_id"]] == row["url"] - assert HTTP_CACHED_RESPONSES == observed_records - - # HTTP Redirects - rows = db_utils.query_db(db, "SELECT * FROM http_redirects WHERE visit_id = 2") - observed_records = set() - for row in rows: - # TODO: new_request_id isn't supported yet - # src = request_id_to_url[row['old_request_id']].split('?')[0] - # dst = request_id_to_url[row['new_request_id']].split('?')[0] - src = row["old_request_url"].split("?")[0] - dst = row["new_request_url"].split("?")[0] - observed_records.add((src, dst)) - assert HTTP_CACHED_REDIRECTS == observed_records - - def test_javascript_saving(self, tmpdir): - """ check that javascript content is saved and hashed correctly """ - test_url = utilities.BASE_TEST_URL + "/http_test_page.html" - manager_params, browser_params = self.get_test_config(str(tmpdir)) - browser_params[0].http_instrument = True - browser_params[0].save_content = "script" - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(url=test_url, sleep=1) - manager.close() - expected_hashes = { - "0110c0521088c74f179615cd7c404816816126fa657550032f75ede67a66c7cc", - "b34744034cd61e139f85f6c4c92464927bed8343a7ac08acf9fb3c6796f80f08", - } - for chash, content in db_utils.get_content(str(tmpdir)): - chash = chash.decode("ascii").lower() - pyhash = sha256(content).hexdigest().lower() - assert pyhash == chash # Verify expected key (sha256 of content) - assert chash in expected_hashes - expected_hashes.remove(chash) - assert len(expected_hashes) == 0 # All expected hashes have been seen - - def test_document_saving(self, tmpdir): - """ check that document content is saved and hashed correctly """ - test_url = utilities.BASE_TEST_URL + "/http_test_page.html" - expected_hashes = { - "2390eceab422db15bc45940b7e042e83e6cbd5f279f57e714bc4ad6cded7f966", - "25343f42d9ffa5c082745f775b172db87d6e14dfbc3160b48669e06d727bfc8d", - } - manager_params, browser_params = self.get_test_config(str(tmpdir)) - browser_params[0].http_instrument = True - browser_params[0].save_content = "main_frame,sub_frame" - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(url=test_url, sleep=1) - manager.close() - for chash, content in db_utils.get_content(str(tmpdir)): - chash = chash.decode("ascii").lower() - pyhash = sha256(content).hexdigest().lower() - assert pyhash == chash # Verify expected key (sha256 of content) - assert chash in expected_hashes - expected_hashes.remove(chash) - assert len(expected_hashes) == 0 # All expected hashes have been seen - - def test_content_saving(self, tmpdir): - """ check that content is saved and hashed correctly """ - test_url = utilities.BASE_TEST_URL + "/http_test_page.html" - manager_params, browser_params = self.get_test_config(str(tmpdir)) - browser_params[0].http_instrument = True - browser_params[0].save_content = True - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(url=test_url, sleep=1) - manager.close() - db = manager_params.database_name - rows = db_utils.query_db(db, "SELECT * FROM http_responses;") - disk_content = dict() - for row in rows: - if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]: - continue - path = urlparse(row["url"]).path - with open(os.path.join(BASE_PATH, path[1:]), "rb") as f: - content = f.read() - chash = sha256(content).hexdigest() - assert chash == row["content_hash"] - disk_content[chash] = content - - ldb_content = dict() - for chash, content in db_utils.get_content(str(tmpdir)): - chash = chash.decode("ascii") - ldb_content[chash] = content - - for k, v in disk_content.items(): - assert v == ldb_content[k] - def test_worker_script_requests(self): """Check correct URL attribution for requests made by worker script""" test_url = utilities.BASE_TEST_URL + "/http_worker_page.html" @@ -890,7 +749,9 @@ class TestPOSTInstrument(OpenWPMTest): "line2 line2_word2\r\n" ) - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Optional[Path] = None + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = self.get_test_config(data_dir) browser_params[0].http_instrument = True return manager_params, browser_params @@ -978,7 +839,7 @@ def test_record_binary_post_data(self): reason="Firefox is currently not able to return the " "file content for an upload, only the filename" ) - def test_record_file_upload(self): + def test_record_file_upload(self, task_manager_creator): """Test that we correctly capture the uploaded file contents. We upload a CSS file and a PNG file to test both text based and @@ -999,7 +860,7 @@ def test_record_file_upload(self): css_file_path = os.path.abspath("test_pages/shared/test_style.css") manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) + manager, db_path = task_manager_creator((manager_params, browser_params)) test_url = utilities.BASE_TEST_URL + "/post_file_upload.html" cs = command_sequence.CommandSequence(test_url) cs.get(sleep=0, timeout=60) @@ -1007,7 +868,7 @@ def test_record_file_upload(self): manager.execute_command_sequence(cs) manager.close() - post_body = self.get_post_request_body_from_db(manager_params.database_name) + post_body = self.get_post_request_body_from_db(db_path) # Binary strings get put into the database as-if they were latin-1. with open(img_file_path, "rb") as f: img_file_content = f.read().strip().decode("latin-1") @@ -1023,8 +884,213 @@ def test_record_file_upload(self): assert expected_body == post_body_decoded +def test_javascript_saving(http_params, xpi, server): + """ check that javascript content is saved and hashed correctly """ + test_url = utilities.BASE_TEST_URL + "/http_test_page.html" + manager_params, browser_params = http_params() + + for browser_param in browser_params: + browser_param.http_instrument = True + browser_param.save_content = "script" + + structured_storage = SQLiteStorageProvider( + db_path=manager_params.data_directory / "crawl-data.sqlite" + ) + ldb_path = Path(manager_params.data_directory) / "content.ldb" + unstructured_storage = LevelDbProvider(db_path=ldb_path) + manager = task_manager.TaskManager( + manager_params, browser_params, structured_storage, unstructured_storage + ) + manager.get(url=test_url, sleep=1) + manager.close() + expected_hashes = { + "0110c0521088c74f179615cd7c404816816126fa657550032f75ede67a66c7cc", + "b34744034cd61e139f85f6c4c92464927bed8343a7ac08acf9fb3c6796f80f08", + } + for chash, content in db_utils.get_content(ldb_path): + chash = chash.decode("ascii").lower() + pyhash = sha256(content).hexdigest().lower() + assert pyhash == chash # Verify expected key (sha256 of content) + assert chash in expected_hashes + expected_hashes.remove(chash) + assert len(expected_hashes) == 0 # All expected hashes have been seen + + +def test_document_saving(http_params, xpi, server): + """ check that document content is saved and hashed correctly """ + test_url = utilities.BASE_TEST_URL + "/http_test_page.html" + expected_hashes = { + "2390eceab422db15bc45940b7e042e83e6cbd5f279f57e714bc4ad6cded7f966", + "25343f42d9ffa5c082745f775b172db87d6e14dfbc3160b48669e06d727bfc8d", + } + manager_params, browser_params = http_params() + for browser_param in browser_params: + browser_param.http_instrument = True + browser_param.save_content = "main_frame,sub_frame" + + structured_storage = SQLiteStorageProvider( + db_path=manager_params.data_directory / "crawl-data.sqlite" + ) + ldb_path = Path(manager_params.data_directory) / "content.ldb" + unstructured_storage = LevelDbProvider(db_path=ldb_path) + manager = task_manager.TaskManager( + manager_params, browser_params, structured_storage, unstructured_storage + ) + + manager.get(url=test_url, sleep=1) + manager.close() + for chash, content in db_utils.get_content(ldb_path): + chash = chash.decode("ascii").lower() + pyhash = sha256(content).hexdigest().lower() + assert pyhash == chash # Verify expected key (sha256 of content) + assert chash in expected_hashes + expected_hashes.remove(chash) + assert len(expected_hashes) == 0 # All expected hashes have been seen + + +def test_content_saving(http_params, xpi, server): + """ check that content is saved and hashed correctly """ + test_url = utilities.BASE_TEST_URL + "/http_test_page.html" + manager_params, browser_params = http_params() + for browser_param in browser_params: + browser_param.http_instrument = True + browser_param.save_content = True + db = manager_params.data_directory / "crawl-data.sqlite" + structured_storage = SQLiteStorageProvider(db_path=db) + ldb_path = Path(manager_params.data_directory) / "content.ldb" + unstructured_storage = LevelDbProvider(db_path=ldb_path) + manager = task_manager.TaskManager( + manager_params, browser_params, structured_storage, unstructured_storage + ) + manager.get(url=test_url, sleep=1) + manager.close() + + rows = db_utils.query_db(db, "SELECT * FROM http_responses;") + disk_content = dict() + for row in rows: + if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]: + continue + path = urlparse(row["url"]).path + with open(os.path.join(BASE_PATH, path[1:]), "rb") as f: + content = f.read() + chash = sha256(content).hexdigest() + assert chash == row["content_hash"] + disk_content[chash] = content + + ldb_content = dict() + for chash, content in db_utils.get_content(ldb_path): + chash = chash.decode("ascii") + ldb_content[chash] = content + + for k, v in disk_content.items(): + assert v == ldb_content[k] + + +def test_cache_hits_recorded(http_params, task_manager_creator): + """Verify all http responses are recorded, including cached responses + + Note that we expect to see all of the same requests and responses + during the second vist (even if cached) except for images. Cached + images do not trigger Observer Notification events. + See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073 + + The test page includes an image which does several permanent redirects + before returning a 404. We expect to see new requests and responses + for this image when the page is reloaded. Additionally, the redirects + should be cached. + """ + test_url = utilities.BASE_TEST_URL + "/http_test_page.html" + manager_params, browser_params = http_params() + # ensuring that we only spawn one browser + manager_params.num_browsers = 1 + manager, db = task_manager_creator((manager_params, [browser_params[0]])) + for i in range(2): + cs = CommandSequence(test_url, site_rank=i) + cs.get(sleep=5) + manager.execute_command_sequence(cs) + + manager.close() + + request_id_to_url = dict() + + # HTTP Requests + rows = db_utils.query_db( + db, + """ + SELECT hr.* + FROM http_requests as hr + JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id + WHERE sv.site_rank = 1""", + ) + observed_records = set() + for row in rows: + # HACK: favicon caching is unpredictable, don't bother checking it + if row["url"].split("?")[0].endswith("favicon.ico"): + continue + observed_records.add( + ( + row["url"].split("?")[0], + row["top_level_url"], + row["triggering_origin"], + row["loading_origin"], + row["loading_href"], + row["is_XHR"], + row["is_third_party_channel"], + row["is_third_party_to_top_window"], + row["resource_type"], + ) + ) + request_id_to_url[row["request_id"]] = row["url"] + assert observed_records == HTTP_CACHED_REQUESTS + + # HTTP Responses + rows = db_utils.query_db( + db, + """ + SELECT hp.* + FROM http_responses as hp + JOIN site_visits sv ON sv.visit_id = hp.visit_id and sv.browser_id = hp.browser_id + WHERE sv.site_rank = 1""", + ) + observed_records = set() + for row in rows: + # HACK: favicon caching is unpredictable, don't bother checking it + if row["url"].split("?")[0].endswith("favicon.ico"): + continue + observed_records.add( + ( + row["url"].split("?")[0], + # TODO: referrer isn't available yet in the + # webext instrumentation | row['referrer'], + row["is_cached"], + ) + ) + assert row["request_id"] in request_id_to_url + assert request_id_to_url[row["request_id"]] == row["url"] + assert HTTP_CACHED_RESPONSES == observed_records + + # HTTP Redirects + rows = db_utils.query_db( + db, + """ + SELECT hr.* + FROM http_redirects as hr + JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id + WHERE sv.site_rank = 1""", + ) + observed_records = set() + for row in rows: + # TODO: new_request_id isn't supported yet + # src = request_id_to_url[row['old_request_id']].split('?')[0] + # dst = request_id_to_url[row['new_request_id']].split('?')[0] + src = row["old_request_url"].split("?")[0] + dst = row["new_request_url"].split("?")[0] + observed_records.add((src, dst)) + assert HTTP_CACHED_REDIRECTS == observed_records + + class FilenamesIntoFormCommand(BaseCommand): - def __init__(self, img_file_path, css_file_path) -> None: + def __init__(self, img_file_path: str, css_file_path: str) -> None: self.img_file_path = img_file_path self.css_file_path = css_file_path @@ -1034,7 +1100,7 @@ def execute( browser_params, manager_params, extension_socket, - ) -> None: + ): img_file_upload_element = webdriver.find_element_by_id("upload-img") css_file_upload_element = webdriver.find_element_by_id("upload-css") img_file_upload_element.send_keys(self.img_file_path) diff --git a/test/test_js_instrument.py b/test/test_js_instrument.py index 66eb3ad67..70d84b744 100644 --- a/test/test_js_instrument.py +++ b/test/test_js_instrument.py @@ -1,3 +1,7 @@ +from pathlib import Path +from typing import List, Optional, Set, Tuple + +from openwpm.config import BrowserParams, ManagerParams from openwpm.utilities import db_utils from . import utilities as util @@ -21,7 +25,7 @@ class TestJSInstrumentNonExistingWindowProperty(OpenWPMJSTest): ("window.nonExisting", "get", "undefined"), } - METHOD_CALLS = set() + METHOD_CALLS: Set[Tuple[str, str, str]] = set() TEST_PAGE = "instrument_non_existing_window_property.html" TOP_URL = u"%s/js_instrument/%s" % (util.BASE_TEST_URL, TEST_PAGE) @@ -64,7 +68,7 @@ class TestJSInstrumentExistingWindowProperty(OpenWPMJSTest): # Note 1: nonExistingProp1 is not enumerable even after being set # Note 2: nonExistingMethod1 shows up as a get rather than call - METHOD_CALLS = set() # Note 2 + METHOD_CALLS: Set[Tuple[str, str, str]] = set() # Note 2 TEST_PAGE = "instrument_existing_window_property.html" TOP_URL = u"%s/js_instrument/%s" % (util.BASE_TEST_URL, TEST_PAGE) @@ -99,7 +103,9 @@ class TestJSInstrumentByPython(OpenWPMJSTest): # noqa ("window.fetch", "call", '["https://example.org"]'), } - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Optional[Path] + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = super().get_config(data_dir) browser_params[0].prefs = { "network.dns.localDomains": "example.com,example.org" @@ -376,7 +382,7 @@ class TestJSInstrumentRecursiveProperties(OpenWPMJSTest): ("window.test.test.prop2", "get", "test_test_prop2"), } - METHOD_CALLS = set() + METHOD_CALLS: Set[Tuple[str, str, str]] = set() TEST_PAGE = "instrument_do_not_recurse_properties_to_instrument.html" TOP_URL = u"%s/js_instrument/%s" % (util.BASE_TEST_URL, TEST_PAGE) diff --git a/test/test_js_instrument_py.py b/test/test_js_instrument_py.py index 33be8c4af..eafc6e3f3 100644 --- a/test/test_js_instrument_py.py +++ b/test/test_js_instrument_py.py @@ -1,19 +1,17 @@ +""" +Test function that converts our python +objects to our JS string +""" import pytest from jsonschema.exceptions import ValidationError from openwpm import js_instrumentation as jsi -pytestmark = pytest.mark.pyonly - def _no_whitespace(x): return "".join(x.split()) -# Test function that converts our python -# objects to our JS string - - def test_python_to_js_lower_true_false(): inpy = [ { diff --git a/test/test_mp_logger.py b/test/test_mp_logger.py index 3ad8b8b54..8cdba99a8 100644 --- a/test/test_mp_logger.py +++ b/test/test_mp_logger.py @@ -2,13 +2,9 @@ import os import time -import pytest - from openwpm import mp_logger from openwpm.utilities.multiprocess_utils import Process -from .openwpmtest import OpenWPMTest - CHILD_INFO_STR_1 = "Child %d - INFO1" CHILD_INFO_STR_2 = "Child %d - INFO2" CHILD_DEBUG_STR = "Child %d - DEBUG" @@ -78,103 +74,102 @@ def child_proc_logging_exception(): ) -class TestMPLogger(OpenWPMTest): - def get_logfile_path(self, directory): - return os.path.join(directory, "mplogger.log") - - def get_logfile_contents(self, logfile): - with open(logfile, "r") as f: - content = f.read().strip() - return content - - def test_multiprocess(self, tmpdir): - # Set up loggingserver - log_file = self.get_logfile_path(str(tmpdir)) - openwpm_logger = mp_logger.MPLogger(log_file) - - child_process_1 = Process(target=child_proc, args=(0,)) - child_process_1.daemon = True - child_process_1.start() - child_process_2 = Process(target=child_proc, args=(1,)) - child_process_2.daemon = True - child_process_2.start() - - # Send some sample logs - logger.info(PARENT_INFO_STR_1) - logger.error(PARENT_ERROR_STR) - logger.critical(PARENT_CRITICAL_STR) - logger.debug(PARENT_DEBUG_STR) - logger.warning(PARENT_WARNING_STR) - - logger1 = logging.getLogger("test1") - logger2 = logging.getLogger("test2") - logger1.info(NAMED_LOGGER_INFO_1) - logger2.info(NAMED_LOGGER_INFO_2) - - # Close the logging server - time.sleep(2) # give some time for logs to be sent - openwpm_logger.close() - child_process_1.join() - child_process_2.join() - print("Child processes joined...") - - log_content = self.get_logfile_contents(log_file) - for child in range(2): - assert log_content.count(CHILD_INFO_STR_1 % child) == 1 - assert log_content.count(CHILD_INFO_STR_2 % child) == 1 - assert log_content.count(CHILD_ERROR_STR % child) == 1 - assert log_content.count(CHILD_CRITICAL_STR % child) == 1 - assert log_content.count(CHILD_DEBUG_STR % child) == 1 - assert log_content.count(CHILD_WARNING_STR % child) == 1 - assert log_content.count(PARENT_INFO_STR_1) == 1 - assert log_content.count(PARENT_ERROR_STR) == 1 - assert log_content.count(PARENT_CRITICAL_STR) == 1 - assert log_content.count(PARENT_DEBUG_STR) == 1 - assert log_content.count(PARENT_WARNING_STR) == 1 - - def test_multiple_instances(self, tmpdir): - os.makedirs(str(tmpdir) + "-1") - self.test_multiprocess(str(tmpdir) + "-1") - os.makedirs(str(tmpdir) + "-2") - self.test_multiprocess(str(tmpdir) + "-2") - - @pytest.mark.skipif( - "CI" in os.environ and os.environ["CI"] == "true", - reason="Flaky on CI", - ) - def test_child_process_with_exception(self, tmpdir): - log_file = self.get_logfile_path(str(tmpdir)) - openwpm_logger = mp_logger.MPLogger(log_file) - - child_process_1 = Process(target=child_proc_with_exception, args=(0,)) - child_process_1.daemon = True - child_process_1.start() - child_process_2 = Process(target=child_proc_with_exception, args=(1,)) - child_process_2.daemon = True - child_process_2.start() - - # Close the logging server - time.sleep(2) # give some time for logs to be sent - child_process_1.join() - child_process_2.join() - print("Child processes joined...") - openwpm_logger.close() - - log_content = self.get_logfile_contents(log_file) - for child in range(2): - assert log_content.count(CHILD_INFO_STR_1 % child) == 1 - assert log_content.count(CHILD_INFO_STR_2 % child) == 1 - assert log_content.count(CHILD_EXCEPTION_STR % child) == 1 - - def test_child_process_logging(self, tmpdir): - log_file = self.get_logfile_path(str(tmpdir)) - openwpm_logger = mp_logger.MPLogger(log_file) - child_process = Process(target=child_proc_logging_exception()) - child_process.daemon = True - child_process.start() - time.sleep(2) - child_process.join() - print("Child processes joined...") - openwpm_logger.close() - log_content = self.get_logfile_contents(log_file) - assert "I'm logging an exception" in log_content +def get_logfile_path(directory): + return os.path.join(directory, "mplogger.log") + + +def get_logfile_contents(logfile): + with open(logfile, "r") as f: + content = f.read().strip() + return content + + +def test_multiprocess(tmpdir): + # Set up loggingserver + log_file = get_logfile_path(str(tmpdir)) + openwpm_logger = mp_logger.MPLogger(log_file) + + child_process_1 = Process(target=child_proc, args=(0,)) + child_process_1.daemon = True + child_process_1.start() + child_process_2 = Process(target=child_proc, args=(1,)) + child_process_2.daemon = True + child_process_2.start() + + # Send some sample logs + logger.info(PARENT_INFO_STR_1) + logger.error(PARENT_ERROR_STR) + logger.critical(PARENT_CRITICAL_STR) + logger.debug(PARENT_DEBUG_STR) + logger.warning(PARENT_WARNING_STR) + + logger1 = logging.getLogger("test1") + logger2 = logging.getLogger("test2") + logger1.info(NAMED_LOGGER_INFO_1) + logger2.info(NAMED_LOGGER_INFO_2) + + # Close the logging server + time.sleep(2) # give some time for logs to be sent + child_process_1.join() + child_process_2.join() + print("Child processes joined...") + openwpm_logger.close() + + log_content = get_logfile_contents(log_file) + for child in range(2): + assert log_content.count(CHILD_INFO_STR_1 % child) == 1 + assert log_content.count(CHILD_INFO_STR_2 % child) == 1 + assert log_content.count(CHILD_ERROR_STR % child) == 1 + assert log_content.count(CHILD_CRITICAL_STR % child) == 1 + assert log_content.count(CHILD_DEBUG_STR % child) == 1 + assert log_content.count(CHILD_WARNING_STR % child) == 1 + assert log_content.count(PARENT_INFO_STR_1) == 1 + assert log_content.count(PARENT_ERROR_STR) == 1 + assert log_content.count(PARENT_CRITICAL_STR) == 1 + assert log_content.count(PARENT_DEBUG_STR) == 1 + assert log_content.count(PARENT_WARNING_STR) == 1 + + +def test_multiple_instances(tmpdir): + os.makedirs(str(tmpdir) + "-1") + test_multiprocess(str(tmpdir) + "-1") + os.makedirs(str(tmpdir) + "-2") + test_multiprocess(str(tmpdir) + "-2") + + +def test_child_process_with_exception(tmpdir): + log_file = get_logfile_path(str(tmpdir)) + openwpm_logger = mp_logger.MPLogger(log_file) + + child_process_1 = Process(target=child_proc_with_exception, args=(0,)) + child_process_1.daemon = True + child_process_1.start() + child_process_2 = Process(target=child_proc_with_exception, args=(1,)) + child_process_2.daemon = True + child_process_2.start() + + # Close the logging server + time.sleep(2) # give some time for logs to be sent + + child_process_1.join() + child_process_2.join() + print("Child processes joined...") + openwpm_logger.close() + + log_content = get_logfile_contents(log_file) + for child in range(2): + assert log_content.count(CHILD_INFO_STR_1 % child) == 1 + assert log_content.count(CHILD_INFO_STR_2 % child) == 1 + assert log_content.count(CHILD_EXCEPTION_STR % child) == 1 + + +def test_child_process_logging(tmpdir): + log_file = get_logfile_path(str(tmpdir)) + openwpm_logger = mp_logger.MPLogger(log_file) + child_process = Process(target=child_proc_logging_exception()) + child_process.daemon = True + child_process.start() + child_process.join() + openwpm_logger.close() + log_content = get_logfile_contents(log_file) + assert "I'm logging an exception" in log_content diff --git a/test/test_profile.py b/test/test_profile.py index f84b4dba4..85c05c80e 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -1,9 +1,12 @@ from os.path import isfile, join +from pathlib import Path +from typing import Any, List, Optional, Tuple import pytest from openwpm.command_sequence import CommandSequence from openwpm.commands.types import BaseCommand +from openwpm.config import BrowserParams, ManagerParams from openwpm.errors import CommandExecutionError, ProfileLoadError from openwpm.task_manager import TaskManager from openwpm.utilities import db_utils @@ -14,7 +17,9 @@ class TestProfile(OpenWPMTest): - def get_config(self, data_dir=""): + def get_config( + self, data_dir: Optional[Path] + ) -> Tuple[ManagerParams, List[BrowserParams]]: manager_params, browser_params = self.get_test_config(data_dir) browser_params[0].profile_archive_dir = join( manager_params.data_directory, "browser_profile" @@ -64,7 +69,7 @@ def test_profile_saved_when_launch_crashes(self): manager.ldb_status_queue.put("DIE") manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout - manager.get("example.com") # Cause a selenium crasht + manager.get("example.com") # Cause a selenium crash # The browser will fail to launch due to the proxy crashes try: @@ -74,30 +79,35 @@ def test_profile_saved_when_launch_crashes(self): manager.close() assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) - def test_seed_persistance(self): - manager_params, browser_params = self.get_test_config(num_browsers=1) - browser_params[0].seed_tar = "." - command_sequences = [] - for _ in range(2): - cs = CommandSequence(url="https://example.com", reset=True) - cs.get() - cs.append_command(TestConfigSetCommand("test_pref", True)) - command_sequences.append(cs) - manager = TaskManager(manager_params, browser_params) - for cs in command_sequences: - manager.execute_command_sequence(cs) - manager.close() - query_result = db_utils.query_db( - manager_params.database_name, - "SELECT * FROM crawl_history;", - ) - assert len(query_result) > 0 - for row in query_result: - assert row["command_status"] == "ok", f"Command {tuple(row)} was not ok" - -class TestConfigSetCommand(BaseCommand): - def __init__(self, pref_name, expected_value) -> None: +def test_seed_persistance(default_params, task_manager_creator): + manager_params, browser_params = default_params + p = Path("profile.tar.gz") + for browser_param in browser_params: + browser_param.seed_tar = p + manager, db = task_manager_creator(default_params) + + command_sequences = [] + for _ in range(2): + cs = CommandSequence(url="https://example.com", reset=True) + cs.get() + cs.append_command(AssertConfigSetCommand("test_pref", True)) + command_sequences.append(cs) + + for cs in command_sequences: + manager.execute_command_sequence(cs) + manager.close() + query_result = db_utils.query_db( + db, + "SELECT * FROM crawl_history;", + ) + assert len(query_result) > 0 + for row in query_result: + assert row["command_status"] == "ok", f"Command {tuple(row)} was not ok" + + +class AssertConfigSetCommand(BaseCommand): + def __init__(self, pref_name: str, expected_value: Any) -> None: self.pref_name = pref_name self.expected_value = expected_value @@ -107,7 +117,7 @@ def execute( browser_params, manager_params, extension_socket, - ) -> None: + ): webdriver.get("about:config") result = webdriver.execute_script( f""" diff --git a/test/test_s3_aggregator.py b/test/test_s3_aggregator.py deleted file mode 100644 index e7f5f4659..000000000 --- a/test/test_s3_aggregator.py +++ /dev/null @@ -1,136 +0,0 @@ -import json -import os -import time -from collections import defaultdict - -import boto3 -import pytest -from localstack.services import infra -from multiprocess import Queue - -from openwpm import task_manager -from openwpm.command_sequence import CommandSequence -from openwpm.DataAggregator.parquet_schema import PQ_SCHEMAS - -from .openwpmtest import OpenWPMTest -from .utilities import BASE_TEST_URL, LocalS3Dataset, LocalS3Session, local_s3_bucket - - -class TestS3Aggregator(OpenWPMTest): - @classmethod - def setup_class(cls): - infra.start_infra(asynchronous=True, apis=["s3"]) - boto3.DEFAULT_SESSION = LocalS3Session() - cls.s3_client = boto3.client("s3") - cls.s3_resource = boto3.resource("s3") - - @classmethod - def teardown_class(cls): - infra.stop_infra() - infra.check_infra(retries=2, expect_shutdown=True, apis=["s3"]) - - def get_config(self, num_browsers=1, data_dir=""): - manager_params, browser_params = self.get_test_config( - data_dir, num_browsers=num_browsers - ) - manager_params.output_format = "s3" - manager_params.s3_bucket = local_s3_bucket(self.s3_resource) - manager_params.s3_directory = "s3-aggregator-tests" - for i in range(num_browsers): - browser_params[i].http_instrument = True - browser_params[i].js_instrument = True - browser_params[i].cookie_instrument = True - browser_params[i].navigation_instrument = True - browser_params[i].callstack_instrument = True - browser_params[i].dns_instrument = True - return manager_params, browser_params - - @pytest.mark.skipif( - "CI" in os.environ and os.environ["CI"] == "true", - reason="Localstack fails to start on CI", - ) - def test_basic_properties(self): - TEST_SITE = "%s/s3_aggregator.html" % BASE_TEST_URL - NUM_VISITS = 2 - NUM_BROWSERS = 4 - manager_params, browser_params = self.get_config(num_browsers=NUM_BROWSERS) - manager = task_manager.TaskManager(manager_params, browser_params) - for _ in range(NUM_VISITS * NUM_BROWSERS): - manager.get(TEST_SITE, sleep=1) - manager.close() - - dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory) - - # Test visit_id consistency - visit_ids = defaultdict(set) - expected_tables = dict(PQ_SCHEMAS) - # We don't expect incomplete visits to exist - # since the visit shouldn't be interrupted - expected_tables.pop("incomplete_visits") - for table_name in expected_tables: - table = dataset.load_table(table_name) - visit_ids[table_name] = table.visit_id.unique() - actual = len(visit_ids[table_name]) - expected = NUM_VISITS * NUM_BROWSERS - assert actual == expected, ( - f"Table {table_name} had {actual} " f"visit_ids, we expected {expected}" - ) - for vid in visit_ids[table_name]: - assert (vid >= 0) and (vid < (1 << 53)) - for table_name, ids in visit_ids.items(): - assert set(ids) == set(visit_ids["site_visits"]) - - # Ensure http table is created - assert TEST_SITE in dataset.load_table("http_requests").top_level_url.unique() - - # Ensure config directory is created and contains the correct number - # of configuration files - config_file = dataset.list_files("config", prepend_root=True) - assert len(config_file) == 1 # only one instance started in test - config = json.loads(str(dataset.get_file(config_file[0]), "utf-8")) - assert len(config["browser_params"]) == NUM_BROWSERS - - @pytest.mark.skipif( - "CI" in os.environ and os.environ["CI"] == "true", - reason="Localstack fails to start on CI", - ) - def test_commit_on_timeout(self): - TEST_SITE = "%s/s3_aggregator.html" % BASE_TEST_URL - manager_params, browser_params = self.get_config(num_browsers=1) - manager_params.s3_directory = "s3-aggregator-tests-2" - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(TEST_SITE, sleep=1) - dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory) - with pytest.raises((FileNotFoundError, OSError)): - requests = dataset.load_table("http_requests") - time.sleep(45) # Current timeout - dataset2 = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory) - requests = dataset2.load_table("http_requests") - assert TEST_SITE in requests.top_level_url.unique() - manager.close() - - @pytest.mark.skipif( - "CI" in os.environ and os.environ["CI"] == "true", - reason="Localstack fails to start on CI", - ) - def test_s3_callbacks(self): - TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" - manager_params, browser_params = self.get_config() - dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory) - manager = task_manager.TaskManager(manager_params, browser_params) - queue = Queue() - - def ensure_site_in_s3(success: bool): - # Ensure http table is created - queue.put( - TEST_SITE in dataset.load_table("http_requests").top_level_url.unique() - ) - - sequence = CommandSequence( - TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3 - ) - sequence.get() - manager.execute_command_sequence(sequence) - manager.close() - - assert queue.get() diff --git a/test/test_simple_commands.py b/test/test_simple_commands.py index 6e0c3ede7..46b7787eb 100644 --- a/test/test_simple_commands.py +++ b/test/test_simple_commands.py @@ -1,3 +1,10 @@ +"""Test correctness of simple commands and check +that resulting data is properly keyed. + +This entire test class is parametrized to run against +both headless and xvfb modes to ensure both are exercised +during the test suite and there are no obvious problems. +""" import glob import gzip import json @@ -5,13 +12,13 @@ import re from urllib.parse import urlparse +import pytest from PIL import Image -from openwpm import command_sequence, task_manager +from openwpm import command_sequence from openwpm.utilities import db_utils from . import utilities -from .openwpmtest import OpenWPMTest url_a = utilities.BASE_TEST_URL + "/simple_a.html" url_b = utilities.BASE_TEST_URL + "/simple_b.html" @@ -78,404 +85,399 @@ } -def pytest_generate_tests(metafunc): - # Source: https://docs.pytest.org/en/latest/example/parametrize.html#a-quick-port-of-testscenarios # noqa - idlist = [] - argvalues = [] - for scenario in metafunc.cls.scenarios: - idlist.append(scenario[0]) - items = scenario[1].items() - argnames = [x[0] for x in items] - argvalues.append([x[1] for x in items]) - metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class") - - -class TestSimpleCommands(OpenWPMTest): - """Test correctness of simple commands and check - that resulting data is properly keyed. - - This entire test class is parametrized to run against - both headless and xvfb modes to ensure both are exercized - during the test suite and there are no obvious problems. +scenarios = [ + pytest.param("headless", id="headless"), + pytest.param("xvfb", id="xvfb"), +] + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_get_site_visits_table_valid(http_params, task_manager_creator, display_mode): + """Check that get works and populates db correctly.""" + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + + # Set up two sequential get commands to two URLS + cs_a = command_sequence.CommandSequence(url_a) + cs_a.get(sleep=1) + cs_b = command_sequence.CommandSequence(url_b) + cs_b.get(sleep=1) + + # Perform the get commands + manager.execute_command_sequence(cs_a) + manager.execute_command_sequence(cs_b) + manager.close() + + qry_res = db_utils.query_db( + db, + "SELECT site_url FROM site_visits ORDER BY site_url", + ) + + # We had two separate page visits + assert len(qry_res) == 2 + + assert qry_res[0][0] == url_a + assert qry_res[1][0] == url_b + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_get_http_tables_valid(http_params, task_manager_creator, display_mode): + """Check that get works and populates http tables correctly.""" + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + + # Set up two sequential get commands to two URLS + cs_a = command_sequence.CommandSequence(url_a) + cs_a.get(sleep=1) + cs_b = command_sequence.CommandSequence(url_b) + cs_b.get(sleep=1) + + manager.execute_command_sequence(cs_a) + manager.execute_command_sequence(cs_b) + manager.close() + + qry_res = db_utils.query_db(db, "SELECT visit_id, site_url FROM site_visits") + + # Construct dict mapping site_url to visit_id + visit_ids = dict() + for row in qry_res: + visit_ids[row[1]] = row[0] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_requests WHERE url = ?", + (url_a,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_requests WHERE url = ?", + (url_b,), + ) + assert qry_res[0][0] == visit_ids[url_b] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_a,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_b,), + ) + assert qry_res[0][0] == visit_ids[url_b] + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_browse_site_visits_table_valid( + http_params, task_manager_creator, display_mode +): + """Check that CommandSequence.browse() populates db correctly.""" + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + + # Set up two sequential browse commands to two URLS + cs_a = command_sequence.CommandSequence(url_a, site_rank=0) + cs_a.browse(num_links=1, sleep=1) + cs_b = command_sequence.CommandSequence(url_b, site_rank=1) + cs_b.browse(num_links=1, sleep=1) + + manager.execute_command_sequence(cs_a) + manager.execute_command_sequence(cs_b) + manager.close() + + qry_res = db_utils.query_db( + db, + "SELECT site_url, site_rank FROM site_visits ORDER BY site_rank", + ) + + # We had two separate page visits + assert len(qry_res) == 2 + + assert qry_res[0][0] == url_a + assert qry_res[0][1] == 0 + assert qry_res[1][0] == url_b + assert qry_res[1][1] == 1 + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_browse_http_table_valid(http_params, task_manager_creator, display_mode): + """Check CommandSequence.browse() works and populates http tables correctly. + + NOTE: Since the browse command is choosing links randomly, there is a + (very small -- 2*0.5^20) chance this test will fail with valid + code. """ - - scenarios = [ - ("headless", {"display_mode": "headless"}), - ("xvfb", {"display_mode": "xvfb"}), - ] - - def get_config(self, display_mode): - manager_params, browser_params = self.get_test_config(display_mode=display_mode) - browser_params[0].http_instrument = True - return manager_params, browser_params - - def test_get_site_visits_table_valid(self, display_mode): - """Check that get works and populates db correctly.""" - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - - # Set up two sequential get commands to two URLS - cs_a = command_sequence.CommandSequence(url_a) - cs_a.get(sleep=1) - cs_b = command_sequence.CommandSequence(url_b) - cs_b.get(sleep=1) - - # Perform the get commands - manager.execute_command_sequence(cs_a) - manager.execute_command_sequence(cs_b) - manager.close() - - qry_res = db_utils.query_db( - manager_params.database_name, "SELECT site_url FROM site_visits" - ) - - # We had two separate page visits - assert len(qry_res) == 2 - - assert qry_res[0][0] == url_a - assert qry_res[1][0] == url_b - - def test_get_http_tables_valid(self, display_mode): - """Check that get works and populates http tables correctly.""" - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - - # Set up two sequential get commands to two URLS - cs_a = command_sequence.CommandSequence(url_a) - cs_a.get(sleep=1) - cs_b = command_sequence.CommandSequence(url_b) - cs_b.get(sleep=1) - - manager.execute_command_sequence(cs_a) - manager.execute_command_sequence(cs_b) - manager.close() - - qry_res = db_utils.query_db( - manager_params.database_name, "SELECT visit_id, site_url FROM site_visits" - ) - - # Construct dict mapping site_url to visit_id - visit_ids = dict() - for row in qry_res: - visit_ids[row[1]] = row[0] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_requests" " WHERE url = ?", - (url_a,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_requests" " WHERE url = ?", - (url_b,), - ) - assert qry_res[0][0] == visit_ids[url_b] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_a,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_b,), - ) - assert qry_res[0][0] == visit_ids[url_b] - - def test_browse_site_visits_table_valid(self, display_mode): - """Check that CommandSequence.browse() populates db correctly.""" - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - - # Set up two sequential browse commands to two URLS - cs_a = command_sequence.CommandSequence(url_a, site_rank=0) - cs_a.browse(num_links=1, sleep=1) - cs_b = command_sequence.CommandSequence(url_b, site_rank=1) - cs_b.browse(num_links=1, sleep=1) - - manager.execute_command_sequence(cs_a) - manager.execute_command_sequence(cs_b) - manager.close() - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT site_url, site_rank" " FROM site_visits", - ) - - # We had two separate page visits - assert len(qry_res) == 2 - - assert qry_res[0][0] == url_a - assert qry_res[0][1] == 0 - assert qry_res[1][0] == url_b - assert qry_res[1][1] == 1 - - def test_browse_http_table_valid(self, display_mode): - """Check CommandSequence.browse() works and populates http tables correctly. - - NOTE: Since the browse command is choosing links randomly, there is a - (very small -- 2*0.5^20) chance this test will fail with valid - code. - """ - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - - # Set up two sequential browse commands to two URLS - cs_a = command_sequence.CommandSequence(url_a) - cs_a.browse(num_links=20, sleep=1) - cs_b = command_sequence.CommandSequence(url_b) - cs_b.browse(num_links=1, sleep=1) - - manager.execute_command_sequence(cs_a) - manager.execute_command_sequence(cs_b) - manager.close() - - qry_res = db_utils.query_db( - manager_params.database_name, "SELECT visit_id, site_url FROM site_visits" - ) - - # Construct dict mapping site_url to visit_id - visit_ids = dict() - for row in qry_res: - visit_ids[row[1]] = row[0] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_requests" " WHERE url = ?", - (url_a,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_requests" " WHERE url = ?", - (url_b,), - ) - assert qry_res[0][0] == visit_ids[url_b] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_a,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_b,), - ) - assert qry_res[0][0] == visit_ids[url_b] - - # Page simple_a.html has three links: - # 1) An absolute link to simple_c.html - # 2) A relative link to simple_d.html - # 3) A javascript: link - # 4) A link to www.google.com - # 5) A link to example.com?localtest.me - # We should see page visits for 1 and 2, but not 3-5. - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_c,), - ) - assert qry_res[0][0] == visit_ids[url_a] - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_d,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - # We expect 4 urls: a,c,d and a favicon request - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT COUNT(DISTINCT url) FROM http_responses" " WHERE visit_id = ?", - (visit_ids[url_a],), - ) - assert qry_res[0][0] == 4 - - def test_browse_wrapper_http_table_valid(self, display_mode): - """Check that TaskManager.browse() wrapper works and populates - http tables correctly. - - NOTE: Since the browse command is choosing links randomly, there is a - (very small -- 2*0.5^20) chance this test will fail with valid - code. - """ - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - - # Set up two sequential browse commands to two URLS - manager.browse(url_a, num_links=20, sleep=1) - manager.browse(url_b, num_links=1, sleep=1) - manager.close() - - qry_res = db_utils.query_db( - manager_params.database_name, "SELECT visit_id, site_url FROM site_visits" - ) - - # Construct dict mapping site_url to visit_id - visit_ids = dict() - for row in qry_res: - visit_ids[row[1]] = row[0] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_requests" " WHERE url = ?", - (url_a,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_requests" " WHERE url = ?", - (url_b,), - ) - assert qry_res[0][0] == visit_ids[url_b] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_a,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_b,), - ) - assert qry_res[0][0] == visit_ids[url_b] - - # Page simple_a.html has three links: - # 1) An absolute link to simple_c.html - # 2) A relative link to simple_d.html - # 3) A javascript: link - # 4) A link to www.google.com - # 5) A link to example.com?localtest.me - # We should see page visits for 1 and 2, but not 3-5. - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_c,), - ) - assert qry_res[0][0] == visit_ids[url_a] - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT visit_id FROM http_responses" " WHERE url = ?", - (url_d,), - ) - assert qry_res[0][0] == visit_ids[url_a] - - # We expect 4 urls: a,c,d and a favicon request - qry_res = db_utils.query_db( - manager_params.database_name, - "SELECT COUNT(DISTINCT url) FROM http_responses" " WHERE visit_id = ?", - (visit_ids[url_a],), - ) - assert qry_res[0][0] == 4 - - def test_save_screenshot_valid(self, display_mode): - """Check that 'save_screenshot' works""" - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - cs = command_sequence.CommandSequence(url_a) - cs.get(sleep=1) - cs.save_screenshot("test") - cs.screenshot_full_page("test_full") - manager.execute_command_sequence(cs) - manager.close() - - # Check that viewport image is not blank - pattern = os.path.join(str(self.tmpdir), "screenshots", "1-*-test.png") - screenshot = glob.glob(pattern)[0] - im = Image.open(screenshot) - bands = im.split() - is_blank = all(band.getextrema() == (255, 255) for band in bands) - assert not is_blank - - # Check that full page screenshot is not blank - pattern = os.path.join(str(self.tmpdir), "screenshots", "1-*-test_full.png") - screenshot = glob.glob(pattern)[0] - im = Image.open(screenshot) - bands = im.split() - is_blank = all(band.getextrema() == (255, 255) for band in bands) - assert not is_blank - - def test_dump_page_source_valid(self, display_mode): - """Check that 'dump_page_source' works and source is saved properly.""" - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - cs = command_sequence.CommandSequence(url_a) - cs.get(sleep=1) - cs.dump_page_source(suffix="test") - manager.execute_command_sequence(cs) - manager.close() - - # Source filename is of the follow structure: - # `sources/-(-suffix).html` - # thus for this test we expect `sources/1--test.html`. - outfile = os.path.join(str(self.tmpdir), "sources", "1-*-test.html") - source_file = glob.glob(outfile)[0] - with open(source_file, "rb") as f: - actual_source = f.read() - with open("./test_pages/expected_source.html", "rb") as f: - expected_source = f.read() - - assert actual_source == expected_source - - def test_recursive_dump_page_source_valid(self, display_mode): - """Check that 'recursive_dump_page_source' works""" - # Run the test crawl - manager_params, browser_params = self.get_config(display_mode) - manager = task_manager.TaskManager(manager_params, browser_params) - cs = command_sequence.CommandSequence(NESTED_FRAMES_URL) - cs.get(sleep=1) - cs.recursive_dump_page_source() - manager.execute_command_sequence(cs) - manager.close() - - outfile = os.path.join(str(self.tmpdir), "sources", "1-*.json.gz") - src_file = glob.glob(outfile)[0] - with gzip.GzipFile(src_file, "rb") as f: - visit_source = json.loads(f.read().decode("utf-8")) - - observed_parents = dict() - - def verify_frame(frame, parent_frames=[]): - # Verify structure - observed_parents[frame["doc_url"]] = list(parent_frames) # copy - - # Verify source - path = urlparse(frame["doc_url"]).path - expected_source = "" - with open("." + path, "r") as f: - expected_source = re.sub(r"\s", "", f.read().lower()) - if expected_source.startswith(""): - expected_source = expected_source[14:] - observed_source = re.sub(r"\s", "", frame["source"].lower()) - if observed_source.startswith(""): - observed_source = observed_source[14:] - assert observed_source == expected_source - - # Verify children - parent_frames.append(frame["doc_url"]) - for key, child_frame in frame["iframes"].items(): - verify_frame(child_frame, parent_frames) - parent_frames.pop() - - verify_frame(visit_source) - assert EXPECTED_PARENTS == observed_parents + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + + # Set up two sequential browse commands to two URLS + cs_a = command_sequence.CommandSequence(url_a) + cs_a.browse(num_links=20, sleep=1) + cs_b = command_sequence.CommandSequence(url_b) + cs_b.browse(num_links=1, sleep=1) + + manager.execute_command_sequence(cs_a) + manager.execute_command_sequence(cs_b) + manager.close() + + qry_res = db_utils.query_db(db, "SELECT visit_id, site_url FROM site_visits") + + # Construct dict mapping site_url to visit_id + visit_ids = dict() + for row in qry_res: + visit_ids[row[1]] = row[0] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_requests WHERE url = ?", + (url_a,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_requests WHERE url = ?", + (url_b,), + ) + assert qry_res[0][0] == visit_ids[url_b] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_a,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_b,), + ) + assert qry_res[0][0] == visit_ids[url_b] + + # Page simple_a.html has three links: + # 1) An absolute link to simple_c.html + # 2) A relative link to simple_d.html + # 3) A javascript: link + # 4) A link to www.google.com + # 5) A link to example.com?localtest.me + # We should see page visits for 1 and 2, but not 3-5. + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_c,), + ) + assert qry_res[0][0] == visit_ids[url_a] + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_d,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + # We expect 4 urls: a,c,d and a favicon request + qry_res = db_utils.query_db( + db, + "SELECT COUNT(DISTINCT url) FROM http_responses WHERE visit_id = ?", + (visit_ids[url_a],), + ) + assert qry_res[0][0] == 4 + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_browse_wrapper_http_table_valid( + http_params, task_manager_creator, display_mode +): + """Check that TaskManager.browse() wrapper works and populates + http tables correctly. + + NOTE: Since the browse command is choosing links randomly, there is a + (very small -- 2*0.5^20) chance this test will fail with valid + code. + """ + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + + # Set up two sequential browse commands to two URLS + manager.browse(url_a, num_links=20, sleep=1) + manager.browse(url_b, num_links=1, sleep=1) + manager.close() + + qry_res = db_utils.query_db(db, "SELECT visit_id, site_url FROM site_visits") + + # Construct dict mapping site_url to visit_id + visit_ids = dict() + for row in qry_res: + visit_ids[row[1]] = row[0] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_requests WHERE url = ?", + (url_a,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_requests WHERE url = ?", + (url_b,), + ) + assert qry_res[0][0] == visit_ids[url_b] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_a,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_b,), + ) + assert qry_res[0][0] == visit_ids[url_b] + + # Page simple_a.html has three links: + # 1) An absolute link to simple_c.html + # 2) A relative link to simple_d.html + # 3) A javascript: link + # 4) A link to www.google.com + # 5) A link to example.com?localtest.me + # We should see page visits for 1 and 2, but not 3-5. + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_c,), + ) + assert qry_res[0][0] == visit_ids[url_a] + qry_res = db_utils.query_db( + db, + "SELECT visit_id FROM http_responses WHERE url = ?", + (url_d,), + ) + assert qry_res[0][0] == visit_ids[url_a] + + # We expect 4 urls: a,c,d and a favicon request + qry_res = db_utils.query_db( + db, + "SELECT COUNT(DISTINCT url) FROM http_responses WHERE visit_id = ?", + (visit_ids[url_a],), + ) + assert qry_res[0][0] == 4 + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_save_screenshot_valid(http_params, task_manager_creator, display_mode): + """Check that 'save_screenshot' works""" + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, _ = task_manager_creator((manager_params, browser_params)) + + cs = command_sequence.CommandSequence(url_a) + cs.get(sleep=1) + cs.save_screenshot("test") + cs.screenshot_full_page("test_full") + manager.execute_command_sequence(cs) + manager.close() + + # Check that viewport image is not blank + pattern = os.path.join(manager_params.data_directory, "screenshots", "*-*-test.png") + screenshot = glob.glob(pattern)[0] + im = Image.open(screenshot) + bands = im.split() + is_blank = all(band.getextrema() == (255, 255) for band in bands) + assert not is_blank + + # Check that full page screenshot is not blank + pattern = os.path.join( + manager_params.data_directory, "screenshots", "*-*-test_full.png" + ) + screenshot = glob.glob(pattern)[0] + im = Image.open(screenshot) + bands = im.split() + is_blank = all(band.getextrema() == (255, 255) for band in bands) + assert not is_blank + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_dump_page_source_valid(http_params, task_manager_creator, display_mode): + """Check that 'dump_page_source' works and source is saved properly.""" + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + + cs = command_sequence.CommandSequence(url_a) + cs.get(sleep=1) + cs.dump_page_source(suffix="test") + manager.execute_command_sequence(cs) + manager.close() + + # Source filename is of the follow structure: + # `sources/-(-suffix).html` + # thus for this test we expect `sources/1--test.html`. + outfile = os.path.join(manager_params.data_directory, "sources", "*-*-test.html") + source_file = glob.glob(outfile)[0] + with open(source_file, "rb") as f: + actual_source = f.read() + with open("./test_pages/expected_source.html", "rb") as f: + expected_source = f.read() + + assert actual_source == expected_source + + +@pytest.mark.parametrize("display_mode", scenarios) +def test_recursive_dump_page_source_valid( + http_params, task_manager_creator, display_mode +): + """Check that 'recursive_dump_page_source' works""" + # Run the test crawl + manager_params, browser_params = http_params(display_mode) + manager, db = task_manager_creator((manager_params, browser_params)) + cs = command_sequence.CommandSequence(NESTED_FRAMES_URL) + cs.get(sleep=1) + cs.recursive_dump_page_source() + manager.execute_command_sequence(cs) + manager.close() + + outfile = os.path.join(manager_params.data_directory, "sources", "*-*.json.gz") + src_file = glob.glob(outfile)[0] + with gzip.GzipFile(src_file, "rb") as f: + visit_source = json.loads(f.read().decode("utf-8")) + + observed_parents = dict() + + def verify_frame(frame, parent_frames=[]): + # Verify structure + observed_parents[frame["doc_url"]] = list(parent_frames) # copy + + # Verify source + path = urlparse(frame["doc_url"]).path + expected_source = "" + with open("." + path, "r") as f: + expected_source = re.sub(r"\s", "", f.read().lower()) + if expected_source.startswith(""): + expected_source = expected_source[14:] + observed_source = re.sub(r"\s", "", frame["source"].lower()) + if observed_source.startswith(""): + observed_source = observed_source[14:] + assert observed_source == expected_source + + # Verify children + parent_frames.append(frame["doc_url"]) + for key, child_frame in frame["iframes"].items(): + verify_frame(child_frame, parent_frames) + parent_frames.pop() + + verify_frame(visit_source) + assert EXPECTED_PARENTS == observed_parents diff --git a/test/test_storage_vectors.py b/test/test_storage_vectors.py index 73a21ca8e..73755c980 100644 --- a/test/test_storage_vectors.py +++ b/test/test_storage_vectors.py @@ -1,11 +1,16 @@ -from openwpm import command_sequence, task_manager +"""Runs some basic tests to check that the saving of +storage vectors (i.e. profile cookies) works. + +NOTE: These tests are very basic and should be expanded +on to check for completeness and correctness. +""" + +from openwpm import command_sequence from openwpm.utilities import db_utils from . import utilities -from .openwpmtest import OpenWPMTest expected_js_cookie = ( - 1, # visit_id u"added-or-changed", # record_type u"explicit", # change_cause 0, # is_http_only @@ -20,39 +25,29 @@ ) -class TestStorageVectors(OpenWPMTest): - """Runs some basic tests to check that the saving of - storage vectors (i.e. profile cookies) works. - - NOTE: These tests are very basic and should be expanded - on to check for completeness and correctness. - """ - - def get_config(self, data_dir=""): - return self.get_test_config(data_dir) - - def test_js_profile_cookies(self): - """ Check that profile cookies set by JS are saved """ - # Run the test crawl - manager_params, browser_params = self.get_config() - browser_params[0].cookie_instrument = True - manager = task_manager.TaskManager(manager_params, browser_params) - url = utilities.BASE_TEST_URL + "/js_cookie.html" - cs = command_sequence.CommandSequence(url) - cs.get(sleep=3, timeout=120) - manager.execute_command_sequence(cs) - manager.close() - # Check that the JS cookie we stored is recorded - qry_res = db_utils.query_db( - manager_params.database_name, - ( - "SELECT visit_id, record_type, change_cause, is_http_only, " - "is_host_only, is_session, host, is_secure, name, path, " - "value, same_site FROM javascript_cookies" - ), - as_tuple=True, - ) - assert len(qry_res) == 1 # we store only one cookie - cookies = qry_res[0] # take the first cookie - # compare URL, domain, name, value, origin, path - assert cookies == expected_js_cookie +def test_js_profile_cookies(default_params, task_manager_creator): + """ Check that profile cookies set by JS are saved """ + # Run the test crawl + manager_params, browser_params = default_params + for browser_param in browser_params: + browser_param.cookie_instrument = True + manager, db = task_manager_creator((manager_params, browser_params)) + url = utilities.BASE_TEST_URL + "/js_cookie.html" + cs = command_sequence.CommandSequence(url) + cs.get(sleep=3, timeout=120) + manager.execute_command_sequence(cs) + manager.close() + # Check that the JS cookie we stored is recorded + qry_res = db_utils.query_db( + db, + ( + "SELECT record_type, change_cause, is_http_only, " + "is_host_only, is_session, host, is_secure, name, path, " + "value, same_site FROM javascript_cookies" + ), + as_tuple=True, + ) + assert len(qry_res) == 1 # we store only one cookie + cookies = qry_res[0] # take the first cookie + # compare URL, domain, name, value, origin, path + assert cookies == expected_js_cookie diff --git a/test/test_task_manager.py b/test/test_task_manager.py index 67ed79074..40b0c743d 100644 --- a/test/test_task_manager.py +++ b/test/test_task_manager.py @@ -1,43 +1,43 @@ +"""Test TaskManager functionality.""" import pytest from openwpm.errors import CommandExecutionError -from openwpm.task_manager import TaskManager -from .openwpmtest import OpenWPMTest from .utilities import BASE_TEST_URL -class TestTaskManager(OpenWPMTest): - """Test TaskManager functionality.""" - - def get_config(self, data_dir=""): - return self.get_test_config(data_dir) - - def test_failure_limit_value(self): - manager_params, _ = self.get_config() - # The default value for failure_limit is 2 * num_browsers + 10 - assert manager_params.failure_limit == 12 - manager_params.failure_limit = 2 - # Test that the chosen value is not overwritten by the default - assert manager_params.failure_limit == 2 - - def test_failure_limit_exceeded(self): - manager_params, browser_params = self.get_config() - manager_params.failure_limit = 0 - manager = TaskManager(manager_params, browser_params) - with pytest.raises(CommandExecutionError): - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Requires two commands to shut down - - def test_failure_limit_reset(self): - """Test that failure_count is reset on command sequence completion.""" - manager_params, browser_params = self.get_config() - manager_params.failure_limit = 1 - manager = TaskManager(manager_params, browser_params) - manager.get("example.com") # Selenium requires scheme prefix - manager.get(BASE_TEST_URL) # Successful command sequence - # Now failure_count should be reset to 0 and the following command - # failure should not raise a CommandExecutionError +def test_failure_limit_value(default_params): + manager_params, _ = default_params + manager_params.num_browsers = 1 + # The default value for failure_limit is 2 * num_browsers + 10 + assert manager_params.failure_limit == 12 + manager_params.failure_limit = 2 + # Test that the chosen value is not overwritten by the default + assert manager_params.failure_limit == 2 + + +def test_failure_limit_exceeded(task_manager_creator, default_params): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + manager_params.failure_limit = 0 + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + + with pytest.raises(CommandExecutionError): manager.get("example.com") # Selenium requires scheme prefix - manager.get(BASE_TEST_URL) # Requires two commands to shut down - manager.close() + manager.get("example.com") # Requires two commands to shut down + manager.close() + + +def test_failure_limit_reset(task_manager_creator, default_params): + """Test that failure_count is reset on command sequence completion.""" + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + manager_params.failure_limit = 1 + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + manager.get("example.com") # Selenium requires scheme prefix + manager.get(BASE_TEST_URL) # Successful command sequence + # Now failure_count should be reset to 0 and the following command + # failure should not raise a CommandExecutionError + manager.get("example.com") # Selenium requires scheme prefix + manager.get(BASE_TEST_URL) # Requires two commands to shut down + manager.close() diff --git a/test/test_timer.py b/test/test_timer.py index 125fc4db9..0eab01202 100644 --- a/test/test_timer.py +++ b/test/test_timer.py @@ -1,29 +1,23 @@ -from openwpm import task_manager from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest from .utilities import BASE_TEST_URL TEST_FILE = "canvas_fingerprinting.html" TEST_URL = BASE_TEST_URL + "/" + TEST_FILE -class TestCommandDuration(OpenWPMTest): - def get_config(self, data_dir=""): - return self.get_test_config(data_dir) +def test_command_duration(default_params, task_manager_creator): + manager, db = task_manager_creator(default_params) + manager.get(url=TEST_URL, sleep=5) + manager.close() - def test_command_duration(self): - manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get(url=TEST_URL, sleep=5) - manager.close() - get_command = db_utils.query_db( - manager_params.database_name, - "SELECT duration FROM crawl_history WHERE command = 'GetCommand'", - as_tuple=True, - )[0] + get_command = db_utils.query_db( + db, + "SELECT duration FROM crawl_history WHERE command = 'GetCommand'", + as_tuple=True, + )[0] - assert get_command[0] > (5 * 1000) # milliseconds conversion for sleep time - assert get_command[0] <= ( - (5 * 1000) + 2 * 1000 - ) # milliseconds conversion for sleep time + time duration a command took (milliseconds) + assert get_command[0] > (5 * 1000) # milliseconds conversion for sleep time + assert get_command[0] <= ( + (5 * 1000) + 2 * 1000 + ) # milliseconds conversion for sleep time + time duration a command took (milliseconds) diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index a8e83de9e..a9f518f8f 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -1,9 +1,6 @@ -from openwpm import task_manager from openwpm.commands.utils.webdriver_utils import parse_neterror from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest - def test_parse_neterror(): text = ( @@ -16,21 +13,16 @@ def test_parse_neterror(): assert parse_neterror(text) == "dnsNotFound" -class TestCustomFunctionCommand(OpenWPMTest): - def get_config(self, data_dir=""): - return self.get_test_config(data_dir) - - def test_parse_neterror_integration(self): - manager_params, browser_params = self.get_config() - manager = task_manager.TaskManager(manager_params, browser_params) - manager.get("http://website.invalid") - manager.close() +def test_parse_neterror_integration(default_params, task_manager_creator): + manager, db = task_manager_creator(default_params) + manager.get("http://website.invalid") + manager.close() - get_command = db_utils.query_db( - manager_params.database_name, - "SELECT command_status, error FROM crawl_history WHERE command = 'GetCommand'", - as_tuple=True, - )[0] + get_command = db_utils.query_db( + db, + "SELECT command_status, error FROM crawl_history WHERE command ='GetCommand'", + as_tuple=True, + )[0] - assert get_command[0] == "neterror" - assert get_command[1] == "dnsNotFound" + assert get_command[0] == "neterror" + assert get_command[1] == "dnsNotFound" diff --git a/test/test_xvfb_browser.py b/test/test_xvfb_browser.py index 24d0371ff..dc5ec5350 100644 --- a/test/test_xvfb_browser.py +++ b/test/test_xvfb_browser.py @@ -1,35 +1,38 @@ import os -from functools import partial -from typing import List + +from selenium.webdriver import Firefox from openwpm.command_sequence import CommandSequence from openwpm.commands.types import BaseCommand -from openwpm.task_manager import TaskManager +from openwpm.config import BrowserParamsInternal, ManagerParamsInternal +from openwpm.socket_interface import ClientSocket -from .openwpmtest import OpenWPMTest from .utilities import BASE_TEST_URL class ExceptionCommand(BaseCommand): - def execute(self): - raise Exception + def execute( + self, + webdriver: Firefox, + browser_params: BrowserParamsInternal, + manager_params: ManagerParamsInternal, + extension_socket: ClientSocket, + ) -> None: + raise RuntimeError("We simulate a Command failing") -class TestXVFBDisplay(OpenWPMTest): +def test_display_shutdown(task_manager_creator, default_params): """Test the XVFB display option to see if it runs and deletes the lockfile upon shutdown""" - - def get_config(self, data_dir=""): - return self.get_test_config(data_dir, display_mode="xvfb") - - def test_display_shutdown(self): - manager_params, browser_params = self.get_config() - TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" - manager = TaskManager(manager_params, browser_params) - port = manager.browsers[0].display_port - - sequence = CommandSequence(TEST_SITE) - sequence.get() - sequence.append_command(ExceptionCommand) - manager.execute_command_sequence(sequence) - manager.close() - assert not os.path.exists("/tmp/.X%s-lock" % port) + manager_params, browser_params = default_params + for browser_param in browser_params: + browser_param.display_mode = "xvfb" + TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" + manager, db = task_manager_creator((manager_params, browser_params)) + port = manager.browsers[0].display_port + + sequence = CommandSequence(TEST_SITE) + sequence.get() + sequence.append_command(ExceptionCommand()) + manager.execute_command_sequence(sequence) + manager.close() + assert not os.path.exists("/tmp/.X%s-lock" % port) diff --git a/test/utilities.py b/test/utilities.py index eb7668645..621d17ef4 100644 --- a/test/utilities.py +++ b/test/utilities.py @@ -5,12 +5,6 @@ from os.path import dirname, realpath from urllib.parse import parse_qs, urlparse -import boto3 -import pyarrow.parquet as pq -import s3fs -from botocore.credentials import Credentials -from pyarrow.filesystem import S3FSWrapper # noqa - LOCAL_WEBSERVER_PORT = 8000 BASE_TEST_URL_DOMAIN = "localtest.me" BASE_TEST_URL_NOPATH = "http://%s:%s" % (BASE_TEST_URL_DOMAIN, LOCAL_WEBSERVER_PORT) @@ -110,103 +104,3 @@ def start_server(): thread.start() print("...serving at port", LOCAL_WEBSERVER_PORT) return server, thread - - -class LocalS3Session(object): - """ - Ensures that the local s3 service is used when - setup as the default boto3 Session - Based on localstack_client/session.py - """ - - def __init__( - self, - aws_access_key_id="accesskey", - aws_secret_access_key="secretkey", - aws_session_token="token", - region_name="us-east-1", - endpoint_url="http://localhost:4572", - botocore_session=None, - profile_name=None, - localstack_host=None, - ): - self.env = "local" - self.session = boto3.session.Session() - self.aws_access_key_id = aws_access_key_id - self.aws_secret_access_key = aws_secret_access_key - self.aws_session_token = aws_session_token - self.region_name = region_name - self.endpoint_url = endpoint_url - - def resource(self, service_name, **kwargs): - return self.session.resource( - service_name, - endpoint_url=self.endpoint_url, - aws_access_key_id=self.aws_access_key_id, - aws_secret_access_key=self.aws_secret_access_key, - region_name=self.region_name, - verify=False, - ) - - def get_credentials(self): - return Credentials( - access_key=self.aws_access_key_id, - secret_key=self.aws_secret_access_key, - token=self.aws_session_token, - ) - - def client(self, service_name, **kwargs): - return self.session.client( - service_name, - endpoint_url=self.endpoint_url, - aws_access_key_id=self.aws_access_key_id, - aws_secret_access_key=self.aws_secret_access_key, - region_name=self.region_name, - verify=False, - ) - - -def local_s3_bucket(resource, name="localstack-foo"): - bucket = resource.Bucket(name) - bucket.create() - return name - - -class LocalS3Dataset(object): - def __init__(self, bucket, directory): - self.bucket = bucket - self.root_directory = directory - self.visits_uri = "%s/%s/visits/%%s" % (self.bucket, self.root_directory) - self.s3_fs = s3fs.S3FileSystem(session=LocalS3Session()) - boto3.DEFAULT_SESSION = LocalS3Session() - self.s3_client = boto3.client("s3") - self.s3_resource = boto3.resource("s3") - - def load_table(self, table_name): - return ( - pq.ParquetDataset(self.visits_uri % table_name, filesystem=self.s3_fs) - .read_pandas() - .to_pandas() - ) - - def list_files(self, directory, prepend_root=False): - bucket = self.s3_resource.Bucket(self.bucket) - files = list() - if prepend_root: - prefix = "%s/%s/" % (self.root_directory, directory) - else: - prefix = directory - for summary in bucket.objects.filter(Prefix=prefix): - files.append(summary.key) - return files - - def get_file(self, filename, prepend_root=False): - if prepend_root: - key = "%s/%s" % (self.root_directory, filename) - else: - key = filename - obj = self.s3_client.get_object(Bucket=self.bucket, Key=key) - body = obj["Body"] - content = body.read() - body.close() - return content