diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cef2a2ab999..ed44f1975ae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,6 +13,8 @@ repos: - id: check-yaml - id: detect-private-key - id: end-of-file-fixer + # TODO: Remove this exclusion when we stop committing the schema. + exclude: ^schemas/ - id: trailing-whitespace - id: fix-encoding-pragma args: ["--remove"] diff --git a/Makefile b/Makefile index 024b5d64b6b..292e3a1088e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: develop setup-git test install-python-dependencies -develop: install-python-dependencies setup-git +develop: install-python-dependencies setup-git fetch-and-validate-schema setup-git: mkdir -p .git/hooks && cd .git/hooks && ln -sf ../../config/hooks/* ./ @@ -12,3 +12,9 @@ test: install-python-dependencies: pip install -e . + +fetch-and-validate-schema: + mkdir -p schemas + curl https://raw.githubusercontent.com/getsentry/sentry-data-schemas/main/relay/event.schema.json -o schemas/event.schema.json + mypy snuba > /dev/null || (if [ "$$?" -gt 1 ]; then exit 1; fi) +.PHONY: fetch-and-validate-schema diff --git a/README.md b/README.md index 8cf30635ee7..8075076e2ae 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,7 @@ Note that Snuba assumes that everything is running on UTC time. Otherwise you ma mkvirtualenv snuba --python=python3.8 workon snuba - make install-python-dependencies - make setup-git + make develop # Run API server snuba api diff --git a/config/hooks/post-merge b/config/hooks/post-merge index dcb24e7f126..89986634d46 100755 --- a/config/hooks/post-merge +++ b/config/hooks/post-merge @@ -4,7 +4,15 @@ red="$(tput setaf 1)" bold="$(tput bold)" reset="$(tput sgr0)" +# Ensures up to data event schemas are downloaded +mkdir -p schemas +curl https://raw.githubusercontent.com/getsentry/sentry-data-schemas/main/relay/event.schema.json -o schemas/event.schema.json +mypy snuba > /dev/null +if [ $? -gt 1 ]; then + echo [${red}${bold}!!!${reset}] Could not dowload schemas. +fi +# Ensures migrations are executed files_changed_upstream="$(mktemp)" trap "rm -f ${files_changed_upstream}" EXIT diff --git a/mypy.ini b/mypy.ini index 7b217649582..08e823b3e78 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,6 @@ [mypy] +plugins = jsonschema_typed.plugin +python_version = 3.8 ignore_missing_imports = False [mypy-_strptime] diff --git a/requirements.txt b/requirements.txt index 93ad2281bc1..603e7e43df8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,13 +19,14 @@ honcho==1.0.1 idna==2.7 itsdangerous==0.24 Jinja2==2.10.1 -jsonschema==3.0.1 +jsonschema==3.2.0 lz4==2.0.0 Markdown==2.6.11 MarkupSafe==1.1.1 mywsgi==1.0.3 more-itertools==4.2.0 mypy>=0.761 +git+https://github.com/getsentry/sentry-data-schemas.git@76c6870d4b81e9c7a3a983cf4f591aeecb579521#subdirectory=py packaging==17.1 parsimonious==0.8.1 py==1.5.3 diff --git a/scripts/update-schemas.sh b/scripts/update-schemas.sh new file mode 100755 index 00000000000..ea8207f8230 --- /dev/null +++ b/scripts/update-schemas.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +SCHEMAS_REPO='https://github.com/getsentry/sentry-data-schemas.git' +LATEST_VERSION=$(git ls-remote $SCHEMAS_REPO HEAD | awk '{ print $1}') + +REGEX='^git\+https\:\/\/github\.com\/getsentry\/sentry\-data\-schemas\.git\@([a-f0-9]+)\#subdirectory=py' +TEMP='git+https://github.com/getsentry/sentry-data-schemas.git@c123123#subdirectory=py' + +echo $TEMP | sed "s/c([1-f0-9]+)/\$LATEST_VERSION/" diff --git a/snuba/datasets/errors_processor.py b/snuba/datasets/errors_processor.py index 43c87739a30..593c657bf71 100644 --- a/snuba/datasets/errors_processor.py +++ b/snuba/datasets/errors_processor.py @@ -47,7 +47,6 @@ def extract_custom( ) -> None: data = event.get("data", {}) user_dict = data.get("user", data.get("sentry.interfaces.User", None)) or {} - user_data: MutableMapping[str, Any] = {} extract_user(user_data, user_dict) output["user_name"] = user_data["username"] diff --git a/snuba/datasets/events_processor.py b/snuba/datasets/events_processor.py index 0be94924b13..595556ebf02 100644 --- a/snuba/datasets/events_processor.py +++ b/snuba/datasets/events_processor.py @@ -1,7 +1,6 @@ -from typing import Any, Mapping, MutableMapping - -import logging import _strptime # NOQA fixes _strptime deferred import issue +import logging +from typing import Any, Mapping, MutableMapping from snuba.clickhouse.columns import ColumnSet from snuba.consumer import KafkaMessageMetadata @@ -47,11 +46,15 @@ def extract_custom( metadata: KafkaMessageMetadata, ) -> None: data = event.get("data", {}) - output["message"] = _unicodify(event["message"]) # USER REQUEST GEO - user = data.get("user", data.get("sentry.interfaces.User", None)) or {} + user = ( + data.get( + "user", data.get("sentry.interfaces.User", None) # type: ignore + ) + or {} + ) extract_user(output, user) geo = user.get("geo", None) or {} diff --git a/snuba/datasets/events_processor_base.py b/snuba/datasets/events_processor_base.py index f6f2b84b707..6ea191d09bb 100644 --- a/snuba/datasets/events_processor_base.py +++ b/snuba/datasets/events_processor_base.py @@ -1,10 +1,9 @@ import logging from abc import ABC, abstractmethod from datetime import datetime -from typing import Any, Mapping, MutableMapping, Optional, Sequence - -from typing_extensions import TypedDict +from typing import Any, cast, Mapping, MutableMapping, Optional, Sequence, TypedDict +from schemas import EventData from snuba import settings from snuba.consumer import KafkaMessageMetadata from snuba.datasets.events_format import ( @@ -30,7 +29,6 @@ logger = logging.getLogger(__name__) - REPLACEMENT_EVENT_TYPES = frozenset( [ "start_delete_groups", @@ -55,9 +53,10 @@ class InsertEvent(TypedDict): message: str platform: str datetime: str # snuba.settings.PAYLOAD_DATETIME_FORMAT - data: MutableMapping[str, Any] + data: EventData primary_hash: str # empty string represents None retention_days: int + search_message: Any class EventsProcessorBase(MessageProcessor, ABC): @@ -116,6 +115,7 @@ def extract_required( # This is not ideal but it should never happen anyways timestamp = _ensure_valid_date( + # TODO: Switch to using event["data"]["timestamp"] datetime.strptime(event["datetime"], settings.PAYLOAD_DATETIME_FORMAT) ) if timestamp is None: @@ -151,7 +151,7 @@ def process_message( type_, event = message[1:3] if type_ == "insert": try: - row = self.process_insert(event, metadata) + row = self.process_insert(cast(InsertEvent, event), metadata) except EventTooOld: return None @@ -168,6 +168,7 @@ def process_message( def process_insert( self, event: InsertEvent, metadata: KafkaMessageMetadata ) -> Optional[Mapping[str, Any]]: + if not self._should_process(event): return None @@ -189,7 +190,7 @@ def process_insert( self.extract_common(processed, event, metadata) self.extract_custom(processed, event, metadata) - sdk = data.get("sdk", None) or {} + sdk = data.get("sdk", None) or {} # type: ignore self.extract_sdk(processed, sdk) tags = _as_dict_safe(data.get("tags", None)) @@ -205,9 +206,13 @@ def process_insert( processed["tags.key"], processed["tags.value"] = extract_extra_tags(tags) exception = ( - data.get("exception", data.get("sentry.interfaces.Exception", None)) or {} + data.get( + "exception", + data.get("sentry.interfaces.Exception", None), # type: ignore + ) + or {} ) - stacks = exception.get("values", None) or [] + stacks: Sequence[Any] = exception.get("values", None) or [] self.extract_stacktraces(processed, stacks) processed["offset"] = metadata.offset @@ -223,11 +228,12 @@ def extract_common( metadata: KafkaMessageMetadata, ) -> None: # Properties we get from the top level of the message payload + data = event.get("data", {}) output["platform"] = _unicodify(event["platform"]) # Properties we get from the "data" dict, which is the actual event body. - data = event.get("data", {}) - received = _collapse_uint32(int(data["received"])) + + received = _collapse_uint32(data["received"]) output["received"] = ( datetime.utcfromtimestamp(received) if received is not None else None )