From ec8a4e0eab6c93454b7cf2cdda8c5e1c59bf5845 Mon Sep 17 00:00:00 2001 From: cccs-eric Date: Tue, 27 Dec 2022 17:06:16 -0500 Subject: [PATCH] feat(ingest): upgrade pydantic version (#6858) This PR also removes the requirement on docker-compose v1 and makes our tests use v2 instead. Co-authored-by: Harshal Sheth --- metadata-ingestion/setup.py | 5 ++- .../src/datahub/cli/docker_cli.py | 2 +- .../ingestion/glossary/datahub_classifier.py | 10 +++--- .../ingestion/source/delta_lake/config.py | 4 +-- .../src/datahub/ingestion/source/demo_data.py | 2 +- .../src/datahub/ingestion/source/powerbi.py | 2 +- .../datahub/ingestion/source/unity/config.py | 2 +- .../ingestion/source/usage/redshift_usage.py | 2 +- .../source/usage/starburst_trino_usage.py | 32 ++++++++++--------- .../ingestion/source_config/sql/bigquery.py | 2 +- .../ingestion/source_config/sql/snowflake.py | 13 +++++--- .../source_config/usage/snowflake_usage.py | 3 +- .../src/datahub/utilities/sample_data.py | 4 +-- .../tests/test_helpers/docker_helpers.py | 9 ++++++ 14 files changed, 54 insertions(+), 38 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8e987680b85c3..7a68f31c236fd 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -396,12 +396,11 @@ def get_long_description(): "mypy==0.991", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - # Restricting top version to <1.10 until we can fix our types. - "pydantic >=1.9.0, <1.10", + "pydantic >=1.9.0", "pytest>=6.2.2", "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", - "pytest-docker[docker-compose-v1]>=1.0.1", + "pytest-docker>=1.0.1", "deepdiff", "requests-mock", "freezegun", diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index be5141f3e9744..cb9d8a89d03f7 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -882,7 +882,7 @@ def ingest_sample_data(path: Optional[str], token: Optional[str]) -> None: if path is None: click.echo("Downloading sample data...") - path = download_sample_data() + path = str(download_sample_data()) # Verify that docker is up. issues = check_local_docker_containers() diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index 905e3b5d72a0f..2eb774a935851 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -57,13 +57,15 @@ class Config: description="Factors and their weights to consider when predicting info types", alias="prediction_factors_and_weights", ) - Name: Optional[NameFactorConfig] = Field(alias="name") + Name: Optional[NameFactorConfig] = Field(default=None, alias="name") - Description: Optional[DescriptionFactorConfig] = Field(alias="description") + Description: Optional[DescriptionFactorConfig] = Field( + default=None, alias="description" + ) - Datatype: Optional[DataTypeFactorConfig] = Field(alias="datatype") + Datatype: Optional[DataTypeFactorConfig] = Field(default=None, alias="datatype") - Values: Optional[ValuesFactorConfig] = Field(alias="values") + Values: Optional[ValuesFactorConfig] = Field(default=None, alias="values") # TODO: Generate Classification doc (classification.md) from python source. diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index 0af78fb3c742f..70a0982ff2bc4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -20,7 +20,7 @@ class S3(ConfigModel): - aws_config: AwsConnectionConfig = Field( + aws_config: Optional[AwsConnectionConfig] = Field( default=None, description="AWS configuration" ) @@ -40,7 +40,7 @@ class DeltaLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase): description="Path to table (s3 or local file system). If path is not a delta table path " "then all subfolders will be scanned to detect and ingest delta tables." ) - relative_path: str = Field( + relative_path: Optional[str] = Field( default=None, description="If set, delta-tables will be searched at location " "'/' and URNs will be created using " diff --git a/metadata-ingestion/src/datahub/ingestion/source/demo_data.py b/metadata-ingestion/src/datahub/ingestion/source/demo_data.py index 1764596bb5e8d..27202bb0087de 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/demo_data.py +++ b/metadata-ingestion/src/datahub/ingestion/source/demo_data.py @@ -24,7 +24,7 @@ class DemoDataSource(GenericFileSource): """ def __init__(self, ctx: PipelineContext, config: DemoDataConfig): - file_config = FileSourceConfig(filename=download_sample_data()) + file_config = FileSourceConfig(path=download_sample_data()) super().__init__(ctx, file_config) @classmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py index 274b06bb7e77c..e531ee676063e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py @@ -116,7 +116,7 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): default=AllowDenyPattern.allow_all(), description="Regex patterns to filter PowerBI workspaces in ingestion", ) - workspace_id: str = pydantic.Field( + workspace_id: Optional[str] = pydantic.Field( description="[deprecated] Use workspace_id_pattern instead", default=None, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 741b877395372..f25863611457f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -15,7 +15,7 @@ class UnityCatalogSourceConfig(StatefulIngestionConfigBase): token: str = pydantic.Field(description="Databricks personal access token") workspace_url: str = pydantic.Field(description="Databricks workspace url") - workspace_name: str = pydantic.Field( + workspace_name: Optional[str] = pydantic.Field( default=None, description="Name of the workspace. Default to deployment name present in workspace_url", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py index 841217a1e60ff..90247218d8d2a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py @@ -140,7 +140,7 @@ class RedshiftAccessEvent(BaseModel): username: str query: int tbl: int - text: str = Field(None, alias="querytxt") + text: Optional[str] = Field(None, alias="querytxt") database: str schema_: str = Field(alias="schema") table: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 6fe8bf26baa08..5534172770b9c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -4,7 +4,7 @@ import logging from datetime import datetime from email.utils import parseaddr -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Optional from dateutil import parser from pydantic.fields import Field @@ -62,23 +62,23 @@ class TrinoConnectorInfo(BaseModel): class TrinoAccessedMetadata(BaseModel): - catalog_name: str = Field(None, alias="catalogName") - schema_name: str = Field(None, alias="schema") # type: ignore - table: str = None # type: ignore + catalog_name: Optional[str] = Field(None, alias="catalogName") + schema_name: Optional[str] = Field(None, alias="schema") + table: Optional[str] = None columns: List[str] - connector_info: TrinoConnectorInfo = Field(None, alias="connectorInfo") + connector_info: Optional[TrinoConnectorInfo] = Field(None, alias="connectorInfo") class TrinoJoinedAccessEvent(BaseModel): - usr: str = None # type:ignore - query: str = None # type: ignore - catalog: str = None # type: ignore - schema_name: str = Field(None, alias="schema") - query_type: str = None # type:ignore - table: str = None # type:ignore + usr: Optional[str] = None + query: Optional[str] = None + catalog: Optional[str] = None + schema_name: Optional[str] = Field(None, alias="schema") + query_type: Optional[str] = None + table: Optional[str] = None accessed_metadata: List[TrinoAccessedMetadata] - starttime: datetime = Field(None, alias="create_time") - endtime: datetime = Field(None, alias="end_time") + starttime: datetime = Field(alias="create_time") + endtime: datetime = Field(alias="end_time") class EnvBasedSourceBaseConfig: @@ -233,7 +233,9 @@ def _aggregate_access_events( floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration) for metadata in event.accessed_metadata: # Skipping queries starting with $system@ - if metadata.catalog_name.startswith("$system@"): + if metadata.catalog_name and metadata.catalog_name.startswith( + "$system@" + ): logging.debug( f"Skipping system query for {metadata.catalog_name}..." ) @@ -258,7 +260,7 @@ def _aggregate_access_events( # add @unknown.com to username # current limitation in user stats UI, we need to provide email to show users - if "@" in parseaddr(event.usr)[1]: + if event.usr and "@" in parseaddr(event.usr)[1]: username = event.usr else: username = f"{event.usr if event.usr else 'unknown'}@{self.config.email_domain}" diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py index 3782d26ea748a..26b524cb44045 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py @@ -29,7 +29,7 @@ class BigQueryConfig(BigQueryBaseConfig, BaseTimeWindowConfig, SQLAlchemyConfig) description="The number of log item will be queried per page for lineage collection", ) credential: Optional[BigQueryCredential] = pydantic.Field( - description="BigQuery credential informations" + default=None, description="BigQuery credential informations" ) # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage. extra_client_options: Dict[str, Any] = pydantic.Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 1924488a4fc5c..6f88871d9aece 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -83,7 +83,7 @@ class SnowflakeProvisionRoleConfig(ConfigModel): description="The username to be used for provisioning of role." ) - admin_password: pydantic.SecretStr = pydantic.Field( + admin_password: Optional[pydantic.SecretStr] = pydantic.Field( default=None, exclude=True, description="The password to be used for provisioning of role.", @@ -131,13 +131,16 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig): description='The type of authenticator to use when connecting to Snowflake. Supports "DEFAULT_AUTHENTICATOR", "EXTERNAL_BROWSER_AUTHENTICATOR" and "KEY_PAIR_AUTHENTICATOR".', ) host_port: Optional[str] = pydantic.Field( - description="DEPRECATED: Snowflake account. e.g. abc48144" + default=None, description="DEPRECATED: Snowflake account. e.g. abc48144" ) # Deprecated account_id: Optional[str] = pydantic.Field( - description="Snowflake account identifier. e.g. xy12345, xy12345.us-east-2.aws, xy12345.us-central1.gcp, xy12345.central-us.azure, xy12345.us-west-2.privatelink. Refer [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#format-2-legacy-account-locator-in-a-region) for more details." + default=None, + description="Snowflake account identifier. e.g. xy12345, xy12345.us-east-2.aws, xy12345.us-central1.gcp, xy12345.central-us.azure, xy12345.us-west-2.privatelink. Refer [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#format-2-legacy-account-locator-in-a-region) for more details.", ) # Once host_port is removed this will be made mandatory - warehouse: Optional[str] = pydantic.Field(description="Snowflake warehouse.") - role: Optional[str] = pydantic.Field(description="Snowflake role.") + warehouse: Optional[str] = pydantic.Field( + default=None, description="Snowflake warehouse." + ) + role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.") include_table_lineage: bool = pydantic.Field( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role.", diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py index 7b35a207d9605..55ca0eba0c1b6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py @@ -41,7 +41,8 @@ class SnowflakeUsageConfig( description="List of regex patterns for databases to include/exclude in usage ingestion.", ) email_domain: Optional[str] = pydantic.Field( - description="Email domain of your organisation so users can be displayed on UI appropriately." + default=None, + description="Email domain of your organisation so users can be displayed on UI appropriately.", ) schema_pattern: AllowDenyPattern = pydantic.Field( default=AllowDenyPattern.allow_all(), diff --git a/metadata-ingestion/src/datahub/utilities/sample_data.py b/metadata-ingestion/src/datahub/utilities/sample_data.py index 12810d23f9bff..b908f7f89fd28 100644 --- a/metadata-ingestion/src/datahub/utilities/sample_data.py +++ b/metadata-ingestion/src/datahub/utilities/sample_data.py @@ -12,9 +12,9 @@ BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}" -def download_sample_data() -> str: +def download_sample_data() -> pathlib.Path: with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file: - path = str(pathlib.Path(tmp_file.name)) + path = pathlib.Path(tmp_file.name) # Download the bootstrap MCE file from GitHub. mce_json_download_response = requests.get(BOOTSTRAP_MCES_URL) diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index 62654cf9f611c..ab99b104cbc4e 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -42,6 +42,15 @@ def wait_for_port( subprocess.run(f"docker logs {container_name}", shell=True, check=True) +@pytest.fixture(scope="session") +def docker_compose_command(): + """Docker Compose command to use, it could be either `docker-compose` + for Docker Compose v1 or `docker compose` for Docker Compose + v2.""" + + return "docker compose" + + @pytest.fixture(scope="module") def docker_compose_runner( docker_compose_command, docker_compose_project_name, docker_setup, docker_cleanup