From ec8a4e0eab6c93454b7cf2cdda8c5e1c59bf5845 Mon Sep 17 00:00:00 2001
From: cccs-eric <eric.ladouceur@cyber.gc.ca>
Date: Tue, 27 Dec 2022 17:06:16 -0500
Subject: [PATCH] feat(ingest): upgrade pydantic version (#6858)

This PR also removes the requirement on docker-compose v1 and makes our tests use v2 instead.

Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
---
 metadata-ingestion/setup.py                   |  5 ++-
 .../src/datahub/cli/docker_cli.py             |  2 +-
 .../ingestion/glossary/datahub_classifier.py  | 10 +++---
 .../ingestion/source/delta_lake/config.py     |  4 +--
 .../src/datahub/ingestion/source/demo_data.py |  2 +-
 .../src/datahub/ingestion/source/powerbi.py   |  2 +-
 .../datahub/ingestion/source/unity/config.py  |  2 +-
 .../ingestion/source/usage/redshift_usage.py  |  2 +-
 .../source/usage/starburst_trino_usage.py     | 32 ++++++++++---------
 .../ingestion/source_config/sql/bigquery.py   |  2 +-
 .../ingestion/source_config/sql/snowflake.py  | 13 +++++---
 .../source_config/usage/snowflake_usage.py    |  3 +-
 .../src/datahub/utilities/sample_data.py      |  4 +--
 .../tests/test_helpers/docker_helpers.py      |  9 ++++++
 14 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 8e987680b85c3..7a68f31c236fd 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -396,12 +396,11 @@ def get_long_description():
     "mypy==0.991",
     # pydantic 1.8.2 is incompatible with mypy 0.910.
     # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
-    # Restricting top version to <1.10 until we can fix our types.
-    "pydantic >=1.9.0, <1.10",
+    "pydantic >=1.9.0",
     "pytest>=6.2.2",
     "pytest-asyncio>=0.16.0",
     "pytest-cov>=2.8.1",
-    "pytest-docker[docker-compose-v1]>=1.0.1",
+    "pytest-docker>=1.0.1",
     "deepdiff",
     "requests-mock",
     "freezegun",
diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py
index be5141f3e9744..cb9d8a89d03f7 100644
--- a/metadata-ingestion/src/datahub/cli/docker_cli.py
+++ b/metadata-ingestion/src/datahub/cli/docker_cli.py
@@ -882,7 +882,7 @@ def ingest_sample_data(path: Optional[str], token: Optional[str]) -> None:
 
     if path is None:
         click.echo("Downloading sample data...")
-        path = download_sample_data()
+        path = str(download_sample_data())
 
     # Verify that docker is up.
     issues = check_local_docker_containers()
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
index 905e3b5d72a0f..2eb774a935851 100644
--- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
+++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@@ -57,13 +57,15 @@ class Config:
         description="Factors and their weights to consider when predicting info types",
         alias="prediction_factors_and_weights",
     )
-    Name: Optional[NameFactorConfig] = Field(alias="name")
+    Name: Optional[NameFactorConfig] = Field(default=None, alias="name")
 
-    Description: Optional[DescriptionFactorConfig] = Field(alias="description")
+    Description: Optional[DescriptionFactorConfig] = Field(
+        default=None, alias="description"
+    )
 
-    Datatype: Optional[DataTypeFactorConfig] = Field(alias="datatype")
+    Datatype: Optional[DataTypeFactorConfig] = Field(default=None, alias="datatype")
 
-    Values: Optional[ValuesFactorConfig] = Field(alias="values")
+    Values: Optional[ValuesFactorConfig] = Field(default=None, alias="values")
 
 
 # TODO: Generate Classification doc (classification.md) from python source.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
index 0af78fb3c742f..70a0982ff2bc4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
@@ -20,7 +20,7 @@
 
 
 class S3(ConfigModel):
-    aws_config: AwsConnectionConfig = Field(
+    aws_config: Optional[AwsConnectionConfig] = Field(
         default=None, description="AWS configuration"
     )
 
@@ -40,7 +40,7 @@ class DeltaLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
         description="Path to table (s3 or local file system). If path is not a delta table path "
         "then all subfolders will be scanned to detect and ingest delta tables."
     )
-    relative_path: str = Field(
+    relative_path: Optional[str] = Field(
         default=None,
         description="If set, delta-tables will be searched at location "
         "'<base_path>/<relative_path>' and URNs will be created using "
diff --git a/metadata-ingestion/src/datahub/ingestion/source/demo_data.py b/metadata-ingestion/src/datahub/ingestion/source/demo_data.py
index 1764596bb5e8d..27202bb0087de 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/demo_data.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/demo_data.py
@@ -24,7 +24,7 @@ class DemoDataSource(GenericFileSource):
     """
 
     def __init__(self, ctx: PipelineContext, config: DemoDataConfig):
-        file_config = FileSourceConfig(filename=download_sample_data())
+        file_config = FileSourceConfig(path=download_sample_data())
         super().__init__(ctx, file_config)
 
     @classmethod
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py
index 274b06bb7e77c..e531ee676063e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py
@@ -116,7 +116,7 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase):
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns to filter PowerBI workspaces in ingestion",
     )
-    workspace_id: str = pydantic.Field(
+    workspace_id: Optional[str] = pydantic.Field(
         description="[deprecated] Use workspace_id_pattern instead",
         default=None,
     )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py
index 741b877395372..f25863611457f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py
@@ -15,7 +15,7 @@
 class UnityCatalogSourceConfig(StatefulIngestionConfigBase):
     token: str = pydantic.Field(description="Databricks personal access token")
     workspace_url: str = pydantic.Field(description="Databricks workspace url")
-    workspace_name: str = pydantic.Field(
+    workspace_name: Optional[str] = pydantic.Field(
         default=None,
         description="Name of the workspace. Default to deployment name present in workspace_url",
     )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py
index 841217a1e60ff..90247218d8d2a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py
@@ -140,7 +140,7 @@ class RedshiftAccessEvent(BaseModel):
     username: str
     query: int
     tbl: int
-    text: str = Field(None, alias="querytxt")
+    text: Optional[str] = Field(None, alias="querytxt")
     database: str
     schema_: str = Field(alias="schema")
     table: str
diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py
index 6fe8bf26baa08..5534172770b9c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py
@@ -4,7 +4,7 @@
 import logging
 from datetime import datetime
 from email.utils import parseaddr
-from typing import Dict, Iterable, List
+from typing import Dict, Iterable, List, Optional
 
 from dateutil import parser
 from pydantic.fields import Field
@@ -62,23 +62,23 @@ class TrinoConnectorInfo(BaseModel):
 
 
 class TrinoAccessedMetadata(BaseModel):
-    catalog_name: str = Field(None, alias="catalogName")
-    schema_name: str = Field(None, alias="schema")  # type: ignore
-    table: str = None  # type: ignore
+    catalog_name: Optional[str] = Field(None, alias="catalogName")
+    schema_name: Optional[str] = Field(None, alias="schema")
+    table: Optional[str] = None
     columns: List[str]
-    connector_info: TrinoConnectorInfo = Field(None, alias="connectorInfo")
+    connector_info: Optional[TrinoConnectorInfo] = Field(None, alias="connectorInfo")
 
 
 class TrinoJoinedAccessEvent(BaseModel):
-    usr: str = None  # type:ignore
-    query: str = None  # type: ignore
-    catalog: str = None  # type: ignore
-    schema_name: str = Field(None, alias="schema")
-    query_type: str = None  # type:ignore
-    table: str = None  # type:ignore
+    usr: Optional[str] = None
+    query: Optional[str] = None
+    catalog: Optional[str] = None
+    schema_name: Optional[str] = Field(None, alias="schema")
+    query_type: Optional[str] = None
+    table: Optional[str] = None
     accessed_metadata: List[TrinoAccessedMetadata]
-    starttime: datetime = Field(None, alias="create_time")
-    endtime: datetime = Field(None, alias="end_time")
+    starttime: datetime = Field(alias="create_time")
+    endtime: datetime = Field(alias="end_time")
 
 
 class EnvBasedSourceBaseConfig:
@@ -233,7 +233,9 @@ def _aggregate_access_events(
             floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
             for metadata in event.accessed_metadata:
                 # Skipping queries starting with $system@
-                if metadata.catalog_name.startswith("$system@"):
+                if metadata.catalog_name and metadata.catalog_name.startswith(
+                    "$system@"
+                ):
                     logging.debug(
                         f"Skipping system query for {metadata.catalog_name}..."
                     )
@@ -258,7 +260,7 @@ def _aggregate_access_events(
 
                 # add @unknown.com to username
                 # current limitation in user stats UI, we need to provide email to show users
-                if "@" in parseaddr(event.usr)[1]:
+                if event.usr and "@" in parseaddr(event.usr)[1]:
                     username = event.usr
                 else:
                     username = f"{event.usr if event.usr else 'unknown'}@{self.config.email_domain}"
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py
index 3782d26ea748a..26b524cb44045 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py
@@ -29,7 +29,7 @@ class BigQueryConfig(BigQueryBaseConfig, BaseTimeWindowConfig, SQLAlchemyConfig)
         description="The number of log item will be queried per page for lineage collection",
     )
     credential: Optional[BigQueryCredential] = pydantic.Field(
-        description="BigQuery credential informations"
+        default=None, description="BigQuery credential informations"
     )
     # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
     extra_client_options: Dict[str, Any] = pydantic.Field(
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
index 1924488a4fc5c..6f88871d9aece 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
@@ -83,7 +83,7 @@ class SnowflakeProvisionRoleConfig(ConfigModel):
         description="The username to be used for provisioning of role."
     )
 
-    admin_password: pydantic.SecretStr = pydantic.Field(
+    admin_password: Optional[pydantic.SecretStr] = pydantic.Field(
         default=None,
         exclude=True,
         description="The password to be used for provisioning of role.",
@@ -131,13 +131,16 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig):
         description='The type of authenticator to use when connecting to Snowflake. Supports "DEFAULT_AUTHENTICATOR", "EXTERNAL_BROWSER_AUTHENTICATOR" and "KEY_PAIR_AUTHENTICATOR".',
     )
     host_port: Optional[str] = pydantic.Field(
-        description="DEPRECATED: Snowflake account. e.g. abc48144"
+        default=None, description="DEPRECATED: Snowflake account. e.g. abc48144"
     )  # Deprecated
     account_id: Optional[str] = pydantic.Field(
-        description="Snowflake account identifier. e.g. xy12345,  xy12345.us-east-2.aws, xy12345.us-central1.gcp, xy12345.central-us.azure, xy12345.us-west-2.privatelink. Refer [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#format-2-legacy-account-locator-in-a-region) for more details."
+        default=None,
+        description="Snowflake account identifier. e.g. xy12345,  xy12345.us-east-2.aws, xy12345.us-central1.gcp, xy12345.central-us.azure, xy12345.us-west-2.privatelink. Refer [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#format-2-legacy-account-locator-in-a-region) for more details.",
     )  # Once host_port is removed this will be made mandatory
-    warehouse: Optional[str] = pydantic.Field(description="Snowflake warehouse.")
-    role: Optional[str] = pydantic.Field(description="Snowflake role.")
+    warehouse: Optional[str] = pydantic.Field(
+        default=None, description="Snowflake warehouse."
+    )
+    role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.")
     include_table_lineage: bool = pydantic.Field(
         default=True,
         description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role.",
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py
index 7b35a207d9605..55ca0eba0c1b6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py
@@ -41,7 +41,8 @@ class SnowflakeUsageConfig(
         description="List of regex patterns for databases to include/exclude in usage ingestion.",
     )
     email_domain: Optional[str] = pydantic.Field(
-        description="Email domain of your organisation so users can be displayed on UI appropriately."
+        default=None,
+        description="Email domain of your organisation so users can be displayed on UI appropriately.",
     )
     schema_pattern: AllowDenyPattern = pydantic.Field(
         default=AllowDenyPattern.allow_all(),
diff --git a/metadata-ingestion/src/datahub/utilities/sample_data.py b/metadata-ingestion/src/datahub/utilities/sample_data.py
index 12810d23f9bff..b908f7f89fd28 100644
--- a/metadata-ingestion/src/datahub/utilities/sample_data.py
+++ b/metadata-ingestion/src/datahub/utilities/sample_data.py
@@ -12,9 +12,9 @@
 BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}"
 
 
-def download_sample_data() -> str:
+def download_sample_data() -> pathlib.Path:
     with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
-        path = str(pathlib.Path(tmp_file.name))
+        path = pathlib.Path(tmp_file.name)
 
         # Download the bootstrap MCE file from GitHub.
         mce_json_download_response = requests.get(BOOTSTRAP_MCES_URL)
diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py
index 62654cf9f611c..ab99b104cbc4e 100644
--- a/metadata-ingestion/tests/test_helpers/docker_helpers.py
+++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py
@@ -42,6 +42,15 @@ def wait_for_port(
         subprocess.run(f"docker logs {container_name}", shell=True, check=True)
 
 
+@pytest.fixture(scope="session")
+def docker_compose_command():
+    """Docker Compose command to use, it could be either `docker-compose`
+    for Docker Compose v1 or `docker compose` for Docker Compose
+    v2."""
+
+    return "docker compose"
+
+
 @pytest.fixture(scope="module")
 def docker_compose_runner(
     docker_compose_command, docker_compose_project_name, docker_setup, docker_cleanup