Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest): hide deprecated path_spec option from config #5944

Merged
merged 10 commits into from
Oct 4, 2022
55 changes: 26 additions & 29 deletions metadata-ingestion/src/datahub/ingestion/source/s3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,14 @@


class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
path_specs: Optional[List[PathSpec]] = Field(
description="List of PathSpec. See below the details about PathSpec"
)
path_spec: Optional[PathSpec] = Field(
description="Path spec will be deprecated in favour of path_specs option."
path_specs: List[PathSpec] = Field(
description="List of PathSpec. See [below](#path-spec) the details about PathSpec"
)
platform: str = Field(
default="", description="The platform that this source connects to"
)
platform_instance: Optional[str] = Field(
default=None,
description="The instance of the platform that all assets produced by this recipe belong to",
# The platform field already exists, but we want to override the type/default/docs.
default="",
description="The platform that this source connects to (either 's3' or 'file'). "
"If not specified, the platform will be inferred from the path_specs.",
)
aws_config: Optional[AwsConnectionConfig] = Field(
default=None, description="AWS configuration"
Expand Down Expand Up @@ -64,23 +60,25 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
description="Maximum number of rows to use when inferring schemas for TSV and CSV files.",
)

@pydantic.root_validator(pre=False)
def validate_platform(cls, values: Dict) -> Dict:
if not values.get("path_specs") and not values.get("path_spec"):
raise ValueError("Either path_specs or path_spec needs to be specified")

if values.get("path_specs") and values.get("path_spec"):
raise ValueError(
"Either path_specs or path_spec needs to be specified but not both"
)

if values.get("path_spec"):
logger.warning(
"path_spec config property is deprecated, please use path_specs instead of it."
)
values["path_specs"] = [values.get("path_spec")]
@pydantic.validator("path_specs", pre=True, always=True)
def rename_path_spec_to_path_specs(
cls, v: Any, values: Dict[str, Any]
) -> List[PathSpec]:
"""
Support the old path_spec field.
"""
if "path_spec" in values:
assert (
"path_specs" not in values
), 'cannot have both "path_spec" and "path_specs"'
logger.warning("path_spec is deprecated. Please use path_specs instead.")
return [values.pop("path_spec")]
return v

@pydantic.root_validator(pre=False)
def check_path_specs_and_infer_platform(cls, values: Dict) -> Dict:
bucket_name: str = ""
path_spec: PathSpec
hsheth2 marked this conversation as resolved.
Show resolved Hide resolved
for path_spec in values.get("path_specs", []):
if path_spec.is_s3:
platform = "s3"
Expand All @@ -92,12 +90,11 @@ def validate_platform(cls, values: Dict) -> Dict:

platform = "file"

if values.get("platform", ""):
if platform == "s3" and values["platform"] != platform:
raise ValueError("all path_spec should belong to the same platform")
if values.get("platform") and values["platform"] != platform:
raise ValueError("all path_spec should belong to the same platform")
else:
logger.debug(f'Setting config "platform": {platform}')
values["platform"] = platform
logger.debug(f'Setting config "platform": {values.get("platform")}')

if platform == "s3":
if bucket_name == "":
Expand Down