diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md index 744efb49e7573..6569128688c5d 100644 --- a/metadata-ingestion/docs/sources/s3/s3.md +++ b/metadata-ingestion/docs/sources/s3/s3.md @@ -1,4 +1,7 @@ -## Valid path_specs.include + +### Path Spec + +**Valid path_specs.include** ```python s3://my-bucket/foo/tests/bar.avro # single file table @@ -14,13 +17,13 @@ s3://my-bucket/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # tabl s3://my-bucket/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 3 levels down in bucket ``` -## Valid path_specs.exclude -- **/tests/** +**Valid path_specs.exclude** +- \**/tests/** - s3://my-bucket/hr/** - **/tests/*.csv - s3://my-bucket/foo/*/my_table/** -### Notes +**Notes** - {table} represents folder for which dataset will be created. - include path must end with (*.* or *.[ext]) to represent leaf level. @@ -59,7 +62,7 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on ::: -## Compatibility +### Compatibility Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_boto_utils.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_boto_utils.py index caec19d0fb249..a9d7df980bbfb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_boto_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_boto_utils.py @@ -3,7 +3,7 @@ from datahub.emitter.mce_builder import make_tag_urn from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.aws.aws_common import AwsSourceConfig +from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.aws.s3_util import ( get_bucket_name, get_bucket_relative_path, @@ -19,7 +19,7 @@ def get_s3_tags( bucket_name: str, key_name: Optional[str], dataset_urn: str, - aws_config: Optional[AwsSourceConfig], + aws_config: Optional[AwsConnectionConfig], ctx: PipelineContext, use_s3_bucket_tags: Optional[bool] = False, use_s3_object_tags: Optional[bool] = False, @@ -75,7 +75,7 @@ def get_s3_tags( def list_folders_path( - s3_uri: str, aws_config: Optional[AwsSourceConfig] + s3_uri: str, aws_config: Optional[AwsConnectionConfig] ) -> Iterable[str]: if not is_s3_uri(s3_uri): raise ValueError("Not a s3 URI: " + s3_uri) @@ -87,7 +87,7 @@ def list_folders_path( def list_folders( - bucket_name: str, prefix: str, aws_config: Optional[AwsSourceConfig] + bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig] ) -> Iterable[str]: if aws_config is None: raise ValueError("aws_config not set. Cannot browse s3") diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index ed57fe15eea0b..ff68b1db15e4d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -11,7 +11,7 @@ EnvBasedSourceConfigBase, PlatformSourceConfigBase, ) -from datahub.ingestion.source.aws.aws_common import AwsSourceConfig +from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.aws.s3_util import is_s3_uri # hide annoying debug errors from py4j @@ -20,7 +20,9 @@ class S3(ConfigModel): - aws_config: AwsSourceConfig = Field(default=None, description="AWS configuration") + aws_config: AwsConnectionConfig = Field( + default=None, description="AWS configuration" + ) # Whether or not to create in datahub from the s3 bucket use_s3_bucket_tags: Optional[bool] = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index 12bfde146cf3c..d84ef529c45e5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -9,7 +9,7 @@ EnvBasedSourceConfigBase, PlatformSourceConfigBase, ) -from datahub.ingestion.source.aws.aws_common import AwsSourceConfig +from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.aws.path_spec import PathSpec from datahub.ingestion.source.aws.s3_util import get_bucket_name from datahub.ingestion.source.s3.profiling import DataLakeProfilerConfig @@ -33,7 +33,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase): default=None, description="The instance of the platform that all assets produced by this recipe belong to", ) - aws_config: Optional[AwsSourceConfig] = Field( + aws_config: Optional[AwsConnectionConfig] = Field( default=None, description="AWS configuration" )