datahub-project · shirshanka · Sep 6, 2022 · Sep 2, 2022
diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md
@@ -1,4 +1,7 @@
-## Valid path_specs.include
+
+### Path Spec
+
+**Valid path_specs.include**
 
 ```python
 s3://my-bucket/foo/tests/bar.avro # single file table   
@@ -14,13 +17,13 @@ s3://my-bucket/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # tabl
 s3://my-bucket/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 3 levels down in bucket
 ```
 
-## Valid path_specs.exclude
-- **/tests/**
+**Valid path_specs.exclude**
+- \**/tests/**
 - s3://my-bucket/hr/**
 - **/tests/*.csv
 - s3://my-bucket/foo/*/my_table/**
 
-### Notes
+**Notes**
 
 - {table} represents folder for which dataset will be created.
 - include path must end with (*.* or *.[ext]) to represent leaf level.
@@ -59,7 +62,7 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on
 
 :::
 
-## Compatibility
+### Compatibility
 
 Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz).
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_boto_utils.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_boto_utils.py
@@ -3,7 +3,7 @@
 
 from datahub.emitter.mce_builder import make_tag_urn
 from datahub.ingestion.api.common import PipelineContext
-from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
+from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
 from datahub.ingestion.source.aws.s3_util import (
     get_bucket_name,
     get_bucket_relative_path,
@@ -19,7 +19,7 @@ def get_s3_tags(
     bucket_name: str,
     key_name: Optional[str],
     dataset_urn: str,
-    aws_config: Optional[AwsSourceConfig],
+    aws_config: Optional[AwsConnectionConfig],
     ctx: PipelineContext,
     use_s3_bucket_tags: Optional[bool] = False,
     use_s3_object_tags: Optional[bool] = False,
@@ -75,7 +75,7 @@ def get_s3_tags(
 
 
 def list_folders_path(
-    s3_uri: str, aws_config: Optional[AwsSourceConfig]
+    s3_uri: str, aws_config: Optional[AwsConnectionConfig]
 ) -> Iterable[str]:
     if not is_s3_uri(s3_uri):
         raise ValueError("Not a s3 URI: " + s3_uri)
@@ -87,7 +87,7 @@ def list_folders_path(
 
 
 def list_folders(
-    bucket_name: str, prefix: str, aws_config: Optional[AwsSourceConfig]
+    bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
 ) -> Iterable[str]:
     if aws_config is None:
         raise ValueError("aws_config not set. Cannot browse s3")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
@@ -11,7 +11,7 @@
     EnvBasedSourceConfigBase,
     PlatformSourceConfigBase,
 )
-from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
+from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
 from datahub.ingestion.source.aws.s3_util import is_s3_uri
 
 # hide annoying debug errors from py4j
@@ -20,7 +20,9 @@
 
 
 class S3(ConfigModel):
-    aws_config: AwsSourceConfig = Field(default=None, description="AWS configuration")
+    aws_config: AwsConnectionConfig = Field(
+        default=None, description="AWS configuration"
+    )
 
     # Whether or not to create in datahub from the s3 bucket
     use_s3_bucket_tags: Optional[bool] = Field(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py
@@ -9,7 +9,7 @@
     EnvBasedSourceConfigBase,
     PlatformSourceConfigBase,
 )
-from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
+from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
 from datahub.ingestion.source.aws.path_spec import PathSpec
 from datahub.ingestion.source.aws.s3_util import get_bucket_name
 from datahub.ingestion.source.s3.profiling import DataLakeProfilerConfig
@@ -33,7 +33,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
         default=None,
         description="The instance of the platform that all assets produced by this recipe belong to",
     )
-    aws_config: Optional[AwsSourceConfig] = Field(
+    aws_config: Optional[AwsConnectionConfig] = Field(
         default=None, description="AWS configuration"
     )