Skip to content

Commit

Permalink
fix(ingest): use AwsConnectionConfig instead of AwsSourceConfig (d…
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored and shirshanka committed Sep 8, 2022
1 parent d0a10de commit 4429820
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 13 deletions.
13 changes: 8 additions & 5 deletions metadata-ingestion/docs/sources/s3/s3.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
## Valid path_specs.include

### Path Spec

**Valid path_specs.include**

```python
s3://my-bucket/foo/tests/bar.avro # single file table
Expand All @@ -14,13 +17,13 @@ s3://my-bucket/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # tabl
s3://my-bucket/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 3 levels down in bucket
```

## Valid path_specs.exclude
- **/tests/**
**Valid path_specs.exclude**
- \**/tests/**
- s3://my-bucket/hr/**
- **/tests/*.csv
- s3://my-bucket/foo/*/my_table/**

### Notes
**Notes**

- {table} represents folder for which dataset will be created.
- include path must end with (*.* or *.[ext]) to represent leaf level.
Expand Down Expand Up @@ -59,7 +62,7 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on

:::

## Compatibility
### Compatibility

Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from datahub.emitter.mce_builder import make_tag_urn
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
from datahub.ingestion.source.aws.s3_util import (
get_bucket_name,
get_bucket_relative_path,
Expand All @@ -19,7 +19,7 @@ def get_s3_tags(
bucket_name: str,
key_name: Optional[str],
dataset_urn: str,
aws_config: Optional[AwsSourceConfig],
aws_config: Optional[AwsConnectionConfig],
ctx: PipelineContext,
use_s3_bucket_tags: Optional[bool] = False,
use_s3_object_tags: Optional[bool] = False,
Expand Down Expand Up @@ -75,7 +75,7 @@ def get_s3_tags(


def list_folders_path(
s3_uri: str, aws_config: Optional[AwsSourceConfig]
s3_uri: str, aws_config: Optional[AwsConnectionConfig]
) -> Iterable[str]:
if not is_s3_uri(s3_uri):
raise ValueError("Not a s3 URI: " + s3_uri)
Expand All @@ -87,7 +87,7 @@ def list_folders_path(


def list_folders(
bucket_name: str, prefix: str, aws_config: Optional[AwsSourceConfig]
bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
) -> Iterable[str]:
if aws_config is None:
raise ValueError("aws_config not set. Cannot browse s3")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
EnvBasedSourceConfigBase,
PlatformSourceConfigBase,
)
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
from datahub.ingestion.source.aws.s3_util import is_s3_uri

# hide annoying debug errors from py4j
Expand All @@ -20,7 +20,9 @@


class S3(ConfigModel):
aws_config: AwsSourceConfig = Field(default=None, description="AWS configuration")
aws_config: AwsConnectionConfig = Field(
default=None, description="AWS configuration"
)

# Whether or not to create in datahub from the s3 bucket
use_s3_bucket_tags: Optional[bool] = Field(
Expand Down
4 changes: 2 additions & 2 deletions metadata-ingestion/src/datahub/ingestion/source/s3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
EnvBasedSourceConfigBase,
PlatformSourceConfigBase,
)
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
from datahub.ingestion.source.aws.path_spec import PathSpec
from datahub.ingestion.source.aws.s3_util import get_bucket_name
from datahub.ingestion.source.s3.profiling import DataLakeProfilerConfig
Expand All @@ -33,7 +33,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
default=None,
description="The instance of the platform that all assets produced by this recipe belong to",
)
aws_config: Optional[AwsSourceConfig] = Field(
aws_config: Optional[AwsConnectionConfig] = Field(
default=None, description="AWS configuration"
)

Expand Down

0 comments on commit 4429820

Please sign in to comment.