diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 74560cee94553..5643ddcf76fd3 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -67,7 +67,7 @@ task generateGraphQLDocumentation(type: YarnTask, dependsOn: [yarnInstall, gener args = ['docusaurus', 'docs:generate:graphql'] } -task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLDocumentation, ':metadata-ingestion:modelDocGen'] ) { +task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLDocumentation, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen'] ) { inputs.files(projectMdFiles) outputs.cacheIf { true } args = ['run', 'generate'] diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 43f116ae3ed24..dc919ed249c56 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -8,7 +8,9 @@ module.exports = { favicon: "img/favicon.ico", organizationName: "linkedin", // Usually your GitHub org/user name. projectName: "datahub", // Usually your repo name. - stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;600&display=swap"], + stylesheets: [ + "https://fonts.googleapis.com/css2?family=Manrope:wght@400;600&display=swap", + ], themeConfig: { colorMode: { switchConfig: { @@ -198,7 +200,10 @@ module.exports = { ], ], plugins: [ - ["@docusaurus/plugin-ideal-image", { quality: 100, sizes: [320, 640, 1280, 1440, 1600] }], + [ + "@docusaurus/plugin-ideal-image", + { quality: 100, sizes: [320, 640, 1280, 1440, 1600] }, + ], "docusaurus-plugin-sass", [ "docusaurus-graphql-plugin", diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index dd7a6fd532aea..831da46b3e6b1 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -65,7 +65,7 @@ function list_markdown_files(): string[] { .trim() .split("\n"); let all_generated_markdown_files = execSync( - "cd .. && ls docs/generated/metamodel/**/*.md" + "cd .. && ls docs/generated/**/**/*.md" ) .toString() .trim() @@ -100,6 +100,8 @@ function list_markdown_files(): string[] { /^datahub-kubernetes\//, // Various other docs/directories to ignore. /^metadata-models\/docs\//, // these are used to generate docs, so we don't want to consider them here + /^metadata-ingestion\/archived\//, // these are archived, so we don't want to consider them here + /^metadata-ingestion\/docs\/sources\//, // these are used to generate docs, so we don't want to consider them here /^metadata-ingestion-examples\//, /^docker\/(?!README|datahub-upgrade|airflow\/local_airflow)/, // Drop all but a few docker docs. /^docs\/rfc\/templates\/000-template\.md$/, @@ -321,6 +323,7 @@ function new_url(original: string, filepath: string): string { const updated = path.normalize( `${"../".repeat(up_levels + 2)}/${relation}/${original}` ); + //console.log(`Rewriting ${original} ${filepath} as ${updated}`); return updated; } else { throw new Error(`unknown extension - ${original} in ${filepath}`); @@ -479,12 +482,18 @@ function write_markdown_file( ): void { const pathname = path.dirname(output_filepath); fs.mkdirSync(pathname, { recursive: true }); - fs.writeFileSync(output_filepath, contents.stringify("")); + try { + fs.writeFileSync(output_filepath, contents.stringify("")); + } catch (error) { + console.log(`Failed to write file ${output_filepath}`); + console.log(`contents = ${contents}`); + throw error; + } } (async function main() { for (const filepath of markdown_files) { - // console.log("Processing:", filepath); + //console.log("Processing:", filepath); const contents_string = fs.readFileSync(`../${filepath}`).toString(); const contents = matter(contents_string); @@ -507,7 +516,11 @@ function write_markdown_file( } // Error if a doc is not accounted for in a sidebar. - const autogenerated_sidebar_directories = ["docs/generated/metamodel"]; + const autogenerated_sidebar_directories = [ + "docs/generated/metamodel", + "docs/generated/ingestion", + "metadata-ingestion/archived", + ]; for (const filepath of markdown_files) { if ( autogenerated_sidebar_directories.some((dir) => filepath.startsWith(dir)) diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index c8da03e14378c..3e810bfcaea97 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -69,9 +69,12 @@ module.exports = { id: "metadata-ingestion/README", }, { - Sources: list_ids_in_directory("metadata-ingestion/source_docs", { - "metadata-ingestion/source_docs/s3": "S3", - }), + Sources: [ + { + type: "autogenerated", + dirName: "docs/generated/ingestion/sources", // '.' means the current docs folder + }, + ], }, "metadata-ingestion/transformers", { @@ -108,6 +111,9 @@ module.exports = { "docs/how/add-custom-data-platform", "docs/platform-instances", "docs/how/add-user-data", + "metadata-ingestion/docs/dev_guides/stateful", + "metadata-ingestion/docs/dev_guides/reporting_telemetry", + "metadata-ingestion/docs/dev_guides/sql_profiles", ], }, ], @@ -212,11 +218,6 @@ module.exports = { "docs/docker/development", "docs/how/backup-datahub", "docs/how/updating-datahub", - { - type: "doc", - label: "Ingesting files from S3", - id: "metadata-ingestion/source_docs/s3", - }, //"metadata-ingestion/examples/transforms/README" //"docs/what/graph", //"docs/what/search-index", diff --git a/docs/cli.md b/docs/cli.md index 8f08c8c7c1a1e..6ef66e1ac11db 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -63,45 +63,44 @@ We use a plugin architecture so that you can install only the dependencies you a | Plugin Name | Install Command | Provides | |-------------------------------------------------------------------------------------|------------------------------------------------------------| ----------------------------------- | -| [file](../metadata-ingestion/source_docs/file.md) | _included by default_ | File source and sink | -| [athena](../metadata-ingestion/source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | -| [bigquery](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | -| [bigquery-usage](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | -| [datahub-lineage-file](../metadata-ingestion/source_docs/file_lineage.md) | _no additional dependencies_ | Lineage File source | -| [datahub-business-glossary](../metadata-ingestion/source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source | -| [dbt](../metadata-ingestion/source_docs/dbt.md) | _no additional dependencies_ | dbt source | -| [druid](../metadata-ingestion/source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | -| [feast-legacy](../metadata-ingestion/source_docs/feast_legacy.md) | `pip install 'acryl-datahub[feast-legacy]'` | Feast source (legacy) | -| [feast](../metadata-ingestion/source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source (0.18.0) | -| [glue](../metadata-ingestion/source_docs/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | -| [hive](../metadata-ingestion/source_docs/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | -| [kafka](../metadata-ingestion/source_docs/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | -| [kafka-connect](../metadata-ingestion/source_docs/kafka-connect.md) | `pip install 'acryl-datahub[kafka-connect]'` | Kafka connect source | -| [ldap](../metadata-ingestion/source_docs/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | -| [looker](../metadata-ingestion/source_docs/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | -| [lookml](../metadata-ingestion/source_docs/lookml.md) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | -| [metabase](../metadata-ingestion/source_docs/metabase.md) | `pip install 'acryl-datahub[metabase]'` | Metabase source | -| [mode](../metadata-ingestion/source_docs/mode.md) | `pip install 'acryl-datahub[mode]'` | Mode Analytics source | -| [mongodb](../metadata-ingestion/source_docs/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | -| [mssql](../metadata-ingestion/source_docs/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | -| [mysql](../metadata-ingestion/source_docs/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | -| [mariadb](../metadata-ingestion/source_docs/mariadb.md) | `pip install 'acryl-datahub[mariadb]'` | MariaDB source | -| [openapi](../metadata-ingestion/source_docs/openapi.md) | `pip install 'acryl-datahub[openapi]'` | OpenApi Source | -| [oracle](../metadata-ingestion/source_docs/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | -| [postgres](../metadata-ingestion/source_docs/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | -| [redash](../metadata-ingestion/source_docs/redash.md) | `pip install 'acryl-datahub[redash]'` | Redash source | -| [redshift](../metadata-ingestion/source_docs/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | -| [sagemaker](../metadata-ingestion/source_docs/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | -| [snowflake](../metadata-ingestion/source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | -| [snowflake-usage](../metadata-ingestion/source_docs/snowflake.md) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | -| [sql-profiles](../metadata-ingestion/source_docs/sql_profiles.md) | `pip install 'acryl-datahub[sql-profiles]'` | Data profiles for SQL-based systems | -| [sqlalchemy](../metadata-ingestion/source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | -| [superset](../metadata-ingestion/source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | -| [tableau](../metadata-ingestion/source_docs/tableau.md) | `pip install 'acryl-datahub[tableau]'` | Tableau source | -| [trino](../metadata-ingestion/source_docs/trino.md) | `pip install 'acryl-datahub[trino]'` | Trino source | -| [starburst-trino-usage](../metadata-ingestion/source_docs/trino.md) | `pip install 'acryl-datahub[starburst-trino-usage]'` | Starburst Trino usage statistics source | -| [nifi](../metadata-ingestion/source_docs/nifi.md) | `pip install 'acryl-datahub[nifi]'` | Nifi source | -| [powerbi](../metadata-ingestion/source_docs/powerbi.md) | `pip install 'acryl-datahub[powerbi]'` | Microsoft Power BI source | +| [file](./generated/ingestion/sources/file.md) | _included by default_ | File source and sink | +| [athena](./generated/ingestion/sources/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | +| [bigquery](./generated/ingestion/sources/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | +| [bigquery-usage](./generated/ingestion/sources/bigquery.md#module-bigquery-usage) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | +| [datahub-lineage-file](./generated/ingestion/sources/file-based-lineage.md) | _no additional dependencies_ | Lineage File source | +| [datahub-business-glossary](./generated/ingestion/sources/business-glossary.md) | _no additional dependencies_ | Business Glossary File source | +| [dbt](./generated/ingestion/sources/dbt.md) | _no additional dependencies_ | dbt source | +| [druid](./generated/ingestion/sources/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | +| [feast-legacy](./generated/ingestion/sources/feast.md#module-feast-legacy) | `pip install 'acryl-datahub[feast-legacy]'` | Feast source (legacy) | +| [feast](./generated/ingestion/sources/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source (0.18.0) | +| [glue](./generated/ingestion/sources/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source | +| [hive](./generated/ingestion/sources/hive.md) | `pip install 'acryl-datahub[hive]'` | Hive source | +| [kafka](./generated/ingestion/sources/kafka.md) | `pip install 'acryl-datahub[kafka]'` | Kafka source | +| [kafka-connect](./generated/ingestion/sources/kafka-connect.md) | `pip install 'acryl-datahub[kafka-connect]'` | Kafka connect source | +| [ldap](./generated/ingestion/sources/ldap.md) | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source | +| [looker](./generated/ingestion/sources/looker.md) | `pip install 'acryl-datahub[looker]'` | Looker source | +| [lookml](./generated/ingestion/sources/looker.md#module-lookml) | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ | +| [metabase](./generated/ingestion/sources/metabase.md) | `pip install 'acryl-datahub[metabase]'` | Metabase source | +| [mode](./generated/ingestion/sources/mode.md) | `pip install 'acryl-datahub[mode]'` | Mode Analytics source | +| [mongodb](./generated/ingestion/sources/mongodb.md) | `pip install 'acryl-datahub[mongodb]'` | MongoDB source | +| [mssql](./generated/ingestion/sources/mssql.md) | `pip install 'acryl-datahub[mssql]'` | SQL Server source | +| [mysql](./generated/ingestion/sources/mysql.md) | `pip install 'acryl-datahub[mysql]'` | MySQL source | +| [mariadb](./generated/ingestion/sources/mariadb.md) | `pip install 'acryl-datahub[mariadb]'` | MariaDB source | +| [openapi](./generated/ingestion/sources/openapi.md) | `pip install 'acryl-datahub[openapi]'` | OpenApi Source | +| [oracle](./generated/ingestion/sources/oracle.md) | `pip install 'acryl-datahub[oracle]'` | Oracle source | +| [postgres](./generated/ingestion/sources/postgres.md) | `pip install 'acryl-datahub[postgres]'` | Postgres source | +| [redash](./generated/ingestion/sources/redash.md) | `pip install 'acryl-datahub[redash]'` | Redash source | +| [redshift](./generated/ingestion/sources/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source | +| [sagemaker](./generated/ingestion/sources/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source | +| [snowflake](./generated/ingestion/sources/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source | +| [snowflake-usage](./generated/ingestion/sources/snowflake.md#module-snowflake-usage) | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source | +| [sqlalchemy](./generated/ingestion/sources/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source | +| [superset](./generated/ingestion/sources/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source | +| [tableau](./generated/ingestion/sources/tableau.md) | `pip install 'acryl-datahub[tableau]'` | Tableau source | +| [trino](./generated/ingestion/sources/trino.md) | `pip install 'acryl-datahub[trino]'` | Trino source | +| [starburst-trino-usage](./generated/ingestion/sources/trino.md) | `pip install 'acryl-datahub[starburst-trino-usage]'` | Starburst Trino usage statistics source | +| [nifi](./generated/ingestion/sources/nifi.md) | `pip install 'acryl-datahub[nifi]'` | Nifi source | +| [powerbi](./generated/ingestion/sources/powerbi.md) | `pip install 'acryl-datahub[powerbi]'` | Microsoft Power BI source | ### Sinks diff --git a/docs/debugging.md b/docs/debugging.md index ee4f191db64ff..3697ee508d48e 100644 --- a/docs/debugging.md +++ b/docs/debugging.md @@ -160,7 +160,7 @@ ALTER TABLE metadata_aspect_v2 CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_ ## I've modified the default user.props file to include a custom username and password, but I don't see the new user(s) inside the Users & Groups tab. Why not? Currently, `user.props` is a file used by the JAAS PropertyFileLoginModule solely for the purpose of **Authentication**. The file is not used as an source from which to -ingest additional metadata about the user. For that, you'll need to ingest some custom information about your new user using the Rest.li APIs or the [File-based ingestion source](https://datahubproject.io/docs/metadata-ingestion/source_docs/file). +ingest additional metadata about the user. For that, you'll need to ingest some custom information about your new user using the Rest.li APIs or the [File-based ingestion source](./generated/ingestion/sources/file.md). For an example of a file that ingests user information, check out [single_mce.json](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/mce_files/single_mce.json), which ingests a single user object into DataHub. Notice that the "urn" field provided will need to align with the custom username you've provided in user.props file. For example, if your user.props file contains: @@ -204,4 +204,4 @@ You'll need to ingest some metadata of the following form to see it inside the D }, "proposedDelta": null } -``` \ No newline at end of file +``` diff --git a/docs/how/ui-tabs-guide.md b/docs/how/ui-tabs-guide.md index 74ece6892d189..6a82a36cd813c 100644 --- a/docs/how/ui-tabs-guide.md +++ b/docs/how/ui-tabs-guide.md @@ -5,7 +5,7 @@ Some of the tabs in the UI might not be enabled by default. This guide is suppos ## Datasets ### Stats and Queries Tab -To enable these tabs you need to use one of the usage sources which gets the relevant metadata from your sources and ingests them into DataHub. These usage sources are listed under other sources which support them e.g. [Snowflake source](../../metadata-ingestion/source_docs/snowflake.md), [BigQuery source](../../metadata-ingestion/source_docs/bigquery.md) +To enable these tabs you need to use one of the usage sources which gets the relevant metadata from your sources and ingests them into DataHub. These usage sources are listed under other sources which support them e.g. [Snowflake source](../../docs/generated/ingestion/sources/snowflake.md), [BigQuery source](../../docs/generated/ingestion/sources/bigquery.md) ### Validation Tab diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 4f89707db27c3..4629b5b47b4de 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -20,7 +20,7 @@ Before running any metadata ingestion job, you should make sure that DataHub bac ### Sources -Data systems that we are extracting metadata from are referred to as **Sources**. The `Sources` tab on the left in the sidebar shows you all the sources that are available for you to ingest metadata from. For example, we have sources for [BigQuery](./source_docs/bigquery.md), [Looker](./source_docs/looker.md), [Tableau](./source_docs/tableau.md) and many others. +Data systems that we are extracting metadata from are referred to as **Sources**. The `Sources` tab on the left in the sidebar shows you all the sources that are available for you to ingest metadata from. For example, we have sources for [BigQuery](../docs/generated/ingestion/sources/bigquery.md), [Looker](../docs/generated/ingestion/sources/looker.md), [Tableau](../docs/generated/ingestion/sources/tableau.md) and many others. #### Metadata Ingestion Source Status diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md index dd696ddfa78ae..2d4a4e4e16a54 100644 --- a/metadata-ingestion/adding-source.md +++ b/metadata-ingestion/adding-source.md @@ -22,6 +22,35 @@ your local environment. We use [pydantic](https://pydantic-docs.helpmanual.io/) for configuration, and all models must inherit from `ConfigModel`. The [file source](./src/datahub/ingestion/source/file.py) is a good example. +#### Documentation for Configuration Classes + +We use [pydantic](https://pydantic-docs.helpmanual.io) conventions for documenting configuration flags. Use the `description` attribute to write rich documentation for your configuration field. + +For example, the following code: +```python +from pydantic import Field +from datahub.api.configuration.common import ConfigModel + +class LookerAPIConfig(ConfigModel): + client_id: str = Field(description="Looker API client id.") + client_secret: str = Field(description="Looker API client secret.") + base_url: str = Field( + description="Url to your Looker instance: `https://company.looker.com:19999` or `https://looker.company.com`, or similar. Used for making API calls to Looker and constructing clickable dashboard and chart urls." + ) + transport_options: Optional[TransportOptionsConfig] = Field( + default=None, + description="Populates the [TransportOptions](https://github.com/looker-open-source/sdk-codegen/blob/94d6047a0d52912ac082eb91616c1e7c379ab262/python/looker_sdk/rtl/transport.py#L70) struct for looker client", + ) +``` + +generates the following documentation: +![Generated Config Documentation](./docs/images/generated_config_docs.png) + +:::note +Inline markdown or code snippets are not yet supported for field level documentation. +::: + + ### 2. Set up the reporter The reporter interface enables the source to report statistics, warnings, failures, and other information about the run. @@ -29,7 +58,7 @@ Some sources use the default `SourceReport` class, but others inherit and extend ### 3. Implement the source itself -The core for the source is the `get_workunits` method, which produces a stream of MCE objects. +The core for the source is the `get_workunits` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit. The [file source](./src/datahub/ingestion/source/file.py) is a good and simple example. The MetadataChangeEventClass is defined in the metadata models which are generated @@ -51,16 +80,156 @@ Tests go in the `tests` directory. We use the [pytest framework](https://pytest. ### 7. Write docs -Create a copy of [`source-docs-template.md`](./source-docs-template.md) and edit all relevant components. +#### 7.1 Set up the source class for automatic documentation + +- Indicate the platform name that this source class produces metadata for using the `@platform_name` decorator. We prefer using the human-readable platform name, so e.g. BigQuery (not bigquery). +- Indicate the config class being used by the source by using the `@config_class` decorator. +- Indicate the support status of the connector by using the `@support_status` decorator. +- Indicate what capabilities the connector supports (and what important capabilities it does NOT support) by using the `@capability` decorator. +- Add rich documentation for the connector by utilizing docstrings on your Python class. Markdown is supported. + +See below a simple example of how to do this for any source. + +```python + +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) + +@platform_name("File") +@support_status(SupportStatus.CERTIFIED) +@config_class(FileSourceConfig) +@capability( + SourceCapability.PLATFORM_INSTANCE, + "File based ingestion does not support platform instances", + supported=False, +) +@capability(SourceCapability.DOMAINS, "Enabled by default") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +class FileSource(Source): + """ + + The File Source can be used to produce all kinds of metadata from a generic metadata events file. + :::note + Events in this file can be in MCE form or MCP form. + ::: + + """ + + ... source code goes here + +``` + + +#### 7.2 Write custom documentation + +- Create a copy of [`source-docs-template.md`](./source-docs-template.md) and edit all relevant components. +- Name the document as `` and move it to `metadata-ingestion/docs/sources//.md`. For example for the Kafka platform, under the `kafka` plugin, move the document to `metadata-ingestion/docs/sources/kafka/kafka.md`. +- Add a quickstart recipe corresponding to the plugin under `metadata-ingestion/docs/sources//_recipe.yml`. For example, for the Kafka platform, under the `kafka` plugin, there is a quickstart recipe located at `metadata-ingestion/docs/sources/kafka/kafka_recipe.yml`. +- To write platform-specific documentation (that is cross-plugin), write the documentation under `metadata-ingestion/docs/sources//README.md`. For example, cross-plugin documentation for the BigQuery platform is located under `metadata-ingestion/docs/sources/bigquery/README.md`. + +#### 7.3 Viewing the Documentation + +Documentation for the source can be viewed by running the documentation generator from the `docs-website` module. + +##### Step 1: Build the Ingestion docs +```console +# From the root of DataHub repo +./gradlew :metadata-ingestion:docGen +``` + +If this finishes successfully, you will see output messages like: +```console +Ingestion Documentation Generation Complete +############################################ +{ + "source_platforms": { + "discovered": 40, + "generated": 40 + }, + "plugins": { + "discovered": 47, + "generated": 47, + "failed": 0 + } +} +############################################ +``` + +You can also find documentation files generated at `./docs/generated/ingestion/sources` relative to the root of the DataHub repo. You should be able to locate your specific source's markdown file here and investigate it to make sure things look as expected. + +#### Step 2: Build the Entire Documentation +To view how this documentation looks in the browser, there is one more step. Just build the entire docusaurus page from the `docs-website` module. + +```console +# From the root of DataHub repo +./gradlew :docs-website:build +``` + +This will generate messages like: +```console +... +> Task :docs-website:yarnGenerate +yarn run v1.22.0 +$ rm -rf genDocs/* && ts-node -O '{ "lib": ["es2020"], "target": "es6" }' generateDocsDir.ts && mv -v docs/* genDocs/ +Including untracked files in docs list: +docs/graphql -> genDocs/graphql +Done in 2.47s. + +> Task :docs-website:yarnBuild +yarn run v1.22.0 +$ docusaurus build + +╭──────────────────────────────────────────────────────────────────────────────╮│ ││ Update available 2.0.0-beta.8 → 2.0.0-beta.18 ││ ││ To upgrade Docusaurus packages with the latest version, run the ││ following command: ││ yarn upgrade @docusaurus/core@latest ││ @docusaurus/plugin-ideal-image@latest @docusaurus/preset-classic@latest ││ │╰──────────────────────────────────────────────────────────────────────────────╯ + + +[en] Creating an optimized production build... +Invalid docusaurus-plugin-ideal-image version 2.0.0-beta.7. +All official @docusaurus/* packages should have the exact same version as @docusaurus/core (2.0.0-beta.8). +Maybe you want to check, or regenerate your yarn.lock or package-lock.json file? +Browserslist: caniuse-lite is outdated. Please run: + npx browserslist@latest --update-db + Why you should do it regularly: https://github.com/browserslist/browserslist#browsers-data-updating +ℹ Compiling Client +ℹ Compiling Server +✔ Client: Compiled successfully in 1.95s +✔ Server: Compiled successfully in 7.52s +Success! Generated static files in "build". + +Use `npm run serve` command to test your build locally. + +Done in 11.59s. + +Deprecated Gradle features were used in this build, making it incompatible with Gradle 7.0. +Use '--warning-mode all' to show the individual deprecation warnings. +See https://docs.gradle.org/6.9.2/userguide/command_line_interface.html#sec:command_line_warnings + +BUILD SUCCESSFUL in 35s +36 actionable tasks: 16 executed, 20 up-to-date +``` + +After this you need to run the following script from the `docs-website` module. +```console +cd docs-website +npm run serve +``` + +Now, browse to http://localhost:3000 or whichever port npm is running on, to browse the docs. +Your source should show up on the left sidebar under `Metadata Ingestion / Sources`. -Add the plugin to the table under [CLI Sources List](../docs/cli.md#sources), and add the source's documentation underneath the [sources folder](https://github.com/datahub-project/datahub/tree/master/metadata-ingestion/source_docs). ### 8. Add SQL Alchemy mapping (if applicable) Add the source in `get_platform_from_sqlalchemy_uri` function in [sql_common.py](./src/datahub/ingestion/source/sql/sql_common.py) if the source has an sqlalchemy source -### 9. Add logo +### 9. Add logo for the platform -Add logo image in [images folder](../datahub-web-react/src/images) and add it to be ingested -in [boot](../metadata-service/war/src/main/resources/boot/data_platforms.json) +Add the logo image in [images folder](../datahub-web-react/src/images) and add it to be ingested at [startup](../metadata-service/war/src/main/resources/boot/data_platforms.json) diff --git a/metadata-ingestion/source_docs/athena.md b/metadata-ingestion/archived/source_docs/athena.md similarity index 100% rename from metadata-ingestion/source_docs/athena.md rename to metadata-ingestion/archived/source_docs/athena.md diff --git a/metadata-ingestion/source_docs/azure-ad.md b/metadata-ingestion/archived/source_docs/azure-ad.md similarity index 100% rename from metadata-ingestion/source_docs/azure-ad.md rename to metadata-ingestion/archived/source_docs/azure-ad.md diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/archived/source_docs/bigquery.md similarity index 100% rename from metadata-ingestion/source_docs/bigquery.md rename to metadata-ingestion/archived/source_docs/bigquery.md diff --git a/metadata-ingestion/source_docs/business_glossary.md b/metadata-ingestion/archived/source_docs/business_glossary.md similarity index 100% rename from metadata-ingestion/source_docs/business_glossary.md rename to metadata-ingestion/archived/source_docs/business_glossary.md diff --git a/metadata-ingestion/source_docs/clickhouse.md b/metadata-ingestion/archived/source_docs/clickhouse.md similarity index 100% rename from metadata-ingestion/source_docs/clickhouse.md rename to metadata-ingestion/archived/source_docs/clickhouse.md diff --git a/metadata-ingestion/source_docs/data_lake.md b/metadata-ingestion/archived/source_docs/data_lake.md similarity index 100% rename from metadata-ingestion/source_docs/data_lake.md rename to metadata-ingestion/archived/source_docs/data_lake.md diff --git a/metadata-ingestion/source_docs/dbt.md b/metadata-ingestion/archived/source_docs/dbt.md similarity index 100% rename from metadata-ingestion/source_docs/dbt.md rename to metadata-ingestion/archived/source_docs/dbt.md diff --git a/metadata-ingestion/source_docs/druid.md b/metadata-ingestion/archived/source_docs/druid.md similarity index 100% rename from metadata-ingestion/source_docs/druid.md rename to metadata-ingestion/archived/source_docs/druid.md diff --git a/metadata-ingestion/source_docs/elastic_search.md b/metadata-ingestion/archived/source_docs/elastic_search.md similarity index 100% rename from metadata-ingestion/source_docs/elastic_search.md rename to metadata-ingestion/archived/source_docs/elastic_search.md diff --git a/metadata-ingestion/source_docs/feast.md b/metadata-ingestion/archived/source_docs/feast.md similarity index 100% rename from metadata-ingestion/source_docs/feast.md rename to metadata-ingestion/archived/source_docs/feast.md diff --git a/metadata-ingestion/source_docs/feast_legacy.md b/metadata-ingestion/archived/source_docs/feast_legacy.md similarity index 100% rename from metadata-ingestion/source_docs/feast_legacy.md rename to metadata-ingestion/archived/source_docs/feast_legacy.md diff --git a/metadata-ingestion/source_docs/file.md b/metadata-ingestion/archived/source_docs/file.md similarity index 100% rename from metadata-ingestion/source_docs/file.md rename to metadata-ingestion/archived/source_docs/file.md diff --git a/metadata-ingestion/source_docs/file_lineage.md b/metadata-ingestion/archived/source_docs/file_lineage.md similarity index 100% rename from metadata-ingestion/source_docs/file_lineage.md rename to metadata-ingestion/archived/source_docs/file_lineage.md diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/archived/source_docs/glue.md similarity index 100% rename from metadata-ingestion/source_docs/glue.md rename to metadata-ingestion/archived/source_docs/glue.md diff --git a/metadata-ingestion/source_docs/hive.md b/metadata-ingestion/archived/source_docs/hive.md similarity index 100% rename from metadata-ingestion/source_docs/hive.md rename to metadata-ingestion/archived/source_docs/hive.md diff --git a/metadata-ingestion/source_docs/images/azure_ad_api_permissions.png b/metadata-ingestion/archived/source_docs/images/azure_ad_api_permissions.png similarity index 100% rename from metadata-ingestion/source_docs/images/azure_ad_api_permissions.png rename to metadata-ingestion/archived/source_docs/images/azure_ad_api_permissions.png diff --git a/metadata-ingestion/source_docs/images/azure_ad_endpoints.png b/metadata-ingestion/archived/source_docs/images/azure_ad_endpoints.png similarity index 100% rename from metadata-ingestion/source_docs/images/azure_ad_endpoints.png rename to metadata-ingestion/archived/source_docs/images/azure_ad_endpoints.png diff --git a/metadata-ingestion/source_docs/images/looker_datahub_permission_set.png b/metadata-ingestion/archived/source_docs/images/looker_datahub_permission_set.png similarity index 100% rename from metadata-ingestion/source_docs/images/looker_datahub_permission_set.png rename to metadata-ingestion/archived/source_docs/images/looker_datahub_permission_set.png diff --git a/metadata-ingestion/source_docs/images/stale_metadata_deletion.png b/metadata-ingestion/archived/source_docs/images/stale_metadata_deletion.png similarity index 100% rename from metadata-ingestion/source_docs/images/stale_metadata_deletion.png rename to metadata-ingestion/archived/source_docs/images/stale_metadata_deletion.png diff --git a/metadata-ingestion/source_docs/kafka-connect.md b/metadata-ingestion/archived/source_docs/kafka-connect.md similarity index 100% rename from metadata-ingestion/source_docs/kafka-connect.md rename to metadata-ingestion/archived/source_docs/kafka-connect.md diff --git a/metadata-ingestion/source_docs/kafka.md b/metadata-ingestion/archived/source_docs/kafka.md similarity index 100% rename from metadata-ingestion/source_docs/kafka.md rename to metadata-ingestion/archived/source_docs/kafka.md diff --git a/metadata-ingestion/source_docs/ldap.md b/metadata-ingestion/archived/source_docs/ldap.md similarity index 100% rename from metadata-ingestion/source_docs/ldap.md rename to metadata-ingestion/archived/source_docs/ldap.md diff --git a/metadata-ingestion/source_docs/looker.md b/metadata-ingestion/archived/source_docs/looker.md similarity index 100% rename from metadata-ingestion/source_docs/looker.md rename to metadata-ingestion/archived/source_docs/looker.md diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/archived/source_docs/lookml.md similarity index 100% rename from metadata-ingestion/source_docs/lookml.md rename to metadata-ingestion/archived/source_docs/lookml.md diff --git a/metadata-ingestion/source_docs/mariadb.md b/metadata-ingestion/archived/source_docs/mariadb.md similarity index 100% rename from metadata-ingestion/source_docs/mariadb.md rename to metadata-ingestion/archived/source_docs/mariadb.md diff --git a/metadata-ingestion/source_docs/metabase.md b/metadata-ingestion/archived/source_docs/metabase.md similarity index 100% rename from metadata-ingestion/source_docs/metabase.md rename to metadata-ingestion/archived/source_docs/metabase.md diff --git a/metadata-ingestion/source_docs/mode.md b/metadata-ingestion/archived/source_docs/mode.md similarity index 100% rename from metadata-ingestion/source_docs/mode.md rename to metadata-ingestion/archived/source_docs/mode.md diff --git a/metadata-ingestion/source_docs/mongodb.md b/metadata-ingestion/archived/source_docs/mongodb.md similarity index 100% rename from metadata-ingestion/source_docs/mongodb.md rename to metadata-ingestion/archived/source_docs/mongodb.md diff --git a/metadata-ingestion/source_docs/mssql.md b/metadata-ingestion/archived/source_docs/mssql.md similarity index 100% rename from metadata-ingestion/source_docs/mssql.md rename to metadata-ingestion/archived/source_docs/mssql.md diff --git a/metadata-ingestion/source_docs/mysql.md b/metadata-ingestion/archived/source_docs/mysql.md similarity index 100% rename from metadata-ingestion/source_docs/mysql.md rename to metadata-ingestion/archived/source_docs/mysql.md diff --git a/metadata-ingestion/source_docs/nifi.md b/metadata-ingestion/archived/source_docs/nifi.md similarity index 100% rename from metadata-ingestion/source_docs/nifi.md rename to metadata-ingestion/archived/source_docs/nifi.md diff --git a/metadata-ingestion/source_docs/okta.md b/metadata-ingestion/archived/source_docs/okta.md similarity index 100% rename from metadata-ingestion/source_docs/okta.md rename to metadata-ingestion/archived/source_docs/okta.md diff --git a/metadata-ingestion/source_docs/openapi.md b/metadata-ingestion/archived/source_docs/openapi.md similarity index 100% rename from metadata-ingestion/source_docs/openapi.md rename to metadata-ingestion/archived/source_docs/openapi.md diff --git a/metadata-ingestion/source_docs/oracle.md b/metadata-ingestion/archived/source_docs/oracle.md similarity index 100% rename from metadata-ingestion/source_docs/oracle.md rename to metadata-ingestion/archived/source_docs/oracle.md diff --git a/metadata-ingestion/source_docs/postgres.md b/metadata-ingestion/archived/source_docs/postgres.md similarity index 100% rename from metadata-ingestion/source_docs/postgres.md rename to metadata-ingestion/archived/source_docs/postgres.md diff --git a/metadata-ingestion/source_docs/powerbi.md b/metadata-ingestion/archived/source_docs/powerbi.md similarity index 100% rename from metadata-ingestion/source_docs/powerbi.md rename to metadata-ingestion/archived/source_docs/powerbi.md diff --git a/metadata-ingestion/source_docs/presto_on_hive.md b/metadata-ingestion/archived/source_docs/presto_on_hive.md similarity index 100% rename from metadata-ingestion/source_docs/presto_on_hive.md rename to metadata-ingestion/archived/source_docs/presto_on_hive.md diff --git a/metadata-ingestion/source_docs/redash.md b/metadata-ingestion/archived/source_docs/redash.md similarity index 100% rename from metadata-ingestion/source_docs/redash.md rename to metadata-ingestion/archived/source_docs/redash.md diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/archived/source_docs/redshift.md similarity index 100% rename from metadata-ingestion/source_docs/redshift.md rename to metadata-ingestion/archived/source_docs/redshift.md diff --git a/metadata-ingestion/source_docs/s3.md b/metadata-ingestion/archived/source_docs/s3.md similarity index 100% rename from metadata-ingestion/source_docs/s3.md rename to metadata-ingestion/archived/source_docs/s3.md diff --git a/metadata-ingestion/source_docs/s3_data_lake.md b/metadata-ingestion/archived/source_docs/s3_data_lake.md similarity index 100% rename from metadata-ingestion/source_docs/s3_data_lake.md rename to metadata-ingestion/archived/source_docs/s3_data_lake.md diff --git a/metadata-ingestion/source_docs/sagemaker.md b/metadata-ingestion/archived/source_docs/sagemaker.md similarity index 100% rename from metadata-ingestion/source_docs/sagemaker.md rename to metadata-ingestion/archived/source_docs/sagemaker.md diff --git a/metadata-ingestion/source_docs/snowflake.md b/metadata-ingestion/archived/source_docs/snowflake.md similarity index 100% rename from metadata-ingestion/source_docs/snowflake.md rename to metadata-ingestion/archived/source_docs/snowflake.md diff --git a/metadata-ingestion/source_docs/sql_profiles.md b/metadata-ingestion/archived/source_docs/sql_profiles.md similarity index 100% rename from metadata-ingestion/source_docs/sql_profiles.md rename to metadata-ingestion/archived/source_docs/sql_profiles.md diff --git a/metadata-ingestion/source_docs/sqlalchemy.md b/metadata-ingestion/archived/source_docs/sqlalchemy.md similarity index 100% rename from metadata-ingestion/source_docs/sqlalchemy.md rename to metadata-ingestion/archived/source_docs/sqlalchemy.md diff --git a/metadata-ingestion/source_docs/stateful_ingestion.md b/metadata-ingestion/archived/source_docs/stateful_ingestion.md similarity index 100% rename from metadata-ingestion/source_docs/stateful_ingestion.md rename to metadata-ingestion/archived/source_docs/stateful_ingestion.md diff --git a/metadata-ingestion/source_docs/superset.md b/metadata-ingestion/archived/source_docs/superset.md similarity index 100% rename from metadata-ingestion/source_docs/superset.md rename to metadata-ingestion/archived/source_docs/superset.md diff --git a/metadata-ingestion/source_docs/tableau.md b/metadata-ingestion/archived/source_docs/tableau.md similarity index 100% rename from metadata-ingestion/source_docs/tableau.md rename to metadata-ingestion/archived/source_docs/tableau.md diff --git a/metadata-ingestion/source_docs/trino.md b/metadata-ingestion/archived/source_docs/trino.md similarity index 100% rename from metadata-ingestion/source_docs/trino.md rename to metadata-ingestion/archived/source_docs/trino.md diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index fa356d134f7bd..95a8b2a58e8df 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -43,6 +43,7 @@ task installDev(type: Exec, dependsOn: [install]) { "${venv_name}/bin/pip install -e .[dev] && touch ${venv_name}/.build_install_dev_sentinel" } + task modelDocGen(type: Exec, dependsOn: [codegen]) { inputs.files( file('scripts/modeldocgen.py'), @@ -50,7 +51,7 @@ task modelDocGen(type: Exec, dependsOn: [codegen]) { project.fileTree(dir: "examples/", include: "**/*.py"), project.fileTree(dir: "../metadata-events/mxe-schemas/src/", include: "**/*.avsc") ) - outputs.dir('../docs/generated') + outputs.dir('../docs/generated/metamodel') commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/modeldocgen.sh" } @@ -118,6 +119,12 @@ task testSlowIntegration(type: Exec, dependsOn: [testQuick, installDevTest]) { "source ${venv_name}/bin/activate && pytest -m 'slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" } +task docGen(type: Exec, dependsOn: [codegen, installDevTest]) { + commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/docgen.sh" +} + + + task cleanPythonCache(type: Exec) { commandLine 'bash', '-x', '-c', "find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" diff --git a/metadata-ingestion/source_docs/reporting_telemetry.md b/metadata-ingestion/docs/dev_guides/reporting_telemetry.md similarity index 100% rename from metadata-ingestion/source_docs/reporting_telemetry.md rename to metadata-ingestion/docs/dev_guides/reporting_telemetry.md diff --git a/metadata-ingestion/docs/dev_guides/sql_profiles.md b/metadata-ingestion/docs/dev_guides/sql_profiles.md new file mode 100644 index 0000000000000..c0fde8f6fce53 --- /dev/null +++ b/metadata-ingestion/docs/dev_guides/sql_profiles.md @@ -0,0 +1,33 @@ +# SQL Profiling + +SQL Profiling collects table level and column level statistics. +The SQL-based profiler does not run alone, but rather can be enabled for other SQL-based sources. +Enabling profiling will slow down ingestion runs. + +:::caution + +Running profiling against many tables or over many rows can run up significant costs. +While we've done our best to limit the expensiveness of the queries the profiler runs, you +should be prudent about the set of tables profiling is enabled on or the frequency +of the profiling runs. + +::: + +## Capabilities + +Extracts: + +- Row and column counts for each table +- For each column, if applicable: + - null counts and proportions + - distinct counts and proportions + - minimum, maximum, mean, median, standard deviation, some quantile values + - histograms or frequencies of unique values + +## Supported Sources + +SQL profiling is supported for all SQL sources. Check the individual source page to verify if it supports profiling. + +## Questions + +If you've got any questions on configuring profiling, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/docs/dev_guides/stale_metadata_deletion.png b/metadata-ingestion/docs/dev_guides/stale_metadata_deletion.png new file mode 100644 index 0000000000000..1fa8539229238 Binary files /dev/null and b/metadata-ingestion/docs/dev_guides/stale_metadata_deletion.png differ diff --git a/metadata-ingestion/docs/dev_guides/stateful.md b/metadata-ingestion/docs/dev_guides/stateful.md new file mode 100644 index 0000000000000..a7c6961476e21 --- /dev/null +++ b/metadata-ingestion/docs/dev_guides/stateful.md @@ -0,0 +1,154 @@ +# Stateful Ingestion +The stateful ingestion feature enables sources to be configured to save custom checkpoint states from their +runs, and query these states back from subsequent runs to make decisions about the current run based on the state saved +from the previous run(s) using a supported ingestion state provider. This is an explicit opt-in feature and is not enabled +by default. + +**_NOTE_**: This feature requires the server to be `statefulIngestion` capable. This is a feature of metadata service with version >= `0.8.20`. + +To check if you are running a stateful ingestion capable server: +```console +curl http:///config + +{ +models: { }, +statefulIngestionCapable: true, # <-- this should be present and true +retention: "true", +noCode: "true" +} +``` + +## Config details + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +|--------------------------------------------------------------| -------- |------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `source.config.stateful_ingestion.enabled` | | False | The type of the ingestion state provider registered with datahub. | +| `source.config.stateful_ingestion.ignore_old_state` | | False | If set to True, ignores the previous checkpoint state. | +| `source.config.stateful_ingestion.ignore_new_state` | | False | If set to True, ignores the current checkpoint state. | +| `source.config.stateful_ingestion.max_checkpoint_state_size` | | 2^24 (16MB) | The maximum size of the checkpoint state in bytes. | +| `source.config.stateful_ingestion.state_provider` | | The default [datahub ingestion state provider](#datahub-ingestion-state-provider) configuration. | The ingestion state provider configuration. | +| `pipeline_name` | ✅ | | The name of the ingestion pipeline the checkpoint states of various source connector job runs are saved/retrieved against via the ingestion state provider. | + +NOTE: If either `dry-run` or `preview` mode are set, stateful ingestion will be turned off regardless of the rest of the configuration. +## Use-cases powered by stateful ingestion. +Following is the list of current use-cases powered by stateful ingestion in datahub. +### Removal of stale tables and views. +Stateful ingestion can be used to automatically soft-delete the tables and views that are seen in a previous run +but absent in the current run (they are either deleted or no longer desired). + +![Stale Metadata Deletion](./stale_metadata_deletion.png) + +#### Supported sources +* All sql based sources. +#### Additional config details + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +|--------------------------------------------| -------- |---------|----------------------------------------------------------------------------------------------------------------------------------------------| +| `stateful_ingestion.remove_stale_metadata` | | True | Soft-deletes the tables and views that were found in the last successful run but missing in the current run with stateful_ingestion enabled. | +#### Sample configuration +```yaml +source: + type: "snowflake" + config: + username: + password: + host_port: + warehouse: + role: + include_tables: True + include_views: True + # Rest of the source specific params ... + ## Stateful Ingestion config ## + stateful_ingestion: + enabled: True # False by default + remove_stale_metadata: True # default value + ## Default state_provider configuration ## + # state_provider: + # type: "datahub" # default value + # This section is needed if the pipeline-level `datahub_api` is not configured. + # config: # default value + # datahub_api: + # server: "http://localhost:8080" + +# The pipeline_name is mandatory for stateful ingestion and the state is tied to this. +# If this is changed after using with stateful ingestion, the previous state will not be available to the next run. +pipeline_name: "my_snowflake_pipeline_1" + +# Pipeline-level datahub_api configuration. +datahub_api: # Optional. But if provided, this config will be used by the "datahub" ingestion state provider. + server: "http://localhost:8080" + +sink: + type: "datahub-rest" + config: + server: 'http://localhost:8080' +``` + +### Prevent redundant reruns for usage source. +Typically, the usage runs are configured to fetch the usage data for the previous day(or hour) for each run. Once a usage +run has finished, subsequent runs until the following day would be fetching the same usage data. With stateful ingestion, +the redundant fetches can be avoided even if the ingestion job is scheduled to run more frequently than the granularity of +usage ingestion. +#### Supported sources +* Snowflake Usage source. +#### Additional config details + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +|----------------------------------| -------- |---------|-------------------------------------------------------------------------------------------------------------------------------------------| +| `stateful_ingestion.force_rerun` | | False | Custom-alias for `stateful_ingestion.ignore_old_state`. Prevents a rerun for the same time window if there was a previous successful run. | +#### Sample Configuration +```yaml +source: + type: "snowflake-usage" + config: + username: + password: + role: + host_port: + warehouse: + # Rest of the source specific params ... + ## Stateful Ingestion config ## + stateful_ingestion: + enabled: True # default is false + force_rerun: False # Specific to this source(alias for ignore_old_state), used to override default behavior if True. + +# The pipeline_name is mandatory for stateful ingestion and the state is tied to this. +# If this is changed after using with stateful ingestion, the previous state will not be available to the next run. +pipeline_name: "my_snowflake_usage_ingestion_pipeline_1" +sink: + type: "datahub-rest" + config: + server: 'http://localhost:8080' +``` + +## The Checkpointing Ingestion State Provider (Developer Guide) +The ingestion checkpointing state provider is responsible for saving and retrieving the ingestion checkpoint state associated with the ingestion runs +of various jobs inside the source connector of the ingestion pipeline. The checkpointing data model is [DatahubIngestionCheckpoint](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/datajob/datahub/DatahubIngestionCheckpoint.pdl) and it supports any custom state to be stored using the [IngestionCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/datajob/datahub/IngestionCheckpointState.pdl#L9). A checkpointing ingestion state provider needs to implement the +[IngestionCheckpointingProviderBase](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py) interface and +register itself with datahub by adding an entry under `datahub.ingestion.checkpointing_provider.plugins` key of the entry_points section in [setup.py](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/setup.py) with its type and implementation class as shown below. +```python +entry_points = { + # " + "datahub.ingestion.checkpointing_provider.plugins": [ + "datahub = datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider:DatahubIngestionCheckpointingProvider", + ], +} +``` + +### Datahub Checkpointing Ingestion State Provider +This is the state provider implementation that is available out of the box. Its type is `datahub` and it is implemented on top +of the `datahub_api` client and the timeseries aspect capabilities of the datahub-backend. +#### Config details + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +|----------------------------------------------------------| -------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------| +| `state_provider.type` | | `datahub` | The type of the ingestion state provider registered with datahub | +| `state_provider.config` | | The `datahub_api` config if set at pipeline level. Otherwise, the default `DatahubClientConfig`. See the [defaults](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19) here. | The configuration required for initializing the state provider. | diff --git a/metadata-ingestion/docs/images/generated_config_docs.png b/metadata-ingestion/docs/images/generated_config_docs.png new file mode 100644 index 0000000000000..90e5144b292b2 Binary files /dev/null and b/metadata-ingestion/docs/images/generated_config_docs.png differ diff --git a/metadata-ingestion/docs/sources/azure-ad/README.md b/metadata-ingestion/docs/sources/azure-ad/README.md new file mode 100644 index 0000000000000..33afad3176100 --- /dev/null +++ b/metadata-ingestion/docs/sources/azure-ad/README.md @@ -0,0 +1,51 @@ +### Extracting DataHub Users + +#### Usernames + +Usernames serve as unique identifiers for users on DataHub. This connector extracts usernames using the +"userPrincipalName" field of an [Azure AD User Response](https://docs.microsoft.com/en-us/graph/api/user-list?view=graph-rest-1.0&tabs=http#response-1), +which is the unique identifier for your Azure AD users. + +If this is not how you wish to map to DataHub usernames, you can provide a custom mapping using the configurations options detailed below. Namely, `azure_ad_response_to_username_attr` +and `azure_ad_response_to_username_regex`. + +#### Responses + +This connector also extracts basic user response information from Azure. The following fields of the Azure User Response are extracted +and mapped to the DataHub `CorpUserInfo` aspect: + +- display name +- first name +- last name +- email +- title +- country + +### Extracting DataHub Groups + +#### Group Names + +Group names serve as unique identifiers for groups on DataHub. This connector extracts group names using the "name" attribute of an Azure Group Response. +By default, a URL-encoded version of the full group name is used as the unique identifier (CorpGroupKey) and the raw "name" attribute is mapped +as the display name that will appear in DataHub's UI. + +If this is not how you wish to map to DataHub group names, you can provide a custom mapping using the configurations options detailed below. Namely, `azure_ad_response_to_groupname_attr` +and `azure_ad_response_to_groupname_regex`. + +#### Responses + +This connector also extracts basic group information from Azure. The following fields of the [Azure AD Group Response](https://docs.microsoft.com/en-us/graph/api/group-list?view=graph-rest-1.0&tabs=http#response-1) are extracted and mapped to the +DataHub `CorpGroupInfo` aspect: + +- name +- description + +### Extracting Group Membership + +This connector additional extracts the edges between Users and Groups that are stored in [Azure AD](https://docs.microsoft.com/en-us/graph/api/group-list-members?view=graph-rest-1.0&tabs=http#response-1). It maps them to the `GroupMembership` aspect +associated with DataHub users (CorpUsers). Today this has the unfortunate side effect of **overwriting** any Group Membership information that +was created outside of the connector. That means if you've used the DataHub REST API to assign users to groups, this information will be overridden +when the Azure AD Source is executed. If you intend to *always* pull users, groups, and their relationships from your Identity Provider, then +this should not matter. + +This is a known limitation in our data model that is being tracked by [this ticket](https://github.com/datahub-project/datahub/issues/3065). \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md new file mode 100644 index 0000000000000..8b375fbee4f33 --- /dev/null +++ b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md @@ -0,0 +1,10 @@ +As a prerequisite, you should [create a DataHub Application](https://docs.microsoft.com/en-us/graph/toolkit/get-started/add-aad-app-registration) within the Azure AD Portal with the permissions +to read your organization's Users and Groups. The following permissions are required, with the `Application` permission type: + +- `Group.Read.All` +- `GroupMember.Read.All` +- `User.Read.All` + +You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal. ![Azure AD API Permissions](./azure_ad_api_permissions.png) + +You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab. ![Azure AD Endpoints](./azure_ad_endpoints.png) diff --git a/metadata-ingestion/docs/sources/azure-ad/azure-ad_recipe.yml b/metadata-ingestion/docs/sources/azure-ad/azure-ad_recipe.yml new file mode 100644 index 0000000000000..7a717af4258ee --- /dev/null +++ b/metadata-ingestion/docs/sources/azure-ad/azure-ad_recipe.yml @@ -0,0 +1,21 @@ +source: + type: "azure-ad" + config: + client_id: "00000000-0000-0000-0000-000000000000" + tenant_id: "00000000-0000-0000-0000-000000000000" + client_secret: "xxxxx" + redirect: "https://login.microsoftonline.com/common/oauth2/nativeclient" + authority: "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000" + token_url: "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000/oauth2/token" + graph_url: "https://graph.microsoft.com/v1.0" + ingest_users: True + ingest_groups: True + groups_pattern: + allow: + - ".*" + users_pattern: + allow: + - ".*" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/azure-ad/azure_ad_api_permissions.png b/metadata-ingestion/docs/sources/azure-ad/azure_ad_api_permissions.png new file mode 100644 index 0000000000000..f67bd0a0c20e1 Binary files /dev/null and b/metadata-ingestion/docs/sources/azure-ad/azure_ad_api_permissions.png differ diff --git a/metadata-ingestion/docs/sources/azure-ad/azure_ad_endpoints.png b/metadata-ingestion/docs/sources/azure-ad/azure_ad_endpoints.png new file mode 100644 index 0000000000000..5ec19c48965a4 Binary files /dev/null and b/metadata-ingestion/docs/sources/azure-ad/azure_ad_endpoints.png differ diff --git a/metadata-ingestion/docs/sources/bigquery/README.md b/metadata-ingestion/docs/sources/bigquery/README.md new file mode 100644 index 0000000000000..80ffdfc947918 --- /dev/null +++ b/metadata-ingestion/docs/sources/bigquery/README.md @@ -0,0 +1 @@ +To get all metadata from BigQuery you need to use two plugins `bigquery` and `bigquery-usage`. Both of them are described in this page. These will require 2 separate recipes. We understand this is not ideal and we plan to make this easier in the future. diff --git a/metadata-ingestion/docs/sources/bigquery/bigquery-usage.md b/metadata-ingestion/docs/sources/bigquery/bigquery-usage.md new file mode 100644 index 0000000000000..f5611e57b47e7 --- /dev/null +++ b/metadata-ingestion/docs/sources/bigquery/bigquery-usage.md @@ -0,0 +1,15 @@ +### Prerequisites + +The Google Identity must have one of the following OAuth scopes granted to it: + +- https://www.googleapis.com/auth/logging.read +- https://www.googleapis.com/auth/logging.admin +- https://www.googleapis.com/auth/cloud-platform.read-only +- https://www.googleapis.com/auth/cloud-platform + +And should be authorized on all projects you'd like to ingest usage stats from. + +### Compatibility + +The source was last most recently confirmed compatible with the [December 16, 2021](https://cloud.google.com/bigquery/docs/release-notes#December_16_2021) +release of BigQuery. \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/bigquery/bigquery-usage_recipe.yml b/metadata-ingestion/docs/sources/bigquery/bigquery-usage_recipe.yml new file mode 100644 index 0000000000000..b0d6e8c3b35da --- /dev/null +++ b/metadata-ingestion/docs/sources/bigquery/bigquery-usage_recipe.yml @@ -0,0 +1,21 @@ +source: + type: bigquery-usage + config: + # Coordinates + projects: + - project_id_1 + - project_id_2 + + # Options + top_n_queries: 10 + dataset_pattern: + allow: + - marketing_db + - sales_db + table_pattern: + deny: + - .*feedback.* + - .*salary.* + +sink: + # sink configs diff --git a/metadata-ingestion/docs/sources/bigquery/bigquery.md b/metadata-ingestion/docs/sources/bigquery/bigquery.md new file mode 100644 index 0000000000000..d8f192da7e94f --- /dev/null +++ b/metadata-ingestion/docs/sources/bigquery/bigquery.md @@ -0,0 +1,88 @@ +### Prerequisites +#### Create a datahub profile in GCP +1. Create a custom role for datahub as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-custom-roles#creating_a_custom_role) +2. Grant the following permissions to this role: +``` + bigquery.datasets.get + bigquery.datasets.getIamPolicy + bigquery.jobs.create + bigquery.jobs.list + bigquery.jobs.listAll + bigquery.models.getMetadata + bigquery.models.list + bigquery.routines.get + bigquery.routines.list + bigquery.tables.create # Needs for profiling + bigquery.tables.get + bigquery.tables.getData # Needs for profiling + bigquery.tables.list + # needed for lineage generation via GCP logging + logging.logEntries.list + logging.privateLogEntries.list + resourcemanager.projects.get +``` +#### Create a service account + +1. Setup a ServiceAccount as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) +and assign the previously created role to this service account. +2. Download a service account JSON keyfile. + Example credential file: +```json +{ + "type": "service_account", + "project_id": "project-id-1234567", + "private_key_id": "d0121d0000882411234e11166c6aaa23ed5d74e0", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----", + "client_email": "test@suppproject-id-1234567.iam.gserviceaccount.com", + "client_id": "113545814931671546333", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test%suppproject-id-1234567.iam.gserviceaccount.com" +} +``` +3. To provide credentials to the source, you can either: + Set an environment variable: + $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" + + *or* + + Set credential config in your source based on the credential json file. For example: + +```yml + credential: + project_id: project-id-1234567 + private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0" + private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n" + client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com" + client_id: "123456678890" +``` + +### Lineage Computation Details + +When `use_exported_bigquery_audit_metadata` is set to `true`, lineage information will be computed using exported bigquery logs. On how to setup exported bigquery audit logs, refer to the following [docs](https://cloud.google.com/bigquery/docs/reference/auditlogs#defining_a_bigquery_log_sink_using_gcloud) on BigQuery audit logs. Note that only protoPayloads with "type.googleapis.com/google.cloud.audit.BigQueryAuditMetadata" are supported by the current ingestion version. The `bigquery_audit_metadata_datasets` parameter will be used only if `use_exported_bigquery_audit_metadat` is set to `true`. + +Note: the `bigquery_audit_metadata_datasets` parameter receives a list of datasets, in the format $PROJECT.$DATASET. This way queries from a multiple number of projects can be used to compute lineage information. + +Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs. + +### Profiling Details + +Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables. + +If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary +views. By default these views are created in the schema where the profiled table is but you can control to create all these +tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. +Temporary tables are removed after profiling. + +```yaml + profiling: + enabled: true + bigquery_temp_table_schema: my-project-id.my-schema-where-views-can-be-created +``` + +:::note + +Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables. +You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables) +::: diff --git a/metadata-ingestion/docs/sources/bigquery/bigquery_recipe.yml b/metadata-ingestion/docs/sources/bigquery/bigquery_recipe.yml new file mode 100644 index 0000000000000..174e2cb74796e --- /dev/null +++ b/metadata-ingestion/docs/sources/bigquery/bigquery_recipe.yml @@ -0,0 +1,21 @@ +source: + type: bigquery + config: + # Coordinates + project_id: my_project_id + + # `schema_pattern` for BQ Datasets + schema_pattern: + allow: + - finance_bq_dataset + + table_pattern: + deny: + # The exact name of the table is revenue_table_name + # The reason we have this `.*` at the beginning is because the current implmenetation of table_pattern is testing + # project_id.dataset_name.table_name + # We will improve this in the future + - .*revenue_table_name + +sink: + # sink configs diff --git a/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md b/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md new file mode 100644 index 0000000000000..2e3b78c76deda --- /dev/null +++ b/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md @@ -0,0 +1,43 @@ +### Business Glossary File Format + +The business glossary source file should be a `.yml` file with the following top-level keys: + +**Glossary**: the top level keys of the business glossary file +- **version**: the version of business glossary file config the config conforms to. Currently the only version released is `1`. +- **source**: the source format of the terms. Currently only supports `DataHub` +- **owners**: owners contains two nested fields + - **users**: (optional) a list of user ids + - **groups**: (optional) a list of group ids +- **url**: (optional) external url pointing to where the glossary is defined externally, if applicable. +- **nodes**: (optional) list of child **GlossaryNode** objects +- **terms**: (optional) list of child **GlossaryTerm** objects + + +**GlossaryNode**: a container of **GlossaryNode** and **GlossaryTerm** objects +- **name**: name of the node +- **description**: description of the node +- **owners**: (optional) owners contains two nested fields + - **users**: (optional) a list of user ids + - **groups**: (optional) a list of group ids +- **terms**: (optional) list of child **GlossaryTerm** objects +- **nodes**: (optional) list of child **GlossaryNode** objects + +**GlossaryTerm**: a term in your business glossary +- **name**: name of the term +- **description**: description of the term +- **owners**: (optional) owners contains two nested fields + - **users**: (optional) a list of user ids + - **groups**: (optional) a list of group ids +- **term_source**: One of `EXTERNAL` or `INTERNAL`. Whether the term is coming from an external glossary or one defined in your organization. +- **source_ref**: (optional) If external, what is the name of the source the glossary term is coming from? +- **source_url**: (optional) If external, what is the url of the source definition? +- **inherits**: (optional) List of **GlossaryTerm** that this term inherits from +- **contains**: (optional) List of **GlossaryTerm** that this term contains +- **custom_properties**: A map of key/value pairs of arbitrary custom properties + +You can also view an example business glossary file checked in [here](../examples/bootstrap_data/business_glossary.yml) + +## Compatibility + +Compatible with version 1 of business glossary format. +The source will be evolved as we publish newer versions of this format. diff --git a/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary_recipe.yml b/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary_recipe.yml new file mode 100644 index 0000000000000..bfd96a554b16e --- /dev/null +++ b/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary_recipe.yml @@ -0,0 +1,8 @@ +source: + type: datahub-business-glossary + config: + # Coordinates + file: /path/to/business_glossary_yaml + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/clickhouse/clickhouse-usage_recipe.yml b/metadata-ingestion/docs/sources/clickhouse/clickhouse-usage_recipe.yml new file mode 100644 index 0000000000000..444ea83bf9fcc --- /dev/null +++ b/metadata-ingestion/docs/sources/clickhouse/clickhouse-usage_recipe.yml @@ -0,0 +1,14 @@ +source: + type: clickhouse-usage + config: + # Coordinates + host_port: db_host:port + platform_instance: dev_cluster + email_domain: acryl.io + + # Credentials + username: username + password: "password" + +sink: +# sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/clickhouse/clickhouse_recipe.yml b/metadata-ingestion/docs/sources/clickhouse/clickhouse_recipe.yml new file mode 100644 index 0000000000000..838902205ea85 --- /dev/null +++ b/metadata-ingestion/docs/sources/clickhouse/clickhouse_recipe.yml @@ -0,0 +1,38 @@ +source: + type: clickhouse + config: + # Coordinates + host_port: localhost:9000 + + # Credentials + username: user + password: pass + + # Options + platform_instance: DatabaseNameToBeIngested + + include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True + +sink: + # sink configs + +#--------------------------------------------------------------------------- +# For the HTTP interface: +#--------------------------------------------------------------------------- +source: + type: clickhouse + config: + host_port: localhost:8443 + protocol: https + +#--------------------------------------------------------------------------- +# For the Native interface: +#--------------------------------------------------------------------------- + +source: + type: clickhouse + config: + host_port: localhost:9440 + scheme: clickhouse+native + secure: True \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/data-lake-files/data-lake.md b/metadata-ingestion/docs/sources/data-lake-files/data-lake.md new file mode 100644 index 0000000000000..e8dff2d71bd4c --- /dev/null +++ b/metadata-ingestion/docs/sources/data-lake-files/data-lake.md @@ -0,0 +1,5 @@ +## Compatibility + +Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). + +For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/). diff --git a/metadata-ingestion/docs/sources/data-lake-files/data-lake_recipe.yml b/metadata-ingestion/docs/sources/data-lake-files/data-lake_recipe.yml new file mode 100644 index 0000000000000..e527217c85fa5 --- /dev/null +++ b/metadata-ingestion/docs/sources/data-lake-files/data-lake_recipe.yml @@ -0,0 +1,11 @@ +source: + type: data-lake + config: + env: "PROD" + platform: "local-data-lake" + base_path: "/path/to/data/folder" + profiling: + enabled: true + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/dbt/dbt.md b/metadata-ingestion/docs/sources/dbt/dbt.md new file mode 100644 index 0000000000000..40a2513007213 --- /dev/null +++ b/metadata-ingestion/docs/sources/dbt/dbt.md @@ -0,0 +1,42 @@ +### dbt meta automated mappings +dbt allows authors to define meta properties for datasets. Checkout this link to know more - [dbt meta](https://docs.getdbt.com/reference/resource-configs/meta). Our dbt source allows users to define +actions such as add a tag, term or owner. For example if a dbt model has a meta config ```"has_pii": True```, we can define an action +that evaluates if the property is set to true and add, lets say, a ```pii``` tag. +To leverage this feature we require users to define mappings as part of the recipe. Following is how mappings can be defined - +```json + "meta_mapping": { + "business_owner": { + "match": ".*", + "operation": "add_owner", + "config": {"owner_type": "user"}, + }, + "has_pii": { + "match": True, + "operation": "add_tag", + "config": {"tag": "has_pii_test"}, + }, + "int_property": { + "match": 1, + "operation": "add_tag", + "config": {"tag": "int_meta_property"}, + }, + "double_property": { + "match": 2.5, + "operation": "add_term", + "config": {"term": "double_meta_property"}, + }, + "data_governance.team_owner": { + "match": "Finance", + "operation": "add_term", + "config": {"term": "Finance_test"}, + }, + } +``` +We support the below actions - +1. add_tag - Requires ```tag``` property in config. +2. add_term - Requires ```term``` property in config. +3. add_owner - Requires ```owner_type``` property in config which can be either user or group. + +Note: +1. Currently, dbt meta mapping is only supported for meta configs defined at the top most level or a node in manifest file. If that is not preset we will look for meta in the config section of the node. +2. For string based meta properties we support regex matching. \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/dbt/dbt_recipe.yml b/metadata-ingestion/docs/sources/dbt/dbt_recipe.yml new file mode 100644 index 0000000000000..6f660a8d685db --- /dev/null +++ b/metadata-ingestion/docs/sources/dbt/dbt_recipe.yml @@ -0,0 +1,14 @@ +source: + type: "dbt" + config: + # Coordinates + manifest_path: "./path/dbt/manifest_file.json" + catalog_path: "./path/dbt/catalog_file.json" + sources_path: "./path/dbt/sources_file.json" + + # Options + target_platform: "my_target_platform_id" # e.g. bigquery/postgres/etc. + load_schemas: True # note: if this is disabled + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/druid/druid_recipe.yml b/metadata-ingestion/docs/sources/druid/druid_recipe.yml new file mode 100644 index 0000000000000..3d6d001224073 --- /dev/null +++ b/metadata-ingestion/docs/sources/druid/druid_recipe.yml @@ -0,0 +1,12 @@ +source: + type: druid + config: + # Coordinates + host_port: "localhost:8082" + + # Credentials + username: admin + password: password + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/elastic-search/elasticsearch_recipe.yml b/metadata-ingestion/docs/sources/elastic-search/elasticsearch_recipe.yml new file mode 100644 index 0000000000000..94bb7b99d5de3 --- /dev/null +++ b/metadata-ingestion/docs/sources/elastic-search/elasticsearch_recipe.yml @@ -0,0 +1,19 @@ +source: + type: "elasticsearch" + config: + # Coordinates + host: 'localhost:9200' + + # Credentials + username: user # optional + password: pass # optional + + # Options + url_prefix: "" # optional url_prefix + env: "PROD" + index_pattern: + allow: [".*some_index_name_pattern*"] + deny: [".*skip_index_name_pattern*"] + +sink: +# sink configs diff --git a/metadata-ingestion/docs/sources/feast/feast_recipe.yml b/metadata-ingestion/docs/sources/feast/feast_recipe.yml new file mode 100644 index 0000000000000..8da4825ad8290 --- /dev/null +++ b/metadata-ingestion/docs/sources/feast/feast_recipe.yml @@ -0,0 +1,8 @@ +source: + type: feast + config: + # Coordinates + core_url: "localhost:6565" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/file-based-lineage/datahub-lineage-file.md b/metadata-ingestion/docs/sources/file-based-lineage/datahub-lineage-file.md new file mode 100644 index 0000000000000..fcc6dd29fbf30 --- /dev/null +++ b/metadata-ingestion/docs/sources/file-based-lineage/datahub-lineage-file.md @@ -0,0 +1,24 @@ +### Lineage File Format + +The lineage source file should be a `.yml` file with the following top-level keys: + +**version**: the version of lineage file config the config conforms to. Currently, the only version released +is `1`. + +**lineage**: the top level key of the lineage file containing a list of **EntityNodeConfig** objects + +**EntityNodeConfig**: + +- **entity**: **EntityConfig** object +- **upstream**: (optional) list of child **EntityNodeConfig** objects + +**EntityConfig**: + +- **name** : name of the entity +- **type**: type of the entity (only `dataset` is supported as of now) +- **env**: the environment of this entity. Should match the values in the + table [here](https://datahubproject.io/docs/graphql/enums/#fabrictype) +- **platform**: a valid platform like kafka, snowflake, etc.. +- **platform_instance**: optional string specifying the platform instance of this entity + +You can also view an example lineage file checked in [here](../../../../metadata-ingestion/examples/bootstrap_data/file_lineage.yml) diff --git a/metadata-ingestion/docs/sources/file-based-lineage/datahub-lineage-file_recipe.yml b/metadata-ingestion/docs/sources/file-based-lineage/datahub-lineage-file_recipe.yml new file mode 100644 index 0000000000000..2f0bb78d654db --- /dev/null +++ b/metadata-ingestion/docs/sources/file-based-lineage/datahub-lineage-file_recipe.yml @@ -0,0 +1,10 @@ +source: + type: datahub-lineage-file + config: + # Coordinates + file: /path/to/file_lineage.yml + # Whether we want to query datahub-gms for upstream data + preserve_upstream: False + +sink: +# sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/file/file_recipe.yml b/metadata-ingestion/docs/sources/file/file_recipe.yml new file mode 100644 index 0000000000000..e825505aebf05 --- /dev/null +++ b/metadata-ingestion/docs/sources/file/file_recipe.yml @@ -0,0 +1,8 @@ +source: + type: file + config: + # Coordinates + filename: ./path/to/mce/file.json + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/glue/glue.md b/metadata-ingestion/docs/sources/glue/glue.md new file mode 100644 index 0000000000000..f726f08cb9823 --- /dev/null +++ b/metadata-ingestion/docs/sources/glue/glue.md @@ -0,0 +1,3 @@ +## Compatibility + +To capture lineage across Glue jobs and databases, a requirements must be met – otherwise the AWS API is unable to report any lineage. The job must be created in Glue Studio with the "Generate classic script" option turned on (this option can be accessed in the "Script" tab). Any custom scripts that do not have the proper annotations will not have reported lineage. diff --git a/metadata-ingestion/docs/sources/glue/glue_recipe.yml b/metadata-ingestion/docs/sources/glue/glue_recipe.yml new file mode 100644 index 0000000000000..79d04a244500c --- /dev/null +++ b/metadata-ingestion/docs/sources/glue/glue_recipe.yml @@ -0,0 +1,8 @@ +source: + type: glue + config: + # Coordinates + aws_region: "my-aws-region" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/hive/hive_recipe.yml b/metadata-ingestion/docs/sources/hive/hive_recipe.yml new file mode 100644 index 0000000000000..091f161936966 --- /dev/null +++ b/metadata-ingestion/docs/sources/hive/hive_recipe.yml @@ -0,0 +1,71 @@ +source: + type: hive + config: + # Coordinates + host_port: localhost:10000 + database: DemoDatabase # optional, if not specified, ingests from all databases + + # Credentials + username: user # optional + password: pass # optional + + # For more details on authentication, see the PyHive docs: + # https://github.com/dropbox/PyHive#passing-session-configuration. + # LDAP, Kerberos, etc. are supported using connect_args, which can be + # added under the `options` config parameter. + #options: + # connect_args: + # auth: KERBEROS + # kerberos_service_name: hive + #scheme: 'hive+http' # set this if Thrift should use the HTTP transport + #scheme: 'hive+https' # set this if Thrift should use the HTTP with SSL transport + #scheme: 'sparksql' # set this for Spark Thrift Server + +sink: + # sink configs + +# --------------------------------------------------------- +# Recipe (Azure HDInsight) +# Connecting to Microsoft Azure HDInsight using TLS. +# --------------------------------------------------------- + +source: + type: hive + config: + # Coordinates + host_port: .azurehdinsight.net:443 + + # Credentials + username: admin + password: password + + # Options + options: + connect_args: + http_path: "/hive2" + auth: BASIC + +sink: + # sink configs + +# --------------------------------------------------------- +# Recipe (Databricks) +# Ensure that databricks-dbapi is installed. If not, use ```pip install databricks-dbapi``` to install. +# Use the ```http_path``` from your Databricks cluster in the following recipe. +# See (https://docs.databricks.com/integrations/bi/jdbc-odbc-bi.html#get-server-hostname-port-http-path-and-jdbc-url) for instructions to find ```http_path```. +# --------------------------------------------------------- + +source: + type: hive + config: + host_port: :443 + username: token + password: + scheme: 'databricks+pyhive' + + options: + connect_args: + http_path: 'sql/protocolv1/o/xxxyyyzzzaaasa/1234-567890-hello123' + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml new file mode 100644 index 0000000000000..747753c8461f0 --- /dev/null +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml @@ -0,0 +1,22 @@ +source: + type: "kafka-connect" + config: + # Coordinates + connect_uri: "http://localhost:8083" + cluster_name: "connect-cluster" + provided_configs: + - provider: env + path_key: MYSQL_CONNECTION_URL + value: jdbc:mysql://test_mysql:3306/librarydb + # Optional mapping of platform types to instance ids + platform_instance_map: # optional + mysql: test_mysql # optional + connect_to_platform_map: # optional + postgres-connector-finance-db: # optional - Connector name + postgres: core_finance_instance # optional - Platform to instance map + # Credentials + username: admin + password: password + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/kafka/README.md b/metadata-ingestion/docs/sources/kafka/README.md new file mode 100644 index 0000000000000..d100252fae85d --- /dev/null +++ b/metadata-ingestion/docs/sources/kafka/README.md @@ -0,0 +1 @@ +Support exists for both Apache Kafka and Confluent Cloud. diff --git a/metadata-ingestion/docs/sources/kafka/kafka.md b/metadata-ingestion/docs/sources/kafka/kafka.md new file mode 100644 index 0000000000000..8b2073557804e --- /dev/null +++ b/metadata-ingestion/docs/sources/kafka/kafka.md @@ -0,0 +1,104 @@ +:::note +Stateful Ingestion is available only when a Platform Instance is assigned to this source. +::: + +### Connecting to Confluent Cloud + +If using Confluent Cloud you can use a recipe like this. In this `consumer_config.sasl.username` and `consumer_config.sasl.password` are the API credentials that you get (in the Confluent UI) from your cluster -> Data Integration -> API Keys. `schema_registry_config.basic.auth.user.info` has API credentials for Confluent schema registry which you get (in Confluent UI) from Schema Registry -> API credentials. + +When creating API Key for the cluster ensure that the ACLs associated with the key are set like below. This is required for DataHub to read topic metadata from topics in Confluent Cloud. +``` +Topic Name = * +Permission = ALLOW +Operation = DESCRIBE +Pattern Type = LITERAL +``` + +```yml +source: + type: "kafka" + config: + platform_instance: "YOUR_CLUSTER_ID" + connection: + bootstrap: "abc-defg.eu-west-1.aws.confluent.cloud:9092" + consumer_config: + security.protocol: "SASL_SSL" + sasl.mechanism: "PLAIN" + sasl.username: "${CLUSTER_API_KEY_ID}" + sasl.password: "${CLUSTER_API_KEY_SECRET}" + schema_registry_url: "https://abc-defgh.us-east-2.aws.confluent.cloud" + schema_registry_config: + basic.auth.user.info: "${REGISTRY_API_KEY_ID}:${REGISTRY_API_KEY_SECRET}" + +sink: + # sink configs +``` + +If you are trying to add domains to your topics you can use a configuration like below. + +```yml +source: + type: "kafka" + config: + # ...connection block + domain: + "urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810": + allow: + - ".*" + "urn:li:domain:d6ec9868-6736-4b1f-8aa6-fee4c5948f17": + deny: + - ".*" +``` + +Note that the `domain` in config above can be either an _urn_ or a domain _id_ (i.e. `urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810` or simply `13ae4d85-d955-49fc-8474-9004c663a810`). The Domain should exist in your DataHub instance before ingesting data into the Domain. To create a Domain on DataHub, check out the [Domains User Guide](https://datahubproject.io/docs/domains/). + +If you are using a non-default subject naming strategy in the schema registry, such as [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work), the mapping for the topic's key and value schemas to the schema registry subject names should be provided via `topic_subject_map` as shown in the configuration below. + +```yml +source: + type: "kafka" + config: + # ...connection block + # Defines the mapping for the key & value schemas associated with a topic & the subject name registered with the + # kafka schema registry. + topic_subject_map: + # Defines both key & value schema for topic 'my_topic_1' + "my_topic_1-key": "io.acryl.Schema1" + "my_topic_1-value": "io.acryl.Schema2" + # Defines only the value schema for topic 'my_topic_2' (the topic doesn't have a key schema). + "my_topic_2-value": "io.acryl.Schema3" +``` + +### Custom Schema Registry + +The Kafka Source uses the schema registry to figure out the schema associated with both `key` and `value` for the topic. +By default it uses the [Confluent's Kafka Schema registry](https://docs.confluent.io/platform/current/schema-registry/index.html) +and supports the `AVRO` schema type. + +If you're using a custom schema registry, or you are using schema type other than `AVRO`, then you can provide your own +custom implementation of the `KafkaSchemaRegistryBase` class, and implement the `get_schema_metadata(topic, platform_urn)` method that +given a topic name would return object of `SchemaMetadata` containing schema for that topic. Please refer +`datahub.ingestion.source.confluent_schema_registry::ConfluentSchemaRegistry` for sample implementation of this class. +```python +class KafkaSchemaRegistryBase(ABC): + @abstractmethod + def get_schema_metadata( + self, topic: str, platform_urn: str + ) -> Optional[SchemaMetadata]: + pass +``` + +The custom schema registry class can be configured using the `schema_registry_class` config param of the `kafka` source as shown below. +```YAML +source: + type: "kafka" + config: + # Set the custom schema registry implementation class + schema_registry_class: "datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry" + # Coordinates + connection: + bootstrap: "broker:9092" + schema_registry_url: http://localhost:8081 + +# sink configs +``` \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/kafka/kafka_recipe.yml b/metadata-ingestion/docs/sources/kafka/kafka_recipe.yml new file mode 100644 index 0000000000000..0c89c6f946e97 --- /dev/null +++ b/metadata-ingestion/docs/sources/kafka/kafka_recipe.yml @@ -0,0 +1,11 @@ +source: + type: "kafka" + config: + platform_instance: "YOUR_CLUSTER_ID" + connection: + bootstrap: "broker:9092" + schema_registry_url: http://localhost:8081 + +sink: + # sink configs + diff --git a/metadata-ingestion/docs/sources/ldap/ldap_recipe.yml b/metadata-ingestion/docs/sources/ldap/ldap_recipe.yml new file mode 100644 index 0000000000000..bdbe05c3877b7 --- /dev/null +++ b/metadata-ingestion/docs/sources/ldap/ldap_recipe.yml @@ -0,0 +1,15 @@ +source: + type: "ldap" + config: + # Coordinates + ldap_server: ldap://localhost + + # Credentials + ldap_user: "cn=admin,dc=example,dc=org" + ldap_password: "admin" + + # Options + base_dn: "dc=example,dc=org" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/looker/looker.md b/metadata-ingestion/docs/sources/looker/looker.md new file mode 100644 index 0000000000000..769d3341dca13 --- /dev/null +++ b/metadata-ingestion/docs/sources/looker/looker.md @@ -0,0 +1,23 @@ +#### Configuration Notes + +See the [Looker authentication docs](https://docs.looker.com/reference/api-and-integration/api-auth#authentication_with_an_sdk) for the steps to create a client ID and secret. +You need to provide the following permissions for ingestion to work correctly. +``` +access_data +explore +manage_models +see_datagroups +see_lookml +see_lookml_dashboards +see_looks +see_pdts +see_queries +see_schedules +see_sql +see_system_activity +see_user_dashboards +see_users +``` +Here is an example permission set after configuration. +![Looker DataHub Permission Set](./looker_datahub_permission_set.png) + diff --git a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png b/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png new file mode 100644 index 0000000000000..7227dc04fb8a0 Binary files /dev/null and b/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png differ diff --git a/metadata-ingestion/docs/sources/looker/looker_recipe.yml b/metadata-ingestion/docs/sources/looker/looker_recipe.yml new file mode 100644 index 0000000000000..42209f8cc6809 --- /dev/null +++ b/metadata-ingestion/docs/sources/looker/looker_recipe.yml @@ -0,0 +1,11 @@ +source: + type: "looker" + config: + # Coordinates + base_url: "https://.cloud.looker.com" + + # Credentials + client_id: ${LOOKER_CLIENT_ID} + client_secret: ${LOOKER_CLIENT_SECRET} + +# sink configs diff --git a/metadata-ingestion/docs/sources/looker/lookml.md b/metadata-ingestion/docs/sources/looker/lookml.md new file mode 100644 index 0000000000000..996f5f68550f3 --- /dev/null +++ b/metadata-ingestion/docs/sources/looker/lookml.md @@ -0,0 +1,13 @@ +#### Configuration Notes + +See the [Looker authentication docs](https://docs.looker.com/reference/api-and-integration/api-auth#authentication_with_an_sdk) for the steps to create a client ID and secret. +You need to ensure that the API key is attached to a user that has Admin privileges. If that is not possible, read the configuration section to provide an offline specification of the `connection_to_platform_map` and the `project_name`. + +:::note +The integration can use an SQL parser to try to parse the tables the views depends on. +::: +This parsing is disabled by default, +but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sqllineage`](https://pypi.org/project/sqllineage/) package. +As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a +custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser` +and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser. diff --git a/metadata-ingestion/docs/sources/looker/lookml_recipe.yml b/metadata-ingestion/docs/sources/looker/lookml_recipe.yml new file mode 100644 index 0000000000000..b1150670b7c74 --- /dev/null +++ b/metadata-ingestion/docs/sources/looker/lookml_recipe.yml @@ -0,0 +1,35 @@ +source: + type: "lookml" + config: + # Coordinates + base_folder: /path/to/model/files + + # Options + api: + # Coordinates for your looker instance + base_url: "https://YOUR_INSTANCE.cloud.looker.com" + + # Credentials for your Looker connection (https://docs.looker.com/reference/api-and-integration/api-auth) + client_id: ${LOOKER_CLIENT_ID} + client_secret: ${LOOKER_CLIENT_SECRET} + + # Alternative to API section above if you want a purely file-based ingestion with no api calls to Looker or if you want to provide platform_instance ids for your connections + # project_name: PROJECT_NAME # See (https://docs.looker.com/data-modeling/getting-started/how-project-works) to understand what is your project name + # connection_to_platform_map: + # connection_name_1: + # platform: snowflake # bigquery, hive, etc + # default_db: DEFAULT_DATABASE. # the default database configured for this connection + # default_schema: DEFAULT_SCHEMA # the default schema configured for this connection + # platform_instance: snow_warehouse # optional + # platform_env: PROD # optional + # connection_name_2: + # platform: bigquery # snowflake, hive, etc + # default_db: DEFAULT_DATABASE. # the default database configured for this connection + # default_schema: DEFAULT_SCHEMA # the default schema configured for this connection + # platform_instance: bq_warehouse # optional + # platform_env: DEV # optional + + github_info: + repo: org/repo-name + +# sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/mariadb/mariadb_recipe.yml b/metadata-ingestion/docs/sources/mariadb/mariadb_recipe.yml new file mode 100644 index 0000000000000..436fa89cce523 --- /dev/null +++ b/metadata-ingestion/docs/sources/mariadb/mariadb_recipe.yml @@ -0,0 +1,12 @@ +source: + type: mariadb + config: + # Coordinates + host_port: localhost:3306 + database: dbname + + # Credentials + username: root + password: example + +# sink configs diff --git a/metadata-ingestion/docs/sources/metabase/metabase.md b/metadata-ingestion/docs/sources/metabase/metabase.md new file mode 100644 index 0000000000000..74b7b73bbca0c --- /dev/null +++ b/metadata-ingestion/docs/sources/metabase/metabase.md @@ -0,0 +1,14 @@ +Metabase databases will be mapped to a DataHub platform based on the engine listed in the +[api/database](https://www.metabase.com/docs/latest/api-documentation.html#database) response. This mapping can be +customized by using the `engine_platform_map` config option. For example, to map databases using the `athena` engine to +the underlying datasets in the `glue` platform, the following snippet can be used: +```yml + engine_platform_map: + athena: glue +``` +DataHub will try to determine database name from Metabase [api/database](https://www.metabase.com/docs/latest/api-documentation.html#database) +payload. However, the name can be overridden from `database_alias_map` for a given database connected to Metabase. + +## Compatibility + +Metabase version [v0.41.2](https://www.metabase.com/start/oss/) diff --git a/metadata-ingestion/docs/sources/metabase/metabase.yml b/metadata-ingestion/docs/sources/metabase/metabase.yml new file mode 100644 index 0000000000000..e0ef6b4ba72fb --- /dev/null +++ b/metadata-ingestion/docs/sources/metabase/metabase.yml @@ -0,0 +1,20 @@ +source: + type: metabase + config: + # Coordinates + connect_uri: http://localhost:3000 + + # Credentials + username: user + password: pass + + # Options + default_schema: public + database_alias_map: + h2: sample-dataset.db + # Optional mapping of platform types to instance ids + platform_instance_map: # optional + postgres: test_postgres # optional + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/mode/mode.md b/metadata-ingestion/docs/sources/mode/mode.md new file mode 100644 index 0000000000000..a61c429a6ab1c --- /dev/null +++ b/metadata-ingestion/docs/sources/mode/mode.md @@ -0,0 +1 @@ +See Mode's [Authentication documentation](https://mode.com/developer/api-reference/authentication/) on how to generate `token` and `password`. diff --git a/metadata-ingestion/docs/sources/mode/mode_recipe.yml b/metadata-ingestion/docs/sources/mode/mode_recipe.yml new file mode 100644 index 0000000000000..153fc8b344cc0 --- /dev/null +++ b/metadata-ingestion/docs/sources/mode/mode_recipe.yml @@ -0,0 +1,21 @@ +source: + type: mode + config: + # Coordinates + connect_uri: http://app.mode.com + + # Credentials + token: token + password: pass + + # Options + workspace: "datahub" + default_schema: "public" + owner_username_instead_of_email: False + api_options: + retry_backoff_multiplier: 2 + max_retry_interval: 10 + max_attempts: 5 + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/mongodb/mongodb_recipe.yml b/metadata-ingestion/docs/sources/mongodb/mongodb_recipe.yml new file mode 100644 index 0000000000000..e6cc346ad4f10 --- /dev/null +++ b/metadata-ingestion/docs/sources/mongodb/mongodb_recipe.yml @@ -0,0 +1,18 @@ +source: + type: "mongodb" + config: + # Coordinates + connect_uri: "mongodb://localhost" + + # Credentials + username: admin + password: password + authMechanism: "DEFAULT" + + # Options + enableSchemaInference: True + useRandomSampling: True + maxSchemaSize: 300 + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/mssql/mssql_recipe.yml b/metadata-ingestion/docs/sources/mssql/mssql_recipe.yml new file mode 100644 index 0000000000000..5f1e24ce1e956 --- /dev/null +++ b/metadata-ingestion/docs/sources/mssql/mssql_recipe.yml @@ -0,0 +1,41 @@ +source: + type: mssql + config: + # Coordinates + host_port: localhost:1433 + database: DemoDatabase + + # Credentials + username: user + password: pass + +sink: + # sink configs + +#------------------------------------------------------------------------ +#Example: using ingestion with ODBC and encryption +#This requires you to have already installed the Microsoft ODBC Driver for SQL Server. +#See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 +# ------------------------------------------------------------------------ + +source: + type: mssql + config: + # Coordinates + host_port: localhost:1433 + database: DemoDatabase + + # Credentials + username: admin + password: password + + # Options + use_odbc: "True" + uri_args: + driver: "ODBC Driver 17 for SQL Server" + Encrypt: "yes" + TrustServerCertificate: "Yes" + ssl: "True" + +sink: + # sink configs diff --git a/metadata-ingestion/docs/sources/mysql/mysql_recipe.yml b/metadata-ingestion/docs/sources/mysql/mysql_recipe.yml new file mode 100644 index 0000000000000..a915cba08ad41 --- /dev/null +++ b/metadata-ingestion/docs/sources/mysql/mysql_recipe.yml @@ -0,0 +1,13 @@ +source: + type: mysql + config: + # Coordinates + host_port: localhost:3306 + database: dbname + + # Credentials + username: root + password: example + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/nifi/nifi_recipe.yml b/metadata-ingestion/docs/sources/nifi/nifi_recipe.yml new file mode 100644 index 0000000000000..7b01e766c6a8e --- /dev/null +++ b/metadata-ingestion/docs/sources/nifi/nifi_recipe.yml @@ -0,0 +1,13 @@ +source: + type: "nifi" + config: + # Coordinates + site_url: "https://localhost:8443/nifi/" + + # Credentials + auth: SINGLE_USER + username: admin + password: password + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/okta/okta.md b/metadata-ingestion/docs/sources/okta/okta.md new file mode 100644 index 0000000000000..5c2ab1856dfc5 --- /dev/null +++ b/metadata-ingestion/docs/sources/okta/okta.md @@ -0,0 +1,12 @@ +As a prerequisite, you should create a DataHub Application within the Okta Developer Console with full permissions to read your organization's Users and Groups. + +## Compatibility + + Validated against Okta API Versions: + - `2021.07.2` + + Validated against load: + - User Count: `1000` + - Group Count: `100` + - Group Membership Edges: `1000` (1 per User) + - Run Time (Wall Clock): `2min 7sec` diff --git a/metadata-ingestion/docs/sources/okta/okta_recipe.yml b/metadata-ingestion/docs/sources/okta/okta_recipe.yml new file mode 100644 index 0000000000000..943e8ad16f006 --- /dev/null +++ b/metadata-ingestion/docs/sources/okta/okta_recipe.yml @@ -0,0 +1,11 @@ +source: + type: okta + config: + # Coordinates + okta_domain: "dev-35531955.okta.com" + + # Credentials + okta_api_token: "11be4R_M2MzDqXawbTHfKGpKee0kuEOfX1RCQSRx99" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/openapi/openapi.md b/metadata-ingestion/docs/sources/openapi/openapi.md new file mode 100644 index 0000000000000..b3231b018bdde --- /dev/null +++ b/metadata-ingestion/docs/sources/openapi/openapi.md @@ -0,0 +1,135 @@ +The dataset metadata should be defined directly in the Swagger file, section `["example"]`. If this is not true, the following procedures will take place. + +## Capabilities + +The plugin read the swagger file where the endopints are defined and searches for the ones which accept +a `GET` call: those are the ones supposed to give back the datasets. + +For every selected endpoint defined in the `paths` section, +the tool searches whether the medatada are already defined in there. +As example, if in your swagger file there is the `/api/users/` defined as follows: + +```yaml +paths: + /api/users/: + get: + tags: [ "Users" ] + operationID: GetUsers + description: Retrieve users data + responses: + '200': + description: Return the list of users + content: + application/json: + example: + {"user": "username", "name": "Full Name", "job": "any", "is_active": True} +``` + +then this plugin has all the information needed to create the dataset in DataHub. + +In case there is no example defined, the plugin will try to get the metadata directly from the endpoint. +So, if in your swagger file you have + +```yaml +paths: + /colors/: + get: + tags: [ "Colors" ] + operationID: GetDefinedColors + description: Retrieve colors + responses: + '200': + description: Return the list of colors +``` + +the tool will make a `GET` call to `https:///test_endpoint.com/colors` +and parse the response obtained. + +### Automatically recorded examples + +Sometimes you can have an endpoint which wants a parameter to work, like +`https://test_endpoint.com/colors/{color}`. + +Since in the OpenApi specifications the listing endpoints are specified +just before the detailed ones, in the list of the paths, you will find + + https:///test_endpoint.com/colors + +defined before + + https://test_endpoint.com/colors/{color} + +This plugin is set to automatically keep an example of the data given by the first URL, +which with some probability will include an example of attribute needed by the second. + +So, if by calling GET to the first URL you get as response: + + {"pantone code": 100, + "color": "yellow", + ...} + +the `"color": "yellow"` part will be used to complete the second link, which +will become: + + https://test_endpoint.com/colors/yellow + +and this last URL will be called to get back the needed metadata. + +### Automatic guessing of IDs + +If no useful example is found, a second procedure will try to guess a numerical ID. +So if we have: + + https:///test_endpoint.com/colors/{colorID} + +and there is no `colorID` example already found by the plugin, +it will try to put a number one (1) at the parameter place + + https://test_endpoint.com/colors/1 + +and this URL will be called to get back the needed metadata. + +## Config details + +### Token authentication + +If this tool needs to get an access token to interrogate the endpoints, this can be requested. Two methods are available at the moment: + +* 'get' : this requires username/password combination to be present in the url. Note that {username} and {password} are mandatory placeholders. They will be replaced with the true credentials at runtime. Note that username and password will be sent in the request address, so it's unsecure. If your provider allows for the other method, please go for it. +* 'post' : username and password will be inserted in the body of the POST request + +In both cases, username and password are the ones defined in the configuration file. + +### Getting dataset metadata from `forced_example` + +Suppose you have an endpoint defined in the swagger file, but without example given, and the tool is +unable to guess the URL. In such cases you can still manually specify it in the `forced_examples` part of the +configuration file. + +As example, if in your swagger file you have + +```yaml +paths: + /accounts/groupname/{name}/: + get: + tags: [ "Groups" ] + operationID: GetGroup + description: Retrieve group data + responses: + '200': + description: Return details about the group +``` + +and the plugin did not found an example in its previous calls, +so the tool have no idea about what substitute to the `{name}` part. + +By specifying in the configuration file + +```yaml + forced_examples: # optionals + /accounts/groupname/{name}: ['test'] +``` + +the plugin is able to build a correct URL, as follows: + +https://test_endpoint.com/accounts/groupname/test diff --git a/metadata-ingestion/docs/sources/openapi/openapi_recipe.yml b/metadata-ingestion/docs/sources/openapi/openapi_recipe.yml new file mode 100644 index 0000000000000..07c08a814a7b1 --- /dev/null +++ b/metadata-ingestion/docs/sources/openapi/openapi_recipe.yml @@ -0,0 +1,20 @@ +source: + type: openapi + config: + name: test_endpoint # this name will appear in DatHub + url: https://test_endpoint.com/ + swagger_file: classicapi/doc/swagger.json # where to search for the OpenApi definitions + get_token: # optional, if you need to get an authentication token beforehand + request_type: get + url: api/authentication/login?username={username}&password={password} + username: your_username # optional + password: your_password # optional + forced_examples: # optionals + /accounts/groupname/{name}: ['test'] + /accounts/username/{name}: ['test'] + ignore_endpoints: [/ignore/this, /ignore/that, /also/that_other] # optional, the endpoints to ignore + +sink: + type: "datahub-rest" + config: + server: 'http://localhost:8080' \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/oracle/oracle.md b/metadata-ingestion/docs/sources/oracle/oracle.md new file mode 100644 index 0000000000000..6043e1c915663 --- /dev/null +++ b/metadata-ingestion/docs/sources/oracle/oracle.md @@ -0,0 +1 @@ +As a SQL-based service, the Athena integration is also supported by our SQL profiler. See here for more details on configuration. \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/oracle/oracle_recipe.yml b/metadata-ingestion/docs/sources/oracle/oracle_recipe.yml new file mode 100644 index 0000000000000..6a52d4800fa45 --- /dev/null +++ b/metadata-ingestion/docs/sources/oracle/oracle_recipe.yml @@ -0,0 +1,16 @@ +source: + type: oracle + config: + # Coordinates + host_port: localhost:5432 + database: dbname + + # Credentials + username: user + password: pass + + # Options + service_name: svc # omit database if using this option + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/postgres/postgres_recipe.yml b/metadata-ingestion/docs/sources/postgres/postgres_recipe.yml new file mode 100644 index 0000000000000..3d3dcf06a373d --- /dev/null +++ b/metadata-ingestion/docs/sources/postgres/postgres_recipe.yml @@ -0,0 +1,16 @@ +source: + type: postgres + config: + # Coordinates + host_port: localhost:5432 + database: DemoDatabase + + # Credentials + username: user + password: pass + + # Options + database_alias: DatabaseNameToBeIngested + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi.md b/metadata-ingestion/docs/sources/powerbi/powerbi.md new file mode 100644 index 0000000000000..5e949e614bd5c --- /dev/null +++ b/metadata-ingestion/docs/sources/powerbi/powerbi.md @@ -0,0 +1,12 @@ +## Concept mapping + +| Power BI | Datahub | +| ------------------------- | ------------------- | +| `Dashboard` | `Dashboard` | +| `Dataset, Datasource` | `Dataset` | +| `Tile` | `Chart` | +| `Report.webUrl` | `Chart.externalUrl` | +| `Workspace` | `N/A` | +| `Report` | `N/A` | + +If Tile is created from report then Chart.externalUrl is set to Report.webUrl. diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml b/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml new file mode 100644 index 0000000000000..302fabc51a2fd --- /dev/null +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml @@ -0,0 +1,21 @@ +source: + type: "powerbi" + config: + # Your Power BI tenant identifier + tenant_id: a949d688-67c0-4bf1-a344-e939411c6c0a + # Ingest elements of below PowerBi Workspace into Datahub + workspace_id: 4bd10256-e999-45dd-8e56-571c77153a5f + # Workspace's dataset environments (PROD, DEV, QA, STAGE) + env: DEV + # Azure AD App client identifier + client_id: foo + # Azure AD App client secret + client_secret: bar + # dataset_type_mapping is fixed mapping of Power BI datasources type to equivalent Datahub "data platform" dataset + dataset_type_mapping: + PostgreSql: postgres + Oracle: oracle + + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/presto-on-hive/presto-on-hive_recipe.yml b/metadata-ingestion/docs/sources/presto-on-hive/presto-on-hive_recipe.yml new file mode 100644 index 0000000000000..6482c610263a2 --- /dev/null +++ b/metadata-ingestion/docs/sources/presto-on-hive/presto-on-hive_recipe.yml @@ -0,0 +1,25 @@ +source: + type: presto-on-hive + config: + # Hive metastore DB connection + host_port: localhost:5432 + database: metastore + + # specify the schema where metastore tables reside + schema_pattern: + allow: + - "^public" + + # credentials + username: user # optional + password: pass # optional + + #scheme: 'postgresql+psycopg2' # set this if metastore db is using postgres + #scheme: 'mysql+pymysql' # set this if metastore db is using mysql, default if unset + + # set this to have advanced filters on what to ingest + #views_where_clause_suffix: AND d."name" in ('db1') + #tables_where_clause_suffix: AND d."name" in ('db1') + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/source_docs/pulsar.md b/metadata-ingestion/docs/sources/pulsar/pulsar.md similarity index 93% rename from metadata-ingestion/source_docs/pulsar.md rename to metadata-ingestion/docs/sources/pulsar/pulsar.md index b3292ba9dcb21..5149de376778e 100644 --- a/metadata-ingestion/source_docs/pulsar.md +++ b/metadata-ingestion/docs/sources/pulsar/pulsar.md @@ -48,10 +48,10 @@ This ingestion source maps the following Source System Concepts to DataHub Conce | Capability | Status | Notes | |-------------------------------------------------------|:------:|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Data Container | ❌ | | -| [Stateful Ingestion](./stateful_ingestion.md) | ✅ | Requires recipe configuration, stateful Ingestion is available only when a Platform Instance is assigned to this source. | +| [Stateful Ingestion](../../../../metadata-ingestion/docs/dev_guides/stateful.md) | ✅ | Requires recipe configuration, stateful Ingestion is available only when a Platform Instance is assigned to this source. | | Partition Support | ✅ | Requires recipe configuration, each individual partition topic can be ingest. Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. This feature is disabled by default. | -| [Platform Instance](../../docs/platform-instances.md) | ✅ | Requires recipe configuration and is mandatory for Stateful Ingestion. A Pulsar instance consists of one or more Pulsar clusters. | -| [Data Domain](../../docs/domains.md) | ✅ | Requires recipe configuration | +| [Platform Instance](../../../platform-instances.md) | ✅ | Requires recipe configuration and is mandatory for Stateful Ingestion. A Pulsar instance consists of one or more Pulsar clusters. | +| [Data Domain](../../../domains.md) | ✅ | Requires recipe configuration | | Dataset Profiling | ❌ | | | Dataset Usage | ❌ | | | Extract Descriptions | ❌ | | @@ -62,7 +62,7 @@ This ingestion source maps the following Source System Concepts to DataHub Conce ## Metadata Ingestion Quickstart -For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). +For context on getting started with ingestion, check out our [metadata ingestion guide](../../../../metadata-ingestion/README.md). ### Prerequisites @@ -85,7 +85,7 @@ Run the following commands to install the relevant plugin(s): Use the following recipe(s) to get started with ingestion. See [below](#config-details) for full configuration options. -_For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes)._ +_For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes)._ #### Quickstart recipe Getting started receipt @@ -165,7 +165,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | `domain.domain_urn.allow` | ❌ | | List of regex patterns for topics to set domain_urn domain key. There can be multiple domain key specified. | | `domain.domain_urn.deny` | ❌ | | List of regex patterns for topics to not assign domain_urn. There can be multiple domain key specified. | | `domain.domain_urn.ignoreCase` | ❌ | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. | -| `stateful_ingestion` | ❌ | | see [Stateful Ingestion](./stateful_ingestion.md) | +| `stateful_ingestion` | ❌ | | see [Stateful Ingestion](../../../../metadata-ingestion/docs/dev_guides/stateful.md) | @@ -173,4 +173,4 @@ Note that a `.` is used to denote nested fields in the YAML recipe. ### [Common Issue] -[Provide description of common issues with this integration and steps to resolve] \ No newline at end of file +[Provide description of common issues with this integration and steps to resolve] diff --git a/metadata-ingestion/docs/sources/redash/redash.md b/metadata-ingestion/docs/sources/redash/redash.md new file mode 100644 index 0000000000000..8f8c5c85496a0 --- /dev/null +++ b/metadata-ingestion/docs/sources/redash/redash.md @@ -0,0 +1,5 @@ +Note! The integration can use an SQL parser to try to parse the tables the chart depends on. This parsing is disabled by default, +but can be enabled by setting `parse_table_names_from_sql: true`. The default parser is based on the [`sqllineage`](https://pypi.org/project/sqllineage/) package. +As this package doesn't officially support all the SQL dialects that Redash supports, the result might not be correct. You can, however, implement a +custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser` +and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser. diff --git a/metadata-ingestion/docs/sources/redash/redash_recipe.yml b/metadata-ingestion/docs/sources/redash/redash_recipe.yml new file mode 100644 index 0000000000000..7d72d262d5ed3 --- /dev/null +++ b/metadata-ingestion/docs/sources/redash/redash_recipe.yml @@ -0,0 +1,20 @@ +source: + type: "redash" + config: + connect_uri: http://localhost:5000/ + api_key: REDASH_API_KEY + + # Optionals + # api_page_limit: 1 #default: None, no limit on ingested dashboards and charts API pagination + # skip_draft: true #default: true, only ingest published dashboards and charts + # dashboard_patterns: + # deny: + # - ^denied dashboard.* + # allow: + # - .*allowed dashboard.* + # chart_patterns: + # deny: + # - ^denied chart.* + # allow: + # - .*allowed chart.* + # parse_table_names_from_sql: false \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/redshift/README.md b/metadata-ingestion/docs/sources/redshift/README.md new file mode 100644 index 0000000000000..2e7f1719b51a9 --- /dev/null +++ b/metadata-ingestion/docs/sources/redshift/README.md @@ -0,0 +1 @@ +To get all metadata from Redshift you need to use two plugins `redshift` and `redshift-usage`. Both of them are described in this page. These will require 2 separate recipes. We understand this is not ideal and we plan to make this easier in the future. \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/redshift/redshift-usage_recipe.yml b/metadata-ingestion/docs/sources/redshift/redshift-usage_recipe.yml new file mode 100644 index 0000000000000..76530b299ee24 --- /dev/null +++ b/metadata-ingestion/docs/sources/redshift/redshift-usage_recipe.yml @@ -0,0 +1,14 @@ +source: + type: redshift-usage + config: + # Coordinates + host_port: db_host:port + database: dev + email_domain: acryl.io + + # Credentials + username: username + password: "password" + +sink: +# sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/redshift/redshift_recipe.yml b/metadata-ingestion/docs/sources/redshift/redshift_recipe.yml new file mode 100644 index 0000000000000..09a8207ae9799 --- /dev/null +++ b/metadata-ingestion/docs/sources/redshift/redshift_recipe.yml @@ -0,0 +1,39 @@ +source: + type: redshift + config: + # Coordinates + host_port: example.something.us-west-2.redshift.amazonaws.com:5439 + database: DemoDatabase + + # Credentials + username: user + password: pass + + # Options + options: + # driver_option: some-option + + include_views: True # whether to include views, defaults to True + include_tables: True # whether to include views, defaults to True + +sink: + # sink configs + +#------------------------------------------------------------------------------ +# Extra options when running Redshift behind a proxy +# This requires you to have already installed the Microsoft ODBC Driver for SQL Server. +# See https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-1-configure-development-environment-for-pyodbc-python-development?view=sql-server-ver15 +#------------------------------------------------------------------------------ + +source: + type: redshift + config: + host_port: my-proxy-hostname:5439 + + options: + connect_args: + sslmode: "prefer" # or "require" or "verify-ca" + sslrootcert: ~ # needed to unpin the AWS Redshift certificate + +sink: + # sink configs diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md new file mode 100644 index 0000000000000..c82874239a0de --- /dev/null +++ b/metadata-ingestion/docs/sources/s3/s3.md @@ -0,0 +1,66 @@ +## Valid path_spec.include + +```python +s3://my-bucket/foo/tests/bar.avro # single file table +s3://my-bucket/foo/tests/*.* # mulitple file level tables +s3://my-bucket/foo/tests/{table}/*.avro #table without partition +s3://my-bucket/foo/tests/{table}/*/*.avro #table where partitions are not specified +s3://my-bucket/foo/tests/{table}/*.* # table where no partitions as well as data type specified +s3://my-bucket/{dept}/tests/{table}/*.avro # specifying key wards to be used in display name +s3://my-bucket/{dept}/tests/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.avro # specify partition key and value format +s3://my-bucket/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.avro # specify partition value only format +s3://my-bucket/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # for all extensions +s3://my-bucket/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 2 levels down in bucket +s3://my-bucket/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 3 levels down in bucket +``` + +## Valid path_spec.exclude +- **/tests/** +- s3://my-bucket/hr/** +- **/tests/*.csv +- s3://my-bucket/foo/*/my_table/** +- +### Notes + +- {table} represents folder for which dataset will be created. +- include path must end with (*.* or *.[ext]) to represent leaf level. +- if *.[ext] is provided then only files with specified type will be scanned. +- /*/ represents single folder. +- {partition[i]} represents value of partition. +- {partition_key[i]} represents name of the partition. +- While extracting, “i” will be used to match partition_key to partition. +- all folder levels need to be specified in include. Only exclude path can have ** like matching. +- exclude path cannot have named variables ( {} ). +- Folder names should not contain {, }, *, / in their names. +- {folder} is reserved for internal working. please do not use in named variables. + + + +If you would like to write a more complicated function for resolving file names, then a {transformer} would be a good fit. + +:::caution + +Specify as long fixed prefix ( with out /*/ ) as possible in `path_spec.include`. This will reduce the scanning time and cost, specifically on AWS S3 + +::: + +:::caution + +Running profiling against many tables or over many rows can run up significant costs. +While we've done our best to limit the expensiveness of the queries the profiler runs, you +should be prudent about the set of tables profiling is enabled on or the frequency +of the profiling runs. + +::: + +:::caution + +If you are ingesting datasets from AWS S3, we recommend running the ingestion on a server in the same region to avoid high egress costs. + +::: + +## Compatibility + +Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). + +For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/). diff --git a/metadata-ingestion/docs/sources/s3/s3_recipe.yml b/metadata-ingestion/docs/sources/s3/s3_recipe.yml new file mode 100644 index 0000000000000..95cfc04a07c10 --- /dev/null +++ b/metadata-ingestion/docs/sources/s3/s3_recipe.yml @@ -0,0 +1,15 @@ +source: + type: s3 + config: + path_spec: + include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*" + aws_config: + aws_access_key_id: ***** + aws_secret_access_key: ***** + aws_region: us-east-2 + env: "PROD" + profiling: + enabled: false + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/sagemaker/sagemaker_recipe.yml b/metadata-ingestion/docs/sources/sagemaker/sagemaker_recipe.yml new file mode 100644 index 0000000000000..a84ca671211ae --- /dev/null +++ b/metadata-ingestion/docs/sources/sagemaker/sagemaker_recipe.yml @@ -0,0 +1,8 @@ +source: + type: sagemaker + config: + # Coordinates + aws_region: "my-aws-region" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/sqlalchemy/sqlalchemy_recipe.yml b/metadata-ingestion/docs/sources/sqlalchemy/sqlalchemy_recipe.yml new file mode 100644 index 0000000000000..8059ea32af810 --- /dev/null +++ b/metadata-ingestion/docs/sources/sqlalchemy/sqlalchemy_recipe.yml @@ -0,0 +1,8 @@ +source: + type: sqlalchemy + config: + # Coordinates + connect_uri: "dialect+driver://username:password@host:port/database" + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/superset/superset.md b/metadata-ingestion/docs/sources/superset/superset.md new file mode 100644 index 0000000000000..a79d690ffbc21 --- /dev/null +++ b/metadata-ingestion/docs/sources/superset/superset.md @@ -0,0 +1,20 @@ +If you were using `database_alias` in one of your other ingestions to rename your databases to something else based on business needs you can rename them in superset also + +```yml +source: + type: superset + config: + # Coordinates + connect_uri: http://localhost:8088 + + # Credentials + username: user + password: pass + provider: ldap + database_alias: + example_name_1: business_name_1 + example_name_2: business_name_2 + +sink: + # sink configs +``` \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/superset/superset_recipe.yml b/metadata-ingestion/docs/sources/superset/superset_recipe.yml new file mode 100644 index 0000000000000..360affd805592 --- /dev/null +++ b/metadata-ingestion/docs/sources/superset/superset_recipe.yml @@ -0,0 +1,13 @@ +source: + type: superset + config: + # Coordinates + connect_uri: http://localhost:8088 + + # Credentials + username: user + password: pass + provider: ldap + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/tableau/tableau.md b/metadata-ingestion/docs/sources/tableau/tableau.md new file mode 100644 index 0000000000000..d9a96cf6cf2d2 --- /dev/null +++ b/metadata-ingestion/docs/sources/tableau/tableau.md @@ -0,0 +1,413 @@ +### Prerequisites + +In order to ingest metadata from tableau, you will need: + +- Python 3.6+ +- Tableau Server Version 2021.1.10 and above. It may also work for older versions. +- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. +- Tableau Credentials (Username/Password or [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens)) + +## Integration Details + +This plugin extracts Sheets, Dashboards, Embedded and Published Data sources metadata within Workbooks in a given project +on a Tableau site. This plugin is in beta and has only been tested on PostgreSQL database and sample workbooks +on Tableau online. Tableau's GraphQL interface is used to extract metadata information. Queries used to extract metadata are located +in `metadata-ingestion/src/datahub/ingestion/source/tableau_common.py` + +### Concept Mapping + +This ingestion source maps the following Source System Concepts to DataHub Concepts: + +| Source Concept | DataHub Concept | Notes | +| -- | -- | -- | +| `Tableau` | [Data Platform](../../metamodel/entities/dataPlatform.md) | | +| Embedded DataSource | [Dataset](../../metamodel/entities/dataset.md) | | +| Published DataSource | [Dataset](../../metamodel/entities/dataset.md) | | +| Custom SQL Table | [Dataset](../../metamodel/entities/dataset.md) | | +| Embedded or External Tables | [Dataset](../../metamodel/entities/dataset.md) | | +| Sheet | [Chart](../../metamodel/entities/chart.md) | | +| Dashboard | [Dashboard](../../metamodel/entities/dashboard.md) | | +| User | [User (a.k.a CorpUser)](../../metamodel/entities/corpuser.md) | | +| Workbook | [Container](../../metamodel/entities/container.md) | | +| Tag | [Tag](../../metamodel/entities/tag.md) | | + + +- [Workbook](#Workbook) +- [Dashboard](#Dashboard) +- [Sheet](#Sheet) +- [Embedded Data source](#Embedded-Data-Source) +- [Published Data source](#Published-Data-Source) +- [Custom SQL Data source](#Custom-SQL-Data-Source) + +#### Workbook +Workbooks from Tableau are ingested as Container in datahub.
+- GraphQL query
+```graphql +{ + workbooksConnection(first: 15, offset: 0, filter: {projectNameWithin: ["default", "Project 2"]}) { + nodes { + id + name + luid + uri + projectName + owner { + username + } + description + uri + createdAt + updatedAt + } + pageInfo { + hasNextPage + endCursor + } + totalCount + } +} +``` + +#### Dashboard +Dashboards from Tableau are ingested as Dashboard in datahub.
+- GraphQL query
+```graphql +{ + workbooksConnection(first: 15, offset: 0, filter: {projectNameWithin: ["default", "Project 2"]}) { + nodes { + ..... + dashboards { + id + name + path + createdAt + updatedAt + sheets { + id + name + } + } + } + pageInfo { + hasNextPage + endCursor + } + totalCount + } +} + +``` + +#### Sheet +Sheets from Tableau are ingested as charts in datahub.
+- GraphQL query
+```graphql +{ + workbooksConnection(first: 10, offset: 0, filter: {projectNameWithin: ["default"]}) { + ..... + sheets { + id + name + path + createdAt + updatedAt + tags { + name + } + containedInDashboards { + name + path + } + upstreamDatasources { + id + name + } + datasourceFields { + __typename + id + name + description + upstreamColumns { + name + } + ... on ColumnField { + dataCategory + role + dataType + aggregation + } + ... on CalculatedField { + role + dataType + aggregation + formula + } + ... on GroupField { + role + dataType + } + ... on DatasourceField { + remoteField { + __typename + id + name + description + folderName + ... on ColumnField { + dataCategory + role + dataType + aggregation + } + ... on CalculatedField { + role + dataType + aggregation + formula + } + ... on GroupField { + role + dataType + } + } + } + } + } + } + ..... + } +} +``` + +#### Embedded Data Source +Embedded Data source from Tableau is ingested as a Dataset in datahub. + +- GraphQL query
+```graphql +{ + workbooksConnection(first: 15, offset: 0, filter: {projectNameWithin: ["default"]}) { + nodes { + .... + embeddedDatasources { + __typename + id + name + hasExtracts + extractLastRefreshTime + extractLastIncrementalUpdateTime + extractLastUpdateTime + upstreamDatabases { + id + name + connectionType + isEmbedded + } + upstreamTables { + name + schema + columns { + name + remoteType + } + } + fields { + __typename + id + name + description + isHidden + folderName + ... on ColumnField { + dataCategory + role + dataType + defaultFormat + aggregation + columns { + table { + ... on CustomSQLTable { + id + name + } + } + } + } + ... on CalculatedField { + role + dataType + defaultFormat + aggregation + formula + } + ... on GroupField { + role + dataType + } + } + upstreamDatasources { + id + name + } + workbook { + name + projectName + } + } + } + .... + } +} +``` + +#### Published Data Source +Published Data source from Tableau is ingested as a Dataset in datahub. + +- GraphQL query
+```graphql +{ + publishedDatasourcesConnection(filter: {idWithin: ["00cce29f-b561-bb41-3557-8e19660bb5dd", "618c87db-5959-338b-bcc7-6f5f4cc0b6c6"]}) { + nodes { + __typename + id + name + hasExtracts + extractLastRefreshTime + extractLastIncrementalUpdateTime + extractLastUpdateTime + downstreamSheets { + id + name + } + upstreamTables { + name + schema + fullName + connectionType + description + contact { + name + } + } + fields { + __typename + id + name + description + isHidden + folderName + ... on ColumnField { + dataCategory + role + dataType + defaultFormat + aggregation + columns { + table { + ... on CustomSQLTable { + id + name + } + } + } + } + ... on CalculatedField { + role + dataType + defaultFormat + aggregation + formula + } + ... on GroupField { + role + dataType + } + } + owner { + username + } + description + uri + projectName + } + pageInfo { + hasNextPage + endCursor + } + totalCount + } +} +``` + +#### Custom SQL Data Source +For custom sql data sources, the query is viewable in UI under View Definition tab.
+- GraphQL query
+```graphql +{ + customSQLTablesConnection(filter: {idWithin: ["22b0b4c3-6b85-713d-a161-5a87fdd78f40"]}) { + nodes { + id + name + query + columns { + id + name + remoteType + description + referencedByFields { + datasource { + id + name + upstreamDatabases { + id + name + } + upstreamTables { + id + name + schema + connectionType + columns { + id + } + } + ... on PublishedDatasource { + projectName + } + ... on EmbeddedDatasource { + workbook { + name + projectName + } + } + } + } + } + tables { + id + name + schema + connectionType + } + } + } +} +``` + +#### Lineage +Lineage is emitted as received from Tableau's metadata API for +- Sheets contained in Dashboard +- Embedded or Published datasources upstream to Sheet +- Published datasources upstream to Embedded datasource +- Tables upstream to Embedded or Published datasource +- Custom SQL datasources upstream to Embedded or Published datasource +- Tables upstream to Custom SQL datasource + + +#### Caveats +- Tableau metadata API might return incorrect schema name for tables for some databases, leading to incorrect metadata in DataHub. This source attempts to extract correct schema from databaseTable's fully qualified name, wherever possible. Read [Using the databaseTable object in query](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_model.html#schema_attribute) for caveats in using schema attribute. + +## Troubleshooting + +### Why are only some workbooks ingested from the specified project? + +This happens when the Tableau API returns NODE_LIMIT_EXCEEDED error and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, reduce the page size using the `workbooks_page_size` config param (Defaults to 10). diff --git a/metadata-ingestion/docs/sources/tableau/tableau_recipe.yml b/metadata-ingestion/docs/sources/tableau/tableau_recipe.yml new file mode 100644 index 0000000000000..6596132c94f5a --- /dev/null +++ b/metadata-ingestion/docs/sources/tableau/tableau_recipe.yml @@ -0,0 +1,21 @@ +source: + type: tableau + config: + # Coordinates + connect_uri: https://prod-ca-a.online.tableau.com + site: acryl + projects: ["default", "Project 2"] + + # Credentials + username: "${TABLEAU_USER}" + password: "${TABLEAU_PASSWORD}" + + # Options + ingest_tags: True + ingest_owner: True + default_schema_map: + mydatabase: public + anotherdatabase: anotherschema + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/trino/starburst-trino-usage_recipe.yml b/metadata-ingestion/docs/sources/trino/starburst-trino-usage_recipe.yml new file mode 100644 index 0000000000000..eeed0ec6898ea --- /dev/null +++ b/metadata-ingestion/docs/sources/trino/starburst-trino-usage_recipe.yml @@ -0,0 +1,18 @@ +source: + type: starburst-trino-usage + config: + # Coordinates + host_port: yourtrinohost:port + # The name of the catalog from getting the usage + database: hive + # Credentials + username: trino_username + password: trino_password + email_domain: test.com + audit_catalog: audit + audit_schema: audit_schema + +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/trino/trino_recipe.yml b/metadata-ingestion/docs/sources/trino/trino_recipe.yml new file mode 100644 index 0000000000000..df13661d7277b --- /dev/null +++ b/metadata-ingestion/docs/sources/trino/trino_recipe.yml @@ -0,0 +1,13 @@ +source: + type: trino + config: + # Coordinates + host_port: localhost:5300 + database: dbname + + # Credentials + username: foo + password: datahub + +sink: + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/examples/recipes/kafka_to_console.yml b/metadata-ingestion/examples/recipes/kafka_to_console.yml index fc46a5f2c833d..2cbfe2f251b14 100644 --- a/metadata-ingestion/examples/recipes/kafka_to_console.yml +++ b/metadata-ingestion/examples/recipes/kafka_to_console.yml @@ -4,7 +4,7 @@ source: type: "kafka" config: connection: - bootstrap: "broker:9092" + bootstrap: "localhost:9092" # see https://datahubproject.io/docs/metadata-ingestion/sink_docs/console for complete documentation sink: diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py new file mode 100644 index 0000000000000..3d2c358bc6b14 --- /dev/null +++ b/metadata-ingestion/scripts/docgen.py @@ -0,0 +1,751 @@ +import glob +import json +import logging +import os +import re +import textwrap +from importlib.metadata import metadata, requires +from typing import Any, Dict, List, Optional + +import click +from pydantic import Field +from pydantic.dataclasses import dataclass + +from datahub.configuration.common import ConfigModel +from datahub.ingestion.api.decorators import ( + CapabilitySetting, + SourceCapability, + SupportStatus, +) +from datahub.ingestion.api.registry import PluginRegistry +from datahub.ingestion.api.source import Source + +logger = logging.getLogger(__name__) + + +@dataclass +class FieldRow: + path: str + type_name: str + required: bool + default: str + description: str + inner_fields: List["FieldRow"] = Field(default_factory=list) + + @staticmethod + def get_checkbox(enabled: bool) -> str: + return "✅" if enabled else "" + + def to_md_line(self) -> str: + return ( + f"| {self.path} | {self.get_checkbox(self.required)} | {self.type_name} | {self.description} | {self.default} |\n" + + "".join([inner_field.to_md_line() for inner_field in self.inner_fields]) + ) + + +class FieldHeader(FieldRow): + def to_md_line(self) -> str: + return "\n".join( + [ + "| Field | Required | Type | Description | Default |", + "| --- | --- | --- | --- | -- |", + "", + ] + ) + + def __init__(self): + pass + + +def get_definition_dict_from_definition( + definitions_dict: Dict[str, Any], definition_name: str +) -> Dict[str, Any]: + import re + + m = re.search("#/definitions/(.*)$", definition_name) + if m: + definition_term: str = m.group(1) + definition_dict = definitions_dict[definition_term] + return definition_dict + + raise Exception("Failed to find a definition for " + definition_name) + + +def get_prefixed_name(field_prefix: Optional[str], field_name: Optional[str]) -> str: + assert ( + field_prefix or field_name + ), "One of field_prefix or field_name should be present" + return ( + f"{field_prefix}.{field_name}" # type: ignore + if field_prefix and field_name + else field_name + if not field_prefix + else field_prefix + ) + + +def gen_md_table_from_struct(schema_dict: Dict[str, Any]) -> List[str]: + table_md_str: List[FieldRow] = [] + # table_md_str = [ + # "\n\nType\n" + # ] + gen_md_table(schema_dict, schema_dict.get("definitions", {}), md_str=table_md_str) + # table_md_str.append("\n
\nField\nDefaultDescription
\n") + + table_md_str = [field for field in table_md_str if len(field.inner_fields) == 0] + [ + field for field in table_md_str if len(field.inner_fields) > 0 + ] + + # table_md_str.sort(key=lambda x: "z" if len(x.inner_fields) else "" + x.path) + return ( + [FieldHeader().to_md_line()] + + [row.to_md_line() for row in table_md_str] + + ["\n"] + ) + + +def get_enum_description( + authored_description: Optional[str], enum_symbols: List[str] +) -> str: + description = authored_description or "" + missed_symbols = [symbol for symbol in enum_symbols if symbol not in description] + if missed_symbols: + description = ( + description + "." + if description + else "" + " Allowed symbols are " + ",".join(enum_symbols) + ) + + return description + + +def gen_md_table( + field_dict: Dict[str, Any], + definitions_dict: Dict[str, Any], + md_str: List[FieldRow], + field_prefix: str = None, +) -> None: + if "enum" in field_dict: + md_str.append( + FieldRow( + path=get_prefixed_name(field_prefix, None), + type_name="Enum", + required=field_dict.get("required") or False, + description=f"one of {','.join(field_dict['enum'])}", + default=field_dict.get("default") or "None", + ) + ) + # md_str.append( + # f"| {get_prefixed_name(field_prefix, None)} | Enum | {field_dict['type']} | one of {','.join(field_dict['enum'])} |\n" + # ) + + elif "properties" in field_dict: + for field_name, value in field_dict["properties"].items(): + required_field: bool = field_name in field_dict.get("required", []) + + if "allOf" in value: + for sub_schema in value["allOf"]: + reference = sub_schema["$ref"] + def_dict = get_definition_dict_from_definition( + definitions_dict, reference + ) + # special case for enum reference, we don't split up the rows + if "enum" in def_dict: + row = FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"enum({reference.split('/')[-1]})", + description=get_enum_description( + value.get("description"), def_dict["enum"] + ), + default=str(value.get("default")) or "", + required=required_field, + ) + md_str.append(row) + else: + # object reference + row = FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"{reference.split('/')[-1]} (see below for fields)", + description=value.get("description") or "", + default=str(value.get("default")) or "", + required=required_field, + ) + md_str.append(row) + # md_str.append( + # f"| {get_prefixed_name(field_prefix, field_name)} | {reference.split('/')[-1]} (see below for fields) | {value.get('description') or ''} | {value.get('default') or ''} | \n" + # ) + gen_md_table( + def_dict, + definitions_dict, + field_prefix=get_prefixed_name(field_prefix, field_name), + md_str=row.inner_fields, + ) + elif "type" in value and value["type"] == "enum": + # enum + enum_definition = value["allOf"][0]["$ref"] + def_dict = get_definition_dict_from_definition( + definitions_dict, enum_definition + ) + print(value) + print(def_dict) + md_str.append( + FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name="Enum", + description=f"one of {','.join(def_dict['enum'])}", + required=required_field, + default=value.get("default") or "None", + ) + # f"| {get_prefixed_name(field_prefix, field_name)} | Enum | one of {','.join(def_dict['enum'])} | {def_dict['type']} | \n" + ) + + elif "type" in value and value["type"] == "object": + # struct + if "$ref" not in value: + if ( + "additionalProperties" in value + and "$ref" in value["additionalProperties"] + ): + # breakpoint() + value_ref = value["additionalProperties"]["$ref"] + def_dict = get_definition_dict_from_definition( + definitions_dict, value_ref + ) + + row = FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"Dict[str, {value_ref.split('/')[-1]}]", + description=value.get("description") or "", + default=value.get("default") or "", + required=required_field, + ) + md_str.append(row) + gen_md_table( + def_dict, + definitions_dict, + field_prefix=get_prefixed_name( + field_prefix, f"{field_name}.`key`" + ), + md_str=row.inner_fields, + ) + else: + value_type = value.get("additionalProperties", {}).get("type") + md_str.append( + FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"Dict[str,{value_type}]" + if value_type + else "Dict", + description=value.get("description") or "", + default=value.get("default") or "", + required=required_field, + ) + ) + else: + object_definition = value["$ref"] + row = FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"{object_definition.split('/')[-1]} (see below for fields)", + description=value.get("description") or "", + default=value.get("default") or "", + required=required_field, + ) + + md_str.append( + row + # f"| {get_prefixed_name(field_prefix, field_name)} | {object_definition.split('/')[-1]} (see below for fields) | {value.get('description') or ''} | {value.get('default') or ''} | \n" + ) + def_dict = get_definition_dict_from_definition( + definitions_dict, object_definition + ) + gen_md_table( + def_dict, + definitions_dict, + field_prefix=get_prefixed_name(field_prefix, field_name), + md_str=row.inner_fields, + ) + elif "type" in value and value["type"] == "array": + # array + items_type = value["items"].get("type", "object") + md_str.append( + FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"Array of {items_type}", + description=value.get("description") or "", + default=str(value.get("default")) or "None", + required=required_field, + ) + # f"| {get_prefixed_name(field_prefix, field_name)} | Array of {items_type} | {value.get('description') or ''} | {value.get('default')} | \n" + ) + # TODO: Array of structs + elif "type" in value: + md_str.append( + FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=value["type"], + description=value.get("description") or "", + default=value.get("default") or "None", + required=required_field, + ) + # f"| {get_prefixed_name(field_prefix, field_name)} | {value['type']} | {value.get('description') or ''} | {value.get('default')} | \n" + ) + elif "$ref" in value: + object_definition = value["$ref"] + def_dict = get_definition_dict_from_definition( + definitions_dict, object_definition + ) + row = FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name=f"{object_definition.split('/')[-1]} (see below for fields)", + description=value.get("description") or "", + default=value.get("default") or "", + required=required_field, + ) + + md_str.append( + row + # f"| {get_prefixed_name(field_prefix, field_name)} | {object_definition.split('/')[-1]} (see below for fields) | {value.get('description') or ''} | {value.get('default') or ''} | \n" + ) + gen_md_table( + def_dict, + definitions_dict, + field_prefix=get_prefixed_name(field_prefix, field_name), + md_str=row.inner_fields, + ) + else: + # print(md_str, field_prefix, field_name, value) + md_str.append( + FieldRow( + path=get_prefixed_name(field_prefix, field_name), + type_name="Generic dict", + description=value.get("description", ""), + default=value.get("default", "None"), + required=required_field, + ) + # f"| {get_prefixed_name(field_prefix, field_name)} | Any dict | {value.get('description') or ''} | {value.get('default')} |\n" + ) + + +def get_snippet(long_string: str, max_length: int = 100) -> str: + snippet = "" + if len(long_string) > max_length: + snippet = long_string[:max_length].strip() + "... " + else: + snippet = long_string.strip() + + snippet = snippet.replace("\n", " ") + snippet = snippet.strip() + " " + return snippet + + +def get_support_status_badge(support_status: SupportStatus) -> str: + if support_status == SupportStatus.CERTIFIED: + return "![Certified](https://img.shields.io/badge/support%20status-certified-brightgreen)" + if support_status == SupportStatus.INCUBATING: + return "![Incubating](https://img.shields.io/badge/support%20status-incubating-blue)" + if support_status == SupportStatus.TESTING: + return "![Testing](https://img.shields.io/badge/support%20status-testing-lightgrey)" + + return "" + + +def get_capability_supported_badge(supported: bool) -> str: + return "✅" if supported else "❌" + + +def get_capability_text(src_capability: SourceCapability) -> str: + """ + Returns markdown format cell text for a capability, hyperlinked to capability feature page if known + """ + capability_docs_mapping: Dict[SourceCapability, str] = { + SourceCapability.DELETION_DETECTION: "../../../../metadata-ingestion/docs/dev_guides/stateful.md#removal-of-stale-tables-and-views", + SourceCapability.DOMAINS: "../../../domains.md", + SourceCapability.PLATFORM_INSTANCE: "../../../platform-instances.md", + SourceCapability.DATA_PROFILING: "../../../../metadata-ingestion/docs/dev_guides/sql_profiles.md", + } + + capability_doc = capability_docs_mapping.get(src_capability) + return ( + src_capability.value + if not capability_doc + else f"[{src_capability.value}]({capability_doc})" + ) + + +def create_or_update( + something: Dict[Any, Any], path: List[str], value: Any +) -> Dict[Any, Any]: + dict_under_operation = something + for p in path[:-1]: + if p not in dict_under_operation: + dict_under_operation[p] = {} + dict_under_operation = dict_under_operation[p] + + dict_under_operation[path[-1]] = value + return something + + +def does_extra_exist(extra_name: str) -> bool: + for key, value in metadata("acryl-datahub").items(): + if key == "Provides-Extra" and value == extra_name: + return True + return False + + +def get_additional_deps_for_extra(extra_name: str) -> List[str]: + all_requirements = requires("acryl-datahub") or [] + # filter for base dependencies + base_deps = set([x.split(";")[0] for x in all_requirements if "extra ==" not in x]) + # filter for dependencies for this extra + extra_deps = set( + [x.split(";")[0] for x in all_requirements if f'extra == "{extra_name}"' in x] + ) + # calculate additional deps that this extra adds + delta_deps = extra_deps - base_deps + return list(delta_deps) + + +def relocate_path(orig_path: str, relative_path: str, relocated_path: str) -> str: + + newPath = os.path.join(os.path.dirname(orig_path), relative_path) + assert os.path.exists(newPath) + + newRelativePath = os.path.relpath(newPath, os.path.dirname(relocated_path)) + return newRelativePath + + +def rewrite_markdown(file_contents: str, path: str, relocated_path: str) -> str: + def new_url(original_url: str, file_path: str) -> str: + if original_url.startswith(("http://", "https://", "#")): + return original_url + import pathlib + + file_ext = pathlib.Path(original_url).suffix + if file_ext.startswith(".md"): + return original_url + elif file_ext in [".png", ".svg", ".gif", ".pdf"]: + new_url = relocate_path(path, original_url, relocated_path) + return new_url + return original_url + + # Look for the [text](url) syntax. Note that this will also capture images. + # + # We do a little bit of parenthesis matching here to account for parens in URLs. + # See https://stackoverflow.com/a/17759264 for explanation of the second capture group. + new_content = re.sub( + r"\[(.*?)\]\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)", + lambda x: f"[{x.group(1)}]({new_url(x.group(2).strip(),path)})", # type: ignore + file_contents, + ) + + new_content = re.sub( + # Also look for the [text]: url syntax. + r"^\[(.+?)\]\s*:\s*(.+?)\s*$", + lambda x: f"[{x.group(1)}]: {new_url(x.group(2), path)}", + new_content, + ) + return new_content + + +@click.command() +@click.option("--out-dir", type=str, required=True) +@click.option("--extra-docs", type=str, required=False) +@click.option("--source", type=str, required=False) +def generate( + out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None +) -> None: # noqa: C901 + source_documentation: Dict[str, Any] = {} + metrics = {} + metrics["source_platforms"] = {"discovered": 0, "generated": 0, "warnings": []} + metrics["plugins"] = {"discovered": 0, "generated": 0, "failed": 0} + + if extra_docs: + for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True): + # breakpoint() + + m = re.search("/docs/sources/(.*)/(.*).md", path) + if m: + platform_name = m.group(1).lower() + file_name = m.group(2) + destination_md: str = ( + f"../docs/generated/ingestion/sources/{platform_name}.md" + ) + + with open(path, "r") as doc_file: + file_contents = doc_file.read() + final_markdown = rewrite_markdown( + file_contents, path, destination_md + ) + + if file_name == "README": + # README goes as platform level docs + # all other docs are assumed to be plugin level + create_or_update( + source_documentation, + [platform_name, "custom_docs"], + final_markdown, + ) + else: + create_or_update( + source_documentation, + [platform_name, "plugins", file_name, "custom_docs"], + final_markdown, + ) + else: + yml_match = re.search("/docs/sources/(.*)/(.*)_recipe.yml", path) + if yml_match: + platform_name = yml_match.group(1).lower() + plugin_name = yml_match.group(2) + with open(path, "r") as doc_file: + file_contents = doc_file.read() + create_or_update( + source_documentation, + [platform_name, "plugins", plugin_name, "recipe"], + file_contents, + ) + + source_registry = PluginRegistry[Source]() + source_registry.register_from_entrypoint("datahub.ingestion.source.plugins") + + # This source is always enabled + for plugin_name in sorted(source_registry._mapping.keys()): + if source and source != plugin_name: + continue + + metrics["plugins"]["discovered"] = metrics["plugins"]["discovered"] + 1 + # We want to attempt to load all plugins before printing a summary. + source_type = None + try: + # output = subprocess.check_output( + # ["/bin/bash", "-c", f"pip install -e '.[{key}]'"] + # ) + + source_registry._ensure_not_lazy(plugin_name) + logger.debug(f"Processing {plugin_name}") + source_type = source_registry.get(plugin_name) + logger.debug(f"Source class is {source_type}") + extra_plugin = plugin_name if does_extra_exist(plugin_name) else None + extra_deps = ( + get_additional_deps_for_extra(extra_plugin) if extra_plugin else [] + ) + + except Exception as e: + print(f"Failed to process {plugin_name} due to {e}") + metrics["plugins"]["failed"] = metrics["plugins"].get["failed"] + 1 + + if source_type and hasattr(source_type, "get_config_class"): + try: + source_config_class: ConfigModel = source_type.get_config_class() + support_status = SupportStatus.UNKNOWN + capabilities = [] + if hasattr(source_type, "__doc__"): + source_doc = textwrap.dedent(source_type.__doc__ or "") + if hasattr(source_type, "get_platform_name"): + platform_name = source_type.get_platform_name() + else: + platform_name = ( + plugin_name.title() + ) # we like platform names to be human readable + + if hasattr(source_type, "get_platform_id"): + platform_id = source_type.get_platform_id() + + source_documentation[platform_id] = ( + source_documentation.get(platform_id) or {} + ) + # breakpoint() + + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "classname"], + ".".join([source_type.__module__, source_type.__name__]), + ) + plugin_file_name = "src/" + "/".join(source_type.__module__.split(".")) + if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name): + plugin_file_name = plugin_file_name + "/__init__.py" + else: + plugin_file_name = plugin_file_name + ".py" + if os.path.exists(plugin_file_name): + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "filename"], + plugin_file_name, + ) + else: + logger.info( + f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}" + ) + + if hasattr(source_type, "get_support_status"): + support_status = source_type.get_support_status() + + if hasattr(source_type, "get_capabilities"): + capabilities = list(source_type.get_capabilities()) + capabilities.sort(key=lambda x: x.capability.value) + + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "capabilities"], + capabilities, + ) + + create_or_update( + source_documentation, [platform_id, "name"], platform_name + ) + + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "extra_deps"], + extra_deps, + ) + + config_dir = f"{out_dir}/config_schemas" + os.makedirs(config_dir, exist_ok=True) + with open(f"{config_dir}/{plugin_name}_config.json", "w") as f: + f.write(source_config_class.schema_json(indent=2)) + + table_md = gen_md_table_from_struct(source_config_class.schema()) + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "source_doc"], + source_doc or "", + ) + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "config"], + table_md, + ) + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "support_status"], + support_status, + ) + + except Exception as e: + raise e + + sources_dir = f"{out_dir}/sources" + os.makedirs(sources_dir, exist_ok=True) + + for platform_id, platform_docs in source_documentation.items(): + if source and platform_id != source: + continue + metrics["source_platforms"]["discovered"] = ( + metrics["source_platforms"]["discovered"] + 1 + ) + platform_doc_file = f"{sources_dir}/{platform_id}.md" + if "name" not in platform_docs: + # We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes + warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?" + logger.error(warning_msg) + metrics["source_platforms"]["warnings"].append(warning_msg) + + with open(platform_doc_file, "w") as f: + if "name" in platform_docs: + f.write(f"# {platform_docs['name']}\n") + if len(platform_docs["plugins"].keys()) > 1: + # More than one plugin used to provide integration with this platform + f.write( + f"There are {len(platform_docs['plugins'].keys())} sources that provide integration with {platform_docs['name']}\n" + ) + f.write("\n") + f.write("\n") + f.write("") + for col_header in ["Source Module", "Documentation"]: + f.write(f"") + f.write("") + + # f.write("| Source Module | Documentation |\n") + # f.write("| ------ | ---- |\n") + for plugin in platform_docs["plugins"]: + f.write("\n") + f.write(f"\n") + f.write( + f"\n" + ) + f.write("\n") + # f.write( + # f"| `{plugin}` | {get_snippet(platform_docs['plugins'][plugin]['source_doc'])}[Read more...](#module-{plugin}) |\n" + # ) + f.write("
{col_header}
\n\n`{plugin}`\n\n\n\n\n{platform_docs['plugins'][plugin].get('source_doc') or ''} [Read more...](#module-{plugin})\n\n\n
\n") + # insert platform level custom docs before plugin section + f.write(platform_docs.get("custom_docs") or "") + for plugin, plugin_docs in platform_docs["plugins"].items(): + f.write(f"\n## Module `{plugin}`\n") + if "support_status" in plugin_docs: + f.write( + get_support_status_badge(plugin_docs["support_status"]) + "\n\n" + ) + if "capabilities" in plugin_docs and len(plugin_docs["capabilities"]): + f.write("\n### Important Capabilities\n") + f.write("| Capability | Status | Notes |\n") + f.write("| ---------- | ------ | ----- |\n") + plugin_capabilities: List[CapabilitySetting] = plugin_docs[ + "capabilities" + ] + for cap_setting in plugin_capabilities: + f.write( + f"| {get_capability_text(cap_setting.capability)} | {get_capability_supported_badge(cap_setting.supported)} | {cap_setting.description} |\n" + ) + f.write("\n") + + f.write(f"{plugin_docs.get('source_doc') or ''}\n") + if "extra_deps" in plugin_docs: + f.write("### Install the Plugin\n") + if plugin_docs["extra_deps"] != []: + f.write("```shell\n") + f.write(f"pip install 'acryl-datahub[{plugin}]`\n") + f.write("```\n") + else: + f.write( + f"The `{plugin}` source works out of the box with `acryl-datahub`.\n" + ) + if "recipe" in plugin_docs: + f.write("\n### Quickstart Recipe\n") + f.write( + "Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.\n\n\n" + ) + f.write( + "For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes)\n" + ) + f.write("```yaml\n") + f.write(plugin_docs["recipe"]) + f.write("\n```\n") + if "config" in plugin_docs: + f.write("\n### Config Details\n") + f.write( + "Note that a `.` is used to denote nested fields in the YAML recipe.\n\n" + ) + f.write( + "\n
\nView All Configuration Options\n\n" + ) + for doc in plugin_docs["config"]: + f.write(doc) + f.write("\n
\n\n") + # insert custom plugin docs after config details + f.write(plugin_docs.get("custom_docs", "")) + if "classname" in plugin_docs: + f.write("\n### Code Coordinates\n") + f.write(f"- Class Name: `{plugin_docs['classname']}`\n") + if "filename" in plugin_docs: + f.write( + f"- Browse on [GitHub](../../../../metadata-ingestion/{plugin_docs['filename']})\n\n" + ) + metrics["plugins"]["generated"] = metrics["plugins"]["generated"] + 1 + + f.write("\n## Questions\n") + f.write( + f"If you've got any questions on configuring ingestion for {platform_docs.get('name',platform_id)}, feel free to ping us on [our Slack](https://slack.datahubproject.io)\n" + ) + metrics["source_platforms"]["generated"] = ( + metrics["source_platforms"]["generated"] + 1 + ) + print("Ingestion Documentation Generation Complete") + print("############################################") + print(json.dumps(metrics, indent=2)) + print("############################################") + + +if __name__ == "__main__": + logger.setLevel("INFO") + generate() diff --git a/metadata-ingestion/scripts/docgen.sh b/metadata-ingestion/scripts/docgen.sh new file mode 100755 index 0000000000000..affb87f2e70a9 --- /dev/null +++ b/metadata-ingestion/scripts/docgen.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -euo pipefail + +# Note: this assumes that datahub has already been built with `./gradlew build`. +DATAHUB_ROOT=.. +DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion +EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources + +rm -r $DOCS_OUT_DIR || true +python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@ diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/file.md index 640f10f6a8790..f4685730039b7 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/file.md @@ -10,7 +10,7 @@ Works with `acryl-datahub` out of the box. Outputs metadata to a file. This can be used to decouple metadata sourcing from the process of pushing it into DataHub, and is particularly useful for debugging purposes. -Note that the [file source](../source_docs/file.md) can read files generated by this sink. +Note that the [file source](../../docs/generated/ingestion/sources/file.md) can read files generated by this sink. ## Quickstart recipe diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 02710928a18d1..b2fa5650d6149 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -3,6 +3,7 @@ from typing import IO, Any, Dict, List, Optional, Pattern, cast from pydantic import BaseModel +from pydantic.fields import Field class ConfigModel(BaseModel): @@ -11,10 +12,15 @@ class Config: class DynamicTypedConfig(ConfigModel): - type: str + type: str = Field( + description="The type of the dynamic object", + ) # This config type is declared Optional[Any] here. The eventual parser for the # specified type is responsible for further validation. - config: Optional[Any] + config: Optional[Any] = Field( + default=None, + description="The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).", + ) class MetaError(Exception): @@ -72,12 +78,21 @@ def load_config(self, config_fp: IO) -> dict: class AllowDenyPattern(ConfigModel): """A class to store allow deny regexes""" - allow: List[str] = [".*"] - deny: List[str] = [] - ignoreCase: Optional[ - bool - ] = True # Name comparisons should default to ignoring case - alphabet: str = "[A-Za-z0-9 _.-]" + allow: List[str] = Field( + default=[".*"], + description="List of regex patterns for process groups to include in ingestion", + ) + deny: List[str] = Field( + default=[], + description="List of regex patterns for process groups to exclude from ingestion.", + ) + ignoreCase: Optional[bool] = Field( + default=True, + description="Whether to ignore case sensitivity during pattern matching.", + ) # Name comparisons should default to ignoring case + alphabet: str = Field( + default="[A-Za-z0-9 _.-]", description="Allowed alphabets pattern" + ) @property def alphabet_pattern(self) -> Pattern: diff --git a/metadata-ingestion/src/datahub/configuration/github.py b/metadata-ingestion/src/datahub/configuration/github.py index e6dcb36197605..31d73f8b24f73 100644 --- a/metadata-ingestion/src/datahub/configuration/github.py +++ b/metadata-ingestion/src/datahub/configuration/github.py @@ -1,12 +1,17 @@ -from pydantic import validator +from pydantic import Field, validator from datahub.configuration.common import ConfigModel, ConfigurationError class GitHubInfo(ConfigModel): - repo: str - branch: str = "main" - base_url: str = "https://github.com" + repo: str = Field( + description="Name of your github repo. e.g. repo for https://github.com/datahub-project/datahub is `datahub-project/datahub`." + ) + branch: str = Field( + "main", + description="Branch on which your files live by default. Typically main or master.", + ) + base_url: str = Field("https://github.com", description="Base url for Github") @validator("repo") def repo_should_be_org_slash_repo(cls, repo: str) -> str: diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index 34df9d640f22c..2ea6663e48488 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -3,6 +3,7 @@ from typing import Any, Dict import pydantic +from pydantic.fields import Field from datahub.configuration.common import ConfigModel from datahub.metadata.schema_classes import CalendarIntervalClass @@ -31,13 +32,16 @@ def get_bucket_duration_delta(bucketing: BucketDuration) -> timedelta: class BaseTimeWindowConfig(ConfigModel): - bucket_duration: BucketDuration = BucketDuration.DAY + bucket_duration: BucketDuration = Field( + default=BucketDuration.DAY, + description="Size of the time window to aggregate usage stats.", + ) # `start_time` and `end_time` will be populated by the pre-validators. # However, we must specify a "default" value here or pydantic will complain # if those fields are not set by the user. - end_time: datetime = None # type: ignore - start_time: datetime = None # type: ignore + end_time: datetime = Field(default=None, description="Latest date of usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`)") # type: ignore + start_time: datetime = Field(default=None, description="Earliest date of usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`)") # type: ignore @pydantic.validator("end_time", pre=True, always=True) def default_end_time( diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 5aadaf3765258..0e013668fb560 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -39,7 +39,7 @@ def _make_curl_command( return " ".join(shlex.quote(fragment) for fragment in fragments) -class DatahubRestEmitter: +class DataHubRestEmitter: DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect DEFAULT_READ_TIMEOUT_SEC = ( 30 # Any ingest call taking longer than 30 seconds should be abandoned @@ -253,3 +253,9 @@ def _emit_generic(self, url: str, payload: str) -> None: raise OperationalError( "Unable to emit metadata to DataHub GMS", {"message": str(e)} ) from e + + +class DatahubRestEmitter(DataHubRestEmitter): + """This class exists as a pass-through for backwards compatibility""" + + pass diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py new file mode 100644 index 0000000000000..9b3f35ae9d811 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -0,0 +1,121 @@ +from dataclasses import dataclass +from enum import Enum, auto +from typing import Callable, Dict, Optional, Type + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.source import Source + + +def config_class(config_cls: Type) -> Callable[[Type], Type]: + """Adds a get_config_class method to the decorated class""" + + def default_create(cls: Type, config_dict: Dict, ctx: PipelineContext) -> Type: + config = config_cls.parse_obj(config_dict) + return cls(config=config, ctx=ctx) + + def wrapper(cls: Type) -> Type: + # add a get_config_class method + setattr(cls, "get_config_class", lambda: config_cls) + if not hasattr(cls, "create") or ( + getattr(cls, "create").__func__ == getattr(Source, "create").__func__ + ): + # add the create method only if it has not been overridden from the base Source.create method + setattr(cls, "create", classmethod(default_create)) + + return cls + + return wrapper + + +def platform_name( + platform_name: str, id: Optional[str] = None +) -> Callable[[Type], Type]: + """Adds a get_platform_name method to the decorated class""" + + def wrapper(cls: Type) -> Type: + setattr(cls, "get_platform_name", lambda: platform_name) + setattr( + cls, + "get_platform_id", + lambda: id if id else platform_name.lower().replace(" ", "-"), + ) + return cls + + if id and " " in id: + raise Exception( + f'Platform id "{id}" contains white-space, please use a platform id without spaces.' + ) + + return wrapper + + +class SupportStatus(Enum): + CERTIFIED = auto() + """ + Certified Sources are well-tested & widely-adopted by the DataHub Community. We expect the integration to be stable with few user-facing issues. + """ + INCUBATING = auto() + """ + Incubating Sources are ready for DataHub Community adoption but have not been tested for a wide variety of edge-cases. We eagerly solicit feedback from the Community to streghten the connector; minor version changes may arise in future releases. + """ + TESTING = auto() + """ + Testing Sources are available for experimentation by DataHub Community members, but may change without notice. + """ + UNKNOWN = auto() + """ + System-default value for when the connector author has declined to provide a status on this connector. + """ + + +def support_status( + support_status: SupportStatus, +) -> Callable[[Type], Type]: + """Adds a get_support_status method to the decorated class""" + + def wrapper(cls: Type) -> Type: + setattr(cls, "get_support_status", lambda: support_status) + return cls + + return wrapper + + +class SourceCapability(Enum): + PLATFORM_INSTANCE = "Platform Instance" + DOMAINS = "Domains" + DATA_PROFILING = "Data Profiling" + USAGE_STATS = "Dataset Usage" + PARTITION_SUPPORT = "Partition Support" + DESCRIPTIONS = "Descriptions" + LINEAGE_COARSE = "Table-Level Lineage" + LINEAGE_FINE = "Column-level Lineage" + OWNERSHIP = "Extract Ownership" + DELETION_DETECTION = "Detect Deleted Entities" + TAGS = "Extract Tags" + + +@dataclass +class CapabilitySetting: + capability: SourceCapability + description: str + supported: bool + + +def capability( + capability_name: SourceCapability, description: str, supported: bool = True +) -> Callable[[Type], Type]: + """ + A decorator to mark a source as having a certain capability + """ + + def wrapper(cls: Type) -> Type: + if not hasattr(cls, "__capabilities"): + setattr(cls, "__capabilities", {}) + setattr(cls, "get_capabilities", lambda: cls.__capabilities.values()) + + cls.__capabilities[capability_name] = CapabilitySetting( + capability=capability_name, description=description, supported=supported + ) + return cls + + return wrapper diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 4b5c5068757be..2230e81585441 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -57,7 +57,6 @@ class Source(Closeable, metaclass=ABCMeta): ctx: PipelineContext @classmethod - @abstractmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py index fcbf74c1f0487..06cabe0be291c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py @@ -5,10 +5,10 @@ from boto3.session import Session from botocore.config import Config from botocore.utils import fix_s3_host +from pydantic.fields import Field -from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import DEFAULT_ENV +from datahub.configuration.source_common import EnvBasedSourceConfigBase if TYPE_CHECKING: @@ -35,7 +35,7 @@ def assume_role( return assumed_role_object["Credentials"] -class AwsSourceConfig(ConfigModel): +class AwsSourceConfig(EnvBasedSourceConfigBase): """ Common AWS credentials config. @@ -44,19 +44,44 @@ class AwsSourceConfig(ConfigModel): - SageMaker source """ - env: str = DEFAULT_ENV - - database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() + database_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for databases to filter in ingestion.", + ) + table_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for tables to filter in ingestion.", + ) - aws_access_key_id: Optional[str] = None - aws_secret_access_key: Optional[str] = None - aws_session_token: Optional[str] = None - aws_role: Optional[Union[str, List[str]]] = None - aws_profile: Optional[str] = None - aws_region: str - aws_endpoint_url: Optional[str] = None - aws_proxy: Optional[Dict[str, str]] = None + aws_access_key_id: Optional[str] = Field( + default=None, + description="Autodetected. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html", + ) + aws_secret_access_key: Optional[str] = Field( + default=None, + description="Autodetected. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html", + ) + aws_session_token: Optional[str] = Field( + default=None, + description="Autodetected. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html", + ) + aws_role: Optional[Union[str, List[str]]] = Field( + default=None, + description="Autodetected. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html", + ) + aws_profile: Optional[str] = Field( + default=None, + description="Named AWS profile to use, if not set the default will be used", + ) + aws_region: str = Field(description="AWS region code.") + aws_endpoint_url: Optional[str] = Field( + default=None, + description="Autodetected. See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html", + ) + aws_proxy: Optional[Dict[str, str]] = Field( + default=None, + description="Autodetected. See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html", + ) def get_session(self) -> Session: if ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 32b1e44673de8..3aeeebd26e76b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -8,6 +8,7 @@ import yaml from pydantic import validator +from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.source_common import PlatformSourceConfigBase @@ -27,6 +28,14 @@ gen_containers, ) from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws import s3_util @@ -70,14 +79,36 @@ class GlueSourceConfig(AwsSourceConfig, PlatformSourceConfigBase): - extract_owners: Optional[bool] = True - extract_transforms: Optional[bool] = True - underlying_platform: Optional[str] = None - ignore_unsupported_connectors: Optional[bool] = True - emit_s3_lineage: bool = False - glue_s3_lineage_direction: str = "upstream" - domain: Dict[str, AllowDenyPattern] = dict() - catalog_id: Optional[str] = None + extract_owners: Optional[bool] = Field( + default=True, + description="When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets.", + ) + extract_transforms: Optional[bool] = Field( + default=True, description="Whether to extract Glue transform jobs." + ) + underlying_platform: Optional[str] = Field( + default=None, + description="@deprecated(Use `platform`) Override for platform name. Allowed values - `glue`, `athena`", + ) + ignore_unsupported_connectors: Optional[bool] = Field( + default=True, + description="Whether to ignore unsupported connectors. If disabled, an error will be raised.", + ) + emit_s3_lineage: bool = Field( + default=False, description=" Whether to emit S3-to-Glue lineage." + ) + glue_s3_lineage_direction: str = Field( + default="upstream", + description="If `upstream`, S3 is upstream to Glue. If `downstream` S3 is downstream to Glue.", + ) + domain: Dict[str, AllowDenyPattern] = Field( + default=dict(), + description="regex patterns for tables to filter to assign domain_key. ", + ) + catalog_id: Optional[str] = Field( + default=None, + description="The aws account id where the target glue catalog lives. If None, datahub will ingest glue in aws caller's account.", + ) use_s3_bucket_tags: Optional[bool] = False use_s3_object_tags: Optional[bool] = False @@ -129,7 +160,56 @@ def report_table_dropped(self, table: str) -> None: self.filtered.append(table) +@platform_name("Glue") +@config_class(GlueSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") class GlueSource(Source): + """ + Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. + + This plugin extracts the following: + + - Tables in the Glue catalog + - Column types associated with each table + - Table metadata, such as owner, description and parameters + - Jobs and their component transformations, data sources, and data sinks + + ## IAM permissions + + For ingesting datasets, the following IAM permissions are required: + ```json + { + "Effect": "Allow", + "Action": [ + "glue:GetDatabases", + "glue:GetTables" + ], + "Resource": [ + "arn:aws:glue:$region-id:$account-id:catalog", + "arn:aws:glue:$region-id:$account-id:database/*", + "arn:aws:glue:$region-id:$account-id:table/*" + ] + } + ``` + + For ingesting jobs (`extract_transforms: True`), the following additional permissions are required: + ```json + { + "Effect": "Allow", + "Action": [ + "glue:GetDataflowGraph", + "glue:GetJobs", + ], + "Resource": "*" + } + ``` + + plus `s3:GetObject` for the job script locations. + + """ + source_config: GlueSourceConfig report = GlueSourceReport() diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker.py b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker.py index 28b40d947b0a8..317030795db8e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker.py @@ -2,6 +2,14 @@ from typing import DefaultDict, Dict, Iterable from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.sagemaker_processors.common import ( @@ -20,7 +28,18 @@ from datahub.ingestion.source.aws.sagemaker_processors.models import ModelProcessor +@platform_name("SageMaker") +@config_class(SagemakerSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class SagemakerSource(Source): + """ + This plugin extracts the following: + + - Feature groups + - Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job) + """ + source_config: SagemakerSourceConfig report = SagemakerSourceReport() diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/common.py index 9d295a472d823..efb743d3c93cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/sagemaker_processors/common.py @@ -1,15 +1,23 @@ from dataclasses import dataclass from typing import Dict, Optional, Union +from pydantic.fields import Field + from datahub.ingestion.api.source import SourceReport from datahub.ingestion.source.aws.aws_common import AwsSourceConfig class SagemakerSourceConfig(AwsSourceConfig): - extract_feature_groups: Optional[bool] = True - extract_models: Optional[bool] = True - extract_jobs: Optional[Union[Dict[str, str], bool]] = True + extract_feature_groups: Optional[bool] = Field( + default=True, description="Whether to extract feature groups." + ) + extract_models: Optional[bool] = Field( + default=True, description="Whether to extract models." + ) + extract_jobs: Optional[Union[Dict[str, str], bool]] = Field( + default=True, description="Whether to extract AutoML jobs." + ) @property def sagemaker_client(self): diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py index 259b36889c988..d07248f78837c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py @@ -36,6 +36,14 @@ from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import is_s3_uri, make_s3_urn, strip_s3_prefix @@ -143,7 +151,67 @@ def get_column_type( S3_PREFIXES = ["s3://", "s3n://", "s3a://"] +@platform_name("Data lake files") +@config_class(DataLakeSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class DataLakeSource(Source): + """ + This plugin extracts: + + - Row and column counts for each table + - For each column, if profiling is enabled: + - null counts and proportions + - distinct counts and proportions + - minimum, maximum, mean, median, standard deviation, some quantile values + - histograms or frequencies of unique values + + This connector supports both local files as well as those stored on AWS S3 (which must be identified using the prefix `s3://`). Supported file types are as follows: + + - CSV + - TSV + - JSON + - Parquet + - Apache Avro + + Schemas for Parquet and Avro files are extracted as provided. + + Schemas for schemaless formats (CSV, TSV, JSON) are inferred. For CSV and TSV files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details)) + JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance. + We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object. + + :::caution + + If you are ingesting datasets from AWS S3, we recommend running the ingestion on a server in the same region to avoid high egress costs. + + ::: + + ## Setup + + To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details). If profiling, make sure that permissions for **s3a://** access are set because Spark and Hadoop use the s3a:// protocol to interface with AWS (schema inference outside of profiling requires s3:// access). + + The data lake connector extracts schemas and profiles from a variety of file formats (see below for an exhaustive list). + Individual files are ingested as tables, and profiles are computed similar to the [SQL profiler](../../../../metadata-ingestion/docs/dev_guides/sql_profiles.md). + + Enabling profiling will slow down ingestion runs. + + :::caution + + Running profiling against many tables or over many rows can run up significant costs. + While we've done our best to limit the expensiveness of the queries the profiler runs, you + should be prudent about the set of tables profiling is enabled on or the frequency + of the profiling runs. + + ::: + + Because data lake files often have messy paths, we provide the built-in option to transform names into a more readable format via the `path_spec` option. This option extracts identifiers from paths through a format string specifier where extracted components are denoted as `{name[index]}`. + + For instance, suppose we wanted to extract the files `/base_folder/folder_1/table_a.csv` and `/base_folder/folder_2/table_b.csv`. To ingest, we could set `base_path` to `/base_folder/` and `path_spec` to `./{name[0]}/{name[1]}.csv`, which would extract tables with names `folder_1.table_a` and `folder_2.table_b`. You could also ignore the folder component by using a `path_spec` such as `./{folder_name}/{name[0]}.csv`, which would just extract tables with names `table_a` and `table_b` – note that any component without the form `{name[index]}` is ignored. + + If you would like to write a more complicated function for resolving file names, then a {transformer} would be a good fit. + + """ + source_config: DataLakeSourceConfig report: DataLakeSourceReport profiling_times_taken: List[float] diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py index 73425ad5d2c52..2e3ca3cc01075 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py @@ -2,35 +2,64 @@ import parse import pydantic +from pydantic.fields import Field -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.emitter.mce_builder import DEFAULT_ENV +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.ingestion.source.aws.aws_common import AwsSourceConfig from datahub.ingestion.source.aws.s3_util import is_s3_uri from datahub.ingestion.source.data_lake.profiling import DataLakeProfilerConfig -class DataLakeSourceConfig(ConfigModel): - - env: str = DEFAULT_ENV - base_path: str - platform: str = "" # overwritten by validator below - - use_relative_path: bool = False - ignore_dotfiles: bool = True - - aws_config: Optional[AwsSourceConfig] = None - - schema_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - profile_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - - path_spec: Optional[str] = None - - profiling: DataLakeProfilerConfig = DataLakeProfilerConfig() - - spark_driver_memory: str = "4g" - - max_rows: int = 100 +class DataLakeSourceConfig(EnvBasedSourceConfigBase): + + base_path: str = Field( + description="Path of the base folder to crawl. Unless `schema_patterns` and `profile_patterns` are set, the connector will ingest all files in this folder." + ) + platform: str = Field( + default="", + description="Autodetected. Platform to use in namespace when constructing URNs. If left blank, local paths will correspond to `file` and S3 paths will correspond to `s3`.", + ) # overwritten by validator below + + use_relative_path: bool = Field( + default=False, + description="Whether to use the relative path when constructing URNs. Has no effect when a `path_spec` is provided.", + ) + ignore_dotfiles: bool = Field( + default=True, + description="Whether to ignore files that start with `.`. For instance, `.DS_Store`, `.bash_profile`, etc.", + ) + + aws_config: Optional[AwsSourceConfig] = Field( + default=None, description="AWS details" + ) + + schema_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for tables to filter for ingestion.", + ) + profile_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for tables to profile ", + ) + + path_spec: Optional[str] = Field( + default=None, + description="Format string for constructing table identifiers from the relative path. See the above setup section for details.", + ) + + profiling: DataLakeProfilerConfig = Field( + default=DataLakeProfilerConfig(), description="Profiling configurations" + ) + + spark_driver_memory: str = Field( + default="4g", description="Max amount of memory to grant Spark." + ) + + max_rows: int = Field( + default=100, + description="Maximum number of rows to use when inferring schemas for TSV and CSV files.", + ) @pydantic.root_validator() def ensure_profiling_pattern_is_passed_to_profiling( diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py index 845fe8f1cb48e..a1fdb19e45004 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py @@ -4,6 +4,7 @@ import pydantic from pandas import DataFrame +from pydantic.fields import Field from pydeequ.analyzers import ( AnalysisRunBuilder, AnalysisRunner, @@ -60,26 +61,68 @@ def null_str(value: Any) -> Optional[str]: class DataLakeProfilerConfig(ConfigModel): - enabled: bool = False - - spark_cluster_manager: Optional[str] = None + enabled: bool = Field( + default=False, description="Whether profiling should be done." + ) + + spark_cluster_manager: Optional[str] = Field( + default=None, + description="Spark master URL. See [Spark docs](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) for details.", + ) # These settings will override the ones below. - profile_table_level_only: bool = False - - allow_deny_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - - max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = None - - include_field_null_count: bool = True - include_field_min_value: bool = True - include_field_max_value: bool = True - include_field_mean_value: bool = True - include_field_median_value: bool = True - include_field_stddev_value: bool = True - include_field_quantiles: bool = True - include_field_distinct_value_frequencies: bool = True - include_field_histogram: bool = True - include_field_sample_values: bool = True + profile_table_level_only: bool = Field( + default=False, + description="Whether to perform profiling at table-level only or include column-level profiling as well.", + ) + + allow_deny_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), description="" + ) + + max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( + default=None, + description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.", + ) + + include_field_null_count: bool = Field( + default=True, + description="Whether to profile for the number of nulls for each column.", + ) + include_field_min_value: bool = Field( + default=True, + description="Whether to profile for the min value of numeric columns.", + ) + include_field_max_value: bool = Field( + default=True, + description="Whether to profile for the max value of numeric columns.", + ) + include_field_mean_value: bool = Field( + default=True, + description="Whether to profile for the mean value of numeric columns.", + ) + include_field_median_value: bool = Field( + default=True, + description="Whether to profile for the median value of numeric columns.", + ) + include_field_stddev_value: bool = Field( + default=True, + description="Whether to profile for the standard deviation of numeric columns.", + ) + include_field_quantiles: bool = Field( + default=True, + description="Whether to profile for the quantiles of numeric columns.", + ) + include_field_distinct_value_frequencies: bool = Field( + default=True, description="Whether to profile for distinct value frequencies." + ) + include_field_histogram: bool = Field( + default=True, + description="Whether to profile for the histogram for numeric fields.", + ) + include_field_sample_values: bool = Field( + default=True, + description="Whether to profile for the sample values for all columns.", + ) @pydantic.root_validator() def ensure_field_level_settings_are_normalized( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt.py b/metadata-ingestion/src/datahub/ingestion/source/dbt.py index 7ddaef9922cf7..2a2854face0eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt.py @@ -7,11 +7,20 @@ import dateutil.parser import requests from pydantic import validator +from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.ingestion_job_state_provider import JobId from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.sql_types import ( @@ -95,27 +104,79 @@ def report_stale_entity_soft_deleted(self, urn: str) -> None: class DBTConfig(StatefulIngestionConfigBase): - manifest_path: str - catalog_path: str - sources_path: Optional[str] - env: str = mce_builder.DEFAULT_ENV - target_platform: str - load_schemas: bool = True - use_identifiers: bool = False - node_type_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - tag_prefix: str = f"{DBT_PLATFORM}:" - node_name_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - disable_dbt_node_creation = False - meta_mapping: Dict = {} - enable_meta_mapping = True - query_tag_mapping: Dict = {} - enable_query_tag_mapping = True - write_semantics: str = "PATCH" - strip_user_ids_from_email: bool = False - owner_extraction_pattern: Optional[str] + manifest_path: str = Field( + description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note this can be a local file or a URI." + ) + catalog_path: str = Field( + description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this can be a local file or a URI." + ) + sources_path: Optional[str] = Field( + default=None, + description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not specified, last-modified fields will not be populated. Note this can be a local file or a URI.", + ) + env: str = Field( + default=mce_builder.DEFAULT_ENV, + description="Environment to use in namespace when constructing URNs.", + ) + target_platform: str = Field( + description="The platform that dbt is loading onto. (e.g. bigquery / redshift / postgres etc.)" + ) + load_schemas: bool = Field( + default=True, + description="This flag is only consulted when disable_dbt_node_creation is set to True. Load schemas for target_platform entities from dbt catalog file, not necessary when you are already ingesting this metadata from the data platform directly. If set to False, table schema details (e.g. columns) will not be ingested.", + ) + use_identifiers: bool = Field( + default=False, + description="Use model identifier instead of model name if defined (if not, default to model name).", + ) + node_type_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for dbt nodes to filter in ingestion.", + ) + tag_prefix: str = Field( + default=f"{DBT_PLATFORM}:", description="Prefix added to tags during ingestion." + ) + node_name_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for dbt model names to filter in ingestion.", + ) + disable_dbt_node_creation = Field( + default=False, + description="Whether to suppress dbt dataset metadata creation. When set to True, this flag applies the dbt metadata to the target_platform entities (e.g. populating schema and column descriptions from dbt into the postgres / bigquery table metadata in DataHub) and generates lineage between the platform entities.", + ) + meta_mapping: Dict = Field( + default={}, + description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.", + ) + enable_meta_mapping = Field( + default=True, + description="When enabled, applies the mappings that are defined through the meta_mapping directives.", + ) + query_tag_mapping: Dict = Field( + default={}, + description="mapping rules that will be executed against dbt query_tag meta properties. Refer to the section below on dbt meta automated mappings.", + ) + enable_query_tag_mapping = Field( + default=True, + description="When enabled, applies the mappings that are defined through the `query_tag_mapping` directives.", + ) + write_semantics: str = Field( + default="PATCH", + description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE"', + ) + strip_user_ids_from_email: bool = Field( + default=False, + description="Whether or not to strip email id while adding owners using dbt meta actions.", + ) + owner_extraction_pattern: Optional[str] = Field( + default=None, + description='Regex string to extract owner from the dbt node using the `(?P...) syntax` of the [match object](https://docs.python.org/3/library/re.html#match-objects), where the group name must be `owner`. Examples: (1)`r"(?P(.*)): (\w+) (\w+)"` will extract `jdoe` as the owner from `"jdoe: John Doe"` (2) `r"@(?P(.*))"` will extract `alice` as the owner from `"@alice"`.', # noqa: W605 + ) # Custom Stateful Ingestion settings - stateful_ingestion: Optional[DBTStatefulIngestionConfig] = None + stateful_ingestion: Optional[DBTStatefulIngestionConfig] = Field( + default=None, description="" + ) @validator("target_platform") def validate_target_platform_value(cls, target_platform: str) -> str: @@ -603,8 +664,39 @@ def get_schema_metadata( ) +@platform_name("dbt") +@config_class(DBTConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability(SourceCapability.USAGE_STATS, "", supported=False) class DBTSource(StatefulIngestionSourceBase): - """Extract DBT metadata for ingestion to Datahub""" + """ + This plugin pulls metadata from dbt's artifact files and generates: + - dbt Tables: for nodes in the dbt manifest file that are models materialized as tables + - dbt Views: for nodes in the dbt manifest file that are models materialized as views + - dbt Ephemeral: for nodes in the dbt manifest file that are ephemeral models + - dbt Sources: for nodes that are sources on top of the underlying platform tables + - dbt Seed: for seed entities + - dbt Test: for dbt test entities + + Note: + 1. It also generates lineage between the `dbt` nodes (e.g. ephemeral nodes that depend on other dbt sources) as well as lineage between the `dbt` nodes and the underlying (target) platform nodes (e.g. BigQuery Table -> dbt Source, dbt View -> BigQuery View). + 2. The previous version of this source (`acryl_datahub<=0.8.16.2`) did not generate `dbt` entities and lineage between `dbt` entities and platform entities. For backwards compatibility with the previous version of this source, there is a config flag `disable_dbt_node_creation` that falls back to the old behavior. + 3. We also support automated actions (like add a tag, term or owner) based on properties defined in dbt meta. + + The artifacts used by this source are: + - [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json) + - This file contains model, source and lineage data. + - [dbt catalog file](https://docs.getdbt.com/reference/artifacts/catalog-json) + - This file contains schema data. + - dbt does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models + - [dbt sources file](https://docs.getdbt.com/reference/artifacts/sources-json) + - This file contains metadata for sources with freshness checks. + - We transfer dbt's freshness checks to DataHub's last-modified fields. + - Note that this file is optional – if not specified, we'll use time of ingestion instead as a proxy for time last-modified. + + """ @classmethod def create(cls, config_dict, ctx): diff --git a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py index 99a469356627c..45c439d7b61a0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py +++ b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py @@ -8,6 +8,7 @@ from elasticsearch import Elasticsearch from pydantic import validator +from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.source_common import DatasetSourceConfigBase @@ -18,6 +19,14 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass @@ -165,12 +174,22 @@ def report_dropped(self, index: str) -> None: class ElasticsearchSourceConfig(DatasetSourceConfigBase): - host: str = "localhost:9200" - username: Optional[str] = None - password: Optional[str] = None - url_prefix: str = "" - index_pattern: AllowDenyPattern = AllowDenyPattern( - allow=[".*"], deny=["^_.*", "^ilm-history.*"] + host: str = Field( + default="localhost:9200", description="The elastic search host URI." + ) + username: Optional[str] = Field( + default=None, description="The username credential." + ) + password: Optional[str] = Field( + default=None, description="The password credential." + ) + url_prefix: str = Field( + default="", + description="There are cases where an enterprise would have multiple elastic search clusters. One way for them to manage is to have a single endpoint for all the elastic search clusters and use url_prefix for routing requests to different clusters.", + ) + index_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern(allow=[".*"], deny=["^_.*", "^ilm-history.*"]), + description="regex patterns for indexes to filter in ingestion.", ) @validator("host") @@ -208,7 +227,19 @@ def http_auth(self) -> Optional[Tuple[str, str]]: return self.username, self.password or "" +@platform_name("Elastic Search") +@config_class(ElasticsearchSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") class ElasticsearchSource(Source): + + """ + This plugin extracts the following: + + - Metadata for indexes + - Column types associated with each index field + """ + def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py index 9241165b1a916..c17ae5c14a85f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -2,6 +2,8 @@ from dataclasses import dataclass from typing import Dict, Iterable, List, Tuple, Union +from pydantic import Field + if sys.version_info >= (3, 7): from feast import ( BigQuerySource, @@ -23,6 +25,14 @@ from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType @@ -65,12 +75,28 @@ class FeastRepositorySourceConfig(ConfigModel): - path: str - environment: str = DEFAULT_ENV + path: str = Field(description="Path to Feast repository") + environment: str = Field( + default=DEFAULT_ENV, description="Environment to use when constructing URNs" + ) +@platform_name("Feast") +@config_class(FeastRepositorySourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") @dataclass class FeastRepositorySource(Source): + """ + This plugin extracts: + + - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey) + - Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature) + - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable) + - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset) + - Column types associated with each entity and feature + """ + source_config: FeastRepositorySourceConfig report: SourceReport feature_store: FeatureStore diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/feast_legacy.py index 92cce6b305675..35b1a90163a39 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast_legacy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast_legacy.py @@ -6,11 +6,19 @@ from typing import Dict, Iterable, List import docker +from pydantic import Field import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel -from datahub.emitter.mce_builder import DEFAULT_ENV +from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType @@ -51,10 +59,15 @@ HOSTED_FEAST_IMAGE = "acryldata/datahub-ingestion-feast-wrapper" -class FeastConfig(ConfigModel): - core_url: str = "localhost:6565" - env: str = DEFAULT_ENV - use_local_build: bool = False +class FeastConfig(EnvBasedSourceConfigBase): + core_url: str = Field( + default="localhost:6565", description="URL of Feast Core instance." + ) + + use_local_build: bool = Field( + default=False, + description="Whether to build Feast ingestion Docker image locally.", + ) @dataclass @@ -65,8 +78,25 @@ def report_dropped(self, name: str) -> None: self.filtered.append(name) +@platform_name("Feast") +@config_class(FeastConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") @dataclass class FeastSource(Source): + """ + This plugin extracts the following: + + - List of feature tables (modeled as [`MLFeatureTable`](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureTableProperties.pdl)s), + features ([`MLFeature`](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl)s), + and entities ([`MLPrimaryKey`](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLPrimaryKeyProperties.pdl)s) + - Column types associated with each feature and entity + + Note: this uses a separate Docker container to extract Feast's metadata into a JSON file, which is then + parsed to DataHub's native objects. This separation was performed because of a dependency conflict in the `feast` module. + + """ + config: FeastConfig report: FeastSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py index d5e07ac50fd9b..d6cff3d43c839 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/file.py +++ b/metadata-ingestion/src/datahub/ingestion/source/file.py @@ -2,7 +2,15 @@ from dataclasses import dataclass, field from typing import Iterable, Iterator, Union +from pydantic.fields import Field + from datahub.configuration.common import ConfigModel +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( @@ -45,11 +53,18 @@ def iterate_generic_file( class FileSourceConfig(ConfigModel): - filename: str + filename: str = Field(description="Path to file to ingest.") +@platform_name("File") +@config_class(FileSourceConfig) +@support_status(SupportStatus.CERTIFIED) @dataclass class GenericFileSource(Source): + """ + This plugin pulls metadata from a previously generated file. The [file sink](../../../../metadata-ingestion/sink_docs/file.md) can produce such files, and a number of samples are included in the [examples/mce_files](../../../../metadata-ingestion/examples/mce_files) directory. + """ + config: FileSourceConfig report: SourceReport = field(default_factory=SourceReport) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index eb843587020ac..6d7e8ea80d02d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -3,49 +3,111 @@ from typing import Any, Dict, List, Optional import pydantic +from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern, ConfigModel class GEProfilingConfig(ConfigModel): - enabled: bool = False - limit: Optional[int] = None - offset: Optional[int] = None + enabled: bool = Field( + default=False, description="Whether profiling should be done." + ) + limit: Optional[int] = Field( + default=None, + description="Max number of documents to profile. By default, profiles all documents.", + ) + offset: Optional[int] = Field( + default=None, + description="Offset in documents to profile. By default, uses no offset.", + ) # These settings will override the ones below. - turn_off_expensive_profiling_metrics: bool = False - profile_table_level_only: bool = False + turn_off_expensive_profiling_metrics: bool = Field( + default=False, + description="Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.", + ) + profile_table_level_only: bool = Field( + default=False, + description="Whether to perform profiling at table-level only, or include column-level profiling as well.", + ) - include_field_null_count: bool = True - include_field_min_value: bool = True - include_field_max_value: bool = True - include_field_mean_value: bool = True - include_field_median_value: bool = True - include_field_stddev_value: bool = True - include_field_quantiles: bool = False - include_field_distinct_value_frequencies: bool = False - include_field_histogram: bool = False - include_field_sample_values: bool = True + include_field_null_count: bool = Field( + default=True, + description="Whether to profile for the number of nulls for each column.", + ) + include_field_min_value: bool = Field( + default=True, + description="Whether to profile for the min value of numeric columns.", + ) + include_field_max_value: bool = Field( + default=True, + description="Whether to profile for the max value of numeric columns.", + ) + include_field_mean_value: bool = Field( + default=True, + description="Whether to profile for the mean value of numeric columns.", + ) + include_field_median_value: bool = Field( + default=True, + description="Whether to profile for the median value of numeric columns.", + ) + include_field_stddev_value: bool = Field( + default=True, + description="Whether to profile for the standard deviation of numeric columns.", + ) + include_field_quantiles: bool = Field( + default=False, + description="Whether to profile for the quantiles of numeric columns.", + ) + include_field_distinct_value_frequencies: bool = Field( + default=False, description="Whether to profile for distinct value frequencies." + ) + include_field_histogram: bool = Field( + default=False, + description="Whether to profile for the histogram for numeric fields.", + ) + include_field_sample_values: bool = Field( + default=True, + description="Whether to profile for the sample values for all columns.", + ) - allow_deny_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = None + allow_deny_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for filtering of tables or table columns to profile.", + ) + max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( + default=None, + description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.", + ) # The default of (5 * cpu_count) is adopted from the default max_workers # parameter of ThreadPoolExecutor. Given that profiling is often an I/O-bound # task, it may make sense to increase this default value in the future. # https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor - max_workers: int = 5 * (os.cpu_count() or 4) + max_workers: int = Field( + default=5 * (os.cpu_count() or 4), + description="Number of worker threads to use for profiling. Set to 1 to disable.", + ) # The query combiner enables us to combine multiple queries into a single query, # reducing the number of round-trips to the database and speeding up profiling. - query_combiner_enabled: bool = True + query_combiner_enabled: bool = Field( + default=True, + description="*This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible.", + ) # Hidden option - used for debugging purposes. - catch_exceptions: bool = True + catch_exceptions: bool = Field(default=True, description="") - partition_profiling_enabled: bool = True - bigquery_temp_table_schema: Optional[str] = None - partition_datetime: Optional[datetime.datetime] + partition_profiling_enabled: bool = Field(default=True, description="") + bigquery_temp_table_schema: Optional[str] = Field( + default=None, + description="On bigquery for profiling partitioned tables needs to create temporary views. You have to define a schema where these will be created. Views will be cleaned up after profiler runs. (Great expectation tech details about this (https://legacy.docs.greatexpectations.io/en/0.9.0/reference/integrations/bigquery.html#custom-queries-with-sql-datasource).", + ) + partition_datetime: Optional[datetime.datetime] = Field( + default=None, + description="For partitioned datasets profile only the partition which matches the datetime or profile the latest one if not set. Only Bigquery supports this.", + ) @pydantic.root_validator() def ensure_field_level_settings_are_normalized( diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index bb5c6718b1d8e..1fef58e2bafad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -8,11 +8,18 @@ import click import requests +from pydantic.fields import Field from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern from datahub.emitter.mce_builder import make_group_urn, make_user_urn from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( # SourceCapability,; capability, + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import ( @@ -33,41 +40,94 @@ class AzureADConfig(ConfigModel): """Config to create a token and connect to Azure AD instance""" # Required - client_id: str - tenant_id: str - client_secret: str - authority: str - token_url: str - + client_id: str = Field( + description="Application ID. Found in your app registration on Azure AD Portal" + ) + tenant_id: str = Field( + description="Directory ID. Found in your app registration on Azure AD Portal" + ) + client_secret: str = Field( + description="Client secret. Found in your app registration on Azure AD Portal" + ) + authority: str = Field( + description="The authority (https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from." + ) + token_url: str = Field( + description="The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint." + ) # Optional: URLs for redirect and hitting the Graph API - redirect: str = "https://login.microsoftonline.com/common/oauth2/nativeclient" - graph_url: str = "https://graph.microsoft.com/v1.0" + redirect: str = Field( + "https://login.microsoftonline.com/common/oauth2/nativeclient", + description="Redirect URI. Found in your app registration on Azure AD Portal.", + ) + + graph_url: str = Field( + "https://graph.microsoft.com/v1.0", + description="[Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api)", + ) # Optional: Customize the mapping to DataHub Username from an attribute in the REST API response # Reference: https://docs.microsoft.com/en-us/graph/api/user-list?view=graph-rest-1.0&tabs=http#response-1 - azure_ad_response_to_username_attr: str = "userPrincipalName" - azure_ad_response_to_username_regex: str = "(.*)" + azure_ad_response_to_username_attr: str = Field( + default="userPrincipalName", + description="Which Azure AD User Response attribute to use as input to DataHub username mapping.", + ) + azure_ad_response_to_username_regex: str = Field( + default="(.*)", + description="A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`.", + ) # Optional: Customize the mapping to DataHub Groupname from an attribute in the REST API response # Reference: https://docs.microsoft.com/en-us/graph/api/group-list?view=graph-rest-1.0&tabs=http#response-1 - azure_ad_response_to_groupname_attr: str = "displayName" - azure_ad_response_to_groupname_regex: str = "(.*)" + azure_ad_response_to_groupname_attr: str = Field( + default="displayName", + description="Which Azure AD Group Response attribute to use as input to DataHub group name mapping.", + ) + azure_ad_response_to_groupname_regex: str = Field( + default="(.*)", + description="A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`.", + ) # Optional: to ingest users, groups or both - ingest_users: bool = True - ingest_groups: bool = True - ingest_group_membership: bool = True - - ingest_groups_users: bool = True - users_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - groups_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() + ingest_users: bool = Field( + default=True, description="Whether users should be ingested into DataHub." + ) + ingest_groups: bool = Field( + default=True, description="Whether groups should be ingested into DataHub." + ) + ingest_group_membership: bool = Field( + default=True, + description="Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.", + ) + + ingest_groups_users: bool = Field( + default=True, + description="This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested.", + ) + users_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for users to filter in ingestion.", + ) + groups_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for groups to include in ingestion.", + ) # If enabled, report will contain names of filtered users and groups. - filtered_tracking: bool = True + filtered_tracking: bool = Field( + default=True, + description="If enabled, report will contain names of filtered users and groups.", + ) # Optional: Whether to mask sensitive information from workunit ID's. On by default. - mask_group_id: bool = True - mask_user_id: bool = True + mask_group_id: bool = Field( + True, + description="Whether workunit ID's for groups should be masked to avoid leaking sensitive information.", + ) + mask_user_id: bool = Field( + True, + description="Whether workunit ID's for users should be masked to avoid leaking sensitive information.", + ) @dataclass @@ -85,8 +145,81 @@ def report_filtered(self, name: str) -> None: # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API +@platform_name("Azure AD") +@config_class(AzureADConfig) +@support_status(SupportStatus.CERTIFIED) class AzureADSource(Source): - """Ingest Azure AD Users and Groups into DataHub""" + """ + This plugin extracts the following: + + - Users + - Groups + - Group Membership + + from your Azure AD instance. + + ### Extracting DataHub Users + + #### Usernames + + Usernames serve as unique identifiers for users on DataHub. This connector extracts usernames using the + "userPrincipalName" field of an [Azure AD User Response](https://docs.microsoft.com/en-us/graph/api/user-list?view=graph-rest-1.0&tabs=http#response-1), + which is the unique identifier for your Azure AD users. + + If this is not how you wish to map to DataHub usernames, you can provide a custom mapping using the configurations options detailed below. Namely, `azure_ad_response_to_username_attr` + and `azure_ad_response_to_username_regex`. + + #### Responses + + This connector also extracts basic user response information from Azure. The following fields of the Azure User Response are extracted + and mapped to the DataHub `CorpUserInfo` aspect: + + - display name + - first name + - last name + - email + - title + - country + + ### Extracting DataHub Groups + + #### Group Names + + Group names serve as unique identifiers for groups on DataHub. This connector extracts group names using the "name" attribute of an Azure Group Response. + By default, a URL-encoded version of the full group name is used as the unique identifier (CorpGroupKey) and the raw "name" attribute is mapped + as the display name that will appear in DataHub's UI. + + If this is not how you wish to map to DataHub group names, you can provide a custom mapping using the configurations options detailed below. Namely, `azure_ad_response_to_groupname_attr` + and `azure_ad_response_to_groupname_regex`. + + #### Responses + + This connector also extracts basic group information from Azure. The following fields of the [Azure AD Group Response](https://docs.microsoft.com/en-us/graph/api/group-list?view=graph-rest-1.0&tabs=http#response-1) are extracted and mapped to the + DataHub `CorpGroupInfo` aspect: + + - name + - description + + ### Extracting Group Membership + + This connector additional extracts the edges between Users and Groups that are stored in [Azure AD](https://docs.microsoft.com/en-us/graph/api/group-list-members?view=graph-rest-1.0&tabs=http#response-1). It maps them to the `GroupMembership` aspect + associated with DataHub users (CorpUsers). Today this has the unfortunate side effect of **overwriting** any Group Membership information that + was created outside of the connector. That means if you've used the DataHub REST API to assign users to groups, this information will be overridden + when the Azure AD Source is executed. If you intend to *always* pull users, groups, and their relationships from your Identity Provider, then + this should not matter. + + This is a known limitation in our data model that is being tracked by [this ticket](https://github.com/datahub-project/datahub/issues/3065). + + ### Prerequisite + + [Create a DataHub Application](https://docs.microsoft.com/en-us/graph/toolkit/get-started/add-aad-app-registration) within the Azure AD Portal with the permissions + to read your organization's Users and Groups. The following permissions are required, with the `Application` permission type: + + - `Group.Read.All` + - `GroupMember.Read.All` + - `User.Read.All` + + """ @classmethod def create(cls, config_dict, ctx): diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index 6c3b28b2ec868..af1ed371cab3d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -10,10 +10,19 @@ from okta.exceptions import OktaAPIException from okta.models import Group, GroupProfile, User, UserProfile, UserStatus from pydantic import validator +from pydantic.fields import Field from datahub.configuration import ConfigModel from datahub.configuration.common import ConfigurationError from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import ( @@ -31,44 +40,92 @@ class OktaConfig(ConfigModel): - # Required: Domain of the Okta deployment. Example: dev-33231928.okta.com - okta_domain = "dev-44231988.okta.com" + okta_domain: str = Field( + default="dev-33231928.okta.com", + description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console.", + ) # Required: An API token generated from Okta. - okta_api_token = "00be4R_M2MzDqXawbWgfKGpKee0kuEOfX1RCQSRx00" + okta_api_token: str = Field( + default="00be4R_M2MzDqXawbWgfKGpKee0kuEOfX1RCQSRx00", + description="An API token generated for the DataHub application inside your Okta Developer Console.", + ) # Optional: Whether to ingest users, groups, or both. - ingest_users: bool = True - ingest_groups: bool = True - ingest_group_membership: bool = True + ingest_users: bool = Field( + default=True, description="Whether users should be ingested into DataHub." + ) + ingest_groups: bool = Field( + default=True, description="Whether groups should be ingested into DataHub." + ) + ingest_group_membership: bool = Field( + default=True, + description="Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.", + ) # Optional: Customize the mapping to DataHub Username from an attribute appearing in the Okta User # profile. Reference: https://developer.okta.com/docs/reference/api/users/ - okta_profile_to_username_attr: str = "login" - okta_profile_to_username_regex: str = "([^@]+)" + okta_profile_to_username_attr: str = Field( + default="login", + description="Which Okta User Profile attribute to use as input to DataHub username mapping.", + ) + okta_profile_to_username_regex: str = Field( + default="([^@]+)", + description="A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`.", + ) # Optional: Customize the mapping to DataHub Group from an attribute appearing in the Okta Group # profile. Reference: https://developer.okta.com/docs/reference/api/groups/ - okta_profile_to_group_name_attr: str = "name" - okta_profile_to_group_name_regex: str = "(.*)" + okta_profile_to_group_name_attr: str = Field( + default="name", + description="Which Okta Group Profile attribute to use as input to DataHub group name mapping.", + ) + okta_profile_to_group_name_regex: str = Field( + default="(.*)", + description="A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`.", + ) # Optional: Include deprovisioned or suspended Okta users in the ingestion. - include_deprovisioned_users = False - include_suspended_users = False + include_deprovisioned_users: bool = Field( + default=False, + description="Whether to ingest users in the DEPROVISIONED state from Okta.", + ) + include_suspended_users: bool = Field( + default=False, + description="Whether to ingest users in the SUSPENDED state from Okta.", + ) # Optional: Page size for reading groups and users from Okta API. - page_size = 100 + page_size: int = Field( + default=100, + description="The number of entities requested from Okta's REST APIs in one request.", + ) # Optional: Set the delay for fetching batches of entities from Okta. Okta has rate limiting in place. - delay_seconds = 0.01 + delay_seconds: Union[float, int] = Field( + default=0.01, + description="Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms.", + ) # Optional: Filter and search expression for ingesting a subset of users. Only one can be specified at a time. - okta_users_filter: Optional[str] = None - okta_users_search: Optional[str] = None + okta_users_filter: Optional[str] = Field( + default=None, + description="Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See (https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info.", + ) + okta_users_search: Optional[str] = Field( + default=None, + description="Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See (https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info.", + ) # Optional: Filter and search expression for ingesting a subset of groups. Only one can be specified at a time. - okta_groups_filter: Optional[str] = None - okta_groups_search: Optional[str] = None + okta_groups_filter: Optional[str] = Field( + default=None, + description="Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See (https://developer.okta.com/docs/reference/api/groups/#filters) for more info.", + ) + okta_groups_search: Optional[str] = Field( + default=None, + description="Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See (https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info.", + ) # Optional: Whether to mask sensitive information from workunit ID's. On by default. mask_group_id: bool = True @@ -111,8 +168,83 @@ def report_filtered(self, name: str) -> None: # - Group Membership Edges: 1000 (1 per User) # - Run Time (Wall Clock): 2min 7sec # + + +@platform_name("Okta") +@config_class(OktaConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration") class OktaSource(Source): - """Ingest Okta Users & Groups into Datahub""" + """ + This plugin extracts the following: + + - Users + - Groups + - Group Membership + + from your Okta instance. + + ### Extracting DataHub Users + + #### Usernames + + Usernames serve as unique identifiers for users on DataHub. This connector extracts usernames using the + "login" field of an [Okta User Profile](https://developer.okta.com/docs/reference/api/users/#profile-object). + By default, the 'login' attribute, which contains an email, is parsed to extract the text before the "@" and map that to the DataHub username. + + If this is not how you wish to map to DataHub usernames, you can provide a custom mapping using the configurations options detailed below. Namely, `okta_profile_to_username_attr` + and `okta_profile_to_username_regex`. + + #### Profiles + + This connector also extracts basic user profile information from Okta. The following fields of the Okta User Profile are extracted + and mapped to the DataHub `CorpUserInfo` aspect: + + - display name + - first name + - last name + - email + - title + - department + - country code + + ### Extracting DataHub Groups + + #### Group Names + + Group names serve as unique identifiers for groups on DataHub. This connector extracts group names using the "name" attribute of an Okta Group Profile. + By default, a URL-encoded version of the full group name is used as the unique identifier (CorpGroupKey) and the raw "name" attribute is mapped + as the display name that will appear in DataHub's UI. + + If this is not how you wish to map to DataHub group names, you can provide a custom mapping using the configurations options detailed below. Namely, `okta_profile_to_group_name_attr` + and `okta_profile_to_group_name_regex`. + + #### Profiles + + This connector also extracts basic group information from Okta. The following fields of the Okta Group Profile are extracted and mapped to the + DataHub `CorpGroupInfo` aspect: + + - name + - description + + ### Extracting Group Membership + + This connector additional extracts the edges between Users and Groups that are stored in Okta. It maps them to the `GroupMembership` aspect + associated with DataHub users (CorpUsers). Today this has the unfortunate side effect of **overwriting** any Group Membership information that + was created outside of the connector. That means if you've used the DataHub REST API to assign users to groups, this information will be overridden + when the Okta source is executed. If you intend to *always* pull users, groups, and their relationships from your Identity Provider, then + this should not matter. + + This is a known limitation in our data model that is being tracked by [this ticket](https://github.com/datahub-project/datahub/issues/3065). + + ### Filtering and Searching + You can also choose to ingest a subset of users or groups to Datahub by adding flags for filtering or searching. For + users, set either the `okta_users_filter` or `okta_users_search` flag (only one can be set at a time). For groups, set + either the `okta_groups_filter` or `okta_groups_search` flag. Note that these are not regular expressions. See [below](#config-details) for full configuration + options. + + + """ @classmethod def create(cls, config_dict, ctx): diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index be2f627e19de5..0217c384bf1e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -5,6 +5,7 @@ from typing import Dict, Iterable, List, Optional, Tuple, Type, cast import confluent_kafka +import pydantic from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.kafka import KafkaConsumerConnectionConfig @@ -19,6 +20,12 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import add_domain_to_entity_wu from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase from datahub.ingestion.source.state.checkpoint import Checkpoint @@ -59,8 +66,14 @@ class KafkaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigBase): # TODO: inline the connection config connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"]) - domain: Dict[str, AllowDenyPattern] = dict() - topic_subject_map: Dict[str, str] = dict() + domain: Dict[str, AllowDenyPattern] = pydantic.Field( + default_factory=dict, + description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).", + ) + topic_subject_map: Dict[str, str] = pydantic.Field( + default_factory=dict, + description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `-key`:`` and `-value`:`` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.", + ) # Custom Stateful Ingestion settings stateful_ingestion: Optional[KafkaSourceStatefulIngestionConfig] = None schema_registry_class: str = ( @@ -84,8 +97,16 @@ def report_stale_entity_soft_deleted(self, urn: str) -> None: self.soft_deleted_stale_entities.append(urn) -@dataclass +@platform_name("Kafka") +@config_class(KafkaSourceConfig) +@support_status(SupportStatus.CERTIFIED) class KafkaSource(StatefulIngestionSourceBase): + """ + This plugin extracts the following: + - Topics from the Kafka broker + - Schemas associated with each topic from the schema registry (only Avro schemas are currently supported) + """ + source_config: KafkaSourceConfig consumer: confluent_kafka.Consumer report: KafkaSourceReport @@ -165,10 +186,10 @@ def get_platform_instance_id(self) -> str: assert self.source_config.platform_instance is not None return self.source_config.platform_instance - @classmethod - def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource": - config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict) - return cls(config, ctx) + # @classmethod + # def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource": + # config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict) + # return cls(config, ctx) def gen_removed_entity_workunits(self) -> Iterable[MetadataWorkUnit]: last_checkpoint = self.get_last_checkpoint( diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index 696318cefb29c..5377cf16d3f2c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -7,6 +7,7 @@ import jpype import jpype.imports import requests +from pydantic.fields import Field from sqlalchemy.engine.url import make_url import datahub.emitter.mce_builder as builder @@ -15,6 +16,14 @@ from datahub.configuration.source_common import DatasetLineageProviderConfigBase from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.sql_common import get_platform_from_sqlalchemy_uri @@ -31,14 +40,33 @@ class ProvidedConfig(ConfigModel): class KafkaConnectSourceConfig(DatasetLineageProviderConfigBase): # See the Connect REST Interface for details # https://docs.confluent.io/platform/current/connect/references/restapi.html# - connect_uri: str = "http://localhost:8083/" - username: Optional[str] = None - password: Optional[str] = None - cluster_name: Optional[str] = "connect-cluster" - construct_lineage_workunits: bool = True - connector_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - provided_configs: Optional[List[ProvidedConfig]] = None - connect_to_platform_map: Optional[dict] = None + connect_uri: str = Field( + default="http://localhost:8083/", description="URI to connect to." + ) + username: Optional[str] = Field(default=None, description="Kafka Connect username.") + password: Optional[str] = Field(default=None, description="Kafka Connect password.") + cluster_name: Optional[str] = Field( + default="connect-cluster", description="Cluster to ingest from." + ) + construct_lineage_workunits: bool = Field( + default=True, + description="Whether to create the input and output Dataset entities", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for connectors to filter for ingestion.", + ) + provided_configs: Optional[List[ProvidedConfig]] = Field( + default=None, description="Provided Configurations" + ) + connect_to_platform_map: Optional[dict] = Field( + default=None, + description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', + ) + platform_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', + ) @dataclass @@ -792,13 +820,23 @@ def transform_connector_config( connector_config[k] = v.replace(key, value) +@platform_name("Kafka Connect") +@config_class(KafkaConnectSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") class KafkaConnectSource(Source): - """The class for Kafka Connect source. + """ + This plugin extracts the following: + + - Kafka Connect connector as individual `DataFlowSnapshotClass` entity + - Creating individual `DataJobSnapshotClass` entity using `{connector_name}:{source_dataset}` naming + - Lineage information between source database to Kafka topic - Attributes: - config (KafkaConnectSourceConfig): Kafka Connect cluster REST API configurations. - report (KafkaConnectSourceReport): Kafka Connect source ingestion report. + Current limitations: + - works only for + - JDBC and Debezium source connectors + - BigQuery sink connector """ config: KafkaConnectSourceConfig diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 2a7e4cc03ccb1..633651c7b171a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -4,9 +4,16 @@ import ldap from ldap.controls import SimplePagedResultsControl +from pydantic.fields import Field from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent @@ -62,31 +69,45 @@ class LDAPSourceConfig(ConfigModel): """Config used by the LDAP Source.""" # Server configuration. - ldap_server: str - ldap_user: str - ldap_password: str + ldap_server: str = Field(description="LDAP server URL.") + ldap_user: str = Field(description="LDAP user.") + ldap_password: str = Field(description="LDAP password.") # Extraction configuration. - base_dn: str - filter: str = "(objectClass=*)" + base_dn: str = Field(description="LDAP DN.") + filter: str = Field(default="(objectClass=*)", description="LDAP extractor filter.") # If set to true, any users without first and last names will be dropped. - drop_missing_first_last_name: bool = True + drop_missing_first_last_name: bool = Field( + default=True, + description="If set to true, any users without first and last names will be dropped.", + ) - page_size: int = 20 + page_size: int = Field( + default=20, description="Size of each page to fetch when extracting metadata." + ) @dataclasses.dataclass class LDAPSourceReport(SourceReport): + dropped_dns: List[str] = dataclasses.field(default_factory=list) def report_dropped(self, dn: str) -> None: self.dropped_dns.append(dn) +@platform_name("LDAP") +@config_class(LDAPSourceConfig) +@support_status(SupportStatus.CERTIFIED) @dataclasses.dataclass class LDAPSource(Source): - """LDAP Source Class.""" + """ + This plugin extracts the following: + - People + - Names, emails, titles, and manager information for each person + - List of groups + """ config: LDAPSourceConfig report: LDAPSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker.py b/metadata-ingestion/src/datahub/ingestion/source/looker.py index 815164763d27f..e30c6a6766a8a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker.py @@ -30,13 +30,19 @@ Query, User, ) -from pydantic import validator +from pydantic import Field, validator import datahub.emitter.mce_builder as builder from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.looker_common import ( @@ -72,10 +78,15 @@ def get_transport_options(self) -> TransportOptions: class LookerAPIConfig(ConfigModel): - client_id: str - client_secret: str - base_url: str - transport_options: Optional[TransportOptionsConfig] + client_id: str = Field(description="Looker API client id.") + client_secret: str = Field(description="Looker API client secret.") + base_url: str = Field( + description="Url to your Looker instance: `https://company.looker.com:19999` or `https://looker.company.com`, or similar. Used for making API calls to Looker and constructing clickable dashboard and chart urls." + ) + transport_options: Optional[TransportOptionsConfig] = Field( + None, + description="Populates the [TransportOptions](https://github.com/looker-open-source/sdk-codegen/blob/94d6047a0d52912ac082eb91616c1e7c379ab262/python/looker_sdk/rtl/transport.py#L70) struct for looker client", + ) class LookerAPI: @@ -107,15 +118,41 @@ def get_client(self) -> Looker31SDK: class LookerDashboardSourceConfig(LookerAPIConfig, LookerCommonConfig): - actor: Optional[str] - dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - include_deleted: bool = False - extract_owners: bool = True - strip_user_ids_from_email: bool = False - skip_personal_folders: bool = False - max_threads: int = os.cpu_count() or 40 - external_base_url: Optional[str] + dashboard_pattern: AllowDenyPattern = Field( + AllowDenyPattern.allow_all(), + description="Patterns for selecting dashboard ids that are to be included", + ) + chart_pattern: AllowDenyPattern = Field( + AllowDenyPattern.allow_all(), + description="Patterns for selecting chart ids that are to be included", + ) + include_deleted: bool = Field( + False, description="Whether to include deleted dashboards." + ) + extract_owners: bool = Field( + True, + description="When enabled, extracts ownership from Looker directly. When disabled, ownership is left empty for dashboards and charts.", + ) + actor: Optional[str] = Field( + None, + description="This config is deprecated in favor of `extract_owners`. Previously, was the actor to use in ownership properties of ingested metadata.", + ) + strip_user_ids_from_email: bool = Field( + False, + description="When enabled, converts Looker user emails of the form name@domain.com to urn:li:corpuser:name when assigning ownership", + ) + skip_personal_folders: bool = Field( + False, + description="Whether to skip ingestion of dashboards in personal folders. Setting this to True will only ingest dashboards in the Shared folder space.", + ) + max_threads: int = Field( + os.cpu_count() or 40, + description="Max parallelism for Looker API calls. Defaults to cpuCount or 40", + ) + external_base_url: Optional[str] = Field( + None, + description="Optional URL to use when constructing external URLs to Looker if the `base_url` is not the correct one to use. For example, `https://looker-public.company.com`. If not provided, the external base URL will default to `base_url`.", + ) @validator("external_base_url", pre=True, always=True) def external_url_defaults_to_api_config_base_url( @@ -275,7 +312,22 @@ def get_urn_dashboard_id(self): return f"dashboards.{self.id}" +@platform_name("Looker") +@support_status(SupportStatus.CERTIFIED) +@config_class(LookerDashboardSourceConfig) class LookerDashboardSource(Source): + """ + This plugin extracts the following: + - Looker dashboards, dashboard elements (charts) and explores + - Names, descriptions, URLs, chart types, input explores for the charts + - Schemas and input views for explores + - Owners of dashboards + + :::note + To get complete Looker metadata integration (including Looker views and lineage to the underlying warehouse tables), you must ALSO use the `lookml` module. + ::: + """ + source_config: LookerDashboardSourceConfig reporter: LookerDashboardSourceReport client: Looker31SDK @@ -912,10 +964,5 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: self.reporter.report_workunit(workunit) yield workunit - @classmethod - def create(cls, config_dict, ctx): - config = LookerDashboardSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py index a36d5aa4ab289..7f717d0efc82d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py @@ -4,9 +4,11 @@ from enum import Enum from typing import Dict, Iterable, List, Optional, Tuple, Union +import pydantic from looker_sdk.error import SDKError from looker_sdk.rtl.transport import TransportOptions from looker_sdk.sdk.api31.methods import Looker31SDK +from pydantic import BaseModel, Field from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder @@ -62,13 +64,13 @@ logger = logging.getLogger(__name__) -@dataclass -class NamingPattern: +# @dataclass +class NamingPattern(BaseModel): allowed_vars: List[str] pattern: str variables: Optional[List[str]] = None - def validate(self, at_least_one: bool) -> bool: + def validate_pattern(self, at_least_one: bool) -> bool: variables = re.findall("({[^}{]+})", self.pattern) self.variables = [v[1:-1] for v in variables] for v in variables: @@ -93,8 +95,11 @@ def validate(self, at_least_one: bool) -> bool: class LookerExploreNamingConfig(ConfigModel): - explore_naming_pattern: NamingPattern = NamingPattern( - allowed_vars=naming_pattern_variables, pattern="{model}.explore.{name}" + explore_naming_pattern: NamingPattern = pydantic.Field( + description="Pattern for providing dataset names to explores. Allowed variables are {project}, {model}, {name}. Default is `{model}.explore.{name}`", + default=NamingPattern( + allowed_vars=naming_pattern_variables, pattern="{model}.explore.{name}" + ), ) explore_browse_pattern: NamingPattern = NamingPattern( allowed_vars=naming_pattern_variables, @@ -115,17 +120,23 @@ def init_naming_pattern(cls, v): @validator("explore_naming_pattern", "explore_browse_pattern", always=True) def validate_naming_pattern(cls, v): assert isinstance(v, NamingPattern) - v.validate(at_least_one=True) + v.validate_pattern(at_least_one=True) return v class LookerViewNamingConfig(ConfigModel): - view_naming_pattern: NamingPattern = NamingPattern( - allowed_vars=naming_pattern_variables, pattern="{project}.view.{name}" + view_naming_pattern: NamingPattern = Field( + NamingPattern( + allowed_vars=naming_pattern_variables, pattern="{project}.view.{name}" + ), + description="Pattern for providing dataset names to views. Allowed variables are `{project}`, `{model}`, `{name}`", ) - view_browse_pattern: NamingPattern = NamingPattern( - allowed_vars=naming_pattern_variables, - pattern="/{env}/{platform}/{project}/views/{name}", + view_browse_pattern: NamingPattern = Field( + NamingPattern( + allowed_vars=naming_pattern_variables, + pattern="/{env}/{platform}/{project}/views/{name}", + ), + description="Pattern for providing browse paths to views. Allowed variables are `{project}`, `{model}`, `{name}`, `{platform}` and `{env}`", ) @validator("view_naming_pattern", "view_browse_pattern", pre=True) @@ -142,16 +153,24 @@ def init_naming_pattern(cls, v): @validator("view_naming_pattern", "view_browse_pattern", always=True) def validate_naming_pattern(cls, v): assert isinstance(v, NamingPattern) - v.validate(at_least_one=True) + v.validate_pattern(at_least_one=True) return v class LookerCommonConfig( LookerViewNamingConfig, LookerExploreNamingConfig, DatasetSourceConfigBase ): - tag_measures_and_dimensions: bool = True - platform_name: str = "looker" - github_info: Optional[GitHubInfo] = None + tag_measures_and_dimensions: bool = Field( + True, + description="When enabled, attaches tags to measures, dimensions and dimension groups to make them more discoverable. When disabled, adds this information to the description of the column.", + ) + platform_name: str = Field( + "looker", description="Default platform name. Don't change." + ) + github_info: Optional[GitHubInfo] = Field( + None, + description="Reference to your github location to enable easy navigation from DataHub to your LookML files", + ) @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/lookml.py b/metadata-ingestion/src/datahub/ingestion/source/lookml.py index b01b772421f58..cbe5d4d89b22d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/lookml.py +++ b/metadata-ingestion/src/datahub/ingestion/source/lookml.py @@ -20,6 +20,12 @@ from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.looker_common import ( LookerCommonConfig, LookerUtil, @@ -140,15 +146,34 @@ def from_looker_connection( class LookMLSourceConfig(LookerCommonConfig): - base_folder: pydantic.DirectoryPath - connection_to_platform_map: Optional[Dict[str, LookerConnectionDefinition]] - model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - parse_table_names_from_sql: bool = False - sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser" + base_folder: pydantic.DirectoryPath = Field( + description="Local filepath where the root of the LookML repo lives. This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`." + ) + connection_to_platform_map: Optional[Dict[str, LookerConnectionDefinition]] = Field( + None, + description="A mapping of [Looker connection names](https://docs.looker.com/reference/model-params/connection-for-model) to DataHub platform, database, and schema values.", + ) + model_pattern: AllowDenyPattern = Field( + AllowDenyPattern.allow_all(), + description="List of regex patterns for LookML models to include in the extraction.", + ) + view_pattern: AllowDenyPattern = Field( + AllowDenyPattern.allow_all(), + description="List of regex patterns for LookML views to include in the extraction.", + ) + parse_table_names_from_sql: bool = Field(False, description="See note below.") + sql_parser: str = Field( + "datahub.utilities.sql_parser.DefaultSQLParser", description="See note below." + ) api: Optional[LookerAPIConfig] - project_name: Optional[str] - transport_options: Optional[TransportOptionsConfig] + project_name: Optional[str] = Field( + None, + description="Required if you don't specify the `api` section. The project name within which all the model files live. See (https://docs.looker.com/data-modeling/getting-started/how-project-works) to understand what the Looker project name should be. The simplest way to see your projects is to click on `Develop` followed by `Manage LookML Projects` in the Looker application.", + ) + transport_options: Optional[TransportOptionsConfig] = Field( + None, + description="Populates the [TransportOptions](https://github.com/looker-open-source/sdk-codegen/blob/94d6047a0d52912ac082eb91616c1e7c379ab262/python/looker_sdk/rtl/transport.py#L70) struct for looker client", + ) @validator("platform_instance") def platform_instance_not_supported(cls, v: str) -> str: @@ -761,7 +786,21 @@ def get_including_extends( return None +@platform_name("Looker") +@config_class(LookMLSourceConfig) +@support_status(SupportStatus.CERTIFIED) class LookMLSource(Source): + """ + This plugin extracts the following: + - LookML views from model files in a project + - Name, upstream table names, metadata for dimensions, measures, and dimension groups attached as tags + - If API integration is enabled (recommended), resolves table and view names by calling the Looker API, otherwise supports offline resolution of these names. + + :::note + To get complete Looker metadata integration (including Looker dashboards and charts and lineage to the underlying Looker views, you must ALSO use the `looker` source module. + ::: + """ + source_config: LookMLSourceConfig reporter: LookMLSourceReport looker_client: Optional[Looker31SDK] = None @@ -780,11 +819,6 @@ def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): "Failed to retrieve connections from looker client. Please check to ensure that you have manage_models permission enabled on this API key." ) - @classmethod - def create(cls, config_dict, ctx): - config = LookMLSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - def _load_model(self, path: str) -> LookerModel: with open(path, "r") as file: logger.debug(f"Loading model from file {path}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index e4e21a4b41158..74b4033a8bd53 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -5,12 +5,21 @@ import dateutil.parser as dp import requests from pydantic import validator +from pydantic.fields import Field from requests.models import HTTPError from sqllineage.runner import LineageRunner import datahub.emitter.mce_builder as builder from datahub.configuration.source_common import DatasetLineageProviderConfigBase from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import ( @@ -38,19 +47,68 @@ class MetabaseConfig(DatasetLineageProviderConfigBase): # See the Metabase /api/session endpoint for details # https://www.metabase.com/docs/latest/api-documentation.html#post-apisession - connect_uri: str = "localhost:3000" - username: Optional[str] = None - password: Optional[str] = None - database_alias_map: Optional[dict] = None - engine_platform_map: Optional[Dict[str, str]] = None - default_schema: str = "public" + connect_uri: str = Field(default="localhost:3000", description="Metabase host URL.") + username: str = Field(default=None, description="Metabase username.") + password: str = Field(default=None, description="Metabase password.") + database_alias_map: Optional[dict] = Field( + default=None, + description="Database name map to use when constructing dataset URN.", + ) + engine_platform_map: Optional[Dict[str, str]] = Field( + default=None, + description="Custom mappings between metabase database engines and DataHub platforms", + ) + default_schema: str = Field( + default="public", + description="Default schema name to use when schema is not provided in an SQL query", + ) @validator("connect_uri") def remove_trailing_slash(cls, v): return config_clean.remove_trailing_slashes(v) +@platform_name("Metabase") +@config_class(MetabaseConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") class MetabaseSource(Source): + """ + This plugin extracts Charts, dashboards, and associated metadata. This plugin is in beta and has only been tested + on PostgreSQL and H2 database. + ### Dashboard + + [/api/dashboard](https://www.metabase.com/docs/latest/api-documentation.html#dashboard) endpoint is used to + retrieve the following dashboard information. + + - Title and description + - Last edited by + - Owner + - Link to the dashboard in Metabase + - Associated charts + + ### Chart + + [/api/card](https://www.metabase.com/docs/latest/api-documentation.html#card) endpoint is used to + retrieve the following information. + + - Title and description + - Last edited by + - Owner + - Link to the chart in Metabase + - Datasource and lineage + + The following properties for a chart are ingested in DataHub. + + | Name | Description | + | ------------- | ----------------------------------------------- | + | `Dimensions` | Column names | + | `Filters` | Any filters applied to the chart | + | `Metrics` | All columns that are being used for aggregation | + + + """ + config: MetabaseConfig report: SourceReport platform = "metabase" diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py index d04184f4b101f..28df9565d4a86 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py @@ -3,11 +3,18 @@ from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import validator +from pydantic.fields import Field import datahub.metadata.schema_classes as models from datahub.configuration.common import ConfigModel from datahub.configuration.config_loader import load_config_file from datahub.emitter.mce_builder import get_sys_time, make_group_urn, make_user_urn +from datahub.ingestion.api.decorators import ( # SourceCapability,; capability, + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit @@ -58,7 +65,7 @@ class DefaultConfig(ConfigModel): class BusinessGlossarySourceConfig(ConfigModel): - file: str + file: str = Field(description="Path to business glossary file to ingest.") class BusinessGlossaryConfig(DefaultConfig): @@ -244,8 +251,15 @@ def get_mces_from_term( return [get_mce_from_snapshot(term_snapshot)] +@platform_name("Business Glossary") +@config_class(BusinessGlossarySourceConfig) +@support_status(SupportStatus.CERTIFIED) @dataclass class BusinessGlossaryFileSource(Source): + """ + This plugin pulls business glossary metadata from a yaml-formatted file. An example of one such file is located in the examples directory [here](../examples/bootstrap_data/business_glossary.yml). + """ + config: BusinessGlossarySourceConfig report: SourceReport = field(default_factory=SourceReport) diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py index e35de45c1a714..034964d2ea92f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py @@ -3,6 +3,7 @@ from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import validator +from pydantic.fields import Field import datahub.metadata.schema_classes as models from datahub.cli.cli_utils import get_aspects_for_entity @@ -19,6 +20,12 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit @@ -55,8 +62,11 @@ class EntityNodeConfig(ConfigModel): class LineageFileSourceConfig(ConfigModel): - file: str - preserve_upstream: bool = True + file: str = Field(description="Path to lineage file to ingest.") + preserve_upstream: bool = Field( + default=True, + description="Whether we want to query datahub-gms for upstream data. False means it will hard replace upstream data for a given entity. True means it will query the backend for existing upstreams and include it in the ingestion run", + ) class LineageConfig(VersionedConfig): @@ -68,8 +78,15 @@ def version_must_be_1(cls, v): raise ValueError("Only version 1 is supported") +@platform_name("File Based Lineage") +@config_class(LineageFileSourceConfig) +@support_status(SupportStatus.CERTIFIED) @dataclass class LineageFileSource(Source): + """ + This plugin pulls lineage metadata from a yaml-formatted file. An example of one such file is located in the examples directory [here](../../../../metadata-ingestion/examples/bootstrap_data/file_lineage.yml). + """ + config: LineageFileSourceConfig report: SourceReport = field(default_factory=SourceReport) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index 37c48540d503c..0ef1aeb6de182 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -7,6 +7,7 @@ import requests import tenacity from pydantic import validator +from pydantic.fields import Field from requests.models import HTTPBasicAuth, HTTPError from sqllineage.runner import LineageRunner from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential @@ -15,6 +16,14 @@ from datahub.configuration.common import ConfigModel from datahub.configuration.source_common import DatasetLineageProviderConfigBase from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import ( @@ -41,21 +50,38 @@ class ModeAPIConfig(ConfigModel): - retry_backoff_multiplier: Union[int, float] = 2 - max_retry_interval: Union[int, float] = 10 - max_attempts: int = 5 + retry_backoff_multiplier: Union[int, float] = Field( + default=2, + description="Multiplier for exponential backoff when waiting to retry", + ) + max_retry_interval: Union[int, float] = Field( + default=10, description="Maximum interval to wait when retrying" + ) + max_attempts: int = Field( + default=5, description="Maximum number of attempts to retry before failing" + ) class ModeConfig(DatasetLineageProviderConfigBase): # See https://mode.com/developer/api-reference/authentication/ # for authentication - connect_uri: str = "https://app.mode.com" - token: Optional[str] = None - password: Optional[str] = None - workspace: Optional[str] = None - default_schema: str = "public" - owner_username_instead_of_email: Optional[bool] = True - api_options: ModeAPIConfig = ModeAPIConfig() + connect_uri: str = Field( + default="https://app.mode.com", description="Mode host URL." + ) + token: str = Field(default=None, description="Mode user token.") + password: str = Field(default=None, description="Mode password for authentication.") + workspace: Optional[str] = Field(default=None, description="") + default_schema: str = Field( + default="public", + description="Default schema to use when schema is not provided in an SQL query", + ) + owner_username_instead_of_email: Optional[bool] = Field( + default=True, description="Use username for owner URN instead of Email" + ) + api_options: ModeAPIConfig = Field( + default=ModeAPIConfig(), + description='Retry/Wait settings for Mode API to avoid "Too many Requests" error. See Mode API Options below', + ) @validator("connect_uri") def remove_trailing_slash(cls, v): @@ -66,7 +92,69 @@ class HTTPError429(HTTPError): pass +@platform_name("Mode") +@config_class(ModeConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") class ModeSource(Source): + """ + + This plugin extracts Charts, Reports, and associated metadata from a given Mode workspace. This plugin is in beta and has only been tested + on PostgreSQL database. + + ### Report + + [/api/{account}/reports/{report}](https://mode.com/developer/api-reference/analytics/reports/) endpoint is used to + retrieve the following report information. + + - Title and description + - Last edited by + - Owner + - Link to the Report in Mode for exploration + - Associated charts within the report + + ### Chart + + [/api/{workspace}/reports/{report}/queries/{query}/charts'](https://mode.com/developer/api-reference/analytics/charts/#getChart) endpoint is used to + retrieve the following information. + + - Title and description + - Last edited by + - Owner + - Link to the chart in Metabase + - Datasource and lineage information from Report queries. + + The following properties for a chart are ingested in DataHub. + + #### Chart Information + | Name | Description | + |-----------|----------------------------------------| + | `Filters` | Filters applied to the chart | + | `Metrics` | Fields or columns used for aggregation | + | `X` | Fields used in X-axis | + | `X2` | Fields used in second X-axis | + | `Y` | Fields used in Y-axis | + | `Y2` | Fields used in second Y-axis | + + + #### Table Information + | Name | Description | + |-----------|------------------------------| + | `Columns` | Column names in a table | + | `Filters` | Filters applied to the table | + + + + #### Pivot Table Information + | Name | Description | + |-----------|----------------------------------------| + | `Columns` | Column names in a table | + | `Filters` | Filters applied to the table | + | `Metrics` | Fields or columns used for aggregation | + | `Rows` | Row names in a table | + + """ + config: ModeConfig report: SourceReport tool = "mode" diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index 7a6782b552f01..8d6201867dd8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -6,11 +6,20 @@ import pymongo from packaging import version from pydantic import PositiveInt, validator +from pydantic.fields import Field from pymongo.mongo_client import MongoClient -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.emitter.mce_builder import DEFAULT_ENV +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.schema_inference.object import ( @@ -45,25 +54,46 @@ DENY_DATABASE_LIST = set(["admin", "config", "local"]) -class MongoDBConfig(ConfigModel): +class MongoDBConfig(EnvBasedSourceConfigBase): # See the MongoDB authentication docs for details and examples. # https://pymongo.readthedocs.io/en/stable/examples/authentication.html - connect_uri: str = "mongodb://localhost" - username: Optional[str] = None - password: Optional[str] = None - authMechanism: Optional[str] = None - options: dict = {} - enableSchemaInference: bool = True - schemaSamplingSize: Optional[PositiveInt] = 1000 - useRandomSampling: bool = True - maxSchemaSize: Optional[PositiveInt] = 300 + connect_uri: str = Field( + default="mongodb://localhost", description="MongoDB connection URI." + ) + username: Optional[str] = Field(default=None, description="MongoDB username.") + password: Optional[str] = Field(default=None, description="MongoDB password.") + authMechanism: Optional[str] = Field( + default=None, description="MongoDB authentication mechanism." + ) + options: dict = Field( + default={}, description="Additional options to pass to `pymongo.MongoClient()`." + ) + enableSchemaInference: bool = Field( + default=True, description="Whether to infer schemas. " + ) + schemaSamplingSize: Optional[PositiveInt] = Field( + default=1000, + description="Number of documents to use when inferring schema size. If set to `0`, all documents will be scanned.", + ) + useRandomSampling: bool = Field( + default=True, + description="If documents for schema inference should be randomly selected. If `False`, documents will be selected from start.", + ) + maxSchemaSize: Optional[PositiveInt] = Field( + default=300, description="Maximum number of fields to include in the schema." + ) # mongodb only supports 16MB as max size for documents. However, if we try to retrieve a larger document it # errors out with "16793600" as the maximum size supported. - maxDocumentSize: Optional[PositiveInt] = 16793600 - env: str = DEFAULT_ENV + maxDocumentSize: Optional[PositiveInt] = Field(default=16793600, description="") - database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() + database_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for databases to filter in ingestion.", + ) + collection_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for collections to filter in ingestion.", + ) @validator("maxDocumentSize") def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value): @@ -97,7 +127,6 @@ def report_dropped(self, name: str) -> None: "mixed": "mixed", } - # map PyMongo types to DataHub classes _field_type_mapping: Dict[Union[Type, str], Type] = { list: ArrayTypeClass, @@ -166,8 +195,27 @@ def construct_schema_pymongo( return construct_schema(list(documents), delimiter) +@platform_name("MongoDB") +@config_class(MongoDBConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") @dataclass class MongoDBSource(Source): + """ + This plugin extracts the following: + + - Databases and associated metadata + - Collections in each database and schemas for each collection (via schema inference) + + By default, schema inference samples 1,000 documents from each collection. Setting `schemaSamplingSize: null` will scan the entire collection. + Moreover, setting `useRandomSampling: False` will sample the first documents found without random selection, which may be faster for large collections. + + Note that `schemaSamplingSize` has no effect if `enableSchemaInference: False` is set. + + Really large schemas will be further truncated to a maximum of 300 schema fields. This is configurable using the `maxSchemaSize` parameter. + + """ + config: MongoDBConfig report: MongoDBSourceReport mongo_client: MongoClient @@ -189,7 +237,7 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig): **self.config.options, } - self.mongo_client = pymongo.MongoClient(self.config.connect_uri, **options) + self.mongo_client = pymongo.MongoClient(self.config.connect_uri, **options) # type: ignore # This cheaply tests the connection. For details, see # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index 4a6c05b5a0965..cadb34e6a9035 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -11,12 +11,20 @@ import requests from dateutil import parser from packaging import version +from pydantic.fields import Field from requests.adapters import HTTPAdapter import datahub.emitter.mce_builder as builder -from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( @@ -55,32 +63,59 @@ class NifiAuthType(Enum): CLIENT_CERT = "CLIENT_CERT" -class NifiSourceConfig(ConfigModel): - site_url: str +class NifiSourceConfig(EnvBasedSourceConfigBase): + site_url: str = Field(description="URI to connect") - auth: NifiAuthType = NifiAuthType.NO_AUTH + auth: NifiAuthType = Field( + default=NifiAuthType.NO_AUTH, + description="Nifi authentication. must be one of : NO_AUTH, SINGLE_USER, CLIENT_CERT", + ) - provenance_days: int = 7 # Fetch provenance events for past 1 week - process_group_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() + provenance_days: int = Field( + default=7, + description="time window to analyze provenance events for external datasets", + ) # Fetch provenance events for past 1 week + process_group_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for filtering process groups", + ) # Required for nifi deployments using Remote Process Groups - site_name: str = "default" - site_url_to_site_name: Dict[str, str] = {} + site_name: str = Field( + default="default", + description="Site name to identify this site with, useful when using input and output ports receiving remote connections", + ) + site_url_to_site_name: Dict[str, str] = Field( + default={}, + description="Lookup to find site_name for site_url, required if using remote process groups in nifi flow", + ) # Required to be set if auth is of type SINGLE_USER - username: Optional[str] - password: Optional[str] + username: Optional[str] = Field( + default=None, description='Nifi username, must be set for auth = "SINGLE_USER"' + ) + password: Optional[str] = Field( + default=None, description='Nifi password, must be set for auth = "SINGLE_USER"' + ) # Required to be set if auth is of type CLIENT_CERT - client_cert_file: Optional[str] - client_key_file: Optional[str] - client_key_password: Optional[str] + client_cert_file: Optional[str] = Field( + default=None, + description='Path to PEM file containing the public certificates for the user/client identity, must be set for auth = "CLIENT_CERT"', + ) + client_key_file: Optional[str] = Field( + default=None, description="Path to PEM file containing the client’s secret key" + ) + client_key_password: Optional[str] = Field( + default=None, description="The password to decrypt the client_key_file" + ) # Required to be set if nifi server certificate is not signed by # root CA trusted by client system, e.g. self-signed certificates - ca_file: Optional[str] - - env: str = builder.DEFAULT_ENV + ca_file: Optional[str] = Field( + default=None, + description="Path to PEM file containing certs for the root CA(s) for the NiFi", + ) TOKEN_ENDPOINT = "/nifi-api/access/token" @@ -268,7 +303,26 @@ def report_dropped(self, ent_name: str) -> None: # allowRemoteAccess +@platform_name("Nifi") +@config_class(NifiSourceConfig) +@support_status(SupportStatus.CERTIFIED) class NifiSource(Source): + """ + This plugin extracts the following: + + - Nifi flow as `DataFlow` entity + - Ingress, egress processors, remote input and output ports as `DataJob` entity + - Input and output ports receiving remote connections as `Dataset` entity + - Lineage information between external datasets and ingress/egress processors by analyzing provenance events + + Current limitations: + + - Limited ingress/egress processors are supported + - S3: `ListS3`, `FetchS3Object`, `PutS3Object` + - SFTP: `ListSFTP`, `FetchSFTP`, `GetSFTP`, `PutSFTP` + + """ + config: NifiSourceConfig report: NifiSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 8a02db79cb359..b71cb363b96e4 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -4,9 +4,19 @@ from abc import ABC from typing import Dict, Generator, Iterable, Optional, Tuple +from pydantic.fields import Field + from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import make_tag_urn from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.openapi_parser import ( @@ -36,15 +46,15 @@ class OpenApiConfig(ConfigModel): - name: str - url: str - swagger_file: str - ignore_endpoints: list = [] - username: str = "" - password: str = "" - forced_examples: dict = {} - token: Optional[str] = None - get_token: dict = {} + name: str = Field(description="") + url: str = Field(description="") + swagger_file: str = Field(description="") + ignore_endpoints: list = Field(default=[], description="") + username: str = Field(default="", description="") + password: str = Field(default="", description="") + forced_examples: dict = Field(default={}, description="") + token: Optional[str] = Field(default=None, description="") + get_token: dict = Field(default={}, description="") def get_swagger(self) -> Dict: if self.get_token or self.token is not None: @@ -101,7 +111,32 @@ class ApiWorkUnit(MetadataWorkUnit): pass +@platform_name("OpenAPI", id="openapi") +@config_class(OpenApiConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, supported=False, description="") class APISource(Source, ABC): + """ + + This plugin is meant to gather dataset-like informations about OpenApi Endpoints. + + As example, if by calling GET at the endpoint at `https://test_endpoint.com/api/users/` you obtain as result: + ```JSON + [{"user": "albert_physics", + "name": "Albert Einstein", + "job": "nature declutterer", + "is_active": true}, + {"user": "phytagoras", + "name": "Phytagoras of Kroton", + "job": "Phylosopher on steroids", + "is_active": true} + ] + ``` + + in Datahub you will see a dataset called `test_endpoint/users` which contains as fields `user`, `name` and `job`. + + """ + def __init__(self, config: OpenApiConfig, ctx: PipelineContext, platform: str): super().__init__(ctx) self.config = config diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py index 41e00ad14c85b..5cfba5fa2ec14 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py @@ -15,12 +15,21 @@ import msal import requests from orderedset import OrderedSet +from pydantic.fields import Field import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import ( @@ -99,17 +108,22 @@ class Constant: class PowerBiAPIConfig(EnvBasedSourceConfigBase): # Organsation Identifier - tenant_id: str + tenant_id: str = Field(description="Power BI tenant identifier.") # PowerBi workspace identifier - workspace_id: str + workspace_id: str = Field(description="Power BI workspace identifier.") # Dataset type mapping - dataset_type_mapping: Dict[str, str] + dataset_type_mapping: Dict[str, str] = Field( + description="Mapping of Power BI datasource type to Datahub dataset." + ) # Azure app client identifier - client_id: str + client_id: str = Field(description="Azure AD App client identifier.") # Azure app client secret - client_secret: str + client_secret: str = Field(description="Azure AD App client secret.") # timeout for meta-data scanning - scan_timeout: int = 60 + scan_timeout: int = Field( + default=60, + description="time in seconds to wait for Power BI metadata scan result.", + ) scope: str = "https://analysis.windows.net/powerbi/api/.default" base_url: str = "https://api.powerbi.com/v1.0/myorg/groups" @@ -1320,9 +1334,26 @@ def report_charts_dropped(self, view: str) -> None: self.filtered_charts.append(view) +@platform_name("PowerBI") +@config_class(PowerBiDashboardSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.OWNERSHIP, "Enabled by default") class PowerBiDashboardSource(Source): """ - Datahub PowerBi plugin main class. This class extends Source to become PowerBi data ingestion source for Datahub + This plugin extracts the following: + + - Power BI dashboards, tiles, datasets + - Names, descriptions and URLs of dashboard and tile + - Owners of dashboards + + ## Configuration Notes + + See the + 1. [Microsoft AD App Creation doc](https://docs.microsoft.com/en-us/power-bi/developer/embedded/embed-service-principal) for the steps to create a app client ID and secret. + 2. Login to Power BI as Admin and from `Tenant settings` allow below permissions. + - Allow service principles to use Power BI APIs + - Allow service principals to use read-only Power BI admin APIs + - Enhance admin APIs responses with detailed metadata """ source_config: PowerBiDashboardSourceConfig diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 0a58a1759b4b3..45651369fca42 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -6,6 +6,7 @@ from typing import Dict, Iterable, List, Optional, Type import dateutil.parser as dp +from pydantic.fields import Field from redash_toolbelt import Redash from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry @@ -14,6 +15,12 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( # SourceCapability,; capability, + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.common import ( @@ -223,17 +230,39 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str) class RedashConfig(ConfigModel): # See the Redash API for details # https://redash.io/help/user-guide/integrations-and-api/api - connect_uri: str = "http://localhost:5000" - api_key: str = "REDASH_API_KEY" - env: str = DEFAULT_ENV + connect_uri: str = Field( + default="http://localhost:5000", description="Redash base URL." + ) + api_key: str = Field(default="REDASH_API_KEY", description="Redash user API key.") # Optionals - dashboard_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - chart_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - skip_draft: bool = True - api_page_limit: int = sys.maxsize - parse_table_names_from_sql: bool = False - sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser" + dashboard_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for dashboards to filter for ingestion.", + ) + chart_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for charts to filter for ingestion.", + ) + skip_draft: bool = Field( + default=True, description="Only ingest published dashboards and charts." + ) + api_page_limit: int = Field( + default=sys.maxsize, + description="Limit on ingested dashboards and charts API pagination.", + ) + parse_table_names_from_sql: bool = Field( + default=False, description="See note below." + ) + sql_parser: str = Field( + default="datahub.utilities.sql_parser.DefaultSQLParser", + description="custom SQL parser. See note below for details.", + ) + + env: str = Field( + default=DEFAULT_ENV, + description="Environment to use in namespace when constructing URNs.", + ) @dataclass @@ -248,7 +277,17 @@ def report_dropped(self, item: str) -> None: self.filtered.append(item) +@platform_name("Redash") +@config_class(RedashConfig) +@support_status(SupportStatus.CERTIFIED) class RedashSource(Source): + """ + This plugin extracts the following: + + - Redash dashboards and queries/visualization + - Redash chart table lineages (disabled by default) + """ + config: RedashConfig report: RedashSourceReport platform = "redash" diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/s3/__init__.py index 5a787d05572ef..32b39420b6b49 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/__init__.py @@ -50,6 +50,14 @@ gen_containers, ) from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import ( @@ -184,7 +192,39 @@ class TableData: table_path: str +@platform_name("S3 Data Lake", id="s3") +@config_class(DataLakeSourceConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class S3Source(Source): + """ + This plugin extracts: + + - Row and column counts for each table + - For each column, if profiling is enabled: + - null counts and proportions + - distinct counts and proportions + - minimum, maximum, mean, median, standard deviation, some quantile values + - histograms or frequencies of unique values + + This connector supports both local files as well as those stored on AWS S3 (which must be identified using the prefix `s3://`). Supported file types are as follows: + + - CSV + - TSV + - JSON + - Parquet + - Apache Avro + + Schemas for Parquet and Avro files are extracted as provided. + + Schemas for schemaless formats (CSV, TSV, JSON) are inferred. For CSV and TSV files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details)) + JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance. + We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object. + + Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details). If profiling, make sure that permissions for **s3a://** access are set because Spark and Hadoop use the s3a:// protocol to interface with AWS (schema inference outside of profiling requires s3:// access). + Enabling profiling will slow down ingestion runs. + """ + source_config: DataLakeSourceConfig report: DataLakeSourceReport profiling_times_taken: List[float] diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index ce2bdde96d529..040b6abc9c413 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -5,10 +5,14 @@ import parse import pydantic +from pydantic.fields import Field from wcmatch import pathlib from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.emitter.mce_builder import DEFAULT_ENV +from datahub.configuration.source_common import ( + EnvBasedSourceConfigBase, + PlatformSourceConfigBase, +) from datahub.ingestion.source.aws.aws_common import AwsSourceConfig from datahub.ingestion.source.aws.s3_util import is_s3_uri from datahub.ingestion.source.s3.profiling import DataLakeProfilerConfig @@ -24,10 +28,21 @@ class PathSpec(ConfigModel): class Config: arbitrary_types_allowed = True - include: str - exclude: Optional[List[str]] - file_types: List[str] = SUPPORTED_FILE_TYPES - table_name: Optional[str] + include: str = Field( + description="Path to table (s3 or local file system). Name variable {table} is used to mark the folder with dataset. In absence of {table}, file level dataset will be created. Check below examples for more details." + ) + exclude: Optional[List[str]] = Field( + default=None, + description="list of paths in glob pattern which will be excluded while scanning for the datasets", + ) + file_types: List[str] = Field( + default=SUPPORTED_FILE_TYPES, + description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.", + ) + table_name: Optional[str] = Field( + default=None, + description="Display name of the dataset.Combination of named variableds from include path and strings", + ) # to be set internally _parsable_include: str @@ -132,25 +147,45 @@ def validate_path_spec(cls, values: Dict) -> Dict[str, Any]: return values -class DataLakeSourceConfig(ConfigModel): - path_spec: PathSpec - env: str = DEFAULT_ENV - platform_instance: Optional[str] = None - platform: str = "" # overwritten by validator below - - aws_config: Optional[AwsSourceConfig] = None +class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase): + path_spec: PathSpec = Field(description="") + platform: str = Field( + default="", description="The platform that this source connects to" + ) + platform_instance: Optional[str] = Field( + default=None, + description="The instance of the platform that all assets produced by this recipe belong to", + ) + aws_config: Optional[AwsSourceConfig] = Field( + default=None, description="AWS configuration" + ) # Whether or not to create in datahub from the s3 bucket - use_s3_bucket_tags: Optional[bool] = None + use_s3_bucket_tags: Optional[bool] = Field( + None, description="Whether or not to create tags in datahub from the s3 bucket" + ) # Whether or not to create in datahub from the s3 object - use_s3_object_tags: Optional[bool] = None - - profile_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - profiling: DataLakeProfilerConfig = DataLakeProfilerConfig() - - spark_driver_memory: str = "4g" - - max_rows: int = 100 + use_s3_object_tags: Optional[bool] = Field( + None, + description="# Whether or not to create tags in datahub from the s3 object", + ) + + profile_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for tables to profile ", + ) + profiling: DataLakeProfilerConfig = Field( + default=DataLakeProfilerConfig(), description="Data profiling configuration" + ) + + spark_driver_memory: str = Field( + default="4g", description="Max amount of memory to grant Spark." + ) + + max_rows: int = Field( + default=100, + description="Maximum number of rows to use when inferring schemas for TSV and CSV files.", + ) @pydantic.root_validator(pre=False) def validate_platform(cls, values: Dict) -> Dict: diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py b/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py index b2afdd97f1cbd..f6d3b3c4af374 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py @@ -4,6 +4,7 @@ import pydantic from pandas import DataFrame +from pydantic.fields import Field from pydeequ.analyzers import ( AnalysisRunBuilder, AnalysisRunner, @@ -60,25 +61,64 @@ def null_str(value: Any) -> Optional[str]: class DataLakeProfilerConfig(ConfigModel): - enabled: bool = False + enabled: bool = Field( + default=False, description="Whether profiling should be done." + ) # These settings will override the ones below. - profile_table_level_only: bool = False - - allow_deny_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() - - max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = None - - include_field_null_count: bool = True - include_field_min_value: bool = True - include_field_max_value: bool = True - include_field_mean_value: bool = True - include_field_median_value: bool = True - include_field_stddev_value: bool = True - include_field_quantiles: bool = True - include_field_distinct_value_frequencies: bool = True - include_field_histogram: bool = True - include_field_sample_values: bool = True + profile_table_level_only: bool = Field( + default=False, + description="Whether to perform profiling at table-level only or include column-level profiling as well.", + ) + + allow_deny_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), description="" + ) + + max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( + default=None, + description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.", + ) + + include_field_null_count: bool = Field( + default=True, + description="Whether to profile for the number of nulls for each column.", + ) + include_field_min_value: bool = Field( + default=True, + description="Whether to profile for the min value of numeric columns.", + ) + include_field_max_value: bool = Field( + default=True, + description="Whether to profile for the max value of numeric columns.", + ) + include_field_mean_value: bool = Field( + default=True, + description="Whether to profile for the mean value of numeric columns.", + ) + include_field_median_value: bool = Field( + default=True, + description="Whether to profile for the median value of numeric columns.", + ) + include_field_stddev_value: bool = Field( + default=True, + description="Whether to profile for the standard deviation of numeric columns.", + ) + include_field_quantiles: bool = Field( + default=True, + description="Whether to profile for the quantiles of numeric columns.", + ) + include_field_distinct_value_frequencies: bool = Field( + default=True, description="Whether to profile for distinct value frequencies." + ) + include_field_histogram: bool = Field( + default=True, + description="Whether to profile for the histogram for numeric fields.", + ) + include_field_sample_values: bool = Field( + default=True, + description="Whether to profile for the sample values for all columns.", + ) @pydantic.root_validator() def ensure_field_level_settings_are_normalized( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 66becd9830d75..b89a078233508 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -3,11 +3,18 @@ import typing from typing import Dict, List, Optional, Tuple +import pydantic from pyathena.common import BaseCursor from pyathena.model import AthenaTableMetadata from sqlalchemy.engine.reflection import Inspector from datahub.emitter.mcp_builder import DatabaseKey, gen_containers +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.sql.sql_common import ( @@ -19,8 +26,13 @@ class AthenaConfig(SQLAlchemyConfig): scheme: str = "awsathena+rest" - username: Optional[str] = None - password: Optional[str] = None + username: Optional[str] = pydantic.Field( + default=None, + description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html", + ) + password: Optional[str] = pydantic.Field( + default=None, description="Same detection scheme as username" + ) database: Optional[str] = None aws_region: str s3_staging_dir: str @@ -42,7 +54,22 @@ def get_sql_alchemy_url(self): ) +@platform_name("Athena") +@support_status(SupportStatus.CERTIFIED) +@config_class(AthenaConfig) class AthenaSource(SQLAlchemySource): + """ + This plugin supports extracting the following metadata from Athena + - Tables, schemas etc. + - Profiling when enabled. + + :::note + + Athena source only works with python 3.7+. + + ::: + """ + def __init__(self, config, ctx): super().__init__(config, ctx, "athena") self.cursor: Optional[BaseCursor] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py index e135dcc24339a..f6207818617f3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py @@ -26,6 +26,14 @@ ProjectIdKey, gen_containers, ) +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.sql_common import ( SQLAlchemyConfig, @@ -284,7 +292,33 @@ def cleanup(config: BigQueryConfig) -> None: os.unlink(config._credentials_path) +@config_class(BigQueryConfig) +@platform_name("BigQuery") +@support_status(SupportStatus.CERTIFIED) +@capability( + SourceCapability.PLATFORM_INSTANCE, + "BigQuery doesn't need platform instances because project ids in BigQuery are globally unique.", + supported=False, +) +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability( + SourceCapability.USAGE_STATS, + "Not provided by this module, use `bigquery-usage` for that.", + supported=False, +) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class BigQuerySource(SQLAlchemySource): + """ + This plugin extracts the following: + - Metadata for databases, schemas, and tables + - Column types associated with each table + - Table, row, and column statistics via optional SQL profiling + - Table level lineage. + """ + def __init__(self, config, ctx): super().__init__(config, ctx, "bigquery") self.config: BigQueryConfig = config diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index e14c1ee29f0d0..c4edbb2d34792 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -9,6 +9,7 @@ import clickhouse_sqlalchemy.types as custom_types from clickhouse_sqlalchemy.drivers import base from clickhouse_sqlalchemy.drivers.base import ClickHouseDialect +from pydantic.fields import Field from sqlalchemy import create_engine, text from sqlalchemy.engine import reflection from sqlalchemy.sql import sqltypes @@ -19,6 +20,14 @@ from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, @@ -109,14 +118,16 @@ class ClickHouseConfig( BasicSQLAlchemyConfig, BaseTimeWindowConfig, DatasetLineageProviderConfigBase ): # defaults - host_port = "localhost:8123" - scheme = "clickhouse" + host_port = Field(default="localhost:8123", description="ClickHouse host URL.") + scheme = Field(default="clickhouse", description="", exclude=True) - secure: Optional[bool] - protocol: Optional[str] + secure: Optional[bool] = Field(default=None, description="") + protocol: Optional[str] = Field(default=None, description="") - include_table_lineage: Optional[bool] = True - include_materialized_views: Optional[bool] = True + include_table_lineage: Optional[bool] = Field( + default=True, description="Whether table lineage should be ingested." + ) + include_materialized_views: Optional[bool] = Field(default=True, description="") def get_sql_alchemy_url(self, database=None): uri_opts = None @@ -307,7 +318,28 @@ def get_columns(self, connection, table_name, schema=None, **kw): clickhouse_datetime_format = "%Y-%m-%d %H:%M:%S" +@platform_name("ClickHouse") +@config_class(ClickHouseConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class ClickHouseSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for tables, views, materialized views and dictionaries + - Column types associated with each table(except *AggregateFunction and DateTime with timezone) + - Table, row, and column statistics via optional SQL profiling. + - Table, view, materialized view and dictionary(with CLICKHOUSE source_type) lineage + + :::tip + + You can also get fine-grained usage statistics for ClickHouse using the `clickhouse-usage` source described below. + + ::: + + """ + config: ClickHouseConfig def __init__(self, config, ctx): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py index 6e9f10526b42e..06dbcf05de7b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py @@ -1,7 +1,16 @@ # This import verifies that the dependencies are available. import pydruid # noqa: F401 +from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, @@ -11,7 +20,10 @@ class DruidConfig(BasicSQLAlchemyConfig): # defaults scheme = "druid" - schema_pattern: AllowDenyPattern = AllowDenyPattern(deny=["^(lookup|sys).*"]) + schema_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern(deny=["^(lookup|sys).*"]), + description="regex patterns for schemas to filter in ingestion.", + ) def get_sql_alchemy_url(self): return f"{super().get_sql_alchemy_url()}/druid/v2/sql/" @@ -32,7 +44,20 @@ def get_identifier(self, schema: str, table: str) -> str: ) +@platform_name("Druid") +@config_class(DruidConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") class DruidSource(SQLAlchemySource): + """ + This plugin extracts the following: + - Metadata for databases, schemas, and tables + - Column types associated with each table + - Table, row, and column statistics via optional SQL profiling. + + **Note**: It is important to explicitly define the deny schema pattern for internal Druid databases (lookup & sys) if adding a schema pattern. Otherwise, the crawler may crash before processing relevant databases. This deny pattern is defined by default but is overriden by user-submitted configurations. + """ + def __init__(self, config, ctx): super().__init__(config, ctx, "druid") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index ac02376531820..5989cf6221f86 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -3,11 +3,20 @@ from typing import Any, Dict, List, Optional from pydantic.class_validators import validator +from pydantic.fields import Field # This import verifies that the dependencies are available. from pyhive import hive # noqa: F401 from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.extractor import schema_util from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, @@ -31,19 +40,37 @@ class HiveConfig(BasicSQLAlchemyConfig): # defaults - scheme = "hive" + scheme = Field(default="hive", exclude=True) # Hive SQLAlchemy connector returns views as tables. # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. # Disabling views helps us prevent this duplication. - include_views = False + include_views = Field( + default=False, + exclude=True, + description="Hive SQLAlchemy connector returns views as tables. See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. Disabling views helps us prevent this duplication.", + ) @validator("host_port") def clean_host_port(cls, v): return config_clean.remove_protocol(v) +@platform_name("Hive") +@config_class(HiveConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") class HiveSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, and tables + - Column types associated with each table + - Detailed table and storage information + - Table, row, and column statistics via optional SQL profiling. + + """ _COMPLEX_TYPE = re.compile("^(struct|map|array|uniontype)") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mariadb.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mariadb.py index 11a2148ff195b..4276ad6570417 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mariadb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mariadb.py @@ -1,6 +1,22 @@ -from datahub.ingestion.source.sql.mysql import MySQLSource +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource +@platform_name("MariaDB") +@config_class(MySQLConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class MariaDBSource(MySQLSource): def get_platform(self): return "mariadb" diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py index 5101165dca1fb..f3f7b243cfa17 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py @@ -5,11 +5,20 @@ # This import verifies that the dependencies are available. import sqlalchemy_pytds # noqa: F401 +from pydantic.fields import Field from sqlalchemy.engine.base import Connection from sqlalchemy.engine.reflection import Inspector from sqlalchemy.engine.result import ResultProxy, RowProxy from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, @@ -18,10 +27,16 @@ class SQLServerConfig(BasicSQLAlchemyConfig): # defaults - host_port: str = "localhost:1433" - scheme: str = "mssql+pytds" - use_odbc: bool = False - uri_args: Dict[str, str] = {} + host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") + scheme: str = Field(default="mssql+pytds", description="", exclude=True) + use_odbc: bool = Field( + default=False, + description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", + ) + uri_args: Dict[str, str] = Field( + default={}, + desscription="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", + ) @pydantic.validator("uri_args") def passwords_match(cls, v, values, **kwargs): @@ -52,7 +67,31 @@ def get_identifier(self, schema: str, table: str) -> str: return regular +@platform_name("Microsoft SQL Server", id="mssql") +@config_class(SQLServerConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability( + SourceCapability.USAGE_STATS, + "Not provided by this module, use `bigquery-usage` for that.", + supported=False, +) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class SQLServerSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, views and tables + - Column types associated with each table/view + - Table, row, and column statistics via optional SQL profiling + + We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. + """ + def __init__(self, config: SQLServerConfig, ctx: PipelineContext): super().__init__(config, ctx, "mssql") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py index 50a108d53b5f0..3e035a891b6c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py @@ -1,8 +1,17 @@ # This import verifies that the dependencies are available. import pymysql # noqa: F401 +from pydantic.fields import Field from sqlalchemy.dialects.mysql import base +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, @@ -31,7 +40,7 @@ class MySQLConfig(BasicSQLAlchemyConfig): # defaults - host_port = "localhost:3306" + host_port = Field(default="localhost:3306", description="MySQL host URL.") scheme = "mysql+pymysql" def get_identifier(self, *, schema: str, table: str) -> str: @@ -42,7 +51,23 @@ def get_identifier(self, *, schema: str, table: str) -> str: return regular +@platform_name("MySQL") +@config_class(MySQLConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class MySQLSource(SQLAlchemySource): + """ + This plugin extracts the following: + + Metadata for databases, schemas, and tables + Column types and schema associated with each table + Table, row, and column statistics via optional SQL profiling + """ + def __init__(self, config, ctx): super().__init__(config, ctx, self.get_platform()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index 296a46d4c4be8..611f2f4b39a40 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -4,10 +4,19 @@ # This import verifies that the dependencies are available. import cx_Oracle # noqa: F401 import pydantic +from pydantic.fields import Field from sqlalchemy import event from sqlalchemy.dialects.oracle.base import OracleDialect from sqlalchemy.engine.reflection import Inspector +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, @@ -38,9 +47,17 @@ def before_cursor_execute(conn, cursor, statement, parameters, context, executem class OracleConfig(BasicSQLAlchemyConfig): # defaults - scheme = "oracle+cx_oracle" - - service_name: Optional[str] + scheme: str = Field( + default="oracle+cx_oracle", + description="Will be set automatically to default value.", + ) + + service_name: Optional[str] = Field( + default=None, description="Oracle service name. If using, omit `database`." + ) + database: Optional[str] = Field( + default=None, description="If using, omit `service_name`." + ) @pydantic.validator("service_name") def check_service_name(cls, v, values): @@ -58,7 +75,22 @@ def get_sql_alchemy_url(self): return url +@platform_name("Oracle") +@config_class(OracleConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DOMAINS, "Enabled by default") class OracleSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, and tables + - Column types associated with each table + - Table, row, and column statistics via optional SQL profiling + + Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html). + + """ + def __init__(self, config, ctx): super().__init__(config, ctx, "oracle") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index d18a4ce3f7382..c21177ebd9117 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -8,8 +8,17 @@ # effects of the import. For more details, see here: # https://geoalchemy-2.readthedocs.io/en/latest/core_tutorial.html#reflecting-tables. from geoalchemy2 import Geometry # noqa: F401 +from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, SQLAlchemySource, @@ -29,8 +38,8 @@ class PostgresConfig(BasicSQLAlchemyConfig): # defaults - scheme = "postgresql+psycopg2" - schema_pattern = AllowDenyPattern(deny=["information_schema"]) + scheme = Field(default="postgresql+psycopg2", description="database scheme") + schema_pattern = Field(default=AllowDenyPattern(deny=["information_schema"])) def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: regular = f"{schema}.{table}" @@ -41,7 +50,23 @@ def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: return regular +@platform_name("Postgres") +@config_class(PostgresConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DOMAINS, "Enabled by default") +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class PostgresSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, views, and tables + - Column types associated with each table + - Also supports PostGIS extensions + - database_alias (optional) can be used to change the name of database to be ingested + - Table, row, and column statistics via optional SQL profiling + """ + def __init__(self, config, ctx): super().__init__(config, ctx, "postgres") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py index b525da058cb4d..50268910244f0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py @@ -5,6 +5,8 @@ from itertools import groupby from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from pydantic.fields import Field + # This import verifies that the dependencies are available. from pyhive import hive # noqa: F401 from sqlalchemy import create_engine, text @@ -13,6 +15,14 @@ from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.emitter.mcp_builder import PlatformKey, gen_containers from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, @@ -35,14 +45,38 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig): - views_where_clause_suffix: str = "" - tables_where_clause_suffix: str = "" - schemas_where_clause_suffix: str = "" - host_port: str = "localhost:3306" - scheme: str = "mysql+pymysql" - - + views_where_clause_suffix: str = Field( + default="", + description="Where clause to specify what Presto views should be ingested.", + ) + tables_where_clause_suffix: str = Field( + default="", + description="Where clause to specify what Hive tables should be ingested.", + ) + schemas_where_clause_suffix: str = Field( + default="", + description="Where clause to specify what Hive schemas should be ingested.", + ) + host_port: str = Field( + default="localhost:3306", + description="Host URL and port to connect to. Example: localhost:3306", + ) + scheme: str = Field(default="mysql+pymysql", description="", exclude=True) + + +@platform_name("Presto on Hive") +@config_class(PrestoOnHiveConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class PrestoOnHiveSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for Presto views and Hive tables (external / managed) + - Column types associated with each table / view + - Detailed table / view property info + """ _TABLES_SQL_STATEMENT = """ SELECT source.* FROM diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py index 12c6449ce70f8..b6b1cd4d0d417 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py @@ -10,6 +10,7 @@ import pydantic # noqa: F401 import sqlalchemy import sqlalchemy_redshift # noqa: F401 +from pydantic.fields import Field from sqlalchemy import create_engine, inspect from sqlalchemy.engine import Connection, reflection from sqlalchemy.engine.reflection import Inspector @@ -22,6 +23,14 @@ from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.postgres import PostgresConfig from datahub.ingestion.source.sql.sql_common import ( @@ -103,15 +112,32 @@ class RedshiftConfig( # Because of this behavior, it uses dramatically fewer round trips for # large Redshift warehouses. As an example, see this query for the columns: # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745. - scheme = "redshift+psycopg2" + scheme = Field( + default="redshift+psycopg2", + description="", + exclude=True, + ) - default_schema: str = "public" + default_schema: str = Field( + default="public", + description="The default schema to use if the sql parser fails to parse the schema with `sql_based` lineage collector", + ) - include_table_lineage: Optional[bool] = True - include_copy_lineage: Optional[bool] = True - capture_lineage_query_parser_failures: Optional[bool] = False + include_table_lineage: Optional[bool] = Field( + default=True, description="Whether table lineage should be ingested." + ) + include_copy_lineage: Optional[bool] = Field( + default=True, + description="Whether lineage should be collected from copy commands", + ) + capture_lineage_query_parser_failures: Optional[bool] = Field( + default=False, description="" + ) - table_lineage_mode: Optional[LineageMode] = LineageMode.STL_SCAN_BASED + table_lineage_mode: Optional[LineageMode] = Field( + default=LineageMode.STL_SCAN_BASED, + description="Which table lineage collector mode to use", + ) @pydantic.validator("platform") def platform_is_always_redshift(cls, v): @@ -364,7 +390,98 @@ class RedshiftReport(SQLSourceReport): upstream_lineage: Dict[str, List[str]] = field(default_factory=dict) +@platform_name("Redshift") +@config_class(RedshiftConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability( + SourceCapability.USAGE_STATS, + "Not provided by this module, use `bigquery-usage` for that.", + supported=False, +) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class RedshiftSource(SQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, views and tables + - Column types associated with each table + - Also supports PostGIS extensions + - Table, row, and column statistics via optional SQL profiling + - Table lineage + + :::tip + + You can also get fine-grained usage statistics for Redshift using the `redshift-usage` source described below. + + ::: + + ### Prerequisites + + This source needs to access system tables that require extra permissions. + To grant these permissions, please alter your datahub Redshift user the following way: + ```sql + ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED; + GRANT SELECT ON pg_catalog.svv_table_info to datahub_user; + GRANT SELECT ON pg_catalog.svl_user_info to datahub_user; + ``` + :::note + + Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements. + + ::: + + ### Lineage + + There are multiple lineage collector implementations as Redshift does not support table lineage out of the box. + + #### stl_scan_based + The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to + discover lineage between tables. + Pros: + - Fast + - Reliable + + Cons: + - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table. + - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies. + + #### sql_based + The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries + and uses sql parsing to discover the dependecies. + + Pros: + - Works with Spectrum tables + - Views are connected properly if a table depends on it + + Cons: + - Slow. + - Less reliable as the query parser can fail on certain queries + + #### mixed + Using both collector above and first applying the sql based and then the stl_scan based one. + + Pros: + - Works with Spectrum tables + - Views are connected properly if a table depends on it + - A bit more reliable than the sql_based one only + + Cons: + - Slow + - May be incorrect at times as the query parser can fail on certain queries + + :::note + + The redshift stl redshift tables which are used for getting data lineage only retain approximately two to five days of log history. This means you cannot extract lineage from queries issued outside that window. + + ::: + + """ + eskind_to_platform = {1: "glue", 2: "hive", 3: "postgres", 4: "redshift"} def __init__(self, config: RedshiftConfig, ctx: PipelineContext): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py index b6dc6a3ad1350..7c549408380cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py @@ -16,6 +16,12 @@ import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.sql.sql_common import ( @@ -46,6 +52,9 @@ snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType +@platform_name("Snowflake") +@config_class(SnowflakeConfig) +@support_status(SupportStatus.CERTIFIED) class SnowflakeSource(SQLAlchemySource): def __init__(self, config: SnowflakeConfig, ctx: PipelineContext): super().__init__(config, ctx, "snowflake") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 58c744e60ff2c..b156b6cb4d21f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -22,6 +22,7 @@ from urllib.parse import quote_plus import pydantic +from pydantic.fields import Field from sqlalchemy import create_engine, dialects, inspect from sqlalchemy.engine.reflection import Inspector from sqlalchemy.exc import ProgrammingError @@ -221,7 +222,10 @@ class SQLAlchemyStatefulIngestionConfig(StatefulIngestionConfig): in the SQLAlchemyConfig. """ - remove_stale_metadata: bool = True + remove_stale_metadata: bool = Field( + default=True, + description="Soft-deletes the tables and views that were found in the last successful run but missing in the current run with stateful_ingestion enabled.", + ) class SQLAlchemyConfig(StatefulIngestionConfigBase): @@ -230,14 +234,33 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase): # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. - schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - profile_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - domain: Dict[str, AllowDenyPattern] = dict() + schema_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for schemas to filter in ingestion.", + ) + table_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="egex patterns for tables to filter in ingestion.", + ) + view_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for views to filter in ingestion.", + ) + profile_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for profiles to filter in ingestion.", + ) + domain: Dict[str, AllowDenyPattern] = Field( + default=dict(), + description=' regex patterns for tables/schemas to descide domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain key specified.', + ) - include_views: Optional[bool] = True - include_tables: Optional[bool] = True + include_views: Optional[bool] = Field( + default=True, description="Whether views should be ingested." + ) + include_tables: Optional[bool] = Field( + default=True, description="Whether tables should be ingested." + ) from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig @@ -260,13 +283,18 @@ def get_sql_alchemy_url(self): class BasicSQLAlchemyConfig(SQLAlchemyConfig): - username: Optional[str] = None - password: Optional[pydantic.SecretStr] = None - host_port: Optional[str] = None - database: Optional[str] = None - database_alias: Optional[str] = None - scheme: Optional[str] = None - sqlalchemy_uri: Optional[str] = None + username: Optional[str] = Field(default=None, description="username") + password: Optional[pydantic.SecretStr] = Field(default=None, description="password") + host_port: str = Field(description="host URL") + database: Optional[str] = Field(default=None, description="database (catalog)") + database_alias: Optional[str] = Field( + default=None, description="Alias to apply to database when ingesting." + ) + scheme: str = Field(description="scheme") + sqlalchemy_uri: Optional[str] = Field( + default=None, + description="URI of database to connect to. See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls. Takes precedence over other connection parameters.", + ) def get_sql_alchemy_url(self, uri_opts: Optional[Dict[str, Any]] = None) -> str: if not ((self.host_port and self.scheme) or self.sqlalchemy_uri): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py index 47fb103b85248..01fd5d1ebbb03 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py @@ -1,16 +1,48 @@ +from pydantic.fields import Field + from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.source.sql.sql_common import SQLAlchemyConfig, SQLAlchemySource class SQLAlchemyGenericConfig(SQLAlchemyConfig): - platform: str - connect_uri: str + + platform: str = Field( + description="Name of platform being ingested, used in constructing URNs." + ) + connect_uri: str = Field( + description="URI of database to connect to. See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls" + ) def get_sql_alchemy_url(self): return self.connect_uri +@platform_name("Other SQLAlchemy databases", id="sqlalchemy") +@config_class(SQLAlchemyGenericConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class SQLAlchemyGenericSource(SQLAlchemySource): + """ + The `sqlalchemy` source is useful if we don't have a pre-built source for your chosen + database system, but there is an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/14/dialects/) + defined elsewhere. In order to use this, you must `pip install` the required dialect packages yourself. + + This plugin extracts the following: + + - Metadata for databases, schemas, views, and tables + - Column types associated with each table + - Table, row, and column statistics via optional SQL profiling. + """ + def __init__(self, config: SQLAlchemyGenericConfig, ctx: PipelineContext): super().__init__(config, ctx, config.platform) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py index 730734f5adc4a..1afee15b51a05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py @@ -7,6 +7,7 @@ # This import verifies that the dependencies are available. import trino.sqlalchemy # noqa: F401 +from pydantic.fields import Field from sqlalchemy import exc, sql from sqlalchemy.engine import reflection from sqlalchemy.engine.reflection import Inspector @@ -16,6 +17,14 @@ from trino.sqlalchemy import datatype, error from trino.sqlalchemy.dialect import TrinoDialect +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.extractor import schema_util from datahub.ingestion.source.sql.sql_common import ( BasicSQLAlchemyConfig, @@ -115,7 +124,7 @@ def _get_columns(self, connection, table_name, schema: str = None, **kw): # typ class TrinoConfig(BasicSQLAlchemyConfig): # defaults - scheme = "trino" + scheme = Field(default="trino", description="", exclude=True) def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: regular = f"{schema}.{table}" @@ -131,7 +140,22 @@ def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: ) +@platform_name("Trino") +@config_class(TrinoConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") class TrinoSource(SQLAlchemySource): + """ + + This plugin extracts the following: + + - Metadata for databases, schemas, and tables + - Column types and schema associated with each table + - Table, row, and column statistics via optional SQL profiling + + """ + config: TrinoConfig def __init__(self, config, ctx): diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index 9e2d83f4076fe..066c1676eff91 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -6,6 +6,7 @@ import psutil import pydantic +from pydantic.fields import Field from datahub.configuration.common import ( ConfigModel, @@ -35,24 +36,50 @@ logger: logging.Logger = logging.getLogger(__name__) +class DynamicTypedStateProviderConfig(DynamicTypedConfig): + # Respecifying the base-class just to override field level docs + + type: str = Field( + description="The type of the state provider to use. For DataHub use `datahub`", + ) + # This config type is declared Optional[Any] here. The eventual parser for the + # specified type is responsible for further validation. + config: Optional[Any] = Field( + default=None, + description="The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).", + ) + + class StatefulIngestionConfig(ConfigModel): """ Basic Stateful Ingestion Specific Configuration for any source. """ - enabled: bool = False + enabled: bool = Field( + default=False, + description="The type of the ingestion state provider registered with datahub.", + ) # fmt: off - max_checkpoint_state_size: pydantic.PositiveInt = 2**24 # 16MB + # 16MB + max_checkpoint_state_size: pydantic.PositiveInt = Field(default=2**24, description="The maximum size of the checkpoint state in bytes. Default is 16MB") # 16MB # fmt: on - state_provider: Optional[DynamicTypedConfig] = None - ignore_old_state: bool = False - ignore_new_state: bool = False + state_provider: Optional[DynamicTypedStateProviderConfig] = Field( + default=None, description="The ingestion state provider configuration." + ) + ignore_old_state: bool = Field( + default=False, + description="If set to True, ignores the previous checkpoint state.", + ) + ignore_new_state: bool = Field( + default=False, + description="If set to True, ignores the current checkpoint state.", + ) @pydantic.root_validator() def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("enabled"): if values.get("state_provider") is None: - values["state_provider"] = DynamicTypedConfig( + values["state_provider"] = DynamicTypedStateProviderConfig( type="datahub", config=None ) return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index ce49f0c39e085..8e0dd6f90a086 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -5,10 +5,17 @@ import dateutil.parser as dp import requests from pydantic.class_validators import validator +from pydantic.fields import Field from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql import sql_common @@ -51,13 +58,19 @@ class SupersetConfig(ConfigModel): # See the Superset /security/login endpoint for details # https://superset.apache.org/docs/rest-api - connect_uri: str = "localhost:8088" - username: Optional[str] = None - password: Optional[str] = None - provider: str = "db" - options: Dict = {} - env: str = DEFAULT_ENV - database_alias: Dict[str, str] = {} + connect_uri: str = Field(default="localhost:8088", description="Superset host URL.") + username: Optional[str] = Field(default=None, description="Superset username.") + password: Optional[str] = Field(default=None, description="Superset password.") + provider: str = Field(default="db", description="Superset provider.") + options: Dict = Field(default={}, description="") + env: str = Field( + default=DEFAULT_ENV, + description="Environment to use in namespace when constructing URNs", + ) + database_alias: Dict[str, str] = Field( + default={}, + description="Can be used to change mapping for database names in superset to what you have in datahub", + ) @validator("connect_uri") def remove_trailing_slash(cls, v): @@ -87,7 +100,17 @@ def get_filter_name(filter_obj): return f"{clause} {column} {operator} {comparator}" +@platform_name("Superset") +@config_class(SupersetConfig) +@support_status(SupportStatus.CERTIFIED) class SupersetSource(Source): + """ + This plugin extracts the following: + - Charts, dashboards, and associated metadata + + See documentation for superset's /security/login at https://superset.apache.org/docs/rest-api for more details on superset's login api. + """ + config: SupersetConfig report: SourceReport platform = "superset" diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 60e21693a8a57..0c3b300d1bbf7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -6,6 +6,7 @@ import dateutil.parser as dp from pydantic import validator +from pydantic.fields import Field from tableauserverclient import ( PersonalAccessTokenAuth, Server, @@ -22,6 +23,14 @@ gen_containers, ) from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.tableau_common import ( @@ -81,21 +90,56 @@ class TableauConfig(ConfigModel): - connect_uri: str - username: Optional[str] = None - password: Optional[str] = None - token_name: Optional[str] = None - token_value: Optional[str] = None - - site: str = "" - projects: Optional[List] = ["default"] - default_schema_map: dict = {} - ingest_tags: Optional[bool] = False - ingest_owner: Optional[bool] = False - ingest_tables_external: bool = False - - workbooks_page_size: int = 10 - env: str = builder.DEFAULT_ENV + connect_uri: str = Field(description="Tableau host URL.") + username: Optional[str] = Field( + default=None, + description="Tableau username, must be set if authenticating using username/password.", + ) + password: Optional[str] = Field( + default=None, + description="Tableau password, must be set if authenticating using username/password.", + ) + token_name: Optional[str] = Field( + default=None, + description="Tableau token name, must be set if authenticating using a personal access token.", + ) + token_value: Optional[str] = Field( + default=None, + description="Tableau token value, must be set if authenticating using a personal access token.", + ) + + site: str = Field( + default="", + description="Tableau Site. Always required for Tableau Online. Use emptystring " + " to connect with Default site on Tableau Server.", + ) + projects: Optional[List[str]] = Field( + default=["default"], description="List of projects" + ) + default_schema_map: dict = Field( + default={}, description="Default schema to use when schema is not found." + ) + ingest_tags: Optional[bool] = Field( + default=False, + description="Ingest Tags from source. This will override Tags entered from UI", + ) + ingest_owner: Optional[bool] = Field( + default=False, + description="Ingest Owner from source. This will override Owner info entered from UI", + ) + ingest_tables_external: bool = Field( + default=False, + description="Ingest details for tables external to (not embedded in) tableau as entities.", + ) + + workbooks_page_size: int = Field( + default=10, + description="Number of workbooks to query at a time using Tableau api.", + ) + env: str = Field( + default=builder.DEFAULT_ENV, + description="Environment to use in namespace when constructing URNs.", + ) @validator("connect_uri") def remove_trailing_slash(cls, v): @@ -106,6 +150,29 @@ class WorkbookKey(PlatformKey): workbook_id: str +@platform_name("Tableau") +@config_class(TableauConfig) +@support_status(SupportStatus.INCUBATING) +@capability( + SourceCapability.PLATFORM_INSTANCE, + "Not applicable to source", + supported=False, +) +@capability(SourceCapability.DOMAINS, "Requires transformer", supported=False) +@capability(SourceCapability.DATA_PROFILING, "", supported=False) +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability( + SourceCapability.USAGE_STATS, + "", + supported=False, +) +@capability(SourceCapability.DELETION_DETECTION, "", supported=False) +@capability(SourceCapability.OWNERSHIP, "Requires recipe configuration") +@capability(SourceCapability.TAGS, "Requires recipe configuration") +@capability( + SourceCapability.PARTITION_SUPPORT, "Not applicable to source", supported=False +) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class TableauSource(Source): config: TableauConfig report: SourceReport @@ -116,7 +183,11 @@ class TableauSource(Source): def __hash__(self): return id(self) - def __init__(self, ctx: PipelineContext, config: TableauConfig): + def __init__( + self, + config: TableauConfig, + ctx: PipelineContext, + ): super().__init__(ctx) self.config = config @@ -1069,7 +1140,7 @@ def _get_ownership(self, user: str) -> Optional[OwnershipClass]: @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: config = TableauConfig.parse_obj(config_dict) - return cls(ctx, config) + return cls(config, ctx) def get_workunits(self) -> Iterable[MetadataWorkUnit]: if self.server is None or not self.server.is_signed_in(): diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py index 6e024cdbf1ea5..2427c172f5980 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/bigquery_usage.py @@ -19,6 +19,12 @@ from datahub.configuration.time_window_config import get_time_bucket from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.usage.usage_common import GenericAggregatedDataset @@ -608,7 +614,21 @@ def cleanup(config: BigQueryUsageConfig) -> None: os.unlink(config._credentials_path) +@platform_name("BigQuery") +@support_status(SupportStatus.CERTIFIED) +@config_class(BigQueryUsageConfig) class BigQueryUsageSource(Source): + """ + This plugin extracts the following: + * Statistics on queries issued and tables and columns accessed (excludes views) + * Aggregation of these statistics into buckets, by day or hour granularity + + :::note + 1. This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` plugin. + 2. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. In that case, use either admin or private log viewer permission. + ::: + """ + def __init__(self, config: BigQueryUsageConfig, ctx: PipelineContext): super().__init__(ctx) self.config: BigQueryUsageConfig = config @@ -620,6 +640,10 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "BigQueryUsageSource config = BigQueryUsageConfig.parse_obj(config_dict) return cls(config, ctx) + # @staticmethod + # def get_config_class() -> Type[ConfigModel]: + # return BigQueryUsageConfig + def add_config_to_report(self): self.report.start_time = self.config.start_time self.report.end_time = self.config.end_time diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py index 8763a4c164bc5..d16b782611c11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py @@ -5,12 +5,22 @@ from typing import Dict, Iterable, List from dateutil import parser +from pydantic.fields import Field from pydantic.main import BaseModel from sqlalchemy import create_engine from sqlalchemy.engine import Engine import datahub.emitter.mce_builder as builder +from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.configuration.time_window_config import get_time_bucket +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.clickhouse import ClickHouseConfig @@ -58,18 +68,42 @@ class ClickHouseJoinedAccessEvent(BaseModel): endtime: datetime -class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig): - env: str = builder.DEFAULT_ENV - email_domain: str - options: dict = {} - query_log_table: str = "system.query_log" +class ClickHouseUsageConfig( + ClickHouseConfig, BaseUsageConfig, EnvBasedSourceConfigBase +): + email_domain: str = Field(description="") + options: dict = Field(default={}, description="") + query_log_table: str = Field(default="system.query_log", exclude=True) def get_sql_alchemy_url(self): return super().get_sql_alchemy_url() +@platform_name("ClickHouse") +@config_class(ClickHouseUsageConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @dataclasses.dataclass class ClickHouseUsageSource(Source): + """ + This plugin has the below functionalities - + 1. For a specific dataset this plugin ingests the following statistics - + 1. top n queries. + 2. top users. + 3. usage of each column in the dataset. + 2. Aggregation of these statistics into buckets, by day or hour granularity. + + Usage information is computed by querying the system.query_log table. In case you have a cluster or need to apply additional transformation/filters you can create a view and put to the `query_log_table` setting. + + :::note + + This source only does usage statistics. To get the tables, views, and schemas in your ClickHouse warehouse, ingest using the `clickhouse` source described above. + + ::: + + """ + config: ClickHouseUsageConfig report: SourceReport = dataclasses.field(default_factory=SourceReport) diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py index b08a6cec3f0db..acdeb9f8787b8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py @@ -4,16 +4,25 @@ from datetime import datetime from typing import Dict, Iterable, List, Optional, Set -from pydantic import Field +from pydantic.fields import Field from pydantic.main import BaseModel from sqlalchemy import create_engine from sqlalchemy.engine import Engine from sqlalchemy.engine.result import ResultProxy, RowProxy import datahub.emitter.mce_builder as builder +from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.configuration.time_window_config import get_time_bucket from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.redshift import RedshiftConfig @@ -133,10 +142,9 @@ class RedshiftAccessEvent(BaseModel): endtime: datetime -class RedshiftUsageConfig(RedshiftConfig, BaseUsageConfig): - env: str = builder.DEFAULT_ENV - email_domain: str - options: Dict = {} +class RedshiftUsageConfig(RedshiftConfig, BaseUsageConfig, EnvBasedSourceConfigBase): + email_domain: str = Field(description="") + options: Dict = Field(default={}, description="") def get_sql_alchemy_url(self): return super().get_sql_alchemy_url() @@ -152,7 +160,39 @@ def report_dropped(self, key: str) -> None: self.filtered.add(key) +@platform_name("Redshift") +@config_class(RedshiftUsageConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") class RedshiftUsageSource(Source): + """ + This plugin extracts usage statistics for datasets in Amazon Redshift. + + Note: Usage information is computed by querying the following system tables - + 1. stl_scan + 2. svv_table_info + 3. stl_query + 4. svl_user_info + + To grant access this plugin for all system tables, please alter your datahub Redshift user the following way: + ```sql + ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED; + ``` + This plugin has the below functionalities - + 1. For a specific dataset this plugin ingests the following statistics - + 1. top n queries. + 2. top users. + 3. usage of each column in the dataset. + 2. Aggregation of these statistics into buckets, by day or hour granularity. + + :::note + + This source only does usage statistics. To get the tables, views, and schemas in your Redshift warehouse, ingest using the `redshift` source described above. + + ::: + + """ + def __init__(self, config: RedshiftUsageConfig, ctx: PipelineContext): super().__init__(ctx) self.config: RedshiftUsageConfig = config diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/snowflake_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/snowflake_usage.py index 7484ec6b8df9c..410879c523d9f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/snowflake_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/snowflake_usage.py @@ -14,6 +14,12 @@ from datahub.configuration.time_window_config import get_time_bucket from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -120,6 +126,9 @@ class SnowflakeJoinedAccessEvent(PermissiveModel): role_name: str +@platform_name("Snowflake") +@support_status(SupportStatus.CERTIFIED) +@config_class(SnowflakeUsageConfig) class SnowflakeUsageSource(StatefulIngestionSourceBase): def __init__(self, config: SnowflakeUsageConfig, ctx: PipelineContext): super(SnowflakeUsageSource, self).__init__(config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index d87a1744c21ee..a2d7bcc320b67 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -7,13 +7,21 @@ from typing import Dict, Iterable, List from dateutil import parser -from pydantic import Field +from pydantic.fields import Field from pydantic.main import BaseModel from sqlalchemy import create_engine from sqlalchemy.engine import Engine import datahub.emitter.mce_builder as builder from datahub.configuration.time_window_config import get_time_bucket +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.trino import TrinoConfig @@ -75,19 +83,48 @@ class TrinoJoinedAccessEvent(BaseModel): endtime: datetime = Field(None, alias="end_time") -class TrinoUsageConfig(TrinoConfig, BaseUsageConfig): - env: str = builder.DEFAULT_ENV - email_domain: str - audit_catalog: str - audit_schema: str - options: dict = {} +class EnvBasedSourceBaseConfig: + pass + + +class TrinoUsageConfig(TrinoConfig, BaseUsageConfig, EnvBasedSourceBaseConfig): + email_domain: str = Field( + description="The email domain which will be appended to the users " + ) + audit_catalog: str = Field( + description="The catalog name where the audit table can be found " + ) + audit_schema: str = Field( + description="The schema name where the audit table can be found" + ) + options: dict = Field(default={}, description="") + database: str = Field(description="The name of the catalog from getting the usage") def get_sql_alchemy_url(self): return super().get_sql_alchemy_url() +@platform_name("Trino") +@config_class(TrinoUsageConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @dataclasses.dataclass class TrinoUsageSource(Source): + """ + If you are using Starburst Trino you can collect usage stats the following way. + + #### Prerequsities + 1. You need to setup Event Logger which saves audit logs into a Postgres db and setup this db as a catalog in Trino + Here you can find more info about how to setup: + https://docs.starburst.io/354-e/security/event-logger.html#security-event-logger--page-root + https://docs.starburst.io/354-e/security/event-logger.html#analyzing-the-event-log + + 2. Install starbust-trino-usage plugin + Run pip install 'acryl-datahub[starburst-trino-usage]'. + + """ + config: TrinoUsageConfig report: SourceReport = dataclasses.field(default_factory=SourceReport) diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py index 25de9efcaa46f..94e30033ddf38 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py @@ -5,6 +5,7 @@ from typing import Callable, Counter, Generic, List, Optional, TypeVar import pydantic +from pydantic.fields import Field import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern @@ -130,10 +131,19 @@ def make_usage_workunit( class BaseUsageConfig(BaseTimeWindowConfig): - top_n_queries: pydantic.PositiveInt = 10 - user_email_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - include_operational_stats: bool = True - format_sql_queries: bool = False + top_n_queries: pydantic.PositiveInt = Field( + default=10, description="Number of top queries to save to each table." + ) + user_email_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for user emails to filter in usage.", + ) + include_operational_stats: bool = Field( + default=True, description="Whether to display operational stats." + ) + format_sql_queries: bool = Field( + default=False, description="Whether to format sql queries" + ) @pydantic.validator("top_n_queries") def ensure_top_n_queries_is_not_too_big(cls, v: int) -> int: diff --git a/metadata-ingestion/tests/unit/test_kafka_source.py b/metadata-ingestion/tests/unit/test_kafka_source.py index 895ebc58dff54..8de1de1380e6d 100644 --- a/metadata-ingestion/tests/unit/test_kafka_source.py +++ b/metadata-ingestion/tests/unit/test_kafka_source.py @@ -15,7 +15,8 @@ make_dataset_urn_with_platform_instance, ) from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.kafka import KafkaSource +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.kafka import KafkaSource, KafkaSourceConfig from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( BrowsePathsClass, @@ -29,8 +30,9 @@ class KafkaSourceTest(unittest.TestCase): @patch("datahub.ingestion.source.kafka.confluent_kafka.Consumer", autospec=True) def test_kafka_source_configuration(self, mock_kafka): ctx = PipelineContext(run_id="test") - kafka_source = KafkaSource.create( - {"connection": {"bootstrap": "foobar:9092"}}, ctx + kafka_source = KafkaSource( + KafkaSourceConfig.parse_obj({"connection": {"bootstrap": "foobar:9092"}}), + ctx, ) kafka_source.close() assert mock_kafka.call_count == 1 @@ -43,8 +45,11 @@ def test_kafka_source_workunits_wildcard_topic(self, mock_kafka): mock_kafka_instance.list_topics.return_value = mock_cluster_metadata ctx = PipelineContext(run_id="test") - kafka_source = KafkaSource.create( - {"connection": {"bootstrap": "localhost:9092"}}, ctx + kafka_source = KafkaSource( + KafkaSourceConfig.parse_obj( + {"connection": {"bootstrap": "localhost:9092"}} + ), + ctx, ) workunits = list(kafka_source.get_workunits()) @@ -111,6 +116,7 @@ def test_kafka_source_workunits_with_platform_instance(self, mock_kafka): # We should only have 1 topic + sub-type wu. assert len(workunits) == 2 + assert isinstance(workunits[0], MetadataWorkUnit) assert isinstance(workunits[0].metadata, MetadataChangeEvent) proposed_snap = workunits[0].metadata.proposedSnapshot assert proposed_snap.urn == make_dataset_urn_with_platform_instance( @@ -303,6 +309,7 @@ def mock_get_latest_version(subject_name: str) -> Optional[RegisteredSchema]: assert len(workunits) == 8 i: int = -1 for wu in workunits: + assert isinstance(wu, MetadataWorkUnit) if not isinstance(wu.metadata, MetadataChangeEvent): continue mce: MetadataChangeEvent = wu.metadata diff --git a/metadata-ingestion/transformers.md b/metadata-ingestion/transformers.md index 2f3e4c2b01508..059d1995e904f 100644 --- a/metadata-ingestion/transformers.md +++ b/metadata-ingestion/transformers.md @@ -92,7 +92,7 @@ Finally, you can install and use your custom transformer as [shown here](#instal ### Adding a set of glossary terms -We can use a similar convention to associate [Glossary Terms](https://datahubproject.io/docs/metadata-ingestion/source_docs/business_glossary) to datasets. We can use the `simple_add_dataset_terms` module that’s included in the ingestion framework. +We can use a similar convention to associate [Glossary Terms](../docs/generated/ingestion/sources/business-glossary.md) to datasets. We can use the `simple_add_dataset_terms` module that’s included in the ingestion framework. The config, which we’d append to our ingestion recipe YAML, would look like this: