Skip to content

Commit

Permalink
feat(ingest): use entry point for registering transformers
Browse files Browse the repository at this point in the history
  • Loading branch information
Masterchen09 committed Dec 4, 2022
1 parent 5e971ce commit b41430e
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 71 deletions.
5 changes: 4 additions & 1 deletion metadata-ingestion/docs/transformer/dataset_transformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,7 @@ def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventCla
Now that we've defined the transformer, we need to make it visible to DataHub. The easiest way to do this is to just place it in the same directory as your recipe, in which case the module name is the same as the file – in this case, `custom_transform_example`.

<details>
<summary>Advanced: installing as a package</summary>
<summary>Advanced: Installing as a package and enable discoverability</summary>
Alternatively, create a `setup.py` in the same directory as our transform script to make it visible globally. After installing this package (e.g. with `python setup.py` or `pip install -e .`), our module will be installed and importable as `custom_transform_example`.

```python
Expand All @@ -1160,6 +1160,9 @@ setup(
)
```

Additionally, declare the transformer under the `entry_points` variable of the [setup script](../../setup.py). This enables the transformer to be
listed when running `datahub check plugins`, and sets up the transformer's shortened alias for use in recipes.

</details>

### Running the transform
Expand Down
21 changes: 21 additions & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,27 @@ def get_long_description():
"salesforce = datahub.ingestion.source.salesforce:SalesforceSource",
"unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
],
"datahub.ingestion.transformer.plugins": [
"simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership",
"mark_dataset_status = datahub.ingestion.transformer.mark_dataset_status:MarkDatasetStatus",
"set_dataset_browse_path = datahub.ingestion.transformer.add_dataset_browse_path:AddDatasetBrowsePathTransformer",
"add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:AddDatasetOwnership",
"simple_add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:SimpleAddDatasetOwnership",
"pattern_add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:PatternAddDatasetOwnership",
"add_dataset_domain = datahub.ingestion.transformer.dataset_domain:AddDatasetDomain",
"simple_add_dataset_domain = datahub.ingestion.transformer.dataset_domain:SimpleAddDatasetDomain",
"pattern_add_dataset_domain = datahub.ingestion.transformer.dataset_domain:PatternAddDatasetDomain",
"add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:AddDatasetTags",
"simple_add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:SimpleAddDatasetTags",
"pattern_add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:PatternAddDatasetTags",
"add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:AddDatasetTerms",
"simple_add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:SimpleAddDatasetTerms",
"pattern_add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:PatternAddDatasetTerms",
"add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:AddDatasetProperties",
"simple_add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:SimpleAddDatasetProperties",
"pattern_add_dataset_schema_terms = datahub.ingestion.transformer.add_dataset_schema_terms:PatternAddDatasetSchemaTerms",
"pattern_add_dataset_schema_tags = datahub.ingestion.transformer.add_dataset_schema_tags:PatternAddDatasetSchemaTags",
],
"datahub.ingestion.sink.plugins": [
"file = datahub.ingestion.sink.file:FileSink",
"console = datahub.ingestion.sink.console:ConsoleSink",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,74 +1,26 @@
from datahub.ingestion.api.registry import PluginRegistry
from datahub.ingestion.api.transform import Transformer
from datahub.ingestion.transformer import dataset_domain
from datahub.ingestion.transformer.add_dataset_browse_path import (
AddDatasetBrowsePathTransformer,
)
from datahub.ingestion.transformer.add_dataset_ownership import (
AddDatasetOwnership,
PatternAddDatasetOwnership,
SimpleAddDatasetOwnership,
)
from datahub.ingestion.transformer.add_dataset_properties import (
AddDatasetProperties,
SimpleAddDatasetProperties,
)
from datahub.ingestion.transformer.add_dataset_schema_tags import (
PatternAddDatasetSchemaTags,
)
from datahub.ingestion.transformer.add_dataset_schema_terms import (
PatternAddDatasetSchemaTerms,
)
from datahub.ingestion.transformer.add_dataset_tags import (
AddDatasetTags,
PatternAddDatasetTags,
SimpleAddDatasetTags,
)
from datahub.ingestion.transformer.add_dataset_terms import (
AddDatasetTerms,
PatternAddDatasetTerms,
SimpleAddDatasetTerms,
)
from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus
from datahub.ingestion.transformer.remove_dataset_ownership import (
SimpleRemoveDatasetOwnership,
)

transform_registry = PluginRegistry[Transformer]()

transform_registry.register(
"simple_remove_dataset_ownership", SimpleRemoveDatasetOwnership
)
transform_registry.register("mark_dataset_status", MarkDatasetStatus)
transform_registry.register("set_dataset_browse_path", AddDatasetBrowsePathTransformer)

transform_registry.register("add_dataset_ownership", AddDatasetOwnership)
transform_registry.register("simple_add_dataset_ownership", SimpleAddDatasetOwnership)
transform_registry.register("pattern_add_dataset_ownership", PatternAddDatasetOwnership)

transform_registry.register("add_dataset_domain", dataset_domain.AddDatasetDomain)
transform_registry.register(
"simple_add_dataset_domain", dataset_domain.SimpleAddDatasetDomain
)
transform_registry.register(
"pattern_add_dataset_domain", dataset_domain.PatternAddDatasetDomain
)


transform_registry.register("add_dataset_tags", AddDatasetTags)
transform_registry.register("simple_add_dataset_tags", SimpleAddDatasetTags)
transform_registry.register("pattern_add_dataset_tags", PatternAddDatasetTags)

transform_registry.register("add_dataset_terms", AddDatasetTerms)
transform_registry.register("simple_add_dataset_terms", SimpleAddDatasetTerms)
transform_registry.register("pattern_add_dataset_terms", PatternAddDatasetTerms)

transform_registry.register("add_dataset_properties", AddDatasetProperties)
transform_registry.register("simple_add_dataset_properties", SimpleAddDatasetProperties)

transform_registry.register(
"pattern_add_dataset_schema_terms", PatternAddDatasetSchemaTerms
)
transform_registry.register(
"pattern_add_dataset_schema_tags", PatternAddDatasetSchemaTags
)
transform_registry.register_from_entrypoint("datahub.ingestion.transformer.plugins")

# These transformers are always enabled
assert transform_registry.get("simple_remove_dataset_ownership")
assert transform_registry.get("mark_dataset_status")
assert transform_registry.get("set_dataset_browse_path")
assert transform_registry.get("add_dataset_ownership")
assert transform_registry.get("simple_add_dataset_ownership")
assert transform_registry.get("pattern_add_dataset_ownership")
assert transform_registry.get("add_dataset_domain")
assert transform_registry.get("simple_add_dataset_domain")
assert transform_registry.get("pattern_add_dataset_domain")
assert transform_registry.get("add_dataset_tags")
assert transform_registry.get("simple_add_dataset_tags")
assert transform_registry.get("pattern_add_dataset_tags")
assert transform_registry.get("add_dataset_terms")
assert transform_registry.get("simple_add_dataset_terms")
assert transform_registry.get("pattern_add_dataset_terms")
assert transform_registry.get("add_dataset_properties")
assert transform_registry.get("simple_add_dataset_properties")
assert transform_registry.get("pattern_add_dataset_schema_terms")
assert transform_registry.get("pattern_add_dataset_schema_tags")

0 comments on commit b41430e

Please sign in to comment.