Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(browse): Fixing browse path to remove requirement for simple name suffix #5634

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
fbebc46
SAVE
jjoyce0510 Aug 9, 2022
e360050
Fixing browse paths
jjoyce0510 Aug 10, 2022
921027f
Fixing the Lineage Viz bugs
jjoyce0510 Aug 10, 2022
30f25a2
Officially fixing Browse Backend
jjoyce0510 Aug 11, 2022
4f0a68a
Adding legacy browse path
jjoyce0510 Aug 12, 2022
2d7512c
Adding step for migrating browse paths
jjoyce0510 Aug 12, 2022
400228e
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
36c8d84
Reset bootstrap
jjoyce0510 Aug 12, 2022
4ffe242
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
272dc7f
Remove unnecessary change
jjoyce0510 Aug 12, 2022
39dbed2
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
e8a6f59
Adding basic unit test
jjoyce0510 Aug 12, 2022
8537930
Adding docs and more to support Browse Path changes
jjoyce0510 Aug 12, 2022
33fe769
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
c03cc49
Remove JSON requirement
jjoyce0510 Aug 12, 2022
96ec3c3
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
f78f795
Restore test utils
jjoyce0510 Aug 12, 2022
da0f2c1
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
2637d70
Updating description
jjoyce0510 Aug 12, 2022
b54fe4c
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 12, 2022
2985eb5
Adding some unit tests
jjoyce0510 Aug 15, 2022
caf7d01
Changing infos to debugs
jjoyce0510 Aug 15, 2022
07b61d6
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 15, 2022
130da25
fix checkstyle
jjoyce0510 Aug 15, 2022
18e0c59
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 15, 2022
c05fd39
Fix test checkstyle
jjoyce0510 Aug 15, 2022
2255c2c
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 15, 2022
3176ccc
Fixing failing test
jjoyce0510 Aug 15, 2022
98ef364
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 15, 2022
09deb5e
Adding details on impacted sources
jjoyce0510 Aug 16, 2022
473b815
Finishing docs
jjoyce0510 Aug 16, 2022
648d10c
Merge remote-tracking branch 'acryl/jj--fix-browse-name-split' into j…
jjoyce0510 Aug 16, 2022
087c56f
Merge branch 'master' into jj--fix-browse-name-split
jjoyce0510 Aug 19, 2022
fb2a06e
Merge branch 'master' into jj--fix-browse-name-split
jjoyce0510 Aug 30, 2022
50e22ff
Merge branch 'master' into jj--fix-browse-name-split
jjoyce0510 Sep 2, 2022
a40f2f0
Merge branch 'master' into jj--fix-browse-name-split
jjoyce0510 Sep 6, 2022
a2cc7cc
Update browse-paths-upgrade.md
jjoyce0510 Sep 6, 2022
8a2fe6d
Merge branch 'master' into jj--fix-browse-name-split
jjoyce0510 Sep 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import com.linkedin.metadata.entity.ebean.EbeanAspectV1;
import com.linkedin.metadata.entity.ebean.EbeanAspectV2;
import com.linkedin.metadata.models.EntitySpec;
import com.linkedin.metadata.search.utils.BrowsePathUtils;
import io.ebean.EbeanServer;
import io.ebean.PagedList;
import java.net.URISyntaxException;
Expand Down Expand Up @@ -151,7 +150,7 @@ public Function<UpgradeContext, UpgradeStepResult> executable() {
// Emit a browse path aspect.
final BrowsePaths browsePaths;
try {
browsePaths = BrowsePathUtils.buildBrowsePath(urn, _entityService.getEntityRegistry());
browsePaths = _entityService.buildDefaultBrowsePath(urn);

final AuditStamp browsePathsStamp = new AuditStamp();
browsePathsStamp.setActor(Urn.createFromString(Constants.SYSTEM_ACTOR));
Expand Down
4 changes: 4 additions & 0 deletions docker/datahub-gms/env/docker-without-neo4j.env
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ ENTITY_SERVICE_ENABLE_RETENTION=true
# set ELASTICSEARCH_USE_SSL=true and uncomment:
# ELASTICSEARCH_USERNAME=
# ELASTICSEARCH_PASSWORD=

# Uncomment to run a one-time upgrade to migrate legacy default browse path format to latest format
# More details can be found at https://datahubproject.io/docs/advanced/browse-paths-upgrade
# UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
4 changes: 4 additions & 0 deletions docker/datahub-gms/env/docker.env
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,7 @@ UI_INGESTION_DEFAULT_CLI_VERSION=0.8.42

# Uncomment to increase concurrency across Kafka consumers
# KAFKA_LISTENER_CONCURRENCY=2

# Uncomment to run a one-time upgrade to migrate legacy default browse path format to latest format
# More details can be found at https://datahubproject.io/docs/advanced/browse-paths-upgrade
# UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
1 change: 1 addition & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ module.exports = {
"metadata-ingestion/adding-source",
"docs/how/add-custom-ingestion-source",
"docs/how/add-custom-data-platform",
"docs/advanced/browse-paths-upgrade",
],
},
],
Expand Down
137 changes: 137 additions & 0 deletions docs/advanced/browse-paths-upgrade.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Browse Paths Upgrade (August 2022)

## Background

Up to this point, there's been a historical constraint on all entity browse paths. Namely, each browse path has been
required to end with a path component that represents "simple name" for an entity. For example, a Browse Path for a
Snowflake Table called "test_table" may look something like this:

```
/prod/snowflake/warehouse1/db1/test_table
```

In the UI, we artificially truncate the final path component when you are browsing the Entity hierarchy, so your browse experience
would be:

`prod` > `snowflake` > `warehouse1`> `db1` > `Click Entity`

As you can see, the final path component `test_table` is effectively ignored. It could have any value, and we would still ignore
it in the UI. This behavior serves as a workaround to the historical requirement that all browse paths end with a simple name.

This data constraint stands in opposition the original intention of Browse Paths: to provide a simple mechanism for organizing
assets into a hierarchical folder structure. For this reason, we've changed the semantics of Browse Paths to better align with the original intention.
Going forward, you will not be required to provide a final component detailing the "name". Instead, you will be able to provide a simpler path that
omits this final component:

```
/prod/snowflake/warehouse1/db1
```

and the browse experience from the UI will continue to work as you would expect:

`prod` > `snowflake` > `warehouse1`> `db1` > `Click Entity`.

With this change comes a fix to a longstanding bug where multiple browse paths could not be attached to a single URN. Going forward,
we will support producing multiple browse paths for the same entity, and allow you to traverse via multiple paths. For example

```python
browse_path = BrowsePathsClass(
paths=["/powerbi/my/custom/path", "/my/other/custom/path"]
)
return MetadataChangeProposalWrapper(
entityType="dataset",
changeType="UPSERT",
entityUrn="urn:li:dataset:(urn:li:dataPlatform:custom,MyFileName,PROD),
aspectName="browsePaths",
aspect=browse_path,
)
```
*Using the Python Emitter SDK to produce multiple Browse Paths for the same entity*

We've received multiple bug reports, such as [this issue](https://github.com/datahub-project/datahub/issues/5525), and requests to address these issues with Browse, and thus are deciding
to do it now before more workarounds are created.

## What this means for you

Once you upgrade to DataHub `v0.8.45` you will immediately notice that traversing your Browse Path hierarchy will require
one extra click to find the entity. This is because we are correctly displaying the FULL browse path, including the simple name mentioned above.

There will be 2 ways to upgrade to the new browse path format. Depending on your ingestion sources, you may want to use one or both:

1. Migrate default browse paths to the new format by restarting DataHub
2. Upgrade your version of the `datahub` CLI to push new browse path format (version `v0.8.45`)

Each step will be discussed in detail below.

### 1. Migrating default browse paths to the new format

To migrate those Browse Paths that are generated by DataHub by default (when no path is provided), simply restart the `datahub-gms` container / pod with a single
additional environment variable:

```
UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
```

And restart the `datahub-gms` instance. This will cause GMS to perform a boot-time migration of all your existing Browse Paths
to the new format, removing the unnecessarily name component at the very end.

If the migration is successful, you'll see the following in your GMS logs:

```
18:58:17.414 [main] INFO c.l.m.b.s.UpgradeDefaultBrowsePathsStep:60 - Successfully upgraded all browse paths!
```

After this one-time migration is complete, you should be able to navigate the Browse hierarchy exactly as you did previously.

> Note that certain ingestion sources actively produce their own Browse Paths, which overrides the default path
> computed by DataHub.
>
> In these cases, getting the updated Browse Path will require re-running your ingestion process with the updated
> version of the connector. This is discussed in more detail in the next section.

### 2. Upgrading the `datahub` CLI to push new browse paths

If you are actively ingesting metadata from one or more of following sources

1. Sagemaker
2. Looker / LookML
3. Feast
4. Kafka
5. Mode
6. PowerBi
7. Pulsar
8. Tableau
9. Business Glossary

You will need to upgrade the DataHub CLI to >= `v0.8.45` and re-run metadata ingestion. This will generate the new browse path format
and overwrite the existing paths for entities that were extracted from these sources.

### If you are producing custom Browse Paths

If you've decided to produce your own custom Browse Paths to organize your assets (e.g. via the Python Emitter SDK), you'll want to change the code to produce those paths
to truncate the final path component. For example, if you were previously emitting a browse path like this:

```
"my/custom/browse/path/suffix"
```

You can simply remove the final "suffix" piece:

```
"my/custom/browse/path"
```

Your users will be able to find the entity by traversing through these folders in the UI:

`my` > `custom` > `browse`> `path` > `Click Entity`.


> Note that if you are using the Browse Path Transformer you *will* be impacted in the same way. It is recommended that you revisit the
> paths that you are producing, and update them to the new format.

## Support

The Acryl team will be on standby to assist you in your migration. Please
join [#release-0_8_0](https://datahubspace.slack.com/archives/C0244FHMHJQ) channel and reach out to us if you find
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like this channel has been deleted. should we just pick one that already exists?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol yes

trouble with the upgrade or have feedback on the process. We will work closely to make sure you can continue to operate
DataHub smoothly.
3 changes: 3 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

### Breaking Changes

- Browse Paths have been upgraded to a new format to align more closely with the intention of the feature.
Learn more about the changes, including steps on upgrading, here: https://datahubproject.io/docs/advanced/browse-paths-upgrade

### Potential Downtime

### Deprecations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import com.linkedin.data.schema.TyperefDataSchema;
import com.linkedin.data.template.DataTemplateUtil;
import com.linkedin.data.template.RecordTemplate;
import com.linkedin.data.template.StringArray;
import com.linkedin.data.template.UnionTemplate;
import com.linkedin.dataplatform.DataPlatformInfo;
import com.linkedin.entity.AspectType;
import com.linkedin.entity.Entity;
import com.linkedin.entity.EntityResponse;
Expand All @@ -38,7 +40,6 @@
import com.linkedin.metadata.models.registry.EntityRegistry;
import com.linkedin.metadata.query.ListUrnsResult;
import com.linkedin.metadata.run.AspectRowSummary;
import com.linkedin.metadata.search.utils.BrowsePathUtils;
import com.linkedin.metadata.snapshot.Snapshot;
import com.linkedin.metadata.utils.DataPlatformInstanceUtils;
import com.linkedin.metadata.utils.EntityKeyUtils;
Expand Down Expand Up @@ -73,6 +74,7 @@
import lombok.extern.slf4j.Slf4j;

import static com.linkedin.metadata.Constants.*;
import static com.linkedin.metadata.search.utils.BrowsePathUtils.*;
import static com.linkedin.metadata.utils.PegasusUtils.*;


Expand Down Expand Up @@ -142,7 +144,6 @@ public static class IngestProposalResult {
public static final String BROWSE_PATHS = "browsePaths";
public static final String DATA_PLATFORM_INSTANCE = "dataPlatformInstance";
protected static final int MAX_KEYS_PER_QUERY = 500;
public static final String STATUS = "status";

public EntityService(
@Nonnull final AspectDao aspectDao,
Expand Down Expand Up @@ -1180,10 +1181,8 @@ public List<Pair<String, RecordTemplate>> generateDefaultAspectsIfMissing(@Nonnu

if (shouldCheckBrowsePath && latestAspects.get(BROWSE_PATHS) == null) {
try {
BrowsePaths generatedBrowsePath = BrowsePathUtils.buildBrowsePath(urn, getEntityRegistry());
if (generatedBrowsePath != null) {
aspects.add(Pair.of(BROWSE_PATHS, generatedBrowsePath));
}
BrowsePaths generatedBrowsePath = buildDefaultBrowsePath(urn);
aspects.add(Pair.of(BROWSE_PATHS, generatedBrowsePath));
} catch (URISyntaxException e) {
log.error("Failed to parse urn: {}", urn);
}
Expand Down Expand Up @@ -1762,4 +1761,54 @@ private RecordTemplate updateAspect(

return newValue;
}

/**
* Builds the default browse path aspects for a subset of well-supported entities.
*
* This method currently supports datasets, charts, dashboards, data flows, data jobs, and glossary terms.
*/
@Nonnull
public BrowsePaths buildDefaultBrowsePath(final @Nonnull Urn urn) throws URISyntaxException {
Character dataPlatformDelimiter = getDataPlatformDelimiter(urn);
String defaultBrowsePath = getDefaultBrowsePath(urn, this.getEntityRegistry(), dataPlatformDelimiter);
StringArray browsePaths = new StringArray();
browsePaths.add(defaultBrowsePath);
BrowsePaths browsePathAspect = new BrowsePaths();
browsePathAspect.setPaths(browsePaths);
return browsePathAspect;
}

/**
* Returns a delimiter on which the name of an asset may be split.
*/
private Character getDataPlatformDelimiter(Urn urn) {
// Attempt to construct the appropriate Data Platform URN
Urn dataPlatformUrn = buildDataPlatformUrn(urn, this.getEntityRegistry());
if (dataPlatformUrn != null) {
// Attempt to resolve the delimiter from Data Platform Info
DataPlatformInfo dataPlatformInfo = getDataPlatformInfo(dataPlatformUrn);
if (dataPlatformInfo != null && dataPlatformInfo.hasDatasetNameDelimiter()) {
return dataPlatformInfo.getDatasetNameDelimiter().charAt(0);
}
}
// Else, fallback to a default delimiter (period) if one cannot be resolved.
return '.';
}

@Nullable
private DataPlatformInfo getDataPlatformInfo(Urn urn) {
try {
final EntityResponse entityResponse = getEntityV2(
Constants.DATA_PLATFORM_ENTITY_NAME,
urn,
ImmutableSet.of(Constants.DATA_PLATFORM_INFO_ASPECT_NAME)
);
if (entityResponse != null && entityResponse.hasAspects() && entityResponse.getAspects().containsKey(Constants.DATA_PLATFORM_INFO_ASPECT_NAME)) {
return new DataPlatformInfo(entityResponse.getAspects().get(Constants.DATA_PLATFORM_INFO_ASPECT_NAME).getValue().data());
}
} catch (Exception e) {
log.warn(String.format("Failed to find Data Platform Info for urn %s", urn));
}
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,12 @@ public Map<String, Long> aggregateByValue(@Nullable String entityName, @Nonnull

@Nonnull
@Override
public BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable Filter requestParams, int from,
public BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable Filter filters, int from,
int size) {
log.debug(
String.format("Browsing entities entityName: %s, path: %s, requestParams: %s, from: %s, size: %s", entityName,
path, requestParams, from, size));
return esBrowseDAO.browse(entityName, path, requestParams, from, size);
String.format("Browsing entities entityName: %s, path: %s, filters: %s, from: %s, size: %s", entityName,
path, filters, from, size));
return esBrowseDAO.browse(entityName, path, filters, from, size);
}

@Nonnull
Expand Down
Loading