diff --git a/datahub-kubernetes/README.md b/datahub-kubernetes/README.md index 9283c6e6e6780d..15b7338f198702 100644 --- a/datahub-kubernetes/README.md +++ b/datahub-kubernetes/README.md @@ -7,7 +7,7 @@ title: "Deploying with Kubernetes" ## Introduction [This directory](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes) provides the Kubernetes [Helm](https://helm.sh/) charts for deploying [Datahub](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes/datahub) and it's [dependencies](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes/prerequisites) -(Elasticsearch, Neo4j, MySQL, and Kafka) on a Kubernetes cluster. +(Elasticsearch, optionally Neo4j, MySQL, and Kafka) on a Kubernetes cluster. ## Setup 1. Set up a kubernetes cluster diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 6ce8bc8aeca204..ab0cb14f916b34 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -109,6 +109,7 @@ module.exports = { "docs/advanced/high-cardinality", "docs/how/scsi-onboarding-guide", "docs/advanced/no-code-upgrade", + "docs/how/migrating-graph-service-implementation", // WIP "docs/advanced/backfilling", // WIP "docs/advanced/derived-aspects", // WIP "docs/advanced/entity-hierarchy", diff --git a/docs/how/migrating-graph-service-implementation.md b/docs/how/migrating-graph-service-implementation.md new file mode 100644 index 00000000000000..f9636d464cbae9 --- /dev/null +++ b/docs/how/migrating-graph-service-implementation.md @@ -0,0 +1,51 @@ +# Migrate Graph Service Implementation to Elasticsearch + +We currently support either Elasticsearch or Neo4j as backend implementations for the graph service. We recommend +Elasticsearch for those looking for a lighter deployment or do not want to manage a Neo4j database. +If you started using Neo4j as your graph service backend, here is how you can migrate to Elasticsearch. + +## Docker-compose + +If you are running your instance through docker locally, you will want to spin up your Datahub instance with +elasticsearch as the backend. On a clean start, this happens by default. However, if you've written data to +Neo4j you need to explicitly ask DataHub to start in Elastic mode. + +```aidl +datahub docker quickstart --graph-service-impl=elasticsearch +``` + +Next, run the following command from root to rebuild your graph index. + +``` +./docker/datahub-upgrade/datahub-upgrade.sh -u RestoreIndices +``` + +After this command completes, you should be migrated. Open up the DataHub UI and verify your relationships are +visible. + +Once you confirm the migration is successful, you must remove your neo4j volume by running + +```aidl +docker volume rm datahub_neo4jdata +``` + +This prevents your DataHub instance from coming up in neo4j mode in the future. + +## Helm + +First, adjust your helm variables to turn off neo4j and set your graph_service_impl to elasticsearch. + +To turn off neo4j in your prerequisites file, set `neo4j-community`'s `enabled` property to `false` +in [datahub-kubernetes/prerequisites/values.yaml](https://github.com/linkedin/datahub/blob/69dc2682ec36d68d2ac2d88b86cbd9c84d766b0b/datahub-kubernetes/prerequisites/values.yaml#L54). + +Then, set `graph_service_impl` to `elasticsearch` in +[datahub-kubernetes/datahub/values.yaml](https://github.com/linkedin/datahub/blob/eb60da975dce1f855883e628541cdbcef89717af/datahub-kubernetes/datahub/values.yaml#L63). + + +See the [deployment helm guide](../../datahub-kubernetes/README.md#components) for more details on how to +set up your helm deployment. + +Finally, follow the [restore-indices helm guide](./restore-indices.md) to re-build +your graph index. + +Once the job completes, your data will be migrated. diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index 37cf6386cbef4f..71aeb1484c3174 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -68,7 +68,21 @@ def check() -> None: docker_check_impl() -def check_neo4j_volume_exists(): +def should_use_neo4j_for_graph_service(graph_service_override): + if graph_service_override is not None: + if graph_service_override == "elasticsearch": + click.echo("Starting with elasticsearch due to graph-service-impl param\n") + return False + if graph_service_override == "neo4j": + click.echo("Starting with neo4j due to graph-service-impl param\n") + return True + else: + click.secho( + graph_service_override + + " is not a valid graph service option. Choose either `neo4j` or " + "`elasticsearch`\n", + fg="red", + ) with get_client_with_error() as (client, error): if error: click.secho( @@ -120,11 +134,19 @@ def check_neo4j_volume_exists(): default=False, help="If true, the docker-compose logs will be printed to console if something fails", ) +@click.option( + "--graph-service-impl", + type=str, + is_flag=False, + default=None, + help="If set, forces docker-compose to use that graph service implementation", +) def quickstart( version: str, build_locally: bool, quickstart_compose_file: List[pathlib.Path], dump_logs_on_failure: bool, + graph_service_impl: str, ) -> None: """Start an instance of DataHub locally using docker-compose. @@ -151,7 +173,7 @@ def quickstart( # Download the quickstart docker-compose file from GitHub. quickstart_download_response = requests.get( GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL - if check_neo4j_volume_exists() + if should_use_neo4j_for_graph_service(graph_service_impl) else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL ) quickstart_download_response.raise_for_status()