diff --git a/docker/mysql-setup/init.sql b/docker/mysql-setup/init.sql index 30b458ec87337e..6bd7133a359a89 100644 --- a/docker/mysql-setup/init.sql +++ b/docker/mysql-setup/init.sql @@ -16,6 +16,7 @@ create table if not exists metadata_aspect_v2 ( ); -- create default records for datahub user if not exists +DROP TABLE if exists temp_metadata_aspect_v2; CREATE TABLE temp_metadata_aspect_v2 LIKE metadata_aspect_v2; INSERT INTO temp_metadata_aspect_v2 (urn, aspect, version, metadata, createdon, createdby) VALUES( 'urn:li:corpuser:datahub', diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 75668c5df81756..80928f2f1118b7 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -127,7 +127,7 @@ services: hostname: kafka-setup image: linkedin/datahub-kafka-setup:${DATAHUB_VERSION:-head} mysql: - command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin + command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin --default-authentication-plugin=mysql_native_password container_name: mysql environment: - MYSQL_DATABASE=datahub @@ -136,6 +136,7 @@ services: - MYSQL_ROOT_PASSWORD=datahub hostname: mysql image: mariadb:10.5.8 + # image: mysql:8 ports: - ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306 volumes: diff --git a/docs/how/backup-datahub.md b/docs/how/backup-datahub.md index 3fccfd26349719..6a9d7287aaa45c 100644 --- a/docs/how/backup-datahub.md +++ b/docs/how/backup-datahub.md @@ -1,5 +1,11 @@ # Taking backup of DataHub +## Production + The recommended backup strategy is to periodically dump the database `datahub.metadata_aspect_v2` so it can be recreated from the dump which most managed DB services will support (e.g. AWS RDS). Then run [restore indices](./restore-indices.md) to recreate the indices. In order to back up Time Series Aspects (which power usage and dataset profiles), you'd have to do a backup of Elasticsearch, which is possible via AWS OpenSearch. Otherwise, you'd have to reingest dataset profiles from your sources in the event of a disaster scenario! + +## Quickstart + +To take a backup of your quickstart, take a look at this [document](../quickstart.md#backing-up-your-datahub-quickstart-experimental) on how to accomplish it. \ No newline at end of file diff --git a/docs/how/restore-indices.md b/docs/how/restore-indices.md index a5c0fd2b782d51..a9e2cd9799d19f 100644 --- a/docs/how/restore-indices.md +++ b/docs/how/restore-indices.md @@ -7,9 +7,18 @@ When a new version of the aspect gets ingested, GMS initiates an MAE event for t the search and graph indices. As such, we can fetch the latest version of each aspect in the local database and produce MAE events corresponding to the aspects to restore the search and graph indices. +## Quickstart + +If you're using the quickstart images, you can use the `datahub` cli to restore indices. + +``` +datahub docker quickstart --restore-indices +``` +See [this section](../quickstart.md#restoring-only-the-index-use-with-care) for more information. + ## Docker-compose -Run the following command from root to send MAE for each aspect in the Local DB. +If you are on a custom docker-compose deployment, run the following command (you need to checkout [the source repository](https://github.com/datahub-project/datahub)) from the root of the repo to send MAE for each aspect in the Local DB. ``` ./docker/datahub-upgrade/datahub-upgrade.sh -u RestoreIndices diff --git a/docs/quickstart.md b/docs/quickstart.md index 4118314a099240..00c72ead2b44e7 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -150,7 +150,7 @@ To stop DataHub's quickstart, you can issue the following command. datahub docker quickstart --stop ``` -### Resetting DataHub +### Resetting DataHub (a.k.a factory reset) To cleanse DataHub of all of its state (e.g. before ingesting your own), you can use the CLI `nuke` command. @@ -158,6 +158,51 @@ To cleanse DataHub of all of its state (e.g. before ingesting your own), you can datahub docker nuke ``` +### Backing up your DataHub Quickstart (experimental) + +The quickstart image is not recommended for use as a production instance. See [Moving to production](#move-to-production) for recommendations on setting up your production cluster. However, in case you want to take a backup of your current quickstart state (e.g. you have a demo to your company coming up and you want to create a copy of the quickstart data so you can restore it at a future date), you can supply the `--backup` flag to quickstart. +``` +datahub docker quickstart --backup +``` +will take a backup of your MySQL image and write it by default to your `~/.datahub/quickstart/` directory as the file `backup.sql`. You can customize this by passing a `--backup-file` argument. +e.g. +``` +datahub docker quickstart --backup --backup-file /home/my_user/datahub_backups/quickstart_backup_2002_22_01.sql +``` +:::note + +Note that the Quickstart backup does not include any timeseries data (dataset statistics, profiles, etc.), so you will lose that information if you delete all your indexes and restore from this backup. + + +### Restoring your DataHub Quickstart (experimental) +As you might imagine, these backups are restore-able. The following section describes a few different options you have to restore your backup. + +#### Restoring a backup (primary + index) [most common] +To restore a previous backup, run the following command: +``` +datahub docker quickstart --restore +``` +This command will pick up the `backup.sql` file located under `~/.datahub/quickstart` and restore your primary database as well as the elasticsearch indexes with it. + +To supply a specific backup file, use the `--restore-file` option. +``` +datahub docker quickstart --restore --restore-file /home/my_user/datahub_backups/quickstart_backup_2002_22_01.sql +``` + +#### Restoring only the index [to deal with index out of sync / corruption issues] +Another situation that can come up is the index can get corrupt, or be missing some update. In order to re-bootstrap the index from the primary store, you can run this command to sync the index with the primary store. +``` +datahub docker quickstart --restore-indices +``` + +#### Restoring a backup (primary but NO index) [rarely used] +Sometimes, you might want to just restore the state of your primary database (MySQL), but not re-index the data. To do this, you have to explicitly disable the restore-indices capability. + +``` +datahub docker quickstart --restore --no-restore-indices +``` + + ### Upgrading your local DataHub If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can just issue the quickstart command again. It will pull down newer images and restart your instance without losing any data. diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 4bd5f1771dde40..2467c61983e5d8 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -1,5 +1,5 @@ [flake8] -max-complexity = 15 +max-complexity = 20 ignore = # Ignore: line length issues, since black's formatter will take care of them. E501, diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index 8975990277f023..92f77bd4a6d0fe 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -14,6 +14,7 @@ import click import pydantic import requests +from expandvars import expandvars from datahub.cli.cli_utils import DATAHUB_ROOT_FOLDER from datahub.cli.docker_check import ( @@ -204,6 +205,148 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None: return +def _backup(backup_file: str) -> int: + resolved_backup_file = os.path.expanduser(backup_file) + dirname = os.path.dirname(resolved_backup_file) + logger.info(f"Creating directory {dirname} if it doesn't exist") + os.makedirs(dirname, exist_ok=True) + logger.info("Executing backup command") + result = subprocess.run( + [ + "bash", + "-c", + f"docker exec mysql mysqldump -u root -pdatahub datahub > {resolved_backup_file}", + ] + ) + logger.info( + f"Backup written to {resolved_backup_file} with status {result.returncode}" + ) + return result.returncode + + +def _restore( + restore_primary: bool, + restore_indices: Optional[bool], + primary_restore_file: Optional[str], +) -> int: + assert ( + restore_primary or restore_indices + ), "Either restore_primary or restore_indices must be set" + msg = "datahub> " + if restore_primary: + msg += f"Will restore primary database from {primary_restore_file}. " + if restore_indices is not False: + msg += ( + f"Will {'also ' if restore_primary else ''}re-build indexes from scratch. " + ) + else: + msg += "Will not re-build indexes. " + msg += "Press y to continue." + click.confirm(msg, abort=True) + if restore_primary: + assert primary_restore_file + resolved_restore_file = os.path.expanduser(primary_restore_file) + logger.info(f"Restoring primary db from backup at {resolved_restore_file}") + assert os.path.exists( + resolved_restore_file + ), f"File {resolved_restore_file} does not exist" + with open(resolved_restore_file, "r") as fp: + result = subprocess.run( + [ + "bash", + "-c", + "docker exec -i mysql bash -c 'mysql -uroot -pdatahub datahub '", + ], + stdin=fp, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if result.returncode != 0: + logger.error("Failed to run MySQL restore") + return result.returncode + else: + logger.info("Successfully restored primary backup.") + + # We run restore indices by default on primary restores, and also if the --restore-indices flag is explicitly set + if restore_indices is not False: + with tempfile.NamedTemporaryFile() as env_fp: + env_fp.write( + expandvars( + """ + # Required Environment Variables +EBEAN_DATASOURCE_USERNAME=datahub +EBEAN_DATASOURCE_PASSWORD=datahub +EBEAN_DATASOURCE_HOST=mysql:${DATAHUB_MAPPED_MYSQL_PORT:-3306} +EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:${DATAHUB_MAPPED_MYSQL_PORT:-3306}/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 +EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver +ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml + +KAFKA_BOOTSTRAP_SERVER=broker:29092 +KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081} + +ELASTICSEARCH_HOST=elasticsearch +ELASTICSEARCH_PORT=${DATAHUB_MAPPED_ELASTIC_PORT:-9200} + +#NEO4J_HOST=http://:7474 +#NEO4J_URI=bolt:// +#NEO4J_USERNAME=neo4j +#NEO4J_PASSWORD=datahub + +DATAHUB_GMS_HOST=datahub-gms +DATAHUB_GMS_PORT=${DATAHUB_MAPPED_GMS_PORT:-8080} + +DATAHUB_MAE_CONSUMER_HOST=datahub-gms +DATAHUB_MAE_CONSUMER_PORT=9091 + +# Optional Arguments + +# Uncomment and set these to support SSL connection to Elasticsearch +# ELASTICSEARCH_USE_SSL= +# ELASTICSEARCH_SSL_PROTOCOL= +# ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL= +# ELASTICSEARCH_SSL_TRUSTSTORE_FILE= +# ELASTICSEARCH_SSL_TRUSTSTORE_TYPE= +# ELASTICSEARCH_SSL_TRUSTSTORE_PASSWORD= +# ELASTICSEARCH_SSL_KEYSTORE_FILE= +# ELASTICSEARCH_SSL_KEYSTORE_TYPE= +# ELASTICSEARCH_SSL_KEYSTORE_PASSWORD= + """ + ).encode("utf-8") + ) + env_fp.flush() + if logger.isEnabledFor(logging.DEBUG): + with open(env_fp.name, "r") as env_fp_reader: + logger.debug(f"Env file contents: {env_fp_reader.read()}") + + # continue to issue the restore indices command + command = ( + "docker pull acryldata/datahub-upgrade:${DATAHUB_VERSION:-head}" + + f" && docker run --network datahub_network --env-file {env_fp.name} " + + "acryldata/datahub-upgrade:${DATAHUB_VERSION:-head} -u RestoreIndices -a clean" + ) + logger.info(f"Running index restore command: {command}") + result = subprocess.run( + args=[ + "bash", + "-c", + "docker pull acryldata/datahub-upgrade:" + + "${DATAHUB_VERSION:-head}" + + f" && docker run --network datahub_network --env-file {env_fp.name} " + + "acryldata/datahub-upgrade:${DATAHUB_VERSION:-head}" + + " -u RestoreIndices -a clean", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + logger.info( + f"Index restore command finished with status {result.returncode}" + ) + if result.returncode != 0: + logger.info(result.stderr) + logger.debug(result.stdout) + return result.returncode + + @docker.command() @click.option( "--version", @@ -219,6 +362,7 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None: help="Attempt to build the containers locally before starting", ) @click.option( + "-f", "--quickstart-compose-file", type=click.Path(exists=True, dir_okay=False, readable=True), default=[], @@ -281,6 +425,49 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None: default=False, help="Use this flag to stop the running containers", ) +@click.option( + "--backup", + required=False, + is_flag=True, + default=False, + help="Run this to backup a running quickstart instance", +) +@click.option( + "--backup-file", + required=False, + type=click.Path(exists=False, dir_okay=False, readable=True, writable=True), + default=os.path.expanduser("~/.datahub/quickstart/backup.sql"), + show_default=True, + help="Run this to backup data from a running quickstart instance", +) +@click.option( + "--restore", + required=False, + is_flag=True, + default=False, + help="Run this to restore a running quickstart instance from a previous backup (see --backup)", +) +@click.option( + "--restore-file", + required=False, + type=str, + default="~/.datahub/quickstart/backup.sql", + help="Set this to provide a custom restore file", +) +@click.option( + "--restore-indices", + required=False, + is_flag=True, + default=False, + help="Enable the restoration of indices of a running quickstart instance. Note: Using --restore will automatically restore-indices unless you use the --no-restore-indices flag.", +) +@click.option( + "--no-restore-indices", + required=False, + is_flag=True, + default=False, + help="Disables the restoration of indices of a running quickstart instance when used in conjunction with --restore.", +) @upgrade.check_upgrade @telemetry.with_telemetry def quickstart( @@ -295,6 +482,12 @@ def quickstart( schema_registry_port: Optional[pydantic.PositiveInt], elastic_port: Optional[pydantic.PositiveInt], stop: bool, + backup: bool, + backup_file: str, + restore: bool, + restore_file: str, + restore_indices: bool, + no_restore_indices: bool, ) -> None: """Start an instance of DataHub locally using docker-compose. @@ -303,6 +496,24 @@ def quickstart( There are options to override the docker-compose config file, build the containers locally, and dump logs to the console or to a file if something goes wrong. """ + if backup: + _backup(backup_file) + return + if restore or restore_indices or no_restore_indices: + if not valid_restore_options(restore, restore_indices, no_restore_indices): + return + # execute restore + restore_indices_flag: Optional[bool] = None + if restore_indices: + restore_indices_flag = True + if no_restore_indices: + restore_indices_flag = False + _restore( + restore_primary=restore, + primary_restore_file=restore_file, + restore_indices=restore_indices_flag, + ) + return running_on_m1 = is_m1() if running_on_m1: @@ -458,6 +669,29 @@ def quickstart( ) +def valid_restore_options( + restore: bool, restore_indices: bool, no_restore_indices: bool +) -> bool: + if no_restore_indices and not restore: + click.secho( + "Using --no-restore-indices without a --restore isn't defined", fg="red" + ) + return False + if no_restore_indices and restore_indices: + click.secho( + "Using --restore-indices in conjunction with --no-restore-indices is undefined", + fg="red", + ) + return False + if restore and restore_indices: + click.secho( + "Using --restore automatically implies using --restore-indices, you don't need to set both. Continuing...", + fg="yellow", + ) + return True + return True + + @docker.command() @click.option( "--path",