Skip to content

Commit

Permalink
feat(quickstart): add experimental support for backup restore
Browse files Browse the repository at this point in the history
  • Loading branch information
shirshanka committed Jul 18, 2022
1 parent 024797d commit 04ab13b
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 9 deletions.
32 changes: 24 additions & 8 deletions docker/datahub-upgrade/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,32 @@
# Defining environment
ARG APP_ENV=prod

FROM adoptopenjdk/openjdk8:alpine-jre as base
FROM alpine:3.14 AS base
ENV DOCKERIZE_VERSION v0.6.1
RUN apk --no-cache add curl tar \
&& curl -L https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz | tar -C /usr/local/bin -xzv
# Upgrade Alpine and base packages
RUN apk --no-cache --update-cache --available upgrade \
&& if [ $(arch) = "aarch64" ]; then \
DOCKERIZE_ARCH='aarch64';\
elif [ $(arch) = "x86_64" ]; then \
DOCKERIZE_ARCH='amd64'; \
else \
echo >&2 "Unsupported architecture $(arch)" ; exit 1; \
fi \
&& apk --no-cache add tar curl openjdk8-jre bash coreutils gcompat \
&& curl https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \
&& curl https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-jmx/9.4.46.v20220331/jetty-jmx-9.4.46.v20220331.jar --output jetty-jmx.jar \
&& curl https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-util/9.4.46.v20220331/jetty-util-9.4.46.v20220331.jar --output jetty-util.jar \
&& wget https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.4.1/opentelemetry-javaagent-all.jar \
&& wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.16.1/jmx_prometheus_javaagent-0.16.1.jar -O jmx_prometheus_javaagent.jar \
&& cp /usr/lib/jvm/java-1.8-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks \
&& curl -L https://github.com/treff7es/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-${DOCKERIZE_ARCH}-$DOCKERIZE_VERSION.tar.gz | tar -C /usr/local/bin -xzv

FROM --platform=$BUILDPLATFORM alpine:3.14 AS prod-build

# Upgrade Alpine and base packages
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add openjdk8 perl

# Workaround alpine issue with /lib64 not being in the ld library path
# https://gitlab.alpinelinux.org/alpine/aports/-/issues/10140
ENV LD_LIBRARY_PATH=/lib64

FROM openjdk:8 as prod-build
COPY . datahub-src
RUN cd datahub-src && ./gradlew :datahub-upgrade:build
RUN cd datahub-src && cp datahub-upgrade/build/libs/datahub-upgrade.jar ../datahub-upgrade.jar
Expand Down
1 change: 1 addition & 0 deletions docker/mysql-setup/init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ create table if not exists metadata_aspect_v2 (
);

-- create default records for datahub user if not exists
DROP TABLE if exists temp_metadata_aspect_v2;
CREATE TABLE temp_metadata_aspect_v2 LIKE metadata_aspect_v2;
INSERT INTO temp_metadata_aspect_v2 (urn, aspect, version, metadata, createdon, createdby) VALUES(
'urn:li:corpuser:datahub',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ services:
hostname: kafka-setup
image: linkedin/datahub-kafka-setup:${DATAHUB_VERSION:-head}
mysql:
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin --default-authentication-plugin=mysql_native_password
container_name: mysql
environment:
- MYSQL_DATABASE=datahub
Expand All @@ -138,6 +138,7 @@ services:
- MYSQL_ROOT_PASSWORD=datahub
hostname: mysql
image: mariadb:10.5.8
# image: mysql:8
ports:
- ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306
volumes:
Expand Down
163 changes: 163 additions & 0 deletions metadata-ingestion/src/datahub/cli/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,115 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
return


def _backup(backup_file: str) -> int:
resolved_backup_file = os.path.expanduser(backup_file)
dirname = os.path.dirname(resolved_backup_file)
logger.info(f"Creating directory {dirname} if it doesn't exist")
os.makedirs(dirname, exist_ok=True)
logger.info("Executing backup command")
result = subprocess.run(
[
"bash",
"-c",
f"docker exec mysql mysqldump -u root -pdatahub datahub > {resolved_backup_file}",
]
)
logger.info(
f"Backup written to {resolved_backup_file} with status {result.returncode}"
)
return result.returncode


def _restore(
restore_primary: bool, restore_indices: bool, primary_restore_file: Optional[str]
) -> int:
if restore_primary:
assert primary_restore_file
resolved_restore_file = os.path.expanduser(primary_restore_file)
logger.info(f"Restoring primary db from backup at {resolved_restore_file}")
assert os.path.exists(
resolved_restore_file
), f"File {resolved_restore_file} does not exist"
with open(resolved_restore_file, "r") as fp:
result = subprocess.run(
[
"bash",
"-c",
"docker exec -i mysql bash -c 'mysql -uroot -pdatahub datahub '",
],
stdin=fp,
)
if result.returncode != 0:
logger.error("Failed to run MySQL restore")
return result.returncode
else:
logger.info("Successfully restored primary backup.")

if restore_indices:
logger.info("Running Index restore command")
with tempfile.NamedTemporaryFile() as env_fp:
env_fp.write(
"""
# Required Environment Variables
EBEAN_DATASOURCE_USERNAME=datahub
EBEAN_DATASOURCE_PASSWORD=datahub
EBEAN_DATASOURCE_HOST=mysql:3306
EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
KAFKA_BOOTSTRAP_SERVER=broker:29092
KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
ELASTICSEARCH_HOST=elasticsearch
ELASTICSEARCH_PORT=9200
#NEO4J_HOST=http://<your-neo-host>:7474
#NEO4J_URI=bolt://<your-neo-host>
#NEO4J_USERNAME=neo4j
#NEO4J_PASSWORD=datahub
DATAHUB_GMS_HOST=datahub-gms
DATAHUB_GMS_PORT=8080
DATAHUB_MAE_CONSUMER_HOST=datahub-gms
DATAHUB_MAE_CONSUMER_PORT=9091
# Optional Arguments
# Uncomment and set these to support SSL connection to Elasticsearch
# ELASTICSEARCH_USE_SSL=
# ELASTICSEARCH_SSL_PROTOCOL=
# ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL=
# ELASTICSEARCH_SSL_TRUSTSTORE_FILE=
# ELASTICSEARCH_SSL_TRUSTSTORE_TYPE=
# ELASTICSEARCH_SSL_TRUSTSTORE_PASSWORD=
# ELASTICSEARCH_SSL_KEYSTORE_FILE=
# ELASTICSEARCH_SSL_KEYSTORE_TYPE=
# ELASTICSEARCH_SSL_KEYSTORE_PASSWORD=
""".encode(
"utf-8"
)
)
env_fp.flush()
# continue to issue the restore indices command
result = subprocess.run(
[
"bash",
"-c",
f"docker run --network datahub_network --env-file {env_fp.name} acryldata/datahub-upgrade:debug -u RestoreIndices",
],
capture_output=True,
)
logger.info(
f"Index restore command finished with status {result.returncode}"
)
if result.returncode != 0:
logger.info(result.stderr)
logger.debug(result.stdout)
return result.returncode


@docker.command()
@click.option(
"--version",
Expand All @@ -219,6 +328,7 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
help="Attempt to build the containers locally before starting",
)
@click.option(
"-f",
"--quickstart-compose-file",
type=click.Path(exists=True, dir_okay=False, readable=True),
default=[],
Expand Down Expand Up @@ -281,6 +391,42 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
default=False,
help="Use this flag to stop the running containers",
)
@click.option(
"--backup",
required=False,
is_flag=True,
default=False,
help="Run this to backup a running quickstart instance",
)
@click.option(
"--backup-file",
required=False,
type=click.Path(exists=False, dir_okay=False, readable=True, writable=True),
default=os.path.expanduser("~/.datahub/quickstart/backup.sql"),
show_default=True,
help="Run this to backup data from a running quickstart instance",
)
@click.option(
"--restore",
required=False,
is_flag=True,
default=False,
help="Run this to restore a running quickstart instance from a previous backup (see --backup)",
)
@click.option(
"--restore-file",
required=False,
type=str,
default="~/.datahub/quickstart/backup.sql",
help="Set this to provide a custom restore file",
)
@click.option(
"--restore-indices",
required=False,
is_flag=True,
default=False,
help="Run this to restore the indices of a running quickstart instance. Can be used in conjunction with a --restore or independently",
)
@upgrade.check_upgrade
@telemetry.with_telemetry
def quickstart(
Expand All @@ -295,6 +441,11 @@ def quickstart(
schema_registry_port: Optional[pydantic.PositiveInt],
elastic_port: Optional[pydantic.PositiveInt],
stop: bool,
backup: bool,
backup_file: str,
restore: bool,
restore_file: str,
restore_indices: bool,
) -> None:
"""Start an instance of DataHub locally using docker-compose.
Expand All @@ -304,6 +455,18 @@ def quickstart(
locally, and dump logs to the console or to a file if something goes wrong.
"""

if backup:
_backup(backup_file)
return
if restore or restore_indices:
# execute restore
_restore(
restore_primary=restore,
primary_restore_file=restore_file,
restore_indices=restore_indices,
)
return

running_on_m1 = is_m1()
if running_on_m1:
click.secho("Detected M1 machine", fg="yellow")
Expand Down

0 comments on commit 04ab13b

Please sign in to comment.