Skip to content

Commit

Permalink
fix(spark-lineage): smoke test fixes, M1 support (datahub-project#6372)
Browse files Browse the repository at this point in the history
  • Loading branch information
treff7es authored and cccs-Dustin committed Feb 1, 2023
1 parent 101ba7f commit 7264555
Show file tree
Hide file tree
Showing 18 changed files with 102 additions and 63 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rappdw/docker-java-python:openjdk1.8.0_171-python3.6.6
FROM python:3.9

ARG shared_workspace=/opt/workspace

Expand All @@ -7,21 +7,32 @@ ENV SHARED_WORKSPACE=${shared_workspace}

# -- Layer: Apache Spark

ARG spark_version=2.4.8
ARG spark_version=3.2.0
ARG hadoop_version=2.7

RUN apt-get update -y && \
apt-get install -y curl && \
apt-get install -y --no-install-recommends curl gnupg software-properties-common && \
apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 0xB1998361219BD9C9 && \
curl https://cdn.azul.com/zulu/bin/zulu-repo_1.0.0-3_all.deb -o /tmp/zulu-repo_1.0.0-3_all.deb && \
apt-get install /tmp/zulu-repo_1.0.0-3_all.deb && \
apt-get update && \
# apt-cache search zulu && \
apt-get install -y --no-install-recommends zulu11-jre && \
apt-get clean && \
curl -sS https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
tar -xf spark.tgz && \
mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \
mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \
rm spark.tgz
rm spark.tgz && \
rm -rf /var/tmp/* /tmp/* /var/lib/apt/lists/*

RUN set -e; \
pip install JPype1

ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}
ENV SPARK_MASTER_HOST spark-master
ENV SPARK_MASTER_PORT 7077
ENV PYSPARK_PYTHON python2.7
ENV PYSPARK_PYTHON python3.9
ENV PATH=$PATH:$SPARK_HOME/bin

COPY workspace $SHARED_WORKSPACE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

#!/bin/bash -xe
#Remove old configuration
rm -rf workspace

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
version: "3.6"

services:
spark-master:
image: spark-master
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)"
]
}
}
]
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,JavaHdfsIn2HiveCreateInsertTable.foo4,PROD)"
Expand Down Expand Up @@ -114,8 +114,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,javahdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down Expand Up @@ -179,8 +179,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,javahdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,JavaHdfsIn2HiveCreateTable.foo3,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,JavaHdfsIn2HiveCreateTable.foo3,PROD)"
]
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,JavaHiveInHiveOut.foo5,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,JavaHiveInHiveOut.foo5,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut1/out.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut1/out.csv,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out2.csv,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,PythonHdfsIn2HiveCreateInsertTable.foo4,PROD)"
Expand Down Expand Up @@ -123,8 +123,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,pythonhdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down Expand Up @@ -154,8 +154,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,pythonhdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,PythonHdfsIn2HiveCreateTable.foo3,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,PythonHiveInHiveOut.foo5,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@

saluation () {
echo "--------------------------------------------------------"
echo "Starting exectuion"
echo "Starting execution $1"
echo "--------------------------------------------------------"

}

saluation
saluation "HdfsIn2HdfsOut1.py"

spark-submit --properties-file $2 HdfsIn2HdfsOut1.py

saluation
saluation "HdfsIn2HdfsOut2.py"
spark-submit --properties-file $2 HdfsIn2HdfsOut2.py

saluation
saluation "HdfsIn2HiveCreateTable.py"
spark-submit --properties-file $2 HdfsIn2HiveCreateTable.py

saluation
saluation "HdfsIn2HiveCreateInsertTable.py"
spark-submit --properties-file $2 HdfsIn2HiveCreateInsertTable.py

saluation
saluation "HiveInHiveOut.py"
spark-submit --properties-file $2 HiveInHiveOut.py


Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/bin/bash
#!/bin/bash -x

set -e

SMOKE_TEST_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

pip install -r requirements.txt

echo "--------------------------------------------------------------------"
Expand All @@ -25,6 +28,7 @@ echo "--------------------------------------------------------------------"
echo "Bringing up spark cluster"
echo "--------------------------------------------------------------------"

cd "${SMOKE_TEST_ROOT_DIR}"/docker
#bring up spark cluster
docker-compose -f spark-docker-compose.yml up -d

Expand Down
28 changes: 21 additions & 7 deletions metadata-integration/java/spark-lineage/spark-smoke-test/smoke.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash -x

set -e
# Script assumptions:
Expand All @@ -7,6 +7,24 @@ set -e
# - pytest is installed
# - requests is installed

is_healthy() {
local service="$1"
local -r -i max_attempts="$2"; shift
local -i attempt_num=1

until [ -n "$(docker ps -f name="$service" -f "health=healthy"|tail -n +2)" ]
do
if (( attempt_num == max_attempts ))
then
echo "Attempt $attempt_num failed and there are no more attempts left!"
return 1
else
echo "Attempt $attempt_num failed! Trying again in $attempt_num seconds..."
sleep $(( attempt_num++ ))
fi
done
}

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$DIR"

Expand All @@ -22,12 +40,8 @@ echo "--------------------------------------------------------------------"

pwd ../../../

datahub docker quickstart \
--build-locally \
--quickstart-compose-file ../../../../docker/docker-compose.yml \
--quickstart-compose-file ../../../../docker/docker-compose.override.yml \
--quickstart-compose-file ../../../../docker/docker-compose.dev.yml \
--dump-logs-on-failure
../../../../docker/dev.sh -d
is_healthy "datahub-gms" 60

echo "--------------------------------------------------------------------"
echo "Setup environment for pytest"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
saluation () {
echo "--------------------------------------------------------"
echo "Starting exectuion"
echo "Starting execution $1"
echo "--------------------------------------------------------"

}


saluation
saluation "test.spark.lineage.HdfsIn2HdfsOut1"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HdfsOut1 build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HdfsIn2HdfsOut2"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HdfsOut2 build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HdfsIn2HiveCreateTable"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HiveCreateTable build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HdfsIn2HiveCreateInsertTable"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HiveCreateInsertTable build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HiveInHiveOut"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HiveInHiveOut build/libs/test-spark-lineage.jar


Expand Down
Loading

0 comments on commit 7264555

Please sign in to comment.