Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(spark-lineage): Smoke test fix + smoke test m1 support #6372

Merged
merged 2 commits into from
Nov 5, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rappdw/docker-java-python:openjdk1.8.0_171-python3.6.6
FROM python:3.9

ARG shared_workspace=/opt/workspace

Expand All @@ -7,21 +7,32 @@ ENV SHARED_WORKSPACE=${shared_workspace}

# -- Layer: Apache Spark

ARG spark_version=2.4.8
ARG spark_version=3.2.0
ARG hadoop_version=2.7

RUN apt-get update -y && \
apt-get install -y curl && \
apt-get install -y --no-install-recommends curl gnupg software-properties-common && \
apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 0xB1998361219BD9C9 && \
curl https://cdn.azul.com/zulu/bin/zulu-repo_1.0.0-3_all.deb -o /tmp/zulu-repo_1.0.0-3_all.deb && \
apt-get install /tmp/zulu-repo_1.0.0-3_all.deb && \
apt-get update && \
# apt-cache search zulu && \
apt-get install -y --no-install-recommends zulu11-jre && \
apt-get clean && \
curl -sS https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
tar -xf spark.tgz && \
mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \
mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \
rm spark.tgz
rm spark.tgz && \
rm -rf /var/tmp/* /tmp/* /var/lib/apt/lists/*

RUN set -e; \
pip install JPype1

ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}
ENV SPARK_MASTER_HOST spark-master
ENV SPARK_MASTER_PORT 7077
ENV PYSPARK_PYTHON python2.7
ENV PYSPARK_PYTHON python3.9
ENV PATH=$PATH:$SPARK_HOME/bin

COPY workspace $SHARED_WORKSPACE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

#!/bin/bash -xe
#Remove old configuration
rm -rf workspace

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
version: "3.6"

services:
spark-master:
image: spark-master
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)"
]
}
}
]
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,JavaHdfsIn2HiveCreateInsertTable.foo4,PROD)"
Expand Down Expand Up @@ -114,8 +114,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,javahdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down Expand Up @@ -179,8 +179,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,javahdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,JavaHdfsIn2HiveCreateTable.foo3,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,JavaHdfsIn2HiveCreateTable.foo3,PROD)"
]
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,JavaHiveInHiveOut.foo5,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,JavaHiveInHiveOut.foo5,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut1/out.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut1/out.csv,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out2.csv,PROD)"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,PythonHdfsIn2HiveCreateInsertTable.foo4,PROD)"
Expand Down Expand Up @@ -123,8 +123,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,pythonhdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down Expand Up @@ -154,8 +154,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,pythonhdfsin2hivecreateinserttable.foo4,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,PythonHdfsIn2HiveCreateTable.foo3,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hive,PythonHiveInHiveOut.foo5,PROD)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@

saluation () {
echo "--------------------------------------------------------"
echo "Starting exectuion"
echo "Starting execution $1"
echo "--------------------------------------------------------"

}

saluation
saluation "HdfsIn2HdfsOut1.py"

spark-submit --properties-file $2 HdfsIn2HdfsOut1.py

saluation
saluation "HdfsIn2HdfsOut2.py"
spark-submit --properties-file $2 HdfsIn2HdfsOut2.py

saluation
saluation "HdfsIn2HiveCreateTable.py"
spark-submit --properties-file $2 HdfsIn2HiveCreateTable.py

saluation
saluation "HdfsIn2HiveCreateInsertTable.py"
spark-submit --properties-file $2 HdfsIn2HiveCreateInsertTable.py

saluation
saluation "HiveInHiveOut.py"
spark-submit --properties-file $2 HiveInHiveOut.py


Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/bin/bash
#!/bin/bash -x

set -e

SMOKE_TEST_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

pip install -r requirements.txt

echo "--------------------------------------------------------------------"
Expand All @@ -25,6 +28,7 @@ echo "--------------------------------------------------------------------"
echo "Bringing up spark cluster"
echo "--------------------------------------------------------------------"

cd "${SMOKE_TEST_ROOT_DIR}"/docker
#bring up spark cluster
docker-compose -f spark-docker-compose.yml up -d

Expand Down
28 changes: 21 additions & 7 deletions metadata-integration/java/spark-lineage/spark-smoke-test/smoke.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/bash -x

set -e
# Script assumptions:
Expand All @@ -7,6 +7,24 @@ set -e
# - pytest is installed
# - requests is installed

is_healthy() {
local service="$1"
local -r -i max_attempts="$2"; shift
local -i attempt_num=1

until [ -n "$(docker ps -f name="$service" -f "health=healthy"|tail -n +2)" ]
do
if (( attempt_num == max_attempts ))
then
echo "Attempt $attempt_num failed and there are no more attempts left!"
return 1
else
echo "Attempt $attempt_num failed! Trying again in $attempt_num seconds..."
sleep $(( attempt_num++ ))
fi
done
}

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$DIR"

Expand All @@ -22,12 +40,8 @@ echo "--------------------------------------------------------------------"

pwd ../../../

datahub docker quickstart \
--build-locally \
--quickstart-compose-file ../../../../docker/docker-compose.yml \
--quickstart-compose-file ../../../../docker/docker-compose.override.yml \
--quickstart-compose-file ../../../../docker/docker-compose.dev.yml \
--dump-logs-on-failure
../../../../docker/dev.sh -d
is_healthy "datahub-gms" 60

echo "--------------------------------------------------------------------"
echo "Setup environment for pytest"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
saluation () {
echo "--------------------------------------------------------"
echo "Starting exectuion"
echo "Starting execution $1"
echo "--------------------------------------------------------"

}


saluation
saluation "test.spark.lineage.HdfsIn2HdfsOut1"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HdfsOut1 build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HdfsIn2HdfsOut2"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HdfsOut2 build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HdfsIn2HiveCreateTable"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HiveCreateTable build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HdfsIn2HiveCreateInsertTable"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HiveCreateInsertTable build/libs/test-spark-lineage.jar

saluation
saluation "test.spark.lineage.HiveInHiveOut"
$1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HiveInHiveOut build/libs/test-spark-lineage.jar


Expand Down
Loading