From ce90ce70bb43b1edc795ab6cccc0d2df71aa29c2 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 28 Mar 2023 18:00:42 +0530 Subject: [PATCH 1/3] documentation for issue #450 --- docs/dataSourcesAndSinks/aws-s3.md | 35 ++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 docs/dataSourcesAndSinks/aws-s3.md diff --git a/docs/dataSourcesAndSinks/aws-s3.md b/docs/dataSourcesAndSinks/aws-s3.md new file mode 100644 index 000000000..9311dd23d --- /dev/null +++ b/docs/dataSourcesAndSinks/aws-s3.md @@ -0,0 +1,35 @@ +--- +title: AWS S3 +parent: Data Sources and Sinks +--- + +# S3 + +1. set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg +=> make it publicly accessible + +2. create aws access key and export via env vars: + +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= + +(if mfa is enabled AWS_SESSION_TOKEN env var would also be needed ) + +3. Download hadoop-aws-3.1.0.jar and aws-java-sdk-bundle-1.11.271.jar via maven + +4. set above in zingg.conf : +spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11.271.jar + +5. Run using: + + ./scripts/zingg.sh --phase findTrainingData --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ./scripts/zingg.sh --phase label --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ./scripts/zingg.sh --phase train --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + ./scripts/zingg.sh --phase match --properties-file config/zingg.conf --conf examples/febrl/config.json --zinggDir s3a://zingg28032023/zingg + +6. Models etc. would get saved in +Amazon S3 > Buckets > zingg28032023 >zingg > 100 + +References: + +1. https://spark.apache.org/docs/latest/cloud-integration.html From 915aff2badea395f09e368ac6ff42bddbecf4b31 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 28 Mar 2023 18:09:26 +0530 Subject: [PATCH 2/3] formatting --- docs/dataSourcesAndSinks/aws-s3.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/dataSourcesAndSinks/aws-s3.md b/docs/dataSourcesAndSinks/aws-s3.md index 9311dd23d..bf28c8107 100644 --- a/docs/dataSourcesAndSinks/aws-s3.md +++ b/docs/dataSourcesAndSinks/aws-s3.md @@ -5,10 +5,10 @@ parent: Data Sources and Sinks # S3 -1. set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg +1. Set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg => make it publicly accessible -2. create aws access key and export via env vars: +2. Create aws access key and export via env vars: export AWS_ACCESS_KEY_ID= export AWS_SECRET_ACCESS_KEY= @@ -17,7 +17,7 @@ export AWS_SECRET_ACCESS_KEY= 3. Download hadoop-aws-3.1.0.jar and aws-java-sdk-bundle-1.11.271.jar via maven -4. set above in zingg.conf : +4. Set above in zingg.conf : spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11.271.jar 5. Run using: From 0b748a2c5a2a3d6fc302db28b4b6f0d23d0b24d9 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Tue, 28 Mar 2023 23:20:31 +0530 Subject: [PATCH 3/3] removed public access clause --- docs/dataSourcesAndSinks/aws-s3.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docs/dataSourcesAndSinks/aws-s3.md b/docs/dataSourcesAndSinks/aws-s3.md index bf28c8107..54b1d8168 100644 --- a/docs/dataSourcesAndSinks/aws-s3.md +++ b/docs/dataSourcesAndSinks/aws-s3.md @@ -6,9 +6,8 @@ parent: Data Sources and Sinks # S3 1. Set a bucket e.g. zingg28032023 and a folder inside it e.g. zingg -=> make it publicly accessible -2. Create aws access key and export via env vars: +2. Create aws access key and export via env vars (ensure that the user with below keys has read/write access to above): export AWS_ACCESS_KEY_ID= export AWS_SECRET_ACCESS_KEY= @@ -29,7 +28,3 @@ spark.jars=//hadoop-aws-3.1.0.jar,//aws-java-sdk-bundle-1.11 6. Models etc. would get saved in Amazon S3 > Buckets > zingg28032023 >zingg > 100 - -References: - -1. https://spark.apache.org/docs/latest/cloud-integration.html