From 50639e1d8a97962001be5596c2c18b6fd6a52c33 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 14 Mar 2024 15:41:06 +0100 Subject: [PATCH 001/103] attempt docker-registry --- dataplatform/.env | 2 ++ .../docker-compose-registry.yaml | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 dataplatform/multiple_stacks/docker-compose-registry.yaml diff --git a/dataplatform/.env b/dataplatform/.env index 6070d3c..c75c2d6 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -40,3 +40,5 @@ RESOURCEMANAGERPORT=8088 YARNHISTSERVERPORT=19888 PORTAINERPORT=9443 + +DOCKERREGISTRYPORT=5000 diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml new file mode 100644 index 0000000..40da726 --- /dev/null +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -0,0 +1,30 @@ +version: '3' + +services: + registry: + image: registry:2.8.3 + networks: + - BIG-dataplatform-network + ports: + - "${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT}" + environment: + REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data + volumes: + - docker_registry_data:/data + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + - node.role == worker + +volumes: + docker_registry_data: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4 + device: :${NFSPATH}/dataplatform_config/docker_registry + +networks: + BIG-dataplatform-network: + external: true + name: BIG-dataplatform-network \ No newline at end of file From 0ea591f4fc7d7234b7b43e7970c84beff4c41c66 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 14 Mar 2024 17:25:41 +0100 Subject: [PATCH 002/103] attempt docker-registry --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 40da726..622bc0f 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -1,4 +1,4 @@ -version: '3' +version: '3.9' services: registry: From 4b9326c57072b3e410b4ebf90d005ac8f11aa1ea Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 14 Mar 2024 17:27:59 +0100 Subject: [PATCH 003/103] attempt docker-registry --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 622bc0f..ca9ad5e 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -6,7 +6,7 @@ services: networks: - BIG-dataplatform-network ports: - - "${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT}" + - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data volumes: From 2800e1b6acf55f25bef25217c38244258f57f876 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 14 Mar 2024 17:36:53 +0100 Subject: [PATCH 004/103] attempt docker-registry --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index ca9ad5e..37ba7f9 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -9,6 +9,7 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data + REGISTRY_HTTP_ADDR: 0.0.0.0:${DOCKERREGISTRYPORT} volumes: - docker_registry_data:/data deploy: From 7f8029ea731a3ef753e91c3dc2cd03b062ea7891 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 14 Mar 2024 17:37:38 +0100 Subject: [PATCH 005/103] attempt docker-registry --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 37ba7f9..3033a89 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -9,7 +9,7 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data - REGISTRY_HTTP_ADDR: 0.0.0.0:${DOCKERREGISTRYPORT} + REGISTRY_HTTP_ADDR: '0.0.0.0:${DOCKERREGISTRYPORT}' volumes: - docker_registry_data:/data deploy: From 12f66435bff18a724f8c18e616a44e7b3a2330de Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 14 Mar 2024 18:00:17 +0100 Subject: [PATCH 006/103] attempt docker-registry --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 3033a89..37ba7f9 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -9,7 +9,7 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data - REGISTRY_HTTP_ADDR: '0.0.0.0:${DOCKERREGISTRYPORT}' + REGISTRY_HTTP_ADDR: 0.0.0.0:${DOCKERREGISTRYPORT} volumes: - docker_registry_data:/data deploy: From 115f8f91b6fece5501c2e6b31e20fc6e26908a19 Mon Sep 17 00:00:00 2001 From: manuelepasini Date: Fri, 15 Mar 2024 10:39:22 +0000 Subject: [PATCH 007/103] change docker registry compose --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 37ba7f9..29f3d79 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -9,7 +9,6 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data - REGISTRY_HTTP_ADDR: 0.0.0.0:${DOCKERREGISTRYPORT} volumes: - docker_registry_data:/data deploy: @@ -28,4 +27,4 @@ volumes: networks: BIG-dataplatform-network: external: true - name: BIG-dataplatform-network \ No newline at end of file + name: BIG-dataplatform-network From e2a0d4df2b4bff6f10b180113b02711996e88195 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 11:45:43 +0100 Subject: [PATCH 008/103] update ports for expose docker registry --- dataplatform/.env | 2 +- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/.env b/dataplatform/.env index c75c2d6..28afa35 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -41,4 +41,4 @@ YARNHISTSERVERPORT=19888 PORTAINERPORT=9443 -DOCKERREGISTRYPORT=5000 +DOCKERREGISTRYPORT=40123 diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 29f3d79..a0c8b60 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -6,7 +6,7 @@ services: networks: - BIG-dataplatform-network ports: - - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} + - 5000:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data volumes: From 702a6c1c1d18947fe720d114269441e2a94d182b Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 11:49:07 +0100 Subject: [PATCH 009/103] upd --- dataplatform/.env | 2 +- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/.env b/dataplatform/.env index 28afa35..c75c2d6 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -41,4 +41,4 @@ YARNHISTSERVERPORT=19888 PORTAINERPORT=9443 -DOCKERREGISTRYPORT=40123 +DOCKERREGISTRYPORT=5000 diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index a0c8b60..29f3d79 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -6,7 +6,7 @@ services: networks: - BIG-dataplatform-network ports: - - 5000:${DOCKERREGISTRYPORT} + - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data volumes: From 11dbeec0911435bf5c3ace258a7f69a0f9599109 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 11:51:41 +0100 Subject: [PATCH 010/103] upd --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 29f3d79..63185cb 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -9,6 +9,7 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data + REGISTRY_HTTP_ADDR: 0.0.0.0:5000 volumes: - docker_registry_data:/data deploy: From ff614ddcaf354702d95e8fcf5d69152669a62349 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 11:51:59 +0100 Subject: [PATCH 011/103] upd --- dataplatform/multiple_stacks/docker-compose-registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/docker-compose-registry.yaml index 63185cb..ca36bbe 100644 --- a/dataplatform/multiple_stacks/docker-compose-registry.yaml +++ b/dataplatform/multiple_stacks/docker-compose-registry.yaml @@ -9,7 +9,7 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data - REGISTRY_HTTP_ADDR: 0.0.0.0:5000 + #REGISTRY_HTTP_ADDR: 0.0.0.0:5000 volumes: - docker_registry_data:/data deploy: From cc65ad28949125c73344a5768255da4294866d50 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 15:56:47 +0100 Subject: [PATCH 012/103] align with repo --- .../{docker-compose-registry.yaml => registry.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dataplatform/multiple_stacks/{docker-compose-registry.yaml => registry.yaml} (100%) diff --git a/dataplatform/multiple_stacks/docker-compose-registry.yaml b/dataplatform/multiple_stacks/registry.yaml similarity index 100% rename from dataplatform/multiple_stacks/docker-compose-registry.yaml rename to dataplatform/multiple_stacks/registry.yaml From 9f5cc9469920072370549020228c92aee827da21 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 16:22:05 +0100 Subject: [PATCH 013/103] change volume folder --- dataplatform/multiple_stacks/registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/registry.yaml b/dataplatform/multiple_stacks/registry.yaml index ca36bbe..9cbb72a 100644 --- a/dataplatform/multiple_stacks/registry.yaml +++ b/dataplatform/multiple_stacks/registry.yaml @@ -23,7 +23,7 @@ volumes: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 - device: :${NFSPATH}/dataplatform_config/docker_registry + device: :${NFSPATH}/dataplatform_config/docker_registry/data networks: BIG-dataplatform-network: From 2da971c21a799af9d892d837f886f52b0948096b Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 17:30:05 +0100 Subject: [PATCH 014/103] change folder --- dataplatform/multiple_stacks/registry.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/registry.yaml b/dataplatform/multiple_stacks/registry.yaml index 9cbb72a..ca36bbe 100644 --- a/dataplatform/multiple_stacks/registry.yaml +++ b/dataplatform/multiple_stacks/registry.yaml @@ -23,7 +23,7 @@ volumes: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 - device: :${NFSPATH}/dataplatform_config/docker_registry/data + device: :${NFSPATH}/dataplatform_config/docker_registry networks: BIG-dataplatform-network: From 7bce64e7241193499d85c9092764df76a48c18b2 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 15 Mar 2024 17:37:37 +0100 Subject: [PATCH 015/103] add comment --- dataplatform/multiple_stacks/registry.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/registry.yaml b/dataplatform/multiple_stacks/registry.yaml index ca36bbe..2d57896 100644 --- a/dataplatform/multiple_stacks/registry.yaml +++ b/dataplatform/multiple_stacks/registry.yaml @@ -1,6 +1,6 @@ version: '3.9' -services: +services: #available services at 127.0.0.0:5000 registry: image: registry:2.8.3 networks: @@ -9,7 +9,6 @@ services: - ${DOCKERREGISTRYPORT}:${DOCKERREGISTRYPORT} environment: REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY: /data - #REGISTRY_HTTP_ADDR: 0.0.0.0:5000 volumes: - docker_registry_data:/data deploy: From 306bca20976d632fc14285a125e4c2a66982d36a Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Mon, 18 Mar 2024 12:27:46 +0100 Subject: [PATCH 016/103] update docker file --- airflow_dags/docker_with_code/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airflow_dags/docker_with_code/Dockerfile b/airflow_dags/docker_with_code/Dockerfile index 7d06b8f..d18916f 100644 --- a/airflow_dags/docker_with_code/Dockerfile +++ b/airflow_dags/docker_with_code/Dockerfile @@ -1,5 +1,7 @@ FROM python:3.8 +ARG OPTION_1 +ARG OPTION_2 # Set working directory WORKDIR /app From 7eee9c52561138bc62b4c26c759af91acfbffb32 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Mon, 18 Mar 2024 17:44:13 +0100 Subject: [PATCH 017/103] update readme and geoserver docker file --- airflow_dags/dockerpyscript/README.md | 4 ++++ dataplatform/multiple_stacks/geoserver.yaml | 12 +++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/airflow_dags/dockerpyscript/README.md b/airflow_dags/dockerpyscript/README.md index 050b470..025c022 100644 --- a/airflow_dags/dockerpyscript/README.md +++ b/airflow_dags/dockerpyscript/README.md @@ -57,6 +57,10 @@ dag = my_dag() ``` +# Use local docker registry at 127.0.0.0:5000 +- remove from docker registry: + - registry garbage-collect -m /etc/docker/registry/config.yml + # Use docker images on docker hub docker build -t my_image . docker tag my_image chiaraforresi/test:v0.0.1 diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 2d0f8ac..1d916d1 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -3,6 +3,8 @@ version: '3.9' services: geoserver: image: docker.osgeo.org/geoserver:2.25.x + environment: + SKIP_DEMO_DATA: true networks: - BIG-dataplatform-network ports: @@ -27,11 +29,11 @@ volumes: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 device: :${NFSPATH}/dataplatform_config/geoserver/data_dir - geoserver_conf: - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4 - device: :${NFSPATH}/dataplatform_config/geoserver_conf/ + #geoserver_conf: + # driver_opts: + # type: nfs + # o: addr=${NFSADDRESS},rw,nfsvers=4 + # device: :${NFSPATH}/dataplatform_config/geoserver_conf/ networks: BIG-dataplatform-network: From 3423b6fac19ed9e1e6a0a60df24d89a65f1356fb Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Mon, 18 Mar 2024 17:44:58 +0100 Subject: [PATCH 018/103] update readme and geoserver docker file --- dataplatform/multiple_stacks/geoserver.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 1d916d1..1f9b5f5 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -4,7 +4,7 @@ services: geoserver: image: docker.osgeo.org/geoserver:2.25.x environment: - SKIP_DEMO_DATA: true + SKIP_DEMO_DATA: "true" networks: - BIG-dataplatform-network ports: From 6da610eb028ac9f6337c0b88d9e4572339b42ed8 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Mon, 18 Mar 2024 17:52:30 +0100 Subject: [PATCH 019/103] fix volumes --- dataplatform/multiple_stacks/geoserver.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 1f9b5f5..d991ce2 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -10,8 +10,8 @@ services: ports: - 81:8080 volumes: - - geoserver_data:/opt/geoserver_data - - geoserver_data_dir:/home/geoserver/data + - geoserver_data_dir:/opt/geoserver_data + - geoserver_data:/home/geoserver/data deploy: placement: constraints: From d2ac997d40ddda70ac0711f6070e9b609b3aabcf Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Mon, 18 Mar 2024 18:04:02 +0100 Subject: [PATCH 020/103] fix port --- dataplatform/multiple_stacks/geoserver.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index d991ce2..201671c 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -8,7 +8,7 @@ services: networks: - BIG-dataplatform-network ports: - - 81:8080 + - 40111:8080 volumes: - geoserver_data_dir:/opt/geoserver_data - geoserver_data:/home/geoserver/data From a68162b6a9ffcd039149d00b91894cdb8618c1d7 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 10:43:17 +0100 Subject: [PATCH 021/103] refactor --- dataplatform/.env | 4 ++++ dataplatform/multiple_stacks/geoserver.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dataplatform/.env b/dataplatform/.env index 1d6c54a..7b3b5c6 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -83,4 +83,8 @@ MOSQUITTO_PORT_EXT_TLS=48883 MOSQUITTO_USER=foo MOSQUITTO_PWD=bar +#DOCKER REGISTRY DOCKERREGISTRYPORT=5000 + +#GEOSERVER +GEOSERVER_PORT=40111 \ No newline at end of file diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 201671c..cc625f6 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -8,7 +8,7 @@ services: networks: - BIG-dataplatform-network ports: - - 40111:8080 + - ${GEOSERVER_PORT}:8080 volumes: - geoserver_data_dir:/opt/geoserver_data - geoserver_data:/home/geoserver/data From 6f37cd7098536ddc678e5508fa66da8ddbb108b3 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 10:55:53 +0100 Subject: [PATCH 022/103] geoserver with hdfs client --- dataplatform/multiple_stacks/geoserver.yaml | 27 ++++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index cc625f6..6c7cd42 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -1,6 +1,25 @@ version: '3.9' services: + hdfs-client: + image: apache/hadoop:${HADOOPVERSION} + volumes: + - hadoop_config:${HADOOPCONFDIR} + command: + - /bin/bash + - -c + - | + sleep 20 + hdfs dfs -get /geoserver-data . + scp -r data/* geoserver:/home/geoserver/data + deploy: + placement: + constraints: + - node.role != manager + - node.hostname != CB-Mass-Node1 + networks: + - BIG-dataplatform-network + geoserver: image: docker.osgeo.org/geoserver:2.25.x environment: @@ -11,7 +30,6 @@ services: - ${GEOSERVER_PORT}:8080 volumes: - geoserver_data_dir:/opt/geoserver_data - - geoserver_data:/home/geoserver/data deploy: placement: constraints: @@ -19,11 +37,12 @@ services: - node.role == worker volumes: - geoserver_data: + hadoop_config: + driver: local driver_opts: type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4 - device: :${NFSPATH}/dataplatform_config/geoserver/data + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" geoserver_data_dir: driver_opts: type: nfs From da2d049ff3e6a63652e310f6d826fa9f8d1f088b Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 10:58:15 +0100 Subject: [PATCH 023/103] fix --- dataplatform/multiple_stacks/geoserver.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 6c7cd42..f756688 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -11,7 +11,7 @@ services: - | sleep 20 hdfs dfs -get /geoserver-data . - scp -r data/* geoserver:/home/geoserver/data + scp -r geoserver-data/data/* geoserver:/home/geoserver/data deploy: placement: constraints: From 4c874c7d005416a669bed1fd69649760a62e16cb Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 11:11:31 +0100 Subject: [PATCH 024/103] fix --- dataplatform/multiple_stacks/geoserver.yaml | 27 +++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index f756688..c63e2fe 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -10,8 +10,31 @@ services: - -c - | sleep 20 - hdfs dfs -get /geoserver-data . - scp -r geoserver-data/data/* geoserver:/home/geoserver/data + sudo yum install openssh-clients + hdfs dfs -get /geoserver-data . 2> /tmp/hdfs_error.log + if [ $? -ne 0 ]; then + echo "Error occurred while getting data from HDFS geoserver-data folder" + exit 1 + fi + echo "End to get data from HDFS geoserver-data folder" + sleep 600 + count=0 + while ! ping -c 1 geoserver &> /dev/null && [ $count -lt 100000 ]; do + echo "Geoserver container is not reachable, retrying in 10 seconds..." + sleep 10 + count=$((count+1)) + done + if [ $count -eq 100000 ]; then + echo "Reached maximum retry limit, exiting..." + exit 1 + fi + echo "Geoserver container is reachable, proceeding with scp command" + scp -r geoserver-data/data/* geoserver:/home/geoserver/data 2> /tmp/scp_error.log + if [ $? -ne 0 ]; then + echo "Error occurred while copying data to geoserver container" + exit 1 + fi + echo "End scp data to geoserver container" deploy: placement: constraints: From 77da86c87cfc6da6c08fe3e5af2bd2a688736e0e Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 11:14:08 +0100 Subject: [PATCH 025/103] fix --- dataplatform/multiple_stacks/geoserver.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index c63e2fe..aec4798 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -10,7 +10,7 @@ services: - -c - | sleep 20 - sudo yum install openssh-clients + sudo yum install -y openssh-clients hdfs dfs -get /geoserver-data . 2> /tmp/hdfs_error.log if [ $? -ne 0 ]; then echo "Error occurred while getting data from HDFS geoserver-data folder" @@ -20,19 +20,19 @@ services: sleep 600 count=0 while ! ping -c 1 geoserver &> /dev/null && [ $count -lt 100000 ]; do - echo "Geoserver container is not reachable, retrying in 10 seconds..." - sleep 10 - count=$((count+1)) + echo "Geoserver container is not reachable, retrying in 10 seconds..." + sleep 10 + count=$((count+1)) done if [ $count -eq 100000 ]; then - echo "Reached maximum retry limit, exiting..." - exit 1 + echo "Reached maximum retry limit, exiting..." + exit 1 fi echo "Geoserver container is reachable, proceeding with scp command" scp -r geoserver-data/data/* geoserver:/home/geoserver/data 2> /tmp/scp_error.log if [ $? -ne 0 ]; then - echo "Error occurred while copying data to geoserver container" - exit 1 + echo "Error occurred while copying data to geoserver container" + exit 1 fi echo "End scp data to geoserver container" deploy: From c322d486c91114fa69b5e02cbfb5d56b0945a401 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 12:55:43 +0100 Subject: [PATCH 026/103] WIP geoserver --- dataplatform/multiple_stacks/geoserver.yaml | 35 +++------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index aec4798..16741cb 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -5,41 +5,15 @@ services: image: apache/hadoop:${HADOOPVERSION} volumes: - hadoop_config:${HADOOPCONFDIR} - command: - - /bin/bash - - -c - - | - sleep 20 - sudo yum install -y openssh-clients - hdfs dfs -get /geoserver-data . 2> /tmp/hdfs_error.log - if [ $? -ne 0 ]; then - echo "Error occurred while getting data from HDFS geoserver-data folder" - exit 1 - fi - echo "End to get data from HDFS geoserver-data folder" - sleep 600 - count=0 - while ! ping -c 1 geoserver &> /dev/null && [ $count -lt 100000 ]; do - echo "Geoserver container is not reachable, retrying in 10 seconds..." - sleep 10 - count=$((count+1)) - done - if [ $count -eq 100000 ]; then - echo "Reached maximum retry limit, exiting..." - exit 1 - fi - echo "Geoserver container is reachable, proceeding with scp command" - scp -r geoserver-data/data/* geoserver:/home/geoserver/data 2> /tmp/scp_error.log - if [ $? -ne 0 ]; then - echo "Error occurred while copying data to geoserver container" - exit 1 - fi - echo "End scp data to geoserver container" + - ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh + - /geoserver_data:/geoserver_data + #TODO command: ["geoserver-hdfs-client-script.sh"] deploy: placement: constraints: - node.role != manager - node.hostname != CB-Mass-Node1 + - node.labels.running_geoserver==1 #run in the same machine networks: - BIG-dataplatform-network @@ -53,6 +27,7 @@ services: - ${GEOSERVER_PORT}:8080 volumes: - geoserver_data_dir:/opt/geoserver_data + - /geoserver_data:/home/geoserver/data deploy: placement: constraints: From a9f5a5a48363a71c2516721f919ebc4afe7cdd76 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 12:56:46 +0100 Subject: [PATCH 027/103] WIP geoserver --- dataplatform/multiple_stacks/geoserver.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 16741cb..8ee0271 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -8,6 +8,9 @@ services: - ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh - /geoserver_data:/geoserver_data #TODO command: ["geoserver-hdfs-client-script.sh"] + command: [ "tail", "-f", "/dev/null" ] + stdin_open: true + tty: true deploy: placement: constraints: From 210d253c8f47c48eff80296d933d50f5e437b969 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 13:04:33 +0100 Subject: [PATCH 028/103] WIP geoserver --- dataplatform/multiple_stacks/geoserver.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 8ee0271..cf72dfe 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -6,7 +6,7 @@ services: volumes: - hadoop_config:${HADOOPCONFDIR} - ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh - - /geoserver_data:/geoserver_data + - /mnt/disk1/geoserver_data:/geoserver_data #TODO command: ["geoserver-hdfs-client-script.sh"] command: [ "tail", "-f", "/dev/null" ] stdin_open: true @@ -30,12 +30,13 @@ services: - ${GEOSERVER_PORT}:8080 volumes: - geoserver_data_dir:/opt/geoserver_data - - /geoserver_data:/home/geoserver/data + - /mnt/disk1/geoserver_data:/home/geoserver/data deploy: placement: constraints: - node.hostname != CB-Mass-Node1 - node.role == worker + - node.labels.running_geoserver==1 #run in the same machine volumes: hadoop_config: From 3350dfe8902c4812c95833ac26b0c58b0b02ccb7 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 13:09:32 +0100 Subject: [PATCH 029/103] WIP geoserver --- dataplatform/multiple_stacks/geoserver.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index cf72dfe..22001cc 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -5,7 +5,7 @@ services: image: apache/hadoop:${HADOOPVERSION} volumes: - hadoop_config:${HADOOPCONFDIR} - - ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh + #- ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh - /mnt/disk1/geoserver_data:/geoserver_data #TODO command: ["geoserver-hdfs-client-script.sh"] command: [ "tail", "-f", "/dev/null" ] From 8aa76437607dc728c00c3fb612fee71f49b5b21f Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 14:28:50 +0100 Subject: [PATCH 030/103] airflow remove unused volume and geoserver add script --- dataplatform/multiple_stacks/airflow.yaml | 6 ----- .../geoserver-hdfs-client-script.sh | 26 +++++++++++++++++++ dataplatform/multiple_stacks/geoserver.yaml | 7 ++--- 3 files changed, 28 insertions(+), 11 deletions(-) create mode 100644 dataplatform/multiple_stacks/geoserver-hdfs-client-script.sh diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 787ecde..6d875fb 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -224,12 +224,6 @@ networks: name: BIG-dataplatform-network volumes: - postgres_db_data: - driver: local - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/airflow_postgres_data/" airflow_data: driver: local diff --git a/dataplatform/multiple_stacks/geoserver-hdfs-client-script.sh b/dataplatform/multiple_stacks/geoserver-hdfs-client-script.sh new file mode 100644 index 0000000..38e0ae8 --- /dev/null +++ b/dataplatform/multiple_stacks/geoserver-hdfs-client-script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +sleep 20 +hdfs dfs -get /geoserver-data . 2> /tmp/hdfs_error.log +if [ $? -ne 0 ]; then + echo "Error occurred while getting data from HDFS geoserver-data folder" + exit 1 +fi +echo "End to get data from HDFS geoserver-data folder" +count=0 +while ! ping -c 1 geoserver &> /dev/null && [ $count -lt 100000 ]; do + echo "Geoserver container is not reachable, retrying in 10 seconds..." + sleep 10 + count=$((count+1)) +done +if [ $count -eq 100000 ]; then + echo "Reached maximum retry limit, exiting..." + exit 1 +fi +echo "Geoserver container is reachable, proceeding with scp command" +cp -r geoserver-data/data/* /geoserver_data 2> /tmp/scp_error.log +if [ $? -ne 0 ]; then + echo "Error occurred while copying data to geoserver container" + exit 1 +fi +echo "End scp data to geoserver container" +exit 0 \ No newline at end of file diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 22001cc..51dcd08 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -5,12 +5,9 @@ services: image: apache/hadoop:${HADOOPVERSION} volumes: - hadoop_config:${HADOOPCONFDIR} - #- ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh + - ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh - /mnt/disk1/geoserver_data:/geoserver_data - #TODO command: ["geoserver-hdfs-client-script.sh"] - command: [ "tail", "-f", "/dev/null" ] - stdin_open: true - tty: true + command: ["geoserver-hdfs-client-script.sh"] deploy: placement: constraints: From 7c0ce1dddf10ca1974efe5087a2d9ba310f4be6a Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 14:54:10 +0100 Subject: [PATCH 031/103] fix script mount --- dataplatform/multiple_stacks/geoserver.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 51dcd08..e1c76ae 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -5,9 +5,9 @@ services: image: apache/hadoop:${HADOOPVERSION} volumes: - hadoop_config:${HADOOPCONFDIR} - - ./geoserver-hdfs-client-script.sh:/opt/hadoop/geoserver-hdfs-client-script.sh + - geoserver_conf:/opt/hadoop/script/geoserver-hdfs-client-script.sh - /mnt/disk1/geoserver_data:/geoserver_data - command: ["geoserver-hdfs-client-script.sh"] + command: ["script/geoserver-hdfs-client-script.sh"] deploy: placement: constraints: @@ -47,11 +47,11 @@ volumes: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 device: :${NFSPATH}/dataplatform_config/geoserver/data_dir - #geoserver_conf: - # driver_opts: - # type: nfs - # o: addr=${NFSADDRESS},rw,nfsvers=4 - # device: :${NFSPATH}/dataplatform_config/geoserver_conf/ + geoserver_conf: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4 + device: :${NFSPATH}/dataplatform_config/geoserver_conf/ networks: BIG-dataplatform-network: From e00dc914137e27a6692055c8fcc019de413d7e3d Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 15:05:30 +0100 Subject: [PATCH 032/103] fix script mount --- dataplatform/multiple_stacks/geoserver.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index e1c76ae..bc86624 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -5,7 +5,7 @@ services: image: apache/hadoop:${HADOOPVERSION} volumes: - hadoop_config:${HADOOPCONFDIR} - - geoserver_conf:/opt/hadoop/script/geoserver-hdfs-client-script.sh + - geoserver_conf:/opt/hadoop/script - /mnt/disk1/geoserver_data:/geoserver_data command: ["script/geoserver-hdfs-client-script.sh"] deploy: From 5897be79271c506afbdaf99719be0f6705e75b73 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Tue, 19 Mar 2024 15:21:37 +0100 Subject: [PATCH 033/103] change geoserver mount to nfs --- .../geoserver-hdfs-client-script.sh | 0 .../geoserver-hdfs/geoserver-hdfs.yaml | 61 +++++++++++++++++++ dataplatform/multiple_stacks/geoserver.yaml | 36 +++-------- 3 files changed, 70 insertions(+), 27 deletions(-) rename dataplatform/multiple_stacks/{ => geoserver-hdfs}/geoserver-hdfs-client-script.sh (100%) create mode 100644 dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml diff --git a/dataplatform/multiple_stacks/geoserver-hdfs-client-script.sh b/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs-client-script.sh similarity index 100% rename from dataplatform/multiple_stacks/geoserver-hdfs-client-script.sh rename to dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs-client-script.sh diff --git a/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml b/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml new file mode 100644 index 0000000..fc4d03b --- /dev/null +++ b/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml @@ -0,0 +1,61 @@ +version: '3.9' + +services: + hdfs-client: + image: apache/hadoop:${HADOOPVERSION} + volumes: + - hadoop_config:${HADOOPCONFDIR} + - geoserver_conf:/opt/hadoop/script + - /mnt/disk1/geoserver_data:/geoserver_data + command: ["script/geoserver-hdfs-client-script.sh"] + deploy: + restart_policy: + condition: none # TODO check if not restart after complete + placement: + constraints: + - node.role != manager + - node.hostname != CB-Mass-Node1 + - node.labels.running_geoserver==1 #run in the same machine + networks: + - BIG-dataplatform-network + + geoserver: + image: docker.osgeo.org/geoserver:2.25.x + environment: + SKIP_DEMO_DATA: "true" + networks: + - BIG-dataplatform-network + ports: + - ${GEOSERVER_PORT}:8080 + volumes: + - geoserver_data_dir:/opt/geoserver_data + - /mnt/disk1/geoserver_data:/home/geoserver/data + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + - node.role == worker + - node.labels.running_geoserver==1 #run in the same machine + +volumes: + hadoop_config: + driver: local + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" + geoserver_data_dir: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4 + device: :${NFSPATH}/dataplatform_config/geoserver/data_dir + geoserver_conf: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4 + device: :${NFSPATH}/dataplatform_config/geoserver_conf/ + +networks: + BIG-dataplatform-network: + external: true + name: BIG-dataplatform-network diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index bc86624..cc625f6 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -1,22 +1,6 @@ version: '3.9' services: - hdfs-client: - image: apache/hadoop:${HADOOPVERSION} - volumes: - - hadoop_config:${HADOOPCONFDIR} - - geoserver_conf:/opt/hadoop/script - - /mnt/disk1/geoserver_data:/geoserver_data - command: ["script/geoserver-hdfs-client-script.sh"] - deploy: - placement: - constraints: - - node.role != manager - - node.hostname != CB-Mass-Node1 - - node.labels.running_geoserver==1 #run in the same machine - networks: - - BIG-dataplatform-network - geoserver: image: docker.osgeo.org/geoserver:2.25.x environment: @@ -27,31 +11,29 @@ services: - ${GEOSERVER_PORT}:8080 volumes: - geoserver_data_dir:/opt/geoserver_data - - /mnt/disk1/geoserver_data:/home/geoserver/data + - geoserver_data:/home/geoserver/data deploy: placement: constraints: - node.hostname != CB-Mass-Node1 - node.role == worker - - node.labels.running_geoserver==1 #run in the same machine volumes: - hadoop_config: - driver: local + geoserver_data: driver_opts: type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" + o: addr=${NFSADDRESS},rw,nfsvers=4 + device: :${NFSPATH}/dataplatform_config/geoserver/data geoserver_data_dir: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 device: :${NFSPATH}/dataplatform_config/geoserver/data_dir - geoserver_conf: - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4 - device: :${NFSPATH}/dataplatform_config/geoserver_conf/ + #geoserver_conf: + # driver_opts: + # type: nfs + # o: addr=${NFSADDRESS},rw,nfsvers=4 + # device: :${NFSPATH}/dataplatform_config/geoserver_conf/ networks: BIG-dataplatform-network: From 5a68964eef8b64e3f5c71bbfdd0ea4df30ed3d2f Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 12:39:43 +0100 Subject: [PATCH 034/103] change geoserver to be a manager --- dataplatform/multiple_stacks/geoserver.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index cc625f6..0923121 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -16,7 +16,7 @@ services: placement: constraints: - node.hostname != CB-Mass-Node1 - - node.role == worker + - node.role == manager volumes: geoserver_data: From a5661413d5b0ff5aab6d1eb1e19ec8230b84016c Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 12:52:20 +0100 Subject: [PATCH 035/103] manage airflow roles --- dataplatform/multiple_stacks/airflow.yaml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 6d875fb..33d2d14 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -101,6 +101,7 @@ services: placement: constraints: - node.hostname != CB-Mass-Node1 + - node.labels.running_airflow_webserver == 1 #https://stackoverflow.com/questions/69316093/how-to-add-new-user-to-docker-image-when-running-distributed-airflow-architectur docker-proxy: @@ -116,6 +117,7 @@ services: placement: constraints: - node.hostname != CB-Mass-Node1 + - node.labels.running_airflow_webserver == 1 webserver: <<: *airflow-common @@ -129,7 +131,8 @@ services: retries: 5 start_period: 30s networks: - - BIG-dataplatform-network + - BIG-dataplatform-network + - node.role == manager scheduler: <<: *airflow-common @@ -141,7 +144,8 @@ services: retries: 5 start_period: 30s networks: - - BIG-dataplatform-network + - BIG-dataplatform-network + - node.labels.running_airflow_webserver == 1 worker: <<: *airflow-common @@ -161,7 +165,8 @@ services: # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation DUMB_INIT_SETSID: "0" networks: - - BIG-dataplatform-network + - BIG-dataplatform-network + - node.role == worker triggerer: <<: *airflow-common @@ -173,7 +178,8 @@ services: # retries: 5 # start_period: 30s networks: - - BIG-dataplatform-network + - BIG-dataplatform-network + - node.labels.running_airflow_webserver == 1 # init: # <<: *airflow-common From 69475ff2111591ab204bd7063bff74675874a5c5 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 14:55:17 +0100 Subject: [PATCH 036/103] pass airlfow installation --- dataplatform/.env | 5 +- dataplatform/multiple_stacks/airflow.yaml | 180 ++++++++++++++-------- 2 files changed, 122 insertions(+), 63 deletions(-) diff --git a/dataplatform/.env b/dataplatform/.env index 7b3b5c6..b5f022c 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -87,4 +87,7 @@ MOSQUITTO_PWD=bar DOCKERREGISTRYPORT=5000 #GEOSERVER -GEOSERVER_PORT=40111 \ No newline at end of file +GEOSERVER_PORT=40111 + +#AIRFLOW +AIRFLOW_WEB_SERVER_PORT=48081 \ No newline at end of file diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 33d2d14..8f30119 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -28,7 +28,7 @@ x-airflow-common: # In order to add custom dependencies or upgrade provider packages you can use your extended image. # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml # and uncomment the "build" line below, Then run `docker-compose build` to build the images. - image: apache/airflow:latest + image: apache/airflow:2.8.3 # build: . environment: &airflow-common-env @@ -45,14 +45,15 @@ x-airflow-common: # Use simple http server on scheduler for health checks # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server # yamllint enable rule:line-length - AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + #TODO Check AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker } #apache-airflow-providers-apache-spark - _AIRFLOW_DB_MIGRATE: 'true' - _AIRFLOW_WWW_USER_CREATE: 'false' - _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} - _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + #TODO check the following + #_AIRFLOW_DB_MIGRATE: 'true' + #_AIRFLOW_WWW_USER_CREATE: 'false' + #_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + #_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" @@ -71,11 +72,11 @@ services: POSTGRES_DB: airflow volumes: - /mnt/disk1/airflow_postgres_data:/var/lib/postgresql/data - healthcheck: - test: ["CMD", "pg_isready", "-U", "airflow"] - interval: 10s - retries: 5 - start_period: 5s +# healthcheck: +# test: ["CMD", "pg_isready", "-U", "airflow"] +# interval: 10s +# retries: 5 +# start_period: 5s networks: - BIG-dataplatform-network deploy: @@ -89,12 +90,12 @@ services: image: redis:latest expose: - 6379 - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 10s - timeout: 30s - retries: 50 - start_period: 30s +# healthcheck: +# test: ["CMD", "redis-cli", "ping"] +# interval: 10s +# timeout: 30s +# retries: 50 +# start_period: 30s networks: - BIG-dataplatform-network deploy: @@ -123,29 +124,35 @@ services: <<: *airflow-common command: webserver ports: - - "48081:8080" - healthcheck: - test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] - interval: 30s - timeout: 10s - retries: 5 - start_period: 30s + - ${AIRFLOW_WEB_SERVER_PORT}:8080 +# healthcheck: +# test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] +# interval: 30s +# timeout: 10s +# retries: 5 +# start_period: 30s networks: - BIG-dataplatform-network - - node.role == manager + deploy: + placement: + constraints: + - node.role == manager scheduler: <<: *airflow-common command: scheduler - healthcheck: - test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] - interval: 30s - timeout: 10s - retries: 5 - start_period: 30s +# healthcheck: +# test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] +# interval: 30s +# timeout: 10s +# retries: 5 +# start_period: 30s networks: - BIG-dataplatform-network - - node.labels.running_airflow_webserver == 1 + deploy: + placement: + constraints: + - node.labels.running_airflow_webserver == 1 worker: <<: *airflow-common @@ -166,7 +173,10 @@ services: DUMB_INIT_SETSID: "0" networks: - BIG-dataplatform-network - - node.role == worker + deploy: + placement: + constraints: + - node.role == worker triggerer: <<: *airflow-common @@ -179,36 +189,82 @@ services: # start_period: 30s networks: - BIG-dataplatform-network - - node.labels.running_airflow_webserver == 1 - - # init: - # <<: *airflow-common - # entrypoint: /bin/bash - # # yamllint disable rule:line-length - # command: - # - -c - # - | - # if [[ -z "${AIRFLOW_UID}" ]]; then - # echo - # echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" - # echo "If you are on Linux, you SHOULD follow the instructions below to set " - # echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." - # echo "For other operating systems you can get rid of the warning with manually created .env file:" - # echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" - # echo - # fi - # mkdir -p /sources/logs /sources/dags /sources/plugins - # chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} - # exec /entrypoint airflow version - # # yamllint enable rule:line-length - # environment: - # <<: *airflow-common-env + deploy: + placement: + constraints: + - node.labels.running_airflow_webserver == 1 - # user: "0:0" - # volumes: - # - airflow_data:/sources - # networks: - # - BIG-dataplatform-network + init: + <<: *airflow-common + entrypoint: /bin/bash + # yamllint disable rule:line-length + command: + - -c + - | + if [[ -z "${AIRFLOW_UID}" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" + echo "If you are on Linux, you SHOULD follow the instructions below to set " + echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." + echo "For other operating systems you can get rid of the warning with manually created .env file:" + echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" + echo + fi + one_meg=1048576 + mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) + cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) + disk_available=$$(df / | tail -1 | awk '{print $$4}') + warning_resources="false" + if (( mem_available < 4000 )) ; then + echo + echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" + echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" + echo + warning_resources="true" + fi + if (( cpus_available < 2 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" + echo "At least 2 CPUs recommended. You have $${cpus_available}" + echo + warning_resources="true" + fi + if (( disk_available < one_meg * 10 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" + echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" + echo + warning_resources="true" + fi + if [[ $${warning_resources} == "true" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" + echo "Please follow the instructions to increase amount of resources available:" + echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" + echo + fi + mkdir -p /sources/logs /sources/dags /sources/plugins + chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} + exec /entrypoint airflow version + # yamllint enable rule:line-length + environment: + <<: *airflow-common-env + _AIRFLOW_DB_MIGRATE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + _PIP_ADDITIONAL_REQUIREMENTS: '' + user: "0:0" + volumes: + - airflow_data:/sources + networks: + - BIG-dataplatform-network + deploy: + restart_policy: + condition: on-failure #TODO check + placement: + constraints: + - node.labels.running_airflow_webserver == 1 # cli: # <<: *airflow-common From de613835abb16efbbf5ddd1008a294ff068e3262 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 15:04:07 +0100 Subject: [PATCH 037/103] pass airlfow installation --- dataplatform/multiple_stacks/airflow.yaml | 33 ----------------------- 1 file changed, 33 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 8f30119..8d4d5ab 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -210,39 +210,6 @@ services: echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" echo fi - one_meg=1048576 - mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) - cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) - disk_available=$$(df / | tail -1 | awk '{print $$4}') - warning_resources="false" - if (( mem_available < 4000 )) ; then - echo - echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" - echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" - echo - warning_resources="true" - fi - if (( cpus_available < 2 )); then - echo - echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" - echo "At least 2 CPUs recommended. You have $${cpus_available}" - echo - warning_resources="true" - fi - if (( disk_available < one_meg * 10 )); then - echo - echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" - echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" - echo - warning_resources="true" - fi - if [[ $${warning_resources} == "true" ]]; then - echo - echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" - echo "Please follow the instructions to increase amount of resources available:" - echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" - echo - fi mkdir -p /sources/logs /sources/dags /sources/plugins chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} exec /entrypoint airflow version From fd9fbb7ad4edbf08284cd5bad6f74d406ae26444 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 15:09:18 +0100 Subject: [PATCH 038/103] small updates --- dataplatform/multiple_stacks/airflow.yaml | 3 ++- .../multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 8d4d5ab..6688be8 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -177,6 +177,7 @@ services: placement: constraints: - node.role == worker + - node.hostname != CB-Mass-Node1 triggerer: <<: *airflow-common @@ -228,7 +229,7 @@ services: - BIG-dataplatform-network deploy: restart_policy: - condition: on-failure #TODO check + condition: on-failure placement: constraints: - node.labels.running_airflow_webserver == 1 diff --git a/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml b/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml index fc4d03b..5e5c44e 100644 --- a/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml +++ b/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml @@ -10,7 +10,7 @@ services: command: ["script/geoserver-hdfs-client-script.sh"] deploy: restart_policy: - condition: none # TODO check if not restart after complete + condition: on-failure placement: constraints: - node.role != manager From c06a906becc012fd14e0832726ecb7bf2b10cbbd Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 15:35:44 +0100 Subject: [PATCH 039/103] wip to change postgres db --- dataplatform/multiple_stacks/airflow.yaml | 45 ++++++++++------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 6688be8..d84d45e 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -33,8 +33,8 @@ x-airflow-common: environment: &airflow-common-env AIRFLOW__CORE__EXECUTOR: CeleryExecutor - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow - AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + #this is for copy data AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow #TODO github secret AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' @@ -45,15 +45,10 @@ x-airflow-common: # Use simple http server on scheduler for health checks # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server # yamllint enable rule:line-length - #TODO Check AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + # AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. - _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker } #apache-airflow-providers-apache-spark - #TODO check the following - #_AIRFLOW_DB_MIGRATE: 'true' - #_AIRFLOW_WWW_USER_CREATE: 'false' - #_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} - #_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} #apache-airflow-providers-docker apache-airflow-providers-apache-spark volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" @@ -64,26 +59,26 @@ x-airflow-common: services: - postgres: - image: postgres:13 - environment: - POSTGRES_USER: airflow - POSTGRES_PASSWORD: airflow - POSTGRES_DB: airflow - volumes: - - /mnt/disk1/airflow_postgres_data:/var/lib/postgresql/data + # postgres: + # image: postgres:13 + # environment: + # POSTGRES_USER: airflow + # POSTGRES_PASSWORD: airflow + # POSTGRES_DB: airflow + # volumes: + # - /mnt/disk1/airflow_postgres_data:/var/lib/postgresql/data # healthcheck: # test: ["CMD", "pg_isready", "-U", "airflow"] # interval: 10s # retries: 5 # start_period: 5s - networks: - - BIG-dataplatform-network - deploy: - placement: - constraints: - - node.hostname != CB-Mass-Node1 - - node.labels.test_airflow_postgres==1 + # networks: + # - BIG-dataplatform-network + # deploy: + # placement: + # constraints: + # - node.hostname != CB-Mass-Node1 + # - node.labels.test_airflow_postgres==1 redis: @@ -217,7 +212,7 @@ services: # yamllint enable rule:line-length environment: <<: *airflow-common-env - _AIRFLOW_DB_MIGRATE: 'true' + _AIRFLOW_DB_MIGRATE: 'false' #this is for copy data _AIRFLOW_WWW_USER_CREATE: 'true' _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} From 5330de4bcf01737b749a9d473038194419d4c9f7 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 15:50:18 +0100 Subject: [PATCH 040/103] fix airflow with our postgres db --- dataplatform/multiple_stacks/airflow.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index d84d45e..ce23141 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -33,8 +33,8 @@ x-airflow-common: environment: &airflow-common-env AIRFLOW__CORE__EXECUTOR: CeleryExecutor - #this is for copy data AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow - AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow #TODO github secret + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow #TODO github secret + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow #TODO github secret AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' From e838df0ad0302208503b939613b3bbdefbadc1ac Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 15:50:41 +0100 Subject: [PATCH 041/103] add a todo --- dataplatform/multiple_stacks/airflow.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index ce23141..fb1bea0 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -215,7 +215,7 @@ services: _AIRFLOW_DB_MIGRATE: 'false' #this is for copy data _AIRFLOW_WWW_USER_CREATE: 'true' _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} - _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} #TODO github secret _PIP_ADDITIONAL_REQUIREMENTS: '' user: "0:0" volumes: From 6ff5c0c59db98d3f0ab8677a1620e60054db8779 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 16:28:49 +0100 Subject: [PATCH 042/103] update docker airflow tentative script --- airflow_dags/{dockerpyscript => }/README.md | 3 +++ airflow_dags/docker_with_code/Dockerfile | 2 -- airflow_dags/docker_with_code/docker_include_python.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) rename airflow_dags/{dockerpyscript => }/README.md (91%) diff --git a/airflow_dags/dockerpyscript/README.md b/airflow_dags/README.md similarity index 91% rename from airflow_dags/dockerpyscript/README.md rename to airflow_dags/README.md index 025c022..8a445e7 100644 --- a/airflow_dags/dockerpyscript/README.md +++ b/airflow_dags/README.md @@ -21,6 +21,7 @@ - https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html - https://airflow.apache.org/docs/apache-airflow/1.10.1/scheduler.html - https://airflow.apache.org/docs/apache-airflow-providers-docker/1.0.2/_api/airflow/providers/docker/operators/docker/index.html +- [Installazione](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html) # Trigger dag from python application @@ -60,6 +61,8 @@ dag = my_dag() # Use local docker registry at 127.0.0.0:5000 - remove from docker registry: - registry garbage-collect -m /etc/docker/registry/config.yml +- build image for registry `docker image build --tag 127.0.0.0:5000/test_img_3 .` +- push on registry `docker run 127.0.0.0:5000/test_img_3` # Use docker images on docker hub docker build -t my_image . diff --git a/airflow_dags/docker_with_code/Dockerfile b/airflow_dags/docker_with_code/Dockerfile index d18916f..7d06b8f 100644 --- a/airflow_dags/docker_with_code/Dockerfile +++ b/airflow_dags/docker_with_code/Dockerfile @@ -1,7 +1,5 @@ FROM python:3.8 -ARG OPTION_1 -ARG OPTION_2 # Set working directory WORKDIR /app diff --git a/airflow_dags/docker_with_code/docker_include_python.py b/airflow_dags/docker_with_code/docker_include_python.py index 5c7d54a..fb8c747 100644 --- a/airflow_dags/docker_with_code/docker_include_python.py +++ b/airflow_dags/docker_with_code/docker_include_python.py @@ -17,8 +17,9 @@ docker_task = DockerOperator( task_id='docker_task', - image='chiaraforresi/test:v0.0.2', - docker_conn_id='docker_hub_chiaraforresi', # Connection ID for Docker Hub + auto_remove=True, + image='127.0.0.0:5000/test_py:v0.0.1', + container_name='test_py_001', #name of container dag=dag, docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container network_mode='host', # The network mode for the container (internal network), if use "host" the container will share the host network From e5e70b281ad0a0c88d027a5f65bc787aca9da03b Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 20 Mar 2024 16:40:40 +0100 Subject: [PATCH 043/103] fix docker file and update readme --- airflow_dags/README.md | 2 +- airflow_dags/docker_with_code/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airflow_dags/README.md b/airflow_dags/README.md index 8a445e7..8152ae7 100644 --- a/airflow_dags/README.md +++ b/airflow_dags/README.md @@ -62,7 +62,7 @@ dag = my_dag() - remove from docker registry: - registry garbage-collect -m /etc/docker/registry/config.yml - build image for registry `docker image build --tag 127.0.0.0:5000/test_img_3 .` -- push on registry `docker run 127.0.0.0:5000/test_img_3` +- push on registry `docker push 127.0.0.0:5000/test_img_3` # Use docker images on docker hub docker build -t my_image . diff --git a/airflow_dags/docker_with_code/Dockerfile b/airflow_dags/docker_with_code/Dockerfile index 7d06b8f..d31bd85 100644 --- a/airflow_dags/docker_with_code/Dockerfile +++ b/airflow_dags/docker_with_code/Dockerfile @@ -10,4 +10,4 @@ COPY python_script.py /app/ #RUN pip install some_dependency # Set the command to execute the Python script with command-line options -CMD ["python", "python_script.py", "--option1", "${OPTION_1}", "--option2", "${OPTION_2}"] +CMD python python_script.py --option1 $OPTION_1 --option2 $OPTION_2 From 1415c590daba9efd81d0132fd0905e6e31e1ecc9 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 22 Mar 2024 11:27:23 +0100 Subject: [PATCH 044/103] update the readme --- airflow_dags/README.md | 128 ++++++++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 40 deletions(-) diff --git a/airflow_dags/README.md b/airflow_dags/README.md index 8152ae7..cd6bb74 100644 --- a/airflow_dags/README.md +++ b/airflow_dags/README.md @@ -1,29 +1,89 @@ -# DockerOperator Airflow -- Instead of using a bash operator can use a DockerOperator to run the docker container +# Airflow +[Installazione](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html) + +## Docker registry +- stack `dataplatform/multiple_stacks/registry.yaml` +- contact locally in the cluster at 127.0.0.0:5000 +- Starting from a Docker file in a cluster machine + - `docker image build --tag 127.0.0.0:5000/IMAGE_NAME:VERSION -f PATH_DOCKERFILE .` + - `docker push 127.0.0.0:5000/IMAGE_NAME:VERSION` +- From any other cluster machine + - `docker pull 127.0.0.0:5000/IMAGE_NAME:VERSION` + +- Clean data in the registry (enter in the container) + - `registry garbage-collect -m /etc/docker/registry/config.yml` + +## Airflow +Start example: +- In the directory of dags that is `:${NFSPATH}/dataplatform_config/airflow_data/dags` + - create a directory for each project and put the file for generate the DAG (in a subdirectory) + - example of a file + ```python + from airflow import DAG + from airflow.operators.docker_operator import DockerOperator + from datetime import datetime + + version='v0.0.1' + img=f'127.0.0.0:5000/cimice_insert_weekly_passive_task:{version}' + #img is the image pushed in the docker registry + docker_env_vars = { + 'FROM_DATE': '2024-03-25', + 'DAYS_STEP': 7, + 'TO_DATE': '2024-05-31', + 'TASK_ID': 6 + } + + default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'email_on_failure': 'chiara.forresi@unibo.it', + 'start_date': datetime(2024, 3, 24) + } + + dag = DAG('cimice_insert_weekly_passive', default_args=default_args, + schedule_interval='30 0 * * 1', # This schedule runs at 00:30 on Mondays + catchup=False, # Set to False to skip any historical runs + tags=['cimice'] + ) + + + docker_task = DockerOperator( + task_id='cimice_insert_weekly_passive_task', + auto_remove=True, + image=img, + container_name='cimice_insert_weekly_passive_task', #name of container + dag=dag, + docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container + network_mode='host', # The network mode for the container (internal network), if use "host" the container will share the host network + environment=docker_env_vars, + mount_tmp_dir=False, + xcom_all=True # Enable XCom push for this task + ) + ``` + +- Examples and Dockerfiles are in __abds-bigdata__ project + - `cimice/src/main/resources/` various subfolders + - `ingestion-weather/src/main/resources/` +- [DAG](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html): A DAG (Directed Acyclic Graph) is the core concept of Airflow, collecting Tasks together, organized with dependencies and relationships to say how they should run. + - [scheduling options](https://airflow.apache.org/docs/apache-airflow/1.10.1/scheduler.html) +- Our dags are always of one task, that is a [Docker Operator](https://airflow.apache.org/docs/apache-airflow-providers-docker/1.0.2/_api/airflow/providers/docker/operators/docker/index.html) + - have a name has to be made of alphanumeric characters, dashes, dots and underscores exclusively + +### DockerOperator Airflow (some things) - Easier to use and test -- Install the [provider](https://airflow.apache.org/docs/apache-airflow-providers-docker/stable/index.html) -- registry for docker images in all the machines - last line returned by the docker container is in XComs - - If want logs all the things: `xcom_all=True` - - use pickle to not - - retrieve_output=True - - retrieve_output_path='/tmp/script.out' #for log and not in the std output -- cpus: to set the number of cpus -- mem_limit: to set the memory limit -- auto_remove=True, the docker rm -- mounts=[] Use volumes "source", "target", "type", "read_only" -- mettere uno spazio alla fine del command per dire che non è un template -- mount_tmp_dir=False, per non fare creare una directory temporanea -- api_version='auto', to use the latest version of the docker api -- container_name='name_of_task' + - If want logs all the things on the standard output: `xcom_all=True` +- **cpus**: to set the number of cpus +- **mem_limit**: to set the memory limit +- **auto_remove**=True, the docker rm +- **mounts**=[] Use volumes "source", "target", "type", "read_only" +- **command**="Command to be run in the container", overwrites the cmd, add a space at the end to tell that is not a template +- **mount_tmp_dir**=False, not mount a temporary directory +- **container_name**=similar to task name, +- For extra things refer to the official documentation -# Some doc -- https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html -- https://airflow.apache.org/docs/apache-airflow/1.10.1/scheduler.html -- https://airflow.apache.org/docs/apache-airflow-providers-docker/1.0.2/_api/airflow/providers/docker/operators/docker/index.html -- [Installazione](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html) - -# Trigger dag from python application +### Trigger dag from python application +This is made in **abds-bigdata** project `ingestion-weather` module ```python from airflow.api.client.local_client import Client @@ -32,7 +92,7 @@ c = Client(None, None) c.trigger_dag(dag_id='test_dag_id', run_id='test_run_id', conf={}) ``` -Dove dentro conf passo i parametri al dag +Inside the conf I pass the parameters for the dag ```python from airflow.decorators import dag, task @@ -57,19 +117,7 @@ def my_dag(): dag = my_dag() ``` - -# Use local docker registry at 127.0.0.0:5000 -- remove from docker registry: - - registry garbage-collect -m /etc/docker/registry/config.yml -- build image for registry `docker image build --tag 127.0.0.0:5000/test_img_3 .` -- push on registry `docker push 127.0.0.0:5000/test_img_3` - -# Use docker images on docker hub -docker build -t my_image . -docker tag my_image chiaraforresi/test:v0.0.1 -docker login --username=chiaraforresi -docker push chiaraforresi/test:v0.0.1 - -set up a docker connection from the UI to docker with username and password for Docker Hub -- registry: registry.hub.docker.com -- registry può essere il registry locale: https://www.frakkingsweet.com/create-your-own-docker-registry/ \ No newline at end of file +### Common errors in the deploy +- not pass files that are in .gitignore in the build of the container +- set the link to services in the config to the new clusters (e.g., hdfs) +- Errors in dag import: enter inside airflow-scheduler container and launch `airflow scheduler` \ No newline at end of file From 6b7b7a44290f8a53f7c42546ee44427348700195 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Mon, 25 Mar 2024 13:01:00 +0100 Subject: [PATCH 045/103] remove unuseful code and update readme to a final version --- airflow_dags/README.md | 104 +++--------------- .../bashscript/bash_script_example.py | 51 --------- .../bashscript/bash_script_example.sh | 3 - airflow_dags/docker_with_code/Dockerfile | 13 --- .../docker_with_code/docker_include_python.py | 27 ----- .../docker_with_code/python_script.py | 17 --- airflow_dags/dockerpyscript/Dockerfile | 10 -- .../docker_operation_example.py | 47 -------- .../dockerpyscript/python_docker_example.py | 34 ------ .../dockerpyscript/python_docker_example.sh | 24 ---- airflow_dags/dockerpyscript/python_script.py | 12 -- airflow_dags/dockerpyscript/requirements.txt | 1 - 12 files changed, 16 insertions(+), 327 deletions(-) delete mode 100644 airflow_dags/bashscript/bash_script_example.py delete mode 100644 airflow_dags/bashscript/bash_script_example.sh delete mode 100644 airflow_dags/docker_with_code/Dockerfile delete mode 100644 airflow_dags/docker_with_code/docker_include_python.py delete mode 100644 airflow_dags/docker_with_code/python_script.py delete mode 100644 airflow_dags/dockerpyscript/Dockerfile delete mode 100644 airflow_dags/dockerpyscript/docker_operation_example.py delete mode 100644 airflow_dags/dockerpyscript/python_docker_example.py delete mode 100644 airflow_dags/dockerpyscript/python_docker_example.sh delete mode 100644 airflow_dags/dockerpyscript/python_script.py delete mode 100644 airflow_dags/dockerpyscript/requirements.txt diff --git a/airflow_dags/README.md b/airflow_dags/README.md index cd6bb74..2dd9c13 100644 --- a/airflow_dags/README.md +++ b/airflow_dags/README.md @@ -17,107 +17,35 @@ Start example: - In the directory of dags that is `:${NFSPATH}/dataplatform_config/airflow_data/dags` - create a directory for each project and put the file for generate the DAG (in a subdirectory) - - example of a file - ```python - from airflow import DAG - from airflow.operators.docker_operator import DockerOperator - from datetime import datetime - - version='v0.0.1' - img=f'127.0.0.0:5000/cimice_insert_weekly_passive_task:{version}' - #img is the image pushed in the docker registry - docker_env_vars = { - 'FROM_DATE': '2024-03-25', - 'DAYS_STEP': 7, - 'TO_DATE': '2024-05-31', - 'TASK_ID': 6 - } - - default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': 'chiara.forresi@unibo.it', - 'start_date': datetime(2024, 3, 24) - } - - dag = DAG('cimice_insert_weekly_passive', default_args=default_args, - schedule_interval='30 0 * * 1', # This schedule runs at 00:30 on Mondays - catchup=False, # Set to False to skip any historical runs - tags=['cimice'] - ) - - - docker_task = DockerOperator( - task_id='cimice_insert_weekly_passive_task', - auto_remove=True, - image=img, - container_name='cimice_insert_weekly_passive_task', #name of container - dag=dag, - docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container - network_mode='host', # The network mode for the container (internal network), if use "host" the container will share the host network - environment=docker_env_vars, - mount_tmp_dir=False, - xcom_all=True # Enable XCom push for this task - ) - ``` - -- Examples and Dockerfiles are in __abds-bigdata__ project - - `cimice/src/main/resources/` various subfolders - - `ingestion-weather/src/main/resources/` + - example of a files are in __abds-bigdata__ project `\cimice\src\main\resources` and `\ingestion-weather\src\main\resources` - [DAG](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html): A DAG (Directed Acyclic Graph) is the core concept of Airflow, collecting Tasks together, organized with dependencies and relationships to say how they should run. - [scheduling options](https://airflow.apache.org/docs/apache-airflow/1.10.1/scheduler.html) - Our dags are always of one task, that is a [Docker Operator](https://airflow.apache.org/docs/apache-airflow-providers-docker/1.0.2/_api/airflow/providers/docker/operators/docker/index.html) - have a name has to be made of alphanumeric characters, dashes, dots and underscores exclusively - -### DockerOperator Airflow (some things) -- Easier to use and test +- In particular, we use a specialization that is the [Docker swarm operator](https://airflow.apache.org/docs/apache-airflow-providers-docker/stable/_api/airflow/providers/docker/operators/docker_swarm/index.html#airflow.providers.docker.operators.docker_swarm.DockerSwarmOperator), + that can be useful for put constraints in where spawn docker containers. +### DockerSwarmOperator Airflow (some things) - last line returned by the docker container is in XComs - If want logs all the things on the standard output: `xcom_all=True` -- **cpus**: to set the number of cpus -- **mem_limit**: to set the memory limit +- constraints in cpus and memory usage - **auto_remove**=True, the docker rm - **mounts**=[] Use volumes "source", "target", "type", "read_only" - **command**="Command to be run in the container", overwrites the cmd, add a space at the end to tell that is not a template - **mount_tmp_dir**=False, not mount a temporary directory -- **container_name**=similar to task name, +- **container_name**=similar to task name +- **placement** +- **network_mode** - For extra things refer to the official documentation -### Trigger dag from python application -This is made in **abds-bigdata** project `ingestion-weather` module - -```python -from airflow.api.client.local_client import Client - -c = Client(None, None) -c.trigger_dag(dag_id='test_dag_id', run_id='test_run_id', conf={}) -``` - -Inside the conf I pass the parameters for the dag - -```python -from airflow.decorators import dag, task -from airflow.utils.dates import days_ago - -@dag(schedule_interval=None, start_date=None) -def my_dag(): - - @task - def process_data(**kwargs): - # Access configuration values from the context - conf = kwargs['dag_run'].conf - key1_value = conf['key1'] - key2_value = conf['key2'] - - # Use the configuration values as needed - print(f"Configuration key1 value: {key1_value}") - print(f"Configuration key2 value: {key2_value}") - - process_data() - -dag = my_dag() -``` +### Trigger a dag from python application +This is made in **abds-bigdata** project `ingestion-weather` module, +through the `python-service-interaction-utils/src/main/python/airflow_interaction.py` service. ### Common errors in the deploy - not pass files that are in .gitignore in the build of the container +- use public ip, we are inside the cluster :D - set the link to services in the config to the new clusters (e.g., hdfs) -- Errors in dag import: enter inside airflow-scheduler container and launch `airflow scheduler` \ No newline at end of file +- Errors in dag import: enter inside airflow-scheduler container and launch `airflow scheduler` + +### Possible updates +- configure an smpt server, for [send mails on failure](https://stackoverflow.com/questions/58736009/email-on-failure-retry-with-airflow-in-docker-container) \ No newline at end of file diff --git a/airflow_dags/bashscript/bash_script_example.py b/airflow_dags/bashscript/bash_script_example.py deleted file mode 100644 index c8b7900..0000000 --- a/airflow_dags/bashscript/bash_script_example.py +++ /dev/null @@ -1,51 +0,0 @@ -from airflow import DAG - -from airflow.operators.bash import BashOperator - -from datetime import datetime - - - -default_args = { - - 'owner': 'airflow', - - 'depends_on_past': False, - - 'start_date': datetime(2023, 7, 17), - - 'retries': 0, - -} - - - -test_dag = DAG( - - 'bashscript', - - default_args=default_args, - - schedule_interval="@weekly" - -) - - - -# Define the BashOperator task - -bash_task = BashOperator( - - task_id='bash_task_execute_script', - - bash_command='./bash_script_example.sh', - - dag=test_dag - -) - - - -# Set task dependencies - -bash_task \ No newline at end of file diff --git a/airflow_dags/bashscript/bash_script_example.sh b/airflow_dags/bashscript/bash_script_example.sh deleted file mode 100644 index b2fd133..0000000 --- a/airflow_dags/bashscript/bash_script_example.sh +++ /dev/null @@ -1,3 +0,0 @@ -#! /bin/bash -# print hello BashOperator message -echo "Hello BashOperator" \ No newline at end of file diff --git a/airflow_dags/docker_with_code/Dockerfile b/airflow_dags/docker_with_code/Dockerfile deleted file mode 100644 index d31bd85..0000000 --- a/airflow_dags/docker_with_code/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -FROM python:3.8 - -# Set working directory -WORKDIR /app - -# Copy your Python script to the container -COPY python_script.py /app/ - -# Install any dependencies if needed -#RUN pip install some_dependency - -# Set the command to execute the Python script with command-line options -CMD python python_script.py --option1 $OPTION_1 --option2 $OPTION_2 diff --git a/airflow_dags/docker_with_code/docker_include_python.py b/airflow_dags/docker_with_code/docker_include_python.py deleted file mode 100644 index fb8c747..0000000 --- a/airflow_dags/docker_with_code/docker_include_python.py +++ /dev/null @@ -1,27 +0,0 @@ -from airflow import DAG -from airflow.operators.docker_operator import DockerOperator -from datetime import datetime - -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'start_date': datetime(2024, 3, 14) -} - -dag = DAG('docker_include_python_example', default_args=default_args, schedule_interval=None) - -docker_env_vars = { - 'OPTION_1': 'value1', - 'OPTION_2': 'value2' -} - -docker_task = DockerOperator( - task_id='docker_task', - auto_remove=True, - image='127.0.0.0:5000/test_py:v0.0.1', - container_name='test_py_001', #name of container - dag=dag, - docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container - network_mode='host', # The network mode for the container (internal network), if use "host" the container will share the host network - environment=docker_env_vars -) \ No newline at end of file diff --git a/airflow_dags/docker_with_code/python_script.py b/airflow_dags/docker_with_code/python_script.py deleted file mode 100644 index 1064e19..0000000 --- a/airflow_dags/docker_with_code/python_script.py +++ /dev/null @@ -1,17 +0,0 @@ -import argparse - -# Define command-line arguments -parser = argparse.ArgumentParser(description='Description of your script') -parser.add_argument('--option1', help='Description of option 1') -parser.add_argument('--option2', help='Description of option 2') - -# Parse command-line arguments -args = parser.parse_args() - -# Access command-line options -option1_value = args.option1 -option2_value = args.option2 - -# Use the command-line options in your script -print("Option 1:", option1_value) -print("Option 2:", option2_value) \ No newline at end of file diff --git a/airflow_dags/dockerpyscript/Dockerfile b/airflow_dags/dockerpyscript/Dockerfile deleted file mode 100644 index 88315a7..0000000 --- a/airflow_dags/dockerpyscript/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -# python-script-example-0.0.1 - -FROM python:3.9-slim -WORKDIR /app - -# Copy the current directory contents into the container at /app -COPY . /app - -# Install any needed dependencies specified in requirements.txt -RUN pip install --no-cache-dir -r requirements.txt \ No newline at end of file diff --git a/airflow_dags/dockerpyscript/docker_operation_example.py b/airflow_dags/dockerpyscript/docker_operation_example.py deleted file mode 100644 index 389c211..0000000 --- a/airflow_dags/dockerpyscript/docker_operation_example.py +++ /dev/null @@ -1,47 +0,0 @@ -from airflow import DAG -from airflow.operators.docker_operator import DockerOperator -from airflow.utils.dates import datetime, timedelta - -# Define your default arguments -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=5), - 'start_date': datetime(2024, 1, 1) # Start date of your DAG -} - -# Define your DAG -dag = DAG( - 'example_dag_docker_operator', - default_args=default_args, - description='A simple DAG example using DockerOperator with private image', - schedule_interval='@daily', # Runs daily - catchup=False # Prevents backfilling for past intervals -) - -task_arguments = { - 'name_argument': 'Chiara' -} -# Initialize the command string -command = "python python_script.py" - -# Add each argument to the command string -for arg_name, arg_value in task_arguments.items(): - command += f" --{arg_name.replace('_', '-')} {arg_value}" - -# Define your DockerOperator -run_task = DockerOperator( - task_id='test_python_script', - container_name='test_1', - image='chiaraforresi/test:v0.0.1', # Private Docker image - command=command, # Command to run in the Docker container - docker_conn_id='docker_hub_chiaraforresi', # Connection ID for Docker Hub - dag=dag, - docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container - network_mode='bridge', # The network mode for the container (internal network), if use "host" the container will share the host network -) - -run_task diff --git a/airflow_dags/dockerpyscript/python_docker_example.py b/airflow_dags/dockerpyscript/python_docker_example.py deleted file mode 100644 index ab6f789..0000000 --- a/airflow_dags/dockerpyscript/python_docker_example.py +++ /dev/null @@ -1,34 +0,0 @@ -from airflow import DAG -from airflow.providers.docker.operators.docker import DockerOperator -from airflow.utils.dates import days_ago - -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - # Define your parameters here - 'name_argument': 'default_name', - 'test_argument': 'test' -} - -dag = DAG( - 'docker_dag', - default_args=default_args, - description='A simple DAG example', - schedule_interval='0 14,18 * * 1,4', # Runs at 14:00 on Mondays and 18:00 on Thursdays - start_date=days_ago(1), - tags=['example'] -) - -task = DockerOperator ( - task_id='docker_task', - image='python:3.9-slim', # Use the Python 3.9 image - command='echo "Hello, World Docker container!"', # If there is an entrypoint in the Dockerfile, it will be overridden by this command - docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container - network_mode='bridge', # The network mode for the container (internal network), if use "host" the container will share the host network - dag=dag -) - -task \ No newline at end of file diff --git a/airflow_dags/dockerpyscript/python_docker_example.sh b/airflow_dags/dockerpyscript/python_docker_example.sh deleted file mode 100644 index dfad867..0000000 --- a/airflow_dags/dockerpyscript/python_docker_example.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -echo "All passed parameters: $@" -image_name=$(head -1 Dockerfile | cut -d' ' -f2) - -echo "Image name: ${image_name}" - -if [[ "$(docker images -q $image_name 2> /dev/null)" == "" ]]; then - # If the image does not exist, build it - docker build -t $image_name . -else - echo "Docker image $image_name already exists. Skipping build." -fi - -#!/bin/bash - -# Define the name of your Docker image -image_name="your_docker_image" - -# Get the name_argument value from the DAG run configuration -name_argument="{{ dag_run.conf['name_argument'] }}" - -# Run the Docker container with the appropriate command-line arguments -#docker run --rm "$image_name" python_script.py --name-argument "$name_argument" \ No newline at end of file diff --git a/airflow_dags/dockerpyscript/python_script.py b/airflow_dags/dockerpyscript/python_script.py deleted file mode 100644 index 264cbe4..0000000 --- a/airflow_dags/dockerpyscript/python_script.py +++ /dev/null @@ -1,12 +0,0 @@ -import argparse - -def main(): - parser = argparse.ArgumentParser(description='Example script with argument parsing') - parser.add_argument('--name-argument', help='Argument passed from Airflow DAG', required=True) - args = parser.parse_args() - - # Use the argument in your Python code - print("Hello,", args) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/airflow_dags/dockerpyscript/requirements.txt b/airflow_dags/dockerpyscript/requirements.txt deleted file mode 100644 index 4f5b899..0000000 --- a/airflow_dags/dockerpyscript/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -requests==2.26.0 \ No newline at end of file From f868ba745161e86b0245b2d3564e660726e43c45 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 28 Mar 2024 15:31:22 +0100 Subject: [PATCH 046/103] add constraint to fix deploy --- dataplatform/multiple_stacks/airflow.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index fb1bea0..f624ce7 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -132,6 +132,7 @@ services: placement: constraints: - node.role == manager + - node.hostname != CB-Mass-Node1 scheduler: <<: *airflow-common From 31bc52d2eccdc7b61d8c337bfcad24c7d589a7b4 Mon Sep 17 00:00:00 2001 From: manuelepasini Date: Wed, 3 Apr 2024 10:10:09 +0000 Subject: [PATCH 047/103] modified spark hist server port --- dataplatform/.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/.env b/dataplatform/.env index f896975..4049660 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -41,7 +41,7 @@ SPARKBIN=/opt/spark/sbin/ SPARKCONFDIR=/opt/spark/conf/ SPARKMASTERHOST=spark-master SPARKMASTERPORT=7077 -SPARKHISTSERVERPORT=18080 +SPARKHISTSERVERPORT=48080 ######################## ##### YARN env variables From 42efcfe44b623ac69dbca8f585bd6b9c435f9616 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 5 Apr 2024 12:38:31 +0200 Subject: [PATCH 048/103] update airflow --- dataplatform/.env | 2 +- dataplatform/multiple_stacks/airflow.yaml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dataplatform/.env b/dataplatform/.env index a38a569..29ceb6c 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -90,4 +90,4 @@ DOCKERREGISTRYPORT=5000 GEOSERVER_PORT=40111 #AIRFLOW -AIRFLOW_WEB_SERVER_PORT=48081 \ No newline at end of file +AIRFLOW_WEB_SERVER_PORT=48091 \ No newline at end of file diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index f624ce7..37ee95d 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -81,7 +81,7 @@ services: # - node.labels.test_airflow_postgres==1 - redis: + airflow-redis: image: redis:latest expose: - 6379 @@ -115,7 +115,7 @@ services: - node.hostname != CB-Mass-Node1 - node.labels.running_airflow_webserver == 1 - webserver: + airflow-webserver: <<: *airflow-common command: webserver ports: @@ -134,7 +134,7 @@ services: - node.role == manager - node.hostname != CB-Mass-Node1 - scheduler: + airflow-scheduler: <<: *airflow-common command: scheduler # healthcheck: @@ -150,7 +150,7 @@ services: constraints: - node.labels.running_airflow_webserver == 1 - worker: + airflow-worker: <<: *airflow-common command: celery worker # healthcheck: @@ -175,7 +175,7 @@ services: - node.role == worker - node.hostname != CB-Mass-Node1 - triggerer: + airflow-triggerer: <<: *airflow-common command: triggerer # healthcheck: @@ -191,7 +191,7 @@ services: constraints: - node.labels.running_airflow_webserver == 1 - init: + airflow-init: <<: *airflow-common entrypoint: /bin/bash # yamllint disable rule:line-length From dfd021d3d9bb85c508ab6c0a87ab791d91fcad1b Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 5 Apr 2024 12:50:12 +0200 Subject: [PATCH 049/103] update airflow --- dataplatform/multiple_stacks/airflow.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 37ee95d..5c1f864 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -81,7 +81,7 @@ services: # - node.labels.test_airflow_postgres==1 - airflow-redis: + redis: image: redis:latest expose: - 6379 From 20eab33fd4f96e4205b67dd15c1b80fb488c8f79 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Fri, 5 Apr 2024 15:58:51 +0200 Subject: [PATCH 050/103] update readme --- airflow_dags/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airflow_dags/README.md b/airflow_dags/README.md index 2dd9c13..476e4d9 100644 --- a/airflow_dags/README.md +++ b/airflow_dags/README.md @@ -34,7 +34,7 @@ Start example: - **mount_tmp_dir**=False, not mount a temporary directory - **container_name**=similar to task name - **placement** -- **network_mode** +- **network_mode** and **networks** use BIG-dataplatform-network - For extra things refer to the official documentation ### Trigger a dag from python application @@ -43,7 +43,7 @@ through the `python-service-interaction-utils/src/main/python/airflow_interactio ### Common errors in the deploy - not pass files that are in .gitignore in the build of the container -- use public ip, we are inside the cluster :D +- use service names and internal ports when refer to other services (not use exposed ports) - set the link to services in the config to the new clusters (e.g., hdfs) - Errors in dag import: enter inside airflow-scheduler container and launch `airflow scheduler` From e7d14b0691d658886c4629d65930cd646af2cc7f Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:07:54 +0200 Subject: [PATCH 051/103] add airflow env and change deploy-swarm.sh --- .gitignore | 2 ++ dataplatform/.airflow.env.example | 8 +++++++ dataplatform/multiple_stacks/airflow.yaml | 8 +++---- dataplatform/multiple_stacks/deploy-swarm.sh | 25 ++++++++++++++++---- 4 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 dataplatform/.airflow.env.example diff --git a/.gitignore b/.gitignore index d0b773d..a819446 100644 --- a/.gitignore +++ b/.gitignore @@ -639,3 +639,5 @@ fabric.propertyDataList /python-utils/src/main/python/gmail/credentials.json /python-utils/src/main/python/gmail/token.pickle *.png + +*.airflow.env \ No newline at end of file diff --git a/dataplatform/.airflow.env.example b/dataplatform/.airflow.env.example new file mode 100644 index 0000000..eb56356 --- /dev/null +++ b/dataplatform/.airflow.env.example @@ -0,0 +1,8 @@ +AIRFLOW_PASSWORD=airflow_psw +AIRFLOW_USER=airflow +AIRFLOW_DB=airflowDB:port/databaseName +AIRFLOW_SMTP_SMTP_HOST=smtp_mail +AIRFLOW_SMTP_SMTP_PORT=smtp_port +AIRFLOW_SMTP_SMTP_USER=mail_user +AIRFLOW_SMTP_SMTP_PASSWORD=mail_psw +AIRFLOW_SMTP_SMTP_MAIL_FROM=mail \ No newline at end of file diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 5c1f864..169f016 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -33,8 +33,8 @@ x-airflow-common: environment: &airflow-common-env AIRFLOW__CORE__EXECUTOR: CeleryExecutor - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow #TODO github secret - AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow #TODO github secret + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://$AIRFLOW_USER:$AIRFLOW_PASSWORD@$AIRFLOW_DB + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://$AIRFLOW_USER:$AIRFLOW_PASSWORD@$AIRFLOW_DB AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' @@ -215,8 +215,8 @@ services: <<: *airflow-common-env _AIRFLOW_DB_MIGRATE: 'false' #this is for copy data _AIRFLOW_WWW_USER_CREATE: 'true' - _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} - _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} #TODO github secret + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-$AIRFLOW_USER} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-$AIRFLOW_PASSWORD} _PIP_ADDITIONAL_REQUIREMENTS: '' user: "0:0" volumes: diff --git a/dataplatform/multiple_stacks/deploy-swarm.sh b/dataplatform/multiple_stacks/deploy-swarm.sh index 9559493..72d6545 100755 --- a/dataplatform/multiple_stacks/deploy-swarm.sh +++ b/dataplatform/multiple_stacks/deploy-swarm.sh @@ -7,20 +7,37 @@ set +o allexport mkdir -p runtime -files_matching_criteria="$@" +# Check if the substitution file path is provided as a command-line argument +if [ $# -eq 2 ]; then + env_file="$1" + substitution_file="$2" +else + env_file="./../.env" + substitution_file="" +fi + +files_matching_criteria="$1" +# Second argument for the substitution file +substitution_file="$2" if [ -z "$files_matching_criteria" ]; then # If $files_matching_criteria is empty, assign it all files matching the criteria files_matching_criteria=$(find . -type f -name "*.yaml") else - echo "Variable \$@ is not empty: $@" + echo "Variable \$1 is not empty: $1" fi for stack in $files_matching_criteria do stack="${stack%.yaml}" stack="${stack#./}" - # Altrimenti, effettua la sostituzione delle variabili di ambiente - envsubst < "${stack}.yaml" > "runtime/${stack}-subs.yaml" + # Perform substitution using both substitution file and .env file + if [ -f "$substitution_file" ]; then + # Use substitution from the provided file if present + envsubst < "${stack}.yaml" | envsubst "$(cat "$substitution_file")" > "runtime/${stack}-subs.yaml" + else + # Use only .env file for substitution + envsubst < "${stack}.yaml" > "runtime/${stack}-subs.yaml" + fi # Deploya lo stack docker stack deploy -c "runtime/${stack}-subs.yaml" "${ENVIRONMENTNAME}-${stack}" done From 36871047496a73be893f826d6c46d4b20bc5f7dd Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:20:09 +0200 Subject: [PATCH 052/103] try to fix variable substitution --- dataplatform/multiple_stacks/deploy-swarm.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/deploy-swarm.sh b/dataplatform/multiple_stacks/deploy-swarm.sh index 72d6545..d61fd95 100755 --- a/dataplatform/multiple_stacks/deploy-swarm.sh +++ b/dataplatform/multiple_stacks/deploy-swarm.sh @@ -33,7 +33,8 @@ do # Perform substitution using both substitution file and .env file if [ -f "$substitution_file" ]; then # Use substitution from the provided file if present - envsubst < "${stack}.yaml" | envsubst "$(cat "$substitution_file")" > "runtime/${stack}-subs.yaml" + env_vars=$(cat "$substitution_file" | sed 's/[^a-zA-Z0-9_]/\\&/g' | sed -e '/^$/d' | xargs) + envsubst "$env_vars" < "${stack}.yaml" > "runtime/${stack}-subs.yaml" else # Use only .env file for substitution envsubst < "${stack}.yaml" > "runtime/${stack}-subs.yaml" From 3640b5238378bd4ee601081ccf178cb16b61a43f Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:28:02 +0200 Subject: [PATCH 053/103] fix variable substitution --- dataplatform/multiple_stacks/deploy-swarm.sh | 22 ++++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/dataplatform/multiple_stacks/deploy-swarm.sh b/dataplatform/multiple_stacks/deploy-swarm.sh index d61fd95..1e60aab 100755 --- a/dataplatform/multiple_stacks/deploy-swarm.sh +++ b/dataplatform/multiple_stacks/deploy-swarm.sh @@ -3,22 +3,19 @@ set -ex set -o allexport source ./../.env -set +o allexport mkdir -p runtime # Check if the substitution file path is provided as a command-line argument if [ $# -eq 2 ]; then - env_file="$1" substitution_file="$2" -else - env_file="./../.env" - substitution_file="" + echo "Using external substitution $substitution_file" + source $substitution_file fi +set +o allexport + files_matching_criteria="$1" -# Second argument for the substitution file -substitution_file="$2" if [ -z "$files_matching_criteria" ]; then # If $files_matching_criteria is empty, assign it all files matching the criteria files_matching_criteria=$(find . -type f -name "*.yaml") @@ -30,15 +27,8 @@ for stack in $files_matching_criteria do stack="${stack%.yaml}" stack="${stack#./}" - # Perform substitution using both substitution file and .env file - if [ -f "$substitution_file" ]; then - # Use substitution from the provided file if present - env_vars=$(cat "$substitution_file" | sed 's/[^a-zA-Z0-9_]/\\&/g' | sed -e '/^$/d' | xargs) - envsubst "$env_vars" < "${stack}.yaml" > "runtime/${stack}-subs.yaml" - else - # Use only .env file for substitution - envsubst < "${stack}.yaml" > "runtime/${stack}-subs.yaml" - fi + # Altrimenti, effettua la sostituzione delle variabili di ambiente + envsubst < "${stack}.yaml" > "runtime/${stack}-subs.yaml" # Deploya lo stack docker stack deploy -c "runtime/${stack}-subs.yaml" "${ENVIRONMENTNAME}-${stack}" done From d535c030085a4cdd57f172fddb4315861f09a11f Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:31:53 +0200 Subject: [PATCH 054/103] add mail configuration for airflow --- dataplatform/multiple_stacks/airflow.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 169f016..86615e6 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -49,6 +49,12 @@ x-airflow-common: # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} #apache-airflow-providers-docker apache-airflow-providers-apache-spark + #mail parameters + AIRFLOW__SMTP__SMTP_HOST: $AIRFLOW_SMTP_SMTP_HOST + AIRFLOW__SMTP__SMTP_PORT: $AIRFLOW_SMTP_SMTP_PORT + AIRFLOW__SMTP__SMTP_USER: $AIRFLOW_SMTP_SMTP_USER + AIRFLOW__SMTP__SMTP_PASSWORD: $AIRFLOW_SMTP_SMTP_PASSWORD + AIRFLOW__SMTP__SMTP_MAIL_FROM: $AIRFLOW_SMTP_SMTP_MAIL_FROM volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" From d093239fdb571022f33470c3b0fc7bae7885e377 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:43:20 +0200 Subject: [PATCH 055/103] add mail configuration for airflow --- dataplatform/multiple_stacks/airflow.yaml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 86615e6..21b2fc8 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -49,12 +49,6 @@ x-airflow-common: # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} #apache-airflow-providers-docker apache-airflow-providers-apache-spark - #mail parameters - AIRFLOW__SMTP__SMTP_HOST: $AIRFLOW_SMTP_SMTP_HOST - AIRFLOW__SMTP__SMTP_PORT: $AIRFLOW_SMTP_SMTP_PORT - AIRFLOW__SMTP__SMTP_USER: $AIRFLOW_SMTP_SMTP_USER - AIRFLOW__SMTP__SMTP_PASSWORD: $AIRFLOW_SMTP_SMTP_PASSWORD - AIRFLOW__SMTP__SMTP_MAIL_FROM: $AIRFLOW_SMTP_SMTP_MAIL_FROM volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" @@ -124,6 +118,15 @@ services: airflow-webserver: <<: *airflow-common command: webserver + environment: + #mail parameters + - AIRFLOW__SMTP__SMTP_HOST=$AIRFLOW_SMTP_SMTP_HOST + - AIRFLOW__SMTP__SMTP_PORT=$AIRFLOW_SMTP_SMTP_PORT + - AIRFLOW__SMTP__SMTP_USER=$AIRFLOW_SMTP_SMTP_USER + - AIRFLOW__SMTP__SMTP_PASSWORD=$AIRFLOW_SMTP_SMTP_PASSWORD + - AIRFLOW__SMTP__SMTP_MAIL_FROM=$AIRFLOW_SMTP_SMTP_MAIL_FROM + - AIRFLOW__SMTP__SMTP_STARTTLS=True + - AIRFLOW__SMTP__SMTP_SSL=False ports: - ${AIRFLOW_WEB_SERVER_PORT}:8080 # healthcheck: From 5fe29d4ea994f233a912c6b35fa437adfe1d0da7 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:48:32 +0200 Subject: [PATCH 056/103] add mail configuration for airflow --- dataplatform/multiple_stacks/airflow.yaml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 21b2fc8..0e6bc3f 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -49,6 +49,14 @@ x-airflow-common: # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} #apache-airflow-providers-docker apache-airflow-providers-apache-spark + #mail parameters + AIRFLOW__SMTP__SMTP_HOST: $AIRFLOW_SMTP_SMTP_HOST + AIRFLOW__SMTP__SMTP_PORT: $AIRFLOW_SMTP_SMTP_PORT + AIRFLOW__SMTP__SMTP_USER: $AIRFLOW_SMTP_SMTP_USER + AIRFLOW__SMTP__SMTP_PASSWORD: $AIRFLOW_SMTP_SMTP_PASSWORD + AIRFLOW__SMTP__SMTP_MAIL_FROM: $AIRFLOW_SMTP_SMTP_MAIL_FROM + AIRFLOW__SMTP__SMTP_STARTTLS: True + AIRFLOW__SMTP__SMTP_SSL: False volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" @@ -118,15 +126,6 @@ services: airflow-webserver: <<: *airflow-common command: webserver - environment: - #mail parameters - - AIRFLOW__SMTP__SMTP_HOST=$AIRFLOW_SMTP_SMTP_HOST - - AIRFLOW__SMTP__SMTP_PORT=$AIRFLOW_SMTP_SMTP_PORT - - AIRFLOW__SMTP__SMTP_USER=$AIRFLOW_SMTP_SMTP_USER - - AIRFLOW__SMTP__SMTP_PASSWORD=$AIRFLOW_SMTP_SMTP_PASSWORD - - AIRFLOW__SMTP__SMTP_MAIL_FROM=$AIRFLOW_SMTP_SMTP_MAIL_FROM - - AIRFLOW__SMTP__SMTP_STARTTLS=True - - AIRFLOW__SMTP__SMTP_SSL=False ports: - ${AIRFLOW_WEB_SERVER_PORT}:8080 # healthcheck: From 47d34bdbb52914e580036e032fcb581e8cd92d8f Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 11:49:28 +0200 Subject: [PATCH 057/103] add mail configuration for airflow --- dataplatform/multiple_stacks/airflow.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 0e6bc3f..013e7da 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -55,8 +55,8 @@ x-airflow-common: AIRFLOW__SMTP__SMTP_USER: $AIRFLOW_SMTP_SMTP_USER AIRFLOW__SMTP__SMTP_PASSWORD: $AIRFLOW_SMTP_SMTP_PASSWORD AIRFLOW__SMTP__SMTP_MAIL_FROM: $AIRFLOW_SMTP_SMTP_MAIL_FROM - AIRFLOW__SMTP__SMTP_STARTTLS: True - AIRFLOW__SMTP__SMTP_SSL: False + AIRFLOW__SMTP__SMTP_STARTTLS: "true" + AIRFLOW__SMTP__SMTP_SSL: "false" volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" From bc164eb5a9bd2af090e86c4675df083fab5d2673 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 13:03:24 +0200 Subject: [PATCH 058/103] try to fix problems --- dataplatform/multiple_stacks/airflow.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index 013e7da..ed334ec 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -55,8 +55,9 @@ x-airflow-common: AIRFLOW__SMTP__SMTP_USER: $AIRFLOW_SMTP_SMTP_USER AIRFLOW__SMTP__SMTP_PASSWORD: $AIRFLOW_SMTP_SMTP_PASSWORD AIRFLOW__SMTP__SMTP_MAIL_FROM: $AIRFLOW_SMTP_SMTP_MAIL_FROM - AIRFLOW__SMTP__SMTP_STARTTLS: "true" - AIRFLOW__SMTP__SMTP_SSL: "false" + AIRFLOW__SMTP__SMTP_STARTTLS: "True" + AIRFLOW__SMTP__SMTP_SSL: "False" + AIRFLOW_CONFIG: "/opt/airflow/airflow.cfg" volumes: - airflow_data:/opt/airflow/ user: "${AIRFLOW_UID:-50000}:0" From d7422650c21ba38541d344c09ca7510602169e69 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 10 Apr 2024 13:08:51 +0200 Subject: [PATCH 059/103] try to fix problems --- dataplatform/multiple_stacks/airflow.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/airflow.yaml b/dataplatform/multiple_stacks/airflow.yaml index ed334ec..99ec1c5 100644 --- a/dataplatform/multiple_stacks/airflow.yaml +++ b/dataplatform/multiple_stacks/airflow.yaml @@ -50,13 +50,14 @@ x-airflow-common: # for other purpose (development, test and especially production usage) build/extend Airflow image. _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} #apache-airflow-providers-docker apache-airflow-providers-apache-spark #mail parameters + AIRFLOW__SMTP__EMAIL_BACKEND: 'airflow.utils.email.send_email_smtp' AIRFLOW__SMTP__SMTP_HOST: $AIRFLOW_SMTP_SMTP_HOST AIRFLOW__SMTP__SMTP_PORT: $AIRFLOW_SMTP_SMTP_PORT AIRFLOW__SMTP__SMTP_USER: $AIRFLOW_SMTP_SMTP_USER AIRFLOW__SMTP__SMTP_PASSWORD: $AIRFLOW_SMTP_SMTP_PASSWORD AIRFLOW__SMTP__SMTP_MAIL_FROM: $AIRFLOW_SMTP_SMTP_MAIL_FROM - AIRFLOW__SMTP__SMTP_STARTTLS: "True" - AIRFLOW__SMTP__SMTP_SSL: "False" + AIRFLOW__SMTP__SMTP_STARTTLS: "true" + AIRFLOW__SMTP__SMTP_SSL: "false" AIRFLOW_CONFIG: "/opt/airflow/airflow.cfg" volumes: - airflow_data:/opt/airflow/ From ddd11bb8017ae4de017f248e714508f77090c5fa Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 14:59:40 +0200 Subject: [PATCH 060/103] Added sftp server --- dataplatform/.env | 4 ++++ dataplatform/multiple_stacks/SFTP.yaml | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 dataplatform/multiple_stacks/SFTP.yaml diff --git a/dataplatform/.env b/dataplatform/.env index 4049660..1b4065e 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -5,6 +5,7 @@ NFSADDRESS=192.168.30.249 NFSPATH=/nfsshare SSHSERVERPORTEXT=40022 SSHSERVERPORT=20 + #Docker image version HADOOPVERSION=3.3.6 SPARKVERSION=latest @@ -81,3 +82,6 @@ MOSQUITTO_PORT_EXT=48081 MOSQUITTO_PORT_EXT_TLS=48883 MOSQUITTO_USER=foo MOSQUITTO_PWD=bar + +#SFTP +SFTPPORT=49096 \ No newline at end of file diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml new file mode 100644 index 0000000..b734973 --- /dev/null +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -0,0 +1,26 @@ +version: "3.9" +services: + sftp: + image: atmoz/sftp + volumes: + - pnrr_datalake:/home/ + ports: + - ${SFTPPORT}:22" + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + networks: + - BIG-dataplatform-network + +volumes: + pnrr_datalake: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/pnrr_datalake/" + +networks: + BIG-dataplatform-network: + external: true + name: BIG-dataplatform-network From 318f49a1317d139a1f53466be0162b686ac644c0 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 15:01:32 +0200 Subject: [PATCH 061/103] Fix: port error in SFTP server --- dataplatform/multiple_stacks/SFTP.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index b734973..70b6f08 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -5,7 +5,7 @@ services: volumes: - pnrr_datalake:/home/ ports: - - ${SFTPPORT}:22" + - ${SFTPPORT}:22 deploy: placement: constraints: From d6f31f3d94a86b7e3127fc4beae32824e33a7439 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 15:08:15 +0200 Subject: [PATCH 062/103] Dev: Updating SFTP --- dataplatform/multiple_stacks/SFTP.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 70b6f08..25867e5 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -3,9 +3,10 @@ services: sftp: image: atmoz/sftp volumes: - - pnrr_datalake:/home/ + - pnrr_datalake:/home/ ports: - - ${SFTPPORT}:22 + - ${SFTPPORT}:22 + command: mpasini:pass:1001 deploy: placement: constraints: From 00c482987ca164d56efee5d9a2e143ef839875c7 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 15:33:44 +0200 Subject: [PATCH 063/103] Fix: Updating hostkeys --- dataplatform/multiple_stacks/SFTP.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 25867e5..f090c8f 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -4,6 +4,7 @@ services: image: atmoz/sftp volumes: - pnrr_datalake:/home/ + - host_keys:/etc/ssh/ ports: - ${SFTPPORT}:22 command: mpasini:pass:1001 @@ -20,6 +21,12 @@ volumes: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/pnrr_datalake/" + + host_keys: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/" networks: BIG-dataplatform-network: From 69da1d187f6a09b37d2ca7b85f17a22bc103261c Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 15:43:37 +0200 Subject: [PATCH 064/103] Fix: updating keys --- dataplatform/multiple_stacks/SFTP.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index f090c8f..e08d6de 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -4,10 +4,11 @@ services: image: atmoz/sftp volumes: - pnrr_datalake:/home/ - - host_keys:/etc/ssh/ + - host_key_ed25519:/etc/ssh/ssh_host_ed25519_key + - host_key_rsa:/etc/ssh/ssh_host_rsa_key ports: - ${SFTPPORT}:22 - command: mpasini:pass:1001 + command: admin::1001 deploy: placement: constraints: @@ -21,12 +22,18 @@ volumes: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/pnrr_datalake/" - - host_keys: + + host_key_ed25519: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_ed25519_key" + + host_key_rsa: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/" + device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_rsa_key" networks: BIG-dataplatform-network: From dc4809f543da7203a0f51b5b8215df92f236d690 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 15:49:11 +0200 Subject: [PATCH 065/103] Fix --- dataplatform/multiple_stacks/SFTP.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index e08d6de..03fb4ce 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -27,13 +27,13 @@ volumes: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_ed25519_key" + device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_ed25519_key.pub" host_key_rsa: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_rsa_key" + device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_rsa_key.pub" networks: BIG-dataplatform-network: From b53b655621a76b35cb3ec6192a1d9e9767f92a2e Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 7 May 2024 15:52:13 +0200 Subject: [PATCH 066/103] Fix --- dataplatform/multiple_stacks/SFTP.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 03fb4ce..4de5272 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -4,8 +4,9 @@ services: image: atmoz/sftp volumes: - pnrr_datalake:/home/ - - host_key_ed25519:/etc/ssh/ssh_host_ed25519_key - - host_key_rsa:/etc/ssh/ssh_host_rsa_key + # - host_key_ed25519:/etc/ssh/ssh_host_ed25519_key + # - host_key_rsa:/etc/ssh/ssh_host_rsa_key + - host_keys: /etc/ssh/ ports: - ${SFTPPORT}:22 command: admin::1001 @@ -34,6 +35,12 @@ volumes: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_rsa_key.pub" + + host_keys: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/" networks: BIG-dataplatform-network: From 2c30bda3d7cfa70094bcb0c288aca41c7f5262ef Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Wed, 8 May 2024 09:57:23 +0200 Subject: [PATCH 067/103] DEV: implementing SFTP --- dataplatform/multiple_stacks/SFTP.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 4de5272..8debf0d 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -3,10 +3,11 @@ services: sftp: image: atmoz/sftp volumes: - - pnrr_datalake:/home/ + - pnrr_datalake:/home # - host_key_ed25519:/etc/ssh/ssh_host_ed25519_key # - host_key_rsa:/etc/ssh/ssh_host_rsa_key - - host_keys: /etc/ssh/ + - host_keys:/etc/ssh + - users:/etc/sftp/users.conf:ro ports: - ${SFTPPORT}:22 command: admin::1001 @@ -36,6 +37,12 @@ volumes: o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_rsa_key.pub" + users: + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/pnrr_users/" + host_keys: driver_opts: type: nfs From a4c2d2b1407d26700e63abb24d08967f944929d8 Mon Sep 17 00:00:00 2001 From: manuelepasini Date: Wed, 8 May 2024 12:21:20 +0000 Subject: [PATCH 068/103] Working SFTP server --- dataplatform/multiple_stacks/SFTP.yaml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 8debf0d..5921dfe 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -6,8 +6,8 @@ services: - pnrr_datalake:/home # - host_key_ed25519:/etc/ssh/ssh_host_ed25519_key # - host_key_rsa:/etc/ssh/ssh_host_rsa_key - - host_keys:/etc/ssh - - users:/etc/sftp/users.conf:ro + - host_keys:/etc/ssh/ + - users:/etc/sftp ports: - ${SFTPPORT}:22 command: admin::1001 @@ -25,18 +25,6 @@ volumes: o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/pnrr_datalake/" - host_key_ed25519: - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_ed25519_key.pub" - - host_key_rsa: - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/ssh_host_rsa_key.pub" - users: driver_opts: type: nfs From d53d8ac1c2df493fcff9c11fbce2897c608a2c5d Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Wed, 8 May 2024 16:27:47 +0200 Subject: [PATCH 069/103] Refactor: updated NFS folders --- dataplatform/multiple_stacks/SFTP.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 5921dfe..6f0bef2 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -23,19 +23,19 @@ volumes: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/pnrr_datalake/" + device: ":${NFSPATH}/pnrr_dataplatform/datalake/" users: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/pnrr_users/" + device: ":${NFSPATH}/pnrr_dataplatform/users/" host_keys: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/sftp_hostkeys/" + device: ":${NFSPATH}/pnrr_dataplatform/sftp_hostkeys/" networks: BIG-dataplatform-network: From e65b603a692c09f5cd2d4304f7d7b66a2b1a6b70 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 9 May 2024 10:33:40 +0200 Subject: [PATCH 070/103] Dev: SFTP install ACL --- dataplatform/multiple_stacks/SFTP.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 6f0bef2..d90844b 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -11,6 +11,12 @@ services: ports: - ${SFTPPORT}:22 command: admin::1001 + command: + - /bin/bash + - -c + - | + apt-get update + apt-get install acl deploy: placement: constraints: From 2bcdd0ed0417929d4a6db79804c1796ad3b92589 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 9 May 2024 10:39:31 +0200 Subject: [PATCH 071/103] Fix: keeping service alive --- dataplatform/multiple_stacks/SFTP.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index d90844b..85d3993 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -10,13 +10,14 @@ services: - users:/etc/sftp ports: - ${SFTPPORT}:22 - command: admin::1001 + # command: admin::1001 command: - /bin/bash - -c - | apt-get update apt-get install acl + tail -f /dev/null deploy: placement: constraints: From 4946c6944c5b3fc94d7d39a24e19cb41063fa03b Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 9 May 2024 10:50:22 +0200 Subject: [PATCH 072/103] Fix: acl implementations --- dataplatform/multiple_stacks/SFTP.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index 85d3993..c5bb6eb 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -11,13 +11,7 @@ services: ports: - ${SFTPPORT}:22 # command: admin::1001 - command: - - /bin/bash - - -c - - | - apt-get update - apt-get install acl - tail -f /dev/null + command: ["apt-get update", "apt-get install acl"] deploy: placement: constraints: From 094cbb285bb0b5c5a7921e8115d528de20e2db63 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 16 May 2024 14:41:39 +0200 Subject: [PATCH 073/103] remove .env from root and update spark version instead of using latest --- .env | 27 --------------------------- dataplatform/.env | 2 +- 2 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 .env diff --git a/.env b/.env deleted file mode 100644 index 2b3727e..0000000 --- a/.env +++ /dev/null @@ -1,27 +0,0 @@ -#Docker image version -HADOOPVERSION=3.3.6 -SPARKVERSION=spark:3.5.1-scala2.12-java17-python3-r-ubuntu - -#Utils variables -HADOOPCONFDIR=/opt/hadoop/etc/hadoop -SPARKCONFDIR=/opt/spark/conf -CLUSTERNAME=TEST - -#Name node volume dir -NAMEDIR=/hadoop/dfs/name -#Data node volume dir -DATADIR=/hadoop/dfs/data -#Journal node volume dir -JOURNALDIR=/data/journalnode - -# Ports setup -SPARKMASTERPORT=7077 -SPARKMASTERHOST=spark-master -SPARKHISTSERVERPORT=18080 - -NAMENODEPORT=9870 -NAMENODERPCPORT=9000 -NAMENODE2PORT=9871 - -RESOURCEMANAGERPORT=8088 -YARNHISTSERVERPORT=19888 diff --git a/dataplatform/.env b/dataplatform/.env index 929668a..c5dc655 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -8,7 +8,7 @@ SSHSERVERPORT=20 #Docker image version HADOOPVERSION=3.3.6 -SPARKVERSION=latest +SPARKVERSION=spark:3.5.1-scala2.12-java17-python3-r-ubuntu ZOOKEEPERVERSION=latest #KAFKA From 4d81bbe9538a9469fad9794717eeae690fb1c345 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 16 May 2024 14:43:29 +0200 Subject: [PATCH 074/103] update spark version --- dataplatform/.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataplatform/.env b/dataplatform/.env index c5dc655..e708f35 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -8,7 +8,7 @@ SSHSERVERPORT=20 #Docker image version HADOOPVERSION=3.3.6 -SPARKVERSION=spark:3.5.1-scala2.12-java17-python3-r-ubuntu +SPARKVERSION=3.5.1-scala2.12-java17-python3-r-ubuntu ZOOKEEPERVERSION=latest #KAFKA From c2e32dc001fd27e52824b23018d0ebcfb035b2fb Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Thu, 16 May 2024 15:17:19 +0200 Subject: [PATCH 075/103] refactor and move legacy stack --- .../geoserver-hdfs/geoserver-hdfs-client-script.sh | 0 .../geoserver-hdfs/geoserver-hdfs.yaml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename dataplatform/{multiple_stacks => legacy_stack}/geoserver-hdfs/geoserver-hdfs-client-script.sh (100%) rename dataplatform/{multiple_stacks => legacy_stack}/geoserver-hdfs/geoserver-hdfs.yaml (100%) diff --git a/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs-client-script.sh b/dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs-client-script.sh similarity index 100% rename from dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs-client-script.sh rename to dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs-client-script.sh diff --git a/dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml b/dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml similarity index 100% rename from dataplatform/multiple_stacks/geoserver-hdfs/geoserver-hdfs.yaml rename to dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml From 98362acff9bfcfe65ee6234d866c72be78569b78 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 15:53:33 +0200 Subject: [PATCH 076/103] utils add docker cleaner --- dataplatform/images/docker-cleaner/Dockerfile | 4 +++ .../images/docker-cleaner/build_container.sh | 6 ++++ .../docker-cleaner/dcoker-cleaner-dag.py | 33 +++++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 dataplatform/images/docker-cleaner/Dockerfile create mode 100644 dataplatform/images/docker-cleaner/build_container.sh create mode 100644 dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py diff --git a/dataplatform/images/docker-cleaner/Dockerfile b/dataplatform/images/docker-cleaner/Dockerfile new file mode 100644 index 0000000..97f8be7 --- /dev/null +++ b/dataplatform/images/docker-cleaner/Dockerfile @@ -0,0 +1,4 @@ +FROM centos:centos7 + +# Run the script +CMD ["/bin/sh", "docker container prune -f", "docker image prune -f"] \ No newline at end of file diff --git a/dataplatform/images/docker-cleaner/build_container.sh b/dataplatform/images/docker-cleaner/build_container.sh new file mode 100644 index 0000000..001a17f --- /dev/null +++ b/dataplatform/images/docker-cleaner/build_container.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +VERSION=v0.0.1 +IMAGE_NAME=127.0.0.0:5000/docker_cleaner:$VERSION +docker image build --tag $IMAGE_NAME -f ./Dockerfile . +docker push $IMAGE_NAME \ No newline at end of file diff --git a/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py b/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py new file mode 100644 index 0000000..2e51188 --- /dev/null +++ b/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py @@ -0,0 +1,33 @@ +from airflow import DAG +from airflow.providers.docker.operators.docker_swarm import DockerSwarmOperator +from docker.types import Placement, ServiceMode +from datetime import datetime +import sys + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': datetime(2024, 3, 21) +} + +dag = DAG('utils_docker_cleaner', default_args=default_args, + schedule_interval='0 7 * * 1', # This cron expression schedules the DAG to run every Monday at 7:00 AM + catchup=False, # Set to False to skip any historical runs + tags=['utils'] +) +version = 'v0.0.1' +img = f'127.0.0.0:5000/docker_cleaner:{version}' +docker_task = DockerSwarmOperator( + task_id='utils_docker_cleaner_task', + auto_remove=True, + image=img, + container_name='utils_docker_cleaner_task', #name of container TODO not uniquer + dag=dag, + docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container + networks=['BIG-dataplatform-network'], + network_mode='BIG-dataplatform-network', + mount_tmp_dir=False, + mode=ServiceMode('global'), + placement=Placement(constraints=['node.hostname != CB-Mass-Node1']), + xcom_all=True, # Enable XCom push for this task +) \ No newline at end of file From 0bab270e195423850f5b8e01b27d3cfb54fb3c74 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 15:55:47 +0200 Subject: [PATCH 077/103] update cleaner --- dataplatform/images/docker-cleaner/Dockerfile | 2 +- dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataplatform/images/docker-cleaner/Dockerfile b/dataplatform/images/docker-cleaner/Dockerfile index 97f8be7..0fec062 100644 --- a/dataplatform/images/docker-cleaner/Dockerfile +++ b/dataplatform/images/docker-cleaner/Dockerfile @@ -1,4 +1,4 @@ FROM centos:centos7 # Run the script -CMD ["/bin/sh", "docker container prune -f", "docker image prune -f"] \ No newline at end of file +CMD ["/bin/sh", "docker container prune -f", "docker image prune -f", "docker builder prune -f"] \ No newline at end of file diff --git a/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py b/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py index 2e51188..2eb0bef 100644 --- a/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py +++ b/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py @@ -7,7 +7,7 @@ default_args = { 'owner': 'airflow', 'depends_on_past': False, - 'start_date': datetime(2024, 3, 21) + 'start_date': datetime(2024, 6, 24) } dag = DAG('utils_docker_cleaner', default_args=default_args, From e0bc7b78bfeca1514ee037502b4d822b7d8865bc Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 15:56:18 +0200 Subject: [PATCH 078/103] update cleaner --- .../{dcoker-cleaner-dag.py => docker-cleaner-dag.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dataplatform/images/docker-cleaner/{dcoker-cleaner-dag.py => docker-cleaner-dag.py} (100%) diff --git a/dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py similarity index 100% rename from dataplatform/images/docker-cleaner/dcoker-cleaner-dag.py rename to dataplatform/images/docker-cleaner/docker-cleaner-dag.py From 721982f6cdb2adca5c4fd68145a0ec843c38c7d2 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 16:09:15 +0200 Subject: [PATCH 079/103] install docker --- dataplatform/images/docker-cleaner/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataplatform/images/docker-cleaner/Dockerfile b/dataplatform/images/docker-cleaner/Dockerfile index 0fec062..6054c26 100644 --- a/dataplatform/images/docker-cleaner/Dockerfile +++ b/dataplatform/images/docker-cleaner/Dockerfile @@ -1,4 +1,6 @@ FROM centos:centos7 +# Install Docker +RUN yum install -y docker # Run the script CMD ["/bin/sh", "docker container prune -f", "docker image prune -f", "docker builder prune -f"] \ No newline at end of file From ef7ab2a1e1cb1e8e282c81c88a18e8e08a844921 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 16:36:04 +0200 Subject: [PATCH 080/103] add commands script --- dataplatform/images/docker-cleaner/Dockerfile | 6 +++++- dataplatform/images/docker-cleaner/commands.sh | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 dataplatform/images/docker-cleaner/commands.sh diff --git a/dataplatform/images/docker-cleaner/Dockerfile b/dataplatform/images/docker-cleaner/Dockerfile index 6054c26..eab33eb 100644 --- a/dataplatform/images/docker-cleaner/Dockerfile +++ b/dataplatform/images/docker-cleaner/Dockerfile @@ -3,4 +3,8 @@ FROM centos:centos7 # Install Docker RUN yum install -y docker # Run the script -CMD ["/bin/sh", "docker container prune -f", "docker image prune -f", "docker builder prune -f"] \ No newline at end of file +ADD commands.sh /usr/local/bin/shell.sh + +RUN chmod 777 /usr/local/bin/shell.sh + +CMD /usr/local/bin/shell.sh \ No newline at end of file diff --git a/dataplatform/images/docker-cleaner/commands.sh b/dataplatform/images/docker-cleaner/commands.sh new file mode 100644 index 0000000..3a69919 --- /dev/null +++ b/dataplatform/images/docker-cleaner/commands.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +# Remove all stopped containers +echo "Removing all stopped containers..." +docker container prune -f+ +# Remove all dangling images +echo "Removing all dangling images..." +docker image prune -f +# Remove all dangling volumes +echo "Removing all dangling volumes..." +docker builder prune -f +echo "Docker cleanup complete." \ No newline at end of file From f77c1303c5b720df99f6ba9d49e96aedecc4a77d Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 16:39:53 +0200 Subject: [PATCH 081/103] fix commands script --- dataplatform/images/docker-cleaner/commands.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dataplatform/images/docker-cleaner/commands.sh b/dataplatform/images/docker-cleaner/commands.sh index 3a69919..4896091 100644 --- a/dataplatform/images/docker-cleaner/commands.sh +++ b/dataplatform/images/docker-cleaner/commands.sh @@ -1,12 +1,11 @@ #!/bin/sh -# Remove all stopped containers echo "Removing all stopped containers..." -docker container prune -f+ -# Remove all dangling images +docker container prune -f + echo "Removing all dangling images..." docker image prune -f -# Remove all dangling volumes -echo "Removing all dangling volumes..." + +echo "Removing cache..." docker builder prune -f echo "Docker cleanup complete." \ No newline at end of file From fd479138c2218e8d4ebc816b0577e4f3be987808 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 26 Jun 2024 16:42:59 +0200 Subject: [PATCH 082/103] change script --- dataplatform/images/docker-cleaner/docker-cleaner-dag.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataplatform/images/docker-cleaner/docker-cleaner-dag.py b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py index 2eb0bef..a0efd6c 100644 --- a/dataplatform/images/docker-cleaner/docker-cleaner-dag.py +++ b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py @@ -21,11 +21,11 @@ task_id='utils_docker_cleaner_task', auto_remove=True, image=img, - container_name='utils_docker_cleaner_task', #name of container TODO not uniquer + container_name='utils_docker_cleaner_task', dag=dag, docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container - networks=['BIG-dataplatform-network'], - network_mode='BIG-dataplatform-network', + network_mode='host', # BIG-dataplatform-network + # network_mode='BIG-dataplatform-network', mount_tmp_dir=False, mode=ServiceMode('global'), placement=Placement(constraints=['node.hostname != CB-Mass-Node1']), From 142890213ca33c4e57478243e75bc1a9f4f18751 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 27 Jun 2024 08:49:39 +0200 Subject: [PATCH 083/103] feat: trying things --- dataplatform/images/docker-cleaner/docker-cleaner-dag.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dataplatform/images/docker-cleaner/docker-cleaner-dag.py b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py index a0efd6c..0d5db69 100644 --- a/dataplatform/images/docker-cleaner/docker-cleaner-dag.py +++ b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py @@ -15,8 +15,10 @@ catchup=False, # Set to False to skip any historical runs tags=['utils'] ) + version = 'v0.0.1' img = f'127.0.0.0:5000/docker_cleaner:{version}' + docker_task = DockerSwarmOperator( task_id='utils_docker_cleaner_task', auto_remove=True, @@ -26,6 +28,7 @@ docker_url='tcp://docker-proxy:2375', # The connection to the Docker daemon, the socket should exist in the container network_mode='host', # BIG-dataplatform-network # network_mode='BIG-dataplatform-network', + volumes=['/var/run/docker.sock:/var/run/docker.sock'], mount_tmp_dir=False, mode=ServiceMode('global'), placement=Placement(constraints=['node.hostname != CB-Mass-Node1']), From 8ab36605b42a44170453e9333c2a93d74cb4ccaf Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 27 Jun 2024 17:59:51 +0200 Subject: [PATCH 084/103] updating FIWARE stack --- dataplatform/multiple_stacks/FIWARE.yaml | 69 +++++++++++------------- dataplatform/multiple_stacks/kafka.yaml | 1 + 2 files changed, 31 insertions(+), 39 deletions(-) diff --git a/dataplatform/multiple_stacks/FIWARE.yaml b/dataplatform/multiple_stacks/FIWARE.yaml index 87332e8..84712ff 100644 --- a/dataplatform/multiple_stacks/FIWARE.yaml +++ b/dataplatform/multiple_stacks/FIWARE.yaml @@ -22,13 +22,20 @@ services: - "${MONGODB_PORT}:27017" networks: - BIG-dataplatform-network - # volumes: - # - mongodb_volume:/data/db deploy: placement: constraints: - node.hostname != CB-Mass-Node1 + fileserver: + init: true + image: jetbrainsinfra/nginx-file-listing:0.2 + ports: + - "${FILESERVER_PORT}:80" + volumes: + - dataset-volume:/mnt/data + networks: + - BIG-dataplatform-network # mosquitto: # init: true # image: eclipse-mosquitto:2.0.11 @@ -69,36 +76,26 @@ services: # depends_on: # - mongodb - fileserver: - init: true - image: jetbrainsinfra/nginx-file-listing:0.2 - ports: - - "${FILESERVER_PORT}:80" - volumes: - - dataset-volume:/mnt/data - networks: - - BIG-dataplatform-network - - ftp: - init: true - image: stilliard/pure-ftpd - environment: - FTP_USER_NAME: "${FTP_USER}" - FTP_USER_PASS: "${FTP_PWD}" - FTP_USER_HOME: "/data" - FTP_PASSIVE_PORTS: "${FTP_PORT_MIN}:${FTP_PORT_MAX}" - FTP_MAX_CLIENTS: 50 - FTP_MAX_CONNECTIONS: 20 - ADDED_FLAGS: "-d -d" - PUBLICHOST: "${FTP_IP}" - volumes: - - ftp-volume:/data - ports: - - "${FTP_PORT_FTP20_EXT}:20" - - "${FTP_PORT_FTP21_EXT}:21" - - "${FTP_PORT_MIN}-${FTP_PORT_MAX}:${FTP_PORT_MIN}-${FTP_PORT_MAX}/tcp" - networks: - - BIG-dataplatform-network + # ftp: + # init: true + # image: stilliard/pure-ftpd + # environment: + # FTP_USER_NAME: "${FTP_USER}" + # FTP_USER_PASS: "${FTP_PWD}" + # FTP_USER_HOME: "/data" + # FTP_PASSIVE_PORTS: "${FTP_PORT_MIN}:${FTP_PORT_MAX}" + # FTP_MAX_CLIENTS: 50 + # FTP_MAX_CONNECTIONS: 20 + # ADDED_FLAGS: "-d -d" + # PUBLICHOST: "${FTP_IP}" + # volumes: + # - ftp-volume:/data + # ports: + # - "${FTP_PORT_FTP20_EXT}:20" + # - "${FTP_PORT_FTP21_EXT}:21" + # - "${FTP_PORT_MIN}-${FTP_PORT_MAX}:${FTP_PORT_MIN}-${FTP_PORT_MAX}/tcp" + # networks: + # - BIG-dataplatform-network volumes: @@ -118,13 +115,7 @@ volumes: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 - device: ":${NFSPATH}/datasets/" - - mongodb_volume: - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4 - device: ":${NFSPATH}/mongodb-data/" + device: ":${NFSPATH}/pnrr_dataplatform/datalake/" networks: BIG-dataplatform-network: diff --git a/dataplatform/multiple_stacks/kafka.yaml b/dataplatform/multiple_stacks/kafka.yaml index 7027fac..f1b714c 100644 --- a/dataplatform/multiple_stacks/kafka.yaml +++ b/dataplatform/multiple_stacks/kafka.yaml @@ -17,6 +17,7 @@ services: placement: constraints: - node.role == manager + - node.hostname != CB-Mass-Node1 networks: - BIG-dataplatform-network From 797669ffe98c39a671611e67b5883bcdb7b4d028 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 27 Jun 2024 18:01:12 +0200 Subject: [PATCH 085/103] updating kafka stack --- dataplatform/multiple_stacks/kafka.yaml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/dataplatform/multiple_stacks/kafka.yaml b/dataplatform/multiple_stacks/kafka.yaml index f1b714c..6ed7ba0 100644 --- a/dataplatform/multiple_stacks/kafka.yaml +++ b/dataplatform/multiple_stacks/kafka.yaml @@ -22,19 +22,6 @@ services: - BIG-dataplatform-network volumes: - spark_config: - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4 - device: ":${NFSPATH}/dataplatform_config/spark_conf/" - - hadoop_config: - driver: local - driver_opts: - type: nfs - o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard - device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" - kafka_data: driver_opts: type: nfs From 1f0422ecac7b631a76e2bfa3da8e40458bce6b41 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 24 Jul 2024 15:24:09 +0200 Subject: [PATCH 086/103] WIP geoserver csrf --- dataplatform/multiple_stacks/geoserver.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataplatform/multiple_stacks/geoserver.yaml b/dataplatform/multiple_stacks/geoserver.yaml index 0923121..7af653c 100644 --- a/dataplatform/multiple_stacks/geoserver.yaml +++ b/dataplatform/multiple_stacks/geoserver.yaml @@ -5,6 +5,8 @@ services: image: docker.osgeo.org/geoserver:2.25.x environment: SKIP_DEMO_DATA: "true" + GEOSERVER_CSRF_DISABLE: "false" + GEOSERVER_CSRF_WHITELIST: 'big.csr.unibo.it' networks: - BIG-dataplatform-network ports: From b9f89dea7037a44b674129aa7976e4931704ac07 Mon Sep 17 00:00:00 2001 From: Chiara Forresi Date: Wed, 24 Jul 2024 15:33:38 +0200 Subject: [PATCH 087/103] fix geoserver csrf --- dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml b/dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml index 5e5c44e..d8d13a9 100644 --- a/dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml +++ b/dataplatform/legacy_stack/geoserver-hdfs/geoserver-hdfs.yaml @@ -23,6 +23,8 @@ services: image: docker.osgeo.org/geoserver:2.25.x environment: SKIP_DEMO_DATA: "true" + GEOSERVER_CSRF_DISABLE: "false" + GEOSERVER_CSRF_WHITELIST: 'big.csr.unibo.it' networks: - BIG-dataplatform-network ports: From 1f6021e84fa9126f2f8e0aed875addf886a30091 Mon Sep 17 00:00:00 2001 From: Manuele Date: Thu, 25 Jul 2024 11:02:32 +0000 Subject: [PATCH 088/103] lost some updates --- dataplatform/multiple_stacks/FIWARE.yaml | 9 +++++++-- dataplatform/multiple_stacks/SFTP.yaml | 1 - dataplatform/multiple_stacks/kafka.yaml | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dataplatform/multiple_stacks/FIWARE.yaml b/dataplatform/multiple_stacks/FIWARE.yaml index 84712ff..97e04a9 100644 --- a/dataplatform/multiple_stacks/FIWARE.yaml +++ b/dataplatform/multiple_stacks/FIWARE.yaml @@ -33,9 +33,14 @@ services: ports: - "${FILESERVER_PORT}:80" volumes: - - dataset-volume:/mnt/data + - pnrr-datalake-landing:/mnt/data networks: - BIG-dataplatform-network + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + # mosquitto: # init: true # image: eclipse-mosquitto:2.0.11 @@ -111,7 +116,7 @@ volumes: o: addr=${NFSADDRESS},rw,nfsvers=4 device: ":${NFSPATH}/ftp/" - dataset-volume: + pnrr-datalake-landing: driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4 diff --git a/dataplatform/multiple_stacks/SFTP.yaml b/dataplatform/multiple_stacks/SFTP.yaml index c5bb6eb..dcc782d 100644 --- a/dataplatform/multiple_stacks/SFTP.yaml +++ b/dataplatform/multiple_stacks/SFTP.yaml @@ -11,7 +11,6 @@ services: ports: - ${SFTPPORT}:22 # command: admin::1001 - command: ["apt-get update", "apt-get install acl"] deploy: placement: constraints: diff --git a/dataplatform/multiple_stacks/kafka.yaml b/dataplatform/multiple_stacks/kafka.yaml index 6ed7ba0..32a5eb4 100644 --- a/dataplatform/multiple_stacks/kafka.yaml +++ b/dataplatform/multiple_stacks/kafka.yaml @@ -11,8 +11,8 @@ services: - KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL - KAFKA_CFG_ZOOKEEPER_CONNECT=zoo1:2181,zoo2:2181,zoo3:2181 - KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true - volumes: - - kafka_data:/bitnami/ + #volumes: + # - kafka_data:/bitnami/ deploy: placement: constraints: From 66b736d1df8efb488f7e1e81425f13467fd29663 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 30 Jul 2024 10:07:40 +0200 Subject: [PATCH 089/103] added hue --- dataplatform/.env | 3 +++ dataplatform/multiple_stacks/hue.yaml | 36 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 dataplatform/multiple_stacks/hue.yaml diff --git a/dataplatform/.env b/dataplatform/.env index e708f35..13381a3 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -60,6 +60,9 @@ ZOO1PORT=2181 ZOO2PORT=2182 ZOO3PORT=2183 +### HUE ### +HUECONFDIR=/desktop/conf/ +HUEPORT=8888 # FIWARE FIWARE_PORT=48082 diff --git a/dataplatform/multiple_stacks/hue.yaml b/dataplatform/multiple_stacks/hue.yaml new file mode 100644 index 0000000..e233deb --- /dev/null +++ b/dataplatform/multiple_stacks/hue.yaml @@ -0,0 +1,36 @@ +version: '3.9' + +services: + hue: + image: gethue/hue:latest + ports: + - ${HUEPORT}:${HUEPORT} + volumes: + - hadoop_config:${HADOOPCONFDIR} + - hue_config:${HUECONFDIR} + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + networks: + - BIG-dataplatform-network + +volumes: + hadoop_config: + driver: local + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" + + hue_config: + driver: local + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/dataplatform_config/hue_conf/" + +networks: + BIG-dataplatform-network: + external: true + name: BIG-dataplatform-network \ No newline at end of file From 26281513886edaa4f2e35a70b243abf75efadc94 Mon Sep 17 00:00:00 2001 From: Manuele Date: Tue, 30 Jul 2024 08:08:23 +0000 Subject: [PATCH 090/103] updated sftp port --- dataplatform/.env | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataplatform/.env b/dataplatform/.env index 13381a3..9afbb9f 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -63,6 +63,7 @@ ZOO3PORT=2183 ### HUE ### HUECONFDIR=/desktop/conf/ HUEPORT=8888 + # FIWARE FIWARE_PORT=48082 @@ -87,7 +88,7 @@ MOSQUITTO_USER=foo MOSQUITTO_PWD=bar #SFTP -SFTPPORT=49096 +SFTPPORT=40021 #DOCKER REGISTRY DOCKERREGISTRYPORT=5000 @@ -96,4 +97,4 @@ DOCKERREGISTRYPORT=5000 GEOSERVER_PORT=40111 #AIRFLOW -AIRFLOW_WEB_SERVER_PORT=48091 \ No newline at end of file +AIRFLOW_WEB_SERVER_PORT=48091 From 1d36d2e8b3391ea81d4bffcb2193b5de7b915ee6 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Tue, 30 Jul 2024 10:17:05 +0200 Subject: [PATCH 091/103] fix: updating hue stack --- dataplatform/multiple_stacks/hue.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dataplatform/multiple_stacks/hue.yaml b/dataplatform/multiple_stacks/hue.yaml index e233deb..70efc87 100644 --- a/dataplatform/multiple_stacks/hue.yaml +++ b/dataplatform/multiple_stacks/hue.yaml @@ -8,12 +8,12 @@ services: volumes: - hadoop_config:${HADOOPCONFDIR} - hue_config:${HUECONFDIR} - deploy: - placement: - constraints: - - node.hostname != CB-Mass-Node1 - networks: - - BIG-dataplatform-network + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + networks: + - BIG-dataplatform-network volumes: hadoop_config: From 8781a527c777b81fb66e58bd5156271abc3721c7 Mon Sep 17 00:00:00 2001 From: Manuele Date: Wed, 31 Jul 2024 07:09:35 +0000 Subject: [PATCH 092/103] modified hue stack --- dataplatform/multiple_stacks/hue.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dataplatform/multiple_stacks/hue.yaml b/dataplatform/multiple_stacks/hue.yaml index 70efc87..91eb563 100644 --- a/dataplatform/multiple_stacks/hue.yaml +++ b/dataplatform/multiple_stacks/hue.yaml @@ -4,10 +4,12 @@ services: hue: image: gethue/hue:latest ports: - - ${HUEPORT}:${HUEPORT} + - ${HUEPORT}:8888 volumes: - hadoop_config:${HADOOPCONFDIR} - hue_config:${HUECONFDIR} + environment: + HADOOP_CONF_DIR: /opt/hadoop/etc/hadoop/ deploy: placement: constraints: @@ -33,4 +35,4 @@ volumes: networks: BIG-dataplatform-network: external: true - name: BIG-dataplatform-network \ No newline at end of file + name: BIG-dataplatform-network From 0a534ed0fd54231007b44ae828f70e5b7182e734 Mon Sep 17 00:00:00 2001 From: manuelepasini Date: Wed, 11 Sep 2024 08:31:59 +0000 Subject: [PATCH 093/103] feat: Swarm cleaner now also cleans up dangling services --- dataplatform/images/docker-cleaner/Dockerfile | 11 +++++++---- dataplatform/images/docker-cleaner/commands.sh | 14 +++++++++++++- .../images/docker-cleaner/docker-cleaner-dag.py | 4 ++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/dataplatform/images/docker-cleaner/Dockerfile b/dataplatform/images/docker-cleaner/Dockerfile index eab33eb..13809c1 100644 --- a/dataplatform/images/docker-cleaner/Dockerfile +++ b/dataplatform/images/docker-cleaner/Dockerfile @@ -1,10 +1,13 @@ -FROM centos:centos7 +FROM docker:stable + +# Install bash +RUN apk add --no-cache bash -# Install Docker -RUN yum install -y docker # Run the script ADD commands.sh /usr/local/bin/shell.sh RUN chmod 777 /usr/local/bin/shell.sh -CMD /usr/local/bin/shell.sh \ No newline at end of file +CMD ["bash", "/usr/local/bin/shell.sh"] + + diff --git a/dataplatform/images/docker-cleaner/commands.sh b/dataplatform/images/docker-cleaner/commands.sh index 4896091..5853773 100644 --- a/dataplatform/images/docker-cleaner/commands.sh +++ b/dataplatform/images/docker-cleaner/commands.sh @@ -8,4 +8,16 @@ docker image prune -f echo "Removing cache..." docker builder prune -f -echo "Docker cleanup complete." \ No newline at end of file +echo "Docker cleanup complete." + + +echo "Cleaning up old services left around the Swarm..." + +docker service ls --format "{{.ID}} {{.Replicas}}" | while read service_id replicas; do + # Check if the replication factor starts with 0 (i.e., 0/x) + if [[ "$replicas" =~ ^0/ ]]; then + echo "Removing service: $service_id" + docker service rm $service_id + fi +done +echo "Succesfully cleaned Swarm services" diff --git a/dataplatform/images/docker-cleaner/docker-cleaner-dag.py b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py index 0d5db69..c75b57b 100644 --- a/dataplatform/images/docker-cleaner/docker-cleaner-dag.py +++ b/dataplatform/images/docker-cleaner/docker-cleaner-dag.py @@ -31,6 +31,6 @@ volumes=['/var/run/docker.sock:/var/run/docker.sock'], mount_tmp_dir=False, mode=ServiceMode('global'), - placement=Placement(constraints=['node.hostname != CB-Mass-Node1']), + placement=Placement(constraints=['node.hostname != CB-Mass-Node1', 'node.role == manager']), xcom_all=True, # Enable XCom push for this task -) \ No newline at end of file +) From b047f4991c08b9ab835a15ced52d3465df098c25 Mon Sep 17 00:00:00 2001 From: manuelepasini Date: Thu, 12 Sep 2024 06:31:35 +0000 Subject: [PATCH 094/103] chore: minor refactoring: --- dataplatform/.env | 2 +- dataplatform/multiple_stacks/HDFS-client.yaml | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dataplatform/.env b/dataplatform/.env index 9afbb9f..c96a04f 100644 --- a/dataplatform/.env +++ b/dataplatform/.env @@ -62,7 +62,7 @@ ZOO3PORT=2183 ### HUE ### HUECONFDIR=/desktop/conf/ -HUEPORT=8888 +HUEPORT=48888 # FIWARE FIWARE_PORT=48082 diff --git a/dataplatform/multiple_stacks/HDFS-client.yaml b/dataplatform/multiple_stacks/HDFS-client.yaml index 4d1ecc4..3faeca9 100644 --- a/dataplatform/multiple_stacks/HDFS-client.yaml +++ b/dataplatform/multiple_stacks/HDFS-client.yaml @@ -4,6 +4,7 @@ services: image: apache/hadoop:${HADOOPVERSION} volumes: - hadoop_config:${HADOOPCONFDIR} + - data_to_load:/home/data command: ["tail", "-f", "/dev/null"] stdin_open: true tty: true @@ -22,8 +23,14 @@ volumes: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" - + data_to_load: + driver: local + driver_opts: + type: nfs + o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard + device: ":${NFSPATH}/pnrr_dataplatform/dump/" + networks: BIG-dataplatform-network: external: true - name: BIG-dataplatform-network \ No newline at end of file + name: BIG-dataplatform-network From 6a5739030dac5a62afa3a56cd2151e2d8a9c57ea Mon Sep 17 00:00:00 2001 From: Manuele Date: Thu, 12 Sep 2024 10:38:36 +0000 Subject: [PATCH 095/103] removed .env --- .gitignore | 4 +- dataplatform/.env | 100 ---------------------------------------------- 2 files changed, 3 insertions(+), 101 deletions(-) delete mode 100644 dataplatform/.env diff --git a/.gitignore b/.gitignore index a819446..533b5ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.env +dataplatform/.env /dataplatform/tests/HDFD_VENV_3.7 /dataplatform/tests/HDFD_VENV_3.7 /dataplatform/multiple_stacks/runtime/ @@ -640,4 +642,4 @@ fabric.propertyDataList /python-utils/src/main/python/gmail/token.pickle *.png -*.airflow.env \ No newline at end of file +*.airflow.env diff --git a/dataplatform/.env b/dataplatform/.env deleted file mode 100644 index c96a04f..0000000 --- a/dataplatform/.env +++ /dev/null @@ -1,100 +0,0 @@ -#Cluster env variables -CLUSTERNAME=BIG-DataPlat -ENVIRONMENTNAME=DataPlatform -NFSADDRESS=192.168.30.249 -NFSPATH=/nfsshare -SSHSERVERPORTEXT=40022 -SSHSERVERPORT=20 - -#Docker image version -HADOOPVERSION=3.3.6 -SPARKVERSION=3.5.1-scala2.12-java17-python3-r-ubuntu -ZOOKEEPERVERSION=latest - -#KAFKA -KAFKA_IP=192.168.30.102 -KAFKA_PORT_EXT=49092 - -######################### -#### Hadoop env variables -HADOOPHOME=/opt/hadoop/bin/ -HADOOPCONFDIR=/opt/hadoop/etc/hadoop/ - -#Namenode ports -NAMENODE1PORT=9870 -NAMENODE2PORT=9871 -#Name node volume dir -NAMEDIR=/hadoop/dfs/name -#Disks on cluster machines -DATANODEDISK1=/mnt/disk1/datanode_dir/ -DATANODEDISK2=/mnt/disk2/datanode_dir/ -DATANODEDISK3=/mnt/disk3/datanode_dir/ -#Data node volume dir -DATANODEDIR1=/hadoop/dfs/disk1/ -DATANODEDIR2=/hadoop/dfs/disk2/ -DATANODEDIR3=/hadoop/dfs/disk3/ -#Journal node volume dir -JOURNALDIR=/data/journalnode - -######################### -##### Spark env variables -SPARKBIN=/opt/spark/sbin/ -SPARKCONFDIR=/opt/spark/conf/ -SPARKMASTERHOST=spark-master -SPARKMASTERPORT=7077 -SPARKHISTSERVERPORT=48080 - -######################## -##### YARN env variables -RESOURCEMANAGERPORT=8088 -YARNHISTSERVERPORT=19888 -YARNHISTSERVERDATADIR=/hadoop/yarn/timeline/ - -######################## -##### ZM env variables -ZOOCONFDIR=/conf -ZOOPORTEXT=2181 -ZOOELECTIONPORT=2888 -ZOODATAPORT=3888 -ZOO1PORT=2181 -ZOO2PORT=2182 -ZOO3PORT=2183 - -### HUE ### -HUECONFDIR=/desktop/conf/ -HUEPORT=48888 - -# FIWARE -FIWARE_PORT=48082 - -# MongoDB -MONGODB_PORT=47017 - -# File server -FILESERVER_PORT=41080 - -# FTP server -FTP_PORT_FTP20_EXT=49020 -FTP_PORT_FTP21_EXT=49021 -FTP_PORT_MIN=49100 -FTP_PORT_MAX=49199 -FTP_USER=foo -FTP_PWD=bar - -# Mosquitto variables -MOSQUITTO_PORT_EXT=48081 -MOSQUITTO_PORT_EXT_TLS=48883 -MOSQUITTO_USER=foo -MOSQUITTO_PWD=bar - -#SFTP -SFTPPORT=40021 - -#DOCKER REGISTRY -DOCKERREGISTRYPORT=5000 - -#GEOSERVER -GEOSERVER_PORT=40111 - -#AIRFLOW -AIRFLOW_WEB_SERVER_PORT=48091 From 026dafae5f1e3db61058f068741121caf81bca19 Mon Sep 17 00:00:00 2001 From: Manuele Date: Thu, 12 Sep 2024 10:54:07 +0000 Subject: [PATCH 096/103] added .env example --- dataplatform/.env.example | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 dataplatform/.env.example diff --git a/dataplatform/.env.example b/dataplatform/.env.example new file mode 100644 index 0000000..edab4fd --- /dev/null +++ b/dataplatform/.env.example @@ -0,0 +1,94 @@ +#Cluster env variables +CLUSTERNAME= +ENVIRONMENTNAME= +NFSADDRESS= +NFSPATH= +SSHSERVERPORTEXT= +SSHSERVERPORT= + +#Docker image version +HADOOPVERSION= +SPARKVERSION= +ZOOKEEPERVERSION= + +#KAFKA +KAFKA_IP= +KAFKA_PORT_EXT= + +######################### +#### Hadoop env variables +HADOOPHOME= +HADOOPCONFDIR= + +#Namenode ports +NAMENODE1PORT= +NAMENODE2PORT= +#Name node volume dir +NAMEDIR= +#Disks on cluster machines +DATANODEDISK1= +DATANODEDISK2= +DATANODEDISK3= +#Data node volume dir +DATANODEDIR1= +DATANODEDIR2= +DATANODEDIR3= +#Journal node volume dir +JOURNALDIR= + +######################### +##### Spark env variables +SPARKBIN= +SPARKCONFDIR= +SPARKMASTERHOST= +SPARKMASTERPORT= +SPARKHISTSERVERPORT= + +######################## +##### YARN env variables +RESOURCEMANAGERPORT= +YARNHISTSERVERPORT= +YARNHISTSERVERDATADIR= + +######################## +##### ZM env variables +ZOOCONFDIR= +ZOOPORTEXT= +ZOOELECTIONPORT= +ZOODATAPORT= +ZOO1PORT= +ZOO2PORT= +ZOO3PORT= + +# FIWARE +FIWARE_PORT= + +# MongoDB +MONGODB_PORT= + +# File server +FILESERVER_PORT= + +# FTP server +FTP_PORT_FTP20_EXT= +FTP_PORT_FTP21_EXT= +FTP_PORT_MIN= +FTP_PORT_MAX= +FTP_USER= +FTP_PWD= + +# Mosquitto variables +MOSQUITTO_PORT_EXT= +MOSQUITTO_PORT_EXT_TLS= +MOSQUITTO_USER= +MOSQUITTO_PWD= + +# Airflow configuration +AIRFLOW_PASSWORD= +AIRFLOW_USER= +AIRFLOW_DB= +AIRFLOW_SMTP_SMTP_HOST= +AIRFLOW_SMTP_SMTP_PORT= +AIRFLOW_SMTP_SMTP_USER= +AIRFLOW_SMTP_SMTP_PASSWORD= +AIRFLOW_SMTP_SMTP_MAIL_FROM= From 4a084d1ad12255e218a95c507a7805b4bef00ef1 Mon Sep 17 00:00:00 2001 From: Manuele Date: Thu, 12 Sep 2024 12:10:01 +0000 Subject: [PATCH 097/103] chore: removing .airflow.example after mergint it with .env --- dataplatform/.airflow.env.example | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 dataplatform/.airflow.env.example diff --git a/dataplatform/.airflow.env.example b/dataplatform/.airflow.env.example deleted file mode 100644 index eb56356..0000000 --- a/dataplatform/.airflow.env.example +++ /dev/null @@ -1,8 +0,0 @@ -AIRFLOW_PASSWORD=airflow_psw -AIRFLOW_USER=airflow -AIRFLOW_DB=airflowDB:port/databaseName -AIRFLOW_SMTP_SMTP_HOST=smtp_mail -AIRFLOW_SMTP_SMTP_PORT=smtp_port -AIRFLOW_SMTP_SMTP_USER=mail_user -AIRFLOW_SMTP_SMTP_PASSWORD=mail_psw -AIRFLOW_SMTP_SMTP_MAIL_FROM=mail \ No newline at end of file From 3103141acbfe6d08afda6327d53bcf5ff452cc6d Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 12 Sep 2024 14:24:53 +0200 Subject: [PATCH 098/103] fix: modyfing deploy swarm script for c.i. --- dataplatform/multiple_stacks/deploy-swarm.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dataplatform/multiple_stacks/deploy-swarm.sh b/dataplatform/multiple_stacks/deploy-swarm.sh index 1e60aab..19ef03c 100755 --- a/dataplatform/multiple_stacks/deploy-swarm.sh +++ b/dataplatform/multiple_stacks/deploy-swarm.sh @@ -1,10 +1,15 @@ #!/bin/bash set -ex -set -o allexport -source ./../.env - -mkdir -p runtime +if [ -z "./../.env"]; then + set -o allexport + source ./../.env + mkdir -p runtime +else + set -o allexport + source ./../.env.example + mkdir -p runtime +fi # Check if the substitution file path is provided as a command-line argument if [ $# -eq 2 ]; then From de74b5743b882991291cade3d152835e47f661d7 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 12 Sep 2024 14:30:24 +0200 Subject: [PATCH 099/103] fix: modifying .env for c.i. --- dataplatform/.env.example | 112 +++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 61 deletions(-) diff --git a/dataplatform/.env.example b/dataplatform/.env.example index edab4fd..8458318 100644 --- a/dataplatform/.env.example +++ b/dataplatform/.env.example @@ -1,94 +1,84 @@ #Cluster env variables -CLUSTERNAME= -ENVIRONMENTNAME= -NFSADDRESS= -NFSPATH= -SSHSERVERPORTEXT= -SSHSERVERPORT= +CLUSTERNAME=BIG-DataPlat +ENVIRONMENTNAME=DataPlatform +NFSADDRESS=192.168.30.249 +NFSPATH=/nfsshare +SSHSERVERPORTEXT=40022 +SSHSERVERPORT=20 #Docker image version -HADOOPVERSION= -SPARKVERSION= -ZOOKEEPERVERSION= +HADOOPVERSION=3.3.6 +SPARKVERSION=latest +ZOOKEEPERVERSION=latest #KAFKA -KAFKA_IP= -KAFKA_PORT_EXT= +KAFKA_IP=192.168.30.102 +KAFKA_PORT_EXT=49092 ######################### #### Hadoop env variables -HADOOPHOME= -HADOOPCONFDIR= +HADOOPHOME=/opt/hadoop/bin/ +HADOOPCONFDIR=/opt/hadoop/etc/hadoop/ #Namenode ports -NAMENODE1PORT= -NAMENODE2PORT= +NAMENODE1PORT=9870 +NAMENODE2PORT=9871 #Name node volume dir -NAMEDIR= +NAMEDIR=/hadoop/dfs/name #Disks on cluster machines -DATANODEDISK1= -DATANODEDISK2= -DATANODEDISK3= +DATANODEDISK1=/mnt/disk1/datanode_dir/ +DATANODEDISK2=/mnt/disk2/datanode_dir/ +DATANODEDISK3=/mnt/disk3/datanode_dir/ #Data node volume dir -DATANODEDIR1= -DATANODEDIR2= -DATANODEDIR3= +DATANODEDIR1=/hadoop/dfs/disk1/ +DATANODEDIR2=/hadoop/dfs/disk2/ +DATANODEDIR3=/hadoop/dfs/disk3/ #Journal node volume dir -JOURNALDIR= +JOURNALDIR=/data/journalnode ######################### ##### Spark env variables -SPARKBIN= -SPARKCONFDIR= -SPARKMASTERHOST= -SPARKMASTERPORT= -SPARKHISTSERVERPORT= +SPARKBIN=/opt/spark/sbin/ +SPARKCONFDIR=/opt/spark/conf/ +SPARKMASTERHOST=spark-master +SPARKMASTERPORT=7077 +SPARKHISTSERVERPORT=18080 ######################## ##### YARN env variables -RESOURCEMANAGERPORT= -YARNHISTSERVERPORT= -YARNHISTSERVERDATADIR= +RESOURCEMANAGERPORT=8088 +YARNHISTSERVERPORT=19888 +YARNHISTSERVERDATADIR=/hadoop/yarn/timeline/ ######################## ##### ZM env variables -ZOOCONFDIR= -ZOOPORTEXT= -ZOOELECTIONPORT= -ZOODATAPORT= -ZOO1PORT= -ZOO2PORT= -ZOO3PORT= +ZOOCONFDIR=/conf +ZOOPORTEXT=2181 +ZOOELECTIONPORT=2888 +ZOODATAPORT=3888 +ZOO1PORT=2181 +ZOO2PORT=2182 +ZOO3PORT=2183 # FIWARE -FIWARE_PORT= +FIWARE_PORT=48082 # MongoDB -MONGODB_PORT= +MONGODB_PORT=47017 # File server -FILESERVER_PORT= +FILESERVER_PORT=41080 # FTP server -FTP_PORT_FTP20_EXT= -FTP_PORT_FTP21_EXT= -FTP_PORT_MIN= -FTP_PORT_MAX= -FTP_USER= -FTP_PWD= +FTP_PORT_FTP20_EXT=49020 +FTP_PORT_FTP21_EXT=49021 +FTP_PORT_MIN=49100 +FTP_PORT_MAX=49199 +FTP_USER=foo +FTP_PWD=bar # Mosquitto variables -MOSQUITTO_PORT_EXT= -MOSQUITTO_PORT_EXT_TLS= -MOSQUITTO_USER= -MOSQUITTO_PWD= - -# Airflow configuration -AIRFLOW_PASSWORD= -AIRFLOW_USER= -AIRFLOW_DB= -AIRFLOW_SMTP_SMTP_HOST= -AIRFLOW_SMTP_SMTP_PORT= -AIRFLOW_SMTP_SMTP_USER= -AIRFLOW_SMTP_SMTP_PASSWORD= -AIRFLOW_SMTP_SMTP_MAIL_FROM= +MOSQUITTO_PORT_EXT=48081 +MOSQUITTO_PORT_EXT_TLS=48883 +MOSQUITTO_USER=foo +MOSQUITTO_PWD=bar \ No newline at end of file From 8c4f6a007287552bfcf5602e441b60830b146349 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 12 Sep 2024 14:39:23 +0200 Subject: [PATCH 100/103] fix: modifying .env for c.i. --- dataplatform/multiple_stacks/hue.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/dataplatform/multiple_stacks/hue.yaml b/dataplatform/multiple_stacks/hue.yaml index 91eb563..014eccd 100644 --- a/dataplatform/multiple_stacks/hue.yaml +++ b/dataplatform/multiple_stacks/hue.yaml @@ -19,14 +19,12 @@ services: volumes: hadoop_config: - driver: local driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" hue_config: - driver: local driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard From 331520c5400baa29426250b6dd51c258c32dce7a Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 12 Sep 2024 14:48:54 +0200 Subject: [PATCH 101/103] fix: modifying .env for c.i. --- dataplatform/multiple_stacks/hue.yaml | 32 ++++++++++++++------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/dataplatform/multiple_stacks/hue.yaml b/dataplatform/multiple_stacks/hue.yaml index 014eccd..af20767 100644 --- a/dataplatform/multiple_stacks/hue.yaml +++ b/dataplatform/multiple_stacks/hue.yaml @@ -1,30 +1,32 @@ -version: '3.9' +version: "3.9" services: hue: - image: gethue/hue:latest - ports: - - ${HUEPORT}:8888 - volumes: - - hadoop_config:${HADOOPCONFDIR} - - hue_config:${HUECONFDIR} - environment: - HADOOP_CONF_DIR: /opt/hadoop/etc/hadoop/ - deploy: - placement: - constraints: - - node.hostname != CB-Mass-Node1 - networks: - - BIG-dataplatform-network + image: gethue/hue:latest + ports: + - ${HUEPORT}:8888 + volumes: + - hadoop_config:${HADOOPCONFDIR} + - hue_config:${HUECONFDIR} + environment: + HADOOP_CONF_DIR: /opt/hadoop/etc/hadoop/ + deploy: + placement: + constraints: + - node.hostname != CB-Mass-Node1 + networks: + - BIG-dataplatform-network volumes: hadoop_config: + driver: local driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard device: ":${NFSPATH}/dataplatform_config/hadoop_conf/" hue_config: + driver: local driver_opts: type: nfs o: addr=${NFSADDRESS},rw,nfsvers=4,nolock,hard From 11b8a5bcefdfdfd980cc190674ea170eb39b688c Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 12 Sep 2024 14:59:22 +0200 Subject: [PATCH 102/103] fix: modifying .env for c.i. --- dataplatform/.env.example | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dataplatform/.env.example b/dataplatform/.env.example index 8458318..8258979 100644 --- a/dataplatform/.env.example +++ b/dataplatform/.env.example @@ -81,4 +81,8 @@ FTP_PWD=bar MOSQUITTO_PORT_EXT=48081 MOSQUITTO_PORT_EXT_TLS=48883 MOSQUITTO_USER=foo -MOSQUITTO_PWD=bar \ No newline at end of file +MOSQUITTO_PWD=bar + +#Hue conf +HUECONFDIR=/usr/share/hue/desktop/conf/ +HUEPORT=48888 \ No newline at end of file From cbf74f42a99e6c95e47cee12c30210dff0ce7701 Mon Sep 17 00:00:00 2001 From: Manuele Pasini Date: Thu, 12 Sep 2024 15:08:02 +0200 Subject: [PATCH 103/103] fix: modifying .env for c.i. --- dataplatform/.env.example | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dataplatform/.env.example b/dataplatform/.env.example index 8258979..35d15f4 100644 --- a/dataplatform/.env.example +++ b/dataplatform/.env.example @@ -85,4 +85,7 @@ MOSQUITTO_PWD=bar #Hue conf HUECONFDIR=/usr/share/hue/desktop/conf/ -HUEPORT=48888 \ No newline at end of file +HUEPORT=48888 + +# Docker registry port +DOCKERREGISTRYPORT=5000 \ No newline at end of file