diff --git a/Dockerfile b/Dockerfile index cb25d96..153d936 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,8 +14,8 @@ USER airflow COPY requirements-uninstall.txt . RUN pip uninstall -y -r requirements-uninstall.txt && \ pip install --no-cache-dir \ - apache-airflow-providers-microsoft-mssql==3.8.0 \ - apache-airflow-providers-common-sql==1.15.0 + apache-airflow-providers-microsoft-mssql==3.9.0 \ + apache-airflow-providers-common-sql==1.16.0 # Copy and install requirements.txt COPY tests-requirements.txt . diff --git a/Makefile b/Makefile index d548329..2812d9c 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,16 @@ .PHONY: run -run: setup-containers create-example-variable +run: \ +create-logs-dir \ +setup-containers \ +create-example-variable \ +create-path-tmp-variable \ +create-inlabs-db \ +create-inlabs-db-connection \ +create-inlabs-portal-connection \ +activate-inlabs-load-dag + +create-logs-dir: + mkdir -p ./mnt/airflow-logs -m a=rwx setup-containers: docker compose up -d --force-recreate --remove-orphans @@ -22,6 +33,79 @@ create-example-variable: }' > /dev/null; \ fi" +create-path-tmp-variable: + @echo "Creating 'path_tmp' Airflow variable" + @docker exec airflow-webserver sh -c \ + "if ! curl -f -s -LI 'http://localhost:8080/api/v1/variables/path_tmp' --user \"airflow:airflow\" > /dev/null; \ + then \ + curl -s -X 'POST' \ + 'http://localhost:8080/api/v1/variables' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"key\": \"path_tmp\", \ + \"value\": \"/tmp\" \ + }' > /dev/null; \ + fi" + +create-inlabs-db: + @echo "Creating 'inlabs' database" + @docker exec -e PGPASSWORD=airflow ro-dou-postgres-1 sh -c "psql -q -U airflow -f /sql/init-db.sql > /dev/null" + +create-inlabs-db-connection: + @echo "Creating 'inlabs_db' Airflow connection" + @docker exec airflow-webserver sh -c \ + "if ! curl -f -s -LI 'http://localhost:8080/api/v1/connections/inlabs_db' --user \"airflow:airflow\" > /dev/null; \ + then \ + curl -s -X 'POST' \ + 'http://localhost:8080/api/v1/connections' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"connection_id\": \"inlabs_db\", \ + \"conn_type\": \"postgres\", \ + \"schema\": \"inlabs\", \ + \"host\": \"ro-dou-postgres-1\", \ + \"login\": \"airflow\", \ + \"password\": \"airflow\", \ + \"port\": 5432 \ + }' > /dev/null; \ + fi" + +create-inlabs-portal-connection: + @echo "Creating 'inlabs_portal' Airflow connection" + @docker exec airflow-webserver sh -c \ + "if ! curl -f -s -LI 'http://localhost:8080/api/v1/connections/inlabs_portal' --user \"airflow:airflow\" > /dev/null; \ + then \ + curl -s -X 'POST' \ + 'http://localhost:8080/api/v1/connections' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"connection_id\": \"inlabs_portal\", \ + \"conn_type\": \"http\", \ + \"description\": \"Credencial para acesso no Portal do INLabs\", \ + \"host\": \"https://inlabs.in.gov.br/\", \ + \"login\": \"user@email.com\", \ + \"password\": \"password\" \ + }' > /dev/null; \ + fi" + +activate-inlabs-load-dag: + @echo "Activating 'dou_inlabs_load_pg' Airflow DAG" + @docker exec airflow-webserver sh -c \ + "curl -s -X 'PATCH' \ + 'http://localhost:8080/api/v1/dags/ro-dou_inlabs_load_pg' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"is_paused\": false \ + }' > /dev/null;" + .PHONY: down down: docker compose down diff --git a/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py b/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py index 8314408..81ed6f8 100644 --- a/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py +++ b/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py @@ -20,11 +20,8 @@ # Constants DEST_DIR = "download_inlabs" -#XXX update here DEST_CONN_ID = "inlabs_db" -#XXX connection to https://inlabs.in.gov.br/ INLABS_CONN_ID = "inlabs_portal" -#XXX remember to create schema `dou_inlabs` on db STG_TABLE = "dou_inlabs.article_raw" @@ -143,6 +140,7 @@ def _unzip_files(): files_exists = _download_files() _unzip_files() + return files_exists @task @@ -225,7 +223,7 @@ def check_if_first_run_of_day(): if execution_date.day == prev_execution_date.day: logging.info ("Não é a primeira execução do dia") logging.info ("Triggering dataset edicao_extra") - return "trigger_dataset_edicao_extra" + return "trigger_dataset_inlabs_edicao_extra" else: logging.info ("Primeira execução do dia") logging.info ("Triggering dataset e DAGs do INLABS") @@ -247,29 +245,13 @@ def remove_directory(): subprocess.run(f"rm -rf {dest_path}", shell=True, check=True) logging.info("Directory %s removed.", dest_path) - # @task_group(group_id='datasets') - # def trigger_datasets(): - # @task.run_if(lambda context: context["task_instance"].execution_date.hour == 15) - # @task(outlets=[Dataset("inlabs")]) - # def trigger_dataset_edicao_normal(): - # logging.info("Disparando DAGs do INLABS") - - # @task.run_if(lambda context: context["task_instance"].execution_date.hour > 15) - # @task(outlets=[Dataset("inlabs_edicao_extra")]) - # def trigger_dataset_edicao_extra(**kwargs): - # logging.info(context["task_instance"]) - # logging.info("Atualizando o Dataset de Edição Extra") - - # trigger_dataset_edicao_normal(), trigger_dataset_edicao_extra() - - - ## Orchestration + ## Orchestration trigger_date = get_date() download_n_unzip_files(trigger_date) >> \ load_data(trigger_date) >> check_loaded_data >> \ + remove_directory() >> \ check_if_first_run_of_day() >> \ - [trigger_dataset_inlabs_edicao_extra(),trigger_dataset_inlabs()] >> \ - remove_directory() + [trigger_dataset_inlabs_edicao_extra(),trigger_dataset_inlabs()] load_inlabs() diff --git a/dag_load_inlabs/sql/init-db.sql b/dag_load_inlabs/sql/init-db.sql new file mode 100644 index 0000000..4dcc807 --- /dev/null +++ b/dag_load_inlabs/sql/init-db.sql @@ -0,0 +1,5 @@ +CREATE DATABASE inlabs; + +\c inlabs + +CREATE SCHEMA IF NOT EXISTS dou_inlabs; \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 5eb41ef..94e7424 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,7 +73,9 @@ volumes: - ./src:/opt/airflow/dags/ro_dou_src # for development purpose + - ./mnt/airflow-logs:/opt/airflow/logs - ./dag_confs:/opt/airflow/dags/ro_dou/dag_confs + - ./dag_load_inlabs:/opt/airflow/dags/dag_load_inlabs - ./tests:/opt/airflow/tests # for test purpose - ./schemas:/opt/airflow/schemas # for test purpose depends_on: @@ -89,6 +91,7 @@ POSTGRES_DB: airflow volumes: - ./mnt/pgdata:/var/lib/postgresql/data + - ./dag_load_inlabs/sql/:/sql healthcheck: test: ["CMD", "pg_isready", "-U", "airflow"] interval: 5s diff --git a/docs/docs/como_utilizar/instalacao.md b/docs/docs/como_utilizar/instalacao.md index 436bbb2..e1cd2ac 100644 --- a/docs/docs/como_utilizar/instalacao.md +++ b/docs/docs/como_utilizar/instalacao.md @@ -33,7 +33,8 @@ Agora, faremos um segundo teste: o clipping **terms_from_variable**, seguindo os Leia a seção **Configurando em Produção** para instalar o Ro-dou utilizando um provedor SMTP real que enviará os e-mails para os destinatários verdadeiros. -**Observação:** Para utilizar o `source: - INLABS`, é necessário criar a conexão `inlabs_db` no Apache Airflow, apontando para o banco `Postgres` que está carregado com os dados do inlabs. Você poderá encontrar aqui um exemplo de como carregar um banco com os dados do inlabs: [`ro-dou_inlabs_load_pg_dag.py`](https://github.com/gestaogovbr/Ro-dou/blob/main/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py). +**Observação:** Para utilizar o `source: - INLABS`, é necessário alterar a conexão `inlabs_portal` no Apache Airflow, apontando o usuário e senha de autenticação do portal. Um novo usuário pode ser cadastrado pelo portal [INLABS](https://inlabs.in.gov.br/acessar.php). A DAG +que realiza o download dos arquivos do INLABS é a **ro-dou_inlabs_load_pg**. Quando tiver terminado de utilizar o ambiente de teste do Ro-DOU, desligue-o por meio do seguinte comando: