From 8e387fb2f077ec60c4bc3d5fa6d5a69da7149204 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Thu, 2 Nov 2023 15:08:44 -0700 Subject: [PATCH 01/61] Initial commit --- backend/Apple_M1_Dockerfile | 11 +- backend/census_to_gsafac/README.md | 70 +++ backend/census_to_gsafac/__init__.py | 0 backend/census_to_gsafac/admin.py | 3 + backend/census_to_gsafac/apps.py | 6 + .../management/commands/load_raw.py | 62 +++ .../management/commands/raw_to_pg.py | 127 +++++ .../census_to_gsafac/migrations/__init__.py | 0 backend/census_to_gsafac/models.py | 445 ++++++++++++++++++ backend/census_to_gsafac/routers.py | 17 + backend/census_to_gsafac/test_models.py | 14 + backend/census_to_gsafac/tests.py | 3 + backend/census_to_gsafac/views.py | 3 + 13 files changed, 757 insertions(+), 4 deletions(-) create mode 100644 backend/census_to_gsafac/README.md create mode 100644 backend/census_to_gsafac/__init__.py create mode 100644 backend/census_to_gsafac/admin.py create mode 100644 backend/census_to_gsafac/apps.py create mode 100644 backend/census_to_gsafac/management/commands/load_raw.py create mode 100644 backend/census_to_gsafac/management/commands/raw_to_pg.py create mode 100644 backend/census_to_gsafac/migrations/__init__.py create mode 100644 backend/census_to_gsafac/models.py create mode 100644 backend/census_to_gsafac/routers.py create mode 100644 backend/census_to_gsafac/test_models.py create mode 100644 backend/census_to_gsafac/tests.py create mode 100644 backend/census_to_gsafac/views.py diff --git a/backend/Apple_M1_Dockerfile b/backend/Apple_M1_Dockerfile index 70546c3b5f..dbad68a99e 100644 --- a/backend/Apple_M1_Dockerfile +++ b/backend/Apple_M1_Dockerfile @@ -21,10 +21,13 @@ RUN \ RUN \ apt-get update -yq && \ - apt install curl -y && \ - apt-get install -y gcc && \ - curl -fsSL https://deb.nodesource.com/setup_16.x | bash - && \ - apt-get install -y nodejs && \ + apt install build-essential curl -y && \ + apt-get install -y gcc ca-certificates gnupg && \ + mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ + NODE_MAJOR=18 && \ + echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \ + apt-get install nodejs -y && \ apt-get install -y npm && \ npm i -g npm@^8 diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md new file mode 100644 index 0000000000..49f1c955c7 --- /dev/null +++ b/backend/census_to_gsafac/README.md @@ -0,0 +1,70 @@ +# Census to FAC data migration + +## Overview + +This is implemented as a django app to leverage existing management commands and settings. It has python and shell scripts to + +* load raw census data as csv files into an S3 bucket +* create postgres tables from these csv files +* perform any data clean up required to create a table from a csv file +* perforn any ither validations or cleansing, such as verifying the integrity of df files, of data coming into FAC from Census + +## Infrastructure changes + +* Create a new S3 bucket in cloud.gov spaces as well as in the ;ocal environment +** Affected files: TBD +* Create a new Postgres instance both in CG and locally +** Affected files: + +## Utilities + +* fac_s3 - is a management command in the `support` app. It can be used to upload folders or files to an s3 nucket. + +```bash +manage.py fac_s3 fac-c2g-s3 --upload --src c2g/data +``` + +* load_raw.py - Read zip files providd by Census, and upload them to the S3 bucket. The basename of the zip file is used to create a folder in S3. The individual unzipped files are stored in the folder. There is an assumption that there are no sub-folders. +* raw_to_pg.py - Inserts data into PG tables using the contents of the csv files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. + +```bash +manage.py raw_to_pg --folder data +manage.py raw_to_pg --clean True +``` + +* models.py These ought to correspons to the incoming csv files +* routers.py This tells django to use a different postgres instance. + +* data A folder that contains sample data that we can use for development. + +* wb_generator.py This module loads a single submission from the history tables to the GSA FAC tables + +* loader.py This module will eventually loadd all of the historic data by invoking wb_generator for each submission + +* c2g/workbooklib is a clone of dissemination/workbooklib + +### Testing + +We need to write more tests. But we have one basic test. This can be invoked as follows + +```bash +manage.py test c2g +``` + +In addition there is a small hack in place to test with the data that was created from the Census csv files. After loading the data into minio and populating postgres as described above, we can now try to create submissions with the following command + +```bash +manage.py raw_to_pg --load True +``` + +Currently, the above command will stop at the first submission that fails. Note also that this program cyrrently deletes everything in SingleAuditChecklist before it starts loading. These are things that we will address once we have most of the code working. + +### Work in progress + +* c2g/workbooklib has only been modified to handle general_information and federal_awards. The rest of the worknooks need to be workd on. +* Meed to write more tests. Have been doing mainly manual testing so far. +* Nothing has been done yet to handle pdf files. + +## Pre-requisites for + +* A django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc to simulate a real submission diff --git a/backend/census_to_gsafac/__init__.py b/backend/census_to_gsafac/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/census_to_gsafac/admin.py b/backend/census_to_gsafac/admin.py new file mode 100644 index 0000000000..8c38f3f3da --- /dev/null +++ b/backend/census_to_gsafac/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/backend/census_to_gsafac/apps.py b/backend/census_to_gsafac/apps.py new file mode 100644 index 0000000000..74305c65f9 --- /dev/null +++ b/backend/census_to_gsafac/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class CensusToGsafacConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'census_to_gsafac' diff --git a/backend/census_to_gsafac/management/commands/load_raw.py b/backend/census_to_gsafac/management/commands/load_raw.py new file mode 100644 index 0000000000..f5c9101704 --- /dev/null +++ b/backend/census_to_gsafac/management/commands/load_raw.py @@ -0,0 +1,62 @@ +import logging + +# import requests +import zipfile + +# import io +import os +import boto3 + +from django.core.management.base import BaseCommand +from django.conf import settings + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + +s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, +) +c2f_bucket_name = settings.AWS_C2F_BUCKET_NAME + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument("--zip_url", help="Remote file name") + parser.add_argument("--zip_src", help="local file name.") + + def handle(self, *args, **options): + url = options["zip_url"] + src = options["zip_src"] + if not url and not src: + logger.error("Remote or local zip file must be specified") + return + if url: + print("Not yet implemented") + return + + folder, zip_file = self.get_folder_and_file(url, src) + for file_name in zip_file.namelist(): + tgt_path = f"{folder}/{file_name}" + with zip_file.open(file_name, "r") as zip_object: + s3_client.upload_fileobj(zip_object, c2f_bucket_name, tgt_path) + print(f"Uploaded : {tgt_path} ") + + def get_folder_and_file(self, url, src): + if url: + print("Not yet implemented") + + # response = requests.get(url) + # if response.status_code != 200: + # logger.error(f"Unable to read from {url}. Response = {response}") + # return + # folder = url.split("/")[-1] + # zip_file = zipfile.ZipFile(io.BytesIO(response.content)) + + if src: + folder = os.path.basename(src).split(".")[0] + zip_file = zipfile.ZipFile(src) + + return folder, zip_file diff --git a/backend/census_to_gsafac/management/commands/raw_to_pg.py b/backend/census_to_gsafac/management/commands/raw_to_pg.py new file mode 100644 index 0000000000..57a908e6d6 --- /dev/null +++ b/backend/census_to_gsafac/management/commands/raw_to_pg.py @@ -0,0 +1,127 @@ +import logging +import boto3 +import csv + + +from django.core.management.base import BaseCommand +from django.conf import settings +from django.apps import apps + +from c2g.loader import load_data + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + +c2g_models = list(apps.get_app_config("c2g").get_models()) +c2g_model_names = [m._meta.model_name for m in c2g_models] +s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, +) +c2g_bucket_name = settings.AWS_C2G_BUCKET_NAME +DELIMITER = "," + + +class Command(BaseCommand): + help = """ + Populate PG database from csv files + Usage: + manage.py raw_to_pg --folder --clean + """ + + def add_arguments(self, parser): + parser.add_argument("--folder", help="S3 folder name") + parser.add_argument("--clean") + parser.add_argument("--sample") + parser.add_argument("--load") + + def handle(self, *args, **options): + if options.get("clean") == "True": + self.delete_data() + return + if options.get("sample") == "True": + self.sample_data() + return + + if options.get("load") == "True": + load_data() + return + + folder = options.get("folder") + if not folder: + print("Please specify a folder name") + return + + items = s3_client.list_objects( + Bucket=c2g_bucket_name, + Prefix=folder, + )["Contents"] + for item in items: + if item["Key"].endswith("/"): + continue + model_name = self.get_model_name(item["Key"]) + if model_name: + model_obj = c2g_models[c2g_model_names.index(model_name)] + response = s3_client.get_object(Bucket=c2g_bucket_name, Key=item["Key"]) + # rows = io.BytesIO(response["Body"].read().replace(b"\r", b"")) + # rows = response["Body"].readlines() + # rows = [] + # for line in response["Body"].read().splitlines(keepends=True): + # rows.append(line.replace(b'\r', b'')) + print("Obtained response from S3") + lines = response["Body"].read().decode("utf-8").splitlines(True) + print("Loaded Body into 'lines'") + # print(lines) + # Use following only for ELECAUDITS + # rows = [row for row in csv.DictReader(lines[11550:12000])] + rows = [row for row in csv.DictReader(lines)] + print("Completed processing 'lines'") + # for row in rows: + # print(row) + # break + self.load_table(model_obj, rows) + + for mdl in c2g_models: + row_count = mdl.objects.all().count() + print(f"{row_count} in ", mdl) + + def delete_data(self): + for mdl in c2g_models: + print("Deleting ", mdl) + mdl.objects.all().delete() + + def sample_data(self): + for mdl in c2g_models: + print("Sampling ", mdl) + rows = mdl.objects.all()[:1] + for row in rows: + for col in mdl._meta.fields: + print(f"{col.name}: {getattr(row, col.name)}") + + def get_model_name(self, name): + print("Processing ", name) + file_name = name.split("/")[-1].split(".")[0] + for model_name in c2g_model_names: + if file_name.lower().startswith(model_name): + print("model_name = ", model_name) + return model_name + print("Could not find a matching model for ", name) + return None + + def load_table(self, model_obj, rows): + print("Loading data for model_obj ", model_obj) + for i in range(0, len(rows)): + # if i > 2: + # break + model_instance = model_obj() + + for column_name, value in rows[i].items(): + if column_name == "id": + continue + setattr(model_instance, column_name, value) + model_instance.save() + if i % 1000 == 0: + print(f"Loaded {i} of {len(rows)} rows to ", model_obj) + print(f"Loaded {len(rows)} rows to ", model_obj) diff --git a/backend/census_to_gsafac/migrations/__init__.py b/backend/census_to_gsafac/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/census_to_gsafac/models.py b/backend/census_to_gsafac/models.py new file mode 100644 index 0000000000..503a9e027f --- /dev/null +++ b/backend/census_to_gsafac/models.py @@ -0,0 +1,445 @@ +from django.db import models + + +class ELECAUDITHEADER(models.Model): + ELECAUDITHEADERID = models.TextField(blank=True, null=True) + + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + FYENDDATE = models.TextField(blank=True, null=True) + + AUDITTYPE = models.TextField(blank=True, null=True) + + PERIODCOVERED = models.TextField(blank=True, null=True) + + NUMBERMONTHS = models.TextField(blank=True, null=True) + + MULTIPLEEINS = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + EINSUBCODE = models.TextField(blank=True, null=True) + + MULTIPLEDUNS = models.TextField(blank=True, null=True) + + DUNS = models.TextField(blank=True, null=True) + + AUDITEENAME = models.TextField(blank=True, null=True) + + STREET1 = models.TextField(blank=True, null=True) + + STREET2 = models.TextField(blank=True, null=True) + + CITY = models.TextField(blank=True, null=True) + + STATE = models.TextField(blank=True, null=True) + + ZIPCODE = models.TextField(blank=True, null=True) + + AUDITEECONTACT = models.TextField(blank=True, null=True) + + AUDITEETITLE = models.TextField(blank=True, null=True) + + AUDITEEPHONE = models.TextField(blank=True, null=True) + + AUDITEEFAX = models.TextField(blank=True, null=True) + + AUDITEEEMAIL = models.TextField(blank=True, null=True) + + AUDITEEDATESIGNED = models.TextField(blank=True, null=True) + + AUDITEENAMETITLE = models.TextField(blank=True, null=True) + + CPAFIRMNAME = models.TextField(blank=True, null=True) + + CPASTREET1 = models.TextField(blank=True, null=True) + + CPASTREET2 = models.TextField(blank=True, null=True) + + CPACITY = models.TextField(blank=True, null=True) + + CPASTATE = models.TextField(blank=True, null=True) + + CPAZIPCODE = models.TextField(blank=True, null=True) + + CPACONTACT = models.TextField(blank=True, null=True) + + CPATITLE = models.TextField(blank=True, null=True) + + CPAPHONE = models.TextField(blank=True, null=True) + + CPAFAX = models.TextField(blank=True, null=True) + + CPAEMAIL = models.TextField(blank=True, null=True) + + CPADATESIGNED = models.TextField(blank=True, null=True) + + CPANAMETITLE = models.TextField(blank=True, null=True) + + COG_OVER = models.TextField(blank=True, null=True) + + COGAGENCY = models.TextField(blank=True, null=True) + + TYPEREPORT_FS = models.TextField(blank=True, null=True) + + REPORTABLECONDITION = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + MATERIALNONCOMPLIANCE = models.TextField(blank=True, null=True) + + GOINGCONCERN = models.TextField(blank=True, null=True) + + TYPEREPORT_MP = models.TextField(blank=True, null=True) + + DOLLARTHRESHOLD = models.TextField(blank=True, null=True) + + LOWRISK = models.TextField(blank=True, null=True) + + REPORTREQUIRED = models.TextField(blank=True, null=True) + + TOTFEDEXPEND = models.TextField(blank=True, null=True) + + COPIES = models.TextField(blank=True, null=True) + + REPORTABLECONDITION_MP = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) + + QCOSTS = models.TextField(blank=True, null=True) + + CYFINDINGS = models.TextField(blank=True, null=True) + + PYSCHEDULE = models.TextField(blank=True, null=True) + + DUP_REPORTS = models.TextField(blank=True, null=True) + + COG_AGENCY = models.TextField(blank=True, null=True) + + OVERSIGHTAGENCY = models.TextField(blank=True, null=True) + + DATERECEIVED = models.TextField(blank=True, null=True) + + DATEFIREWALL = models.TextField(blank=True, null=True) + + PREVIOUSDATEFIREWALL = models.TextField(blank=True, null=True) + + FINDINGREFNUM = models.TextField(blank=True, null=True) + + TYPEOFENTITY = models.TextField(blank=True, null=True) + + IMAGE = models.TextField(blank=True, null=True) + + AGENCYCFDA = models.TextField(blank=True, null=True) + + INITIALDATE = models.TextField(blank=True, null=True) + + DATERECEIVEDOTHER = models.TextField(blank=True, null=True) + + MULTIPLE_CPAS = models.TextField(blank=True, null=True) + + AUDITEECERTIFYNAME = models.TextField(blank=True, null=True) + + AUDITEECERTIFYTITLE = models.TextField(blank=True, null=True) + + FACACCEPTEDDATE = models.TextField(blank=True, null=True) + + AUDITOR_EIN = models.TextField(blank=True, null=True) + + SD_MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + SD_MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY_MP = models.TextField(blank=True, null=True) + + SP_FRAMEWORK = models.TextField(blank=True, null=True) + + SP_FRAMEWORK_REQUIRED = models.TextField(blank=True, null=True) + + TYPEREPORT_SP_FRAMEWORK = models.TextField(blank=True, null=True) + + SUPPRESSION_CODE = models.TextField(blank=True, null=True) + + ENTITY_TYPE = models.TextField(blank=True, null=True) + + TYPEAUDIT_CODE = models.TextField(blank=True, null=True) + + OPEID = models.TextField(blank=True, null=True) + + DATETOED = models.TextField(blank=True, null=True) + + DATEFINISHED = models.TextField(blank=True, null=True) + + TYPEFINDING = models.TextField(blank=True, null=True) + + TYPEFUNDING = models.TextField(blank=True, null=True) + + FYSTARTDATE = models.TextField(blank=True, null=True) + + CPAFOREIGN = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + CPACOUNTRY = models.TextField(blank=True, null=True) + + +class ELECEINS(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + EINSEQNUM = models.TextField(blank=True, null=True) + + DUNS = models.TextField(blank=True, null=True) + + DUNSEQNUM = models.TextField(blank=True, null=True) + + +class ELECAUDITFINDINGS(models.Model): + ELECAUDITFINDINGSID = models.TextField(blank=True, null=True) + + ELECAUDITSID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + QCOSTS = models.TextField(blank=True, null=True) + + OTHERFINDINGS = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + OTHERNONCOMPLIANCE = models.TextField(blank=True, null=True) + + TYPEREQUIREMENT = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + MODIFIEDOPINION = models.TextField(blank=True, null=True) + + REPEATFINDING = models.TextField(blank=True, null=True) + + PRIORFINDINGREFNUMS = models.TextField(blank=True, null=True) + + +class ELECNOTES(models.Model): + ID = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + SEQ_NUMBER = models.TextField(blank=True, null=True) + + TYPE_ID = models.TextField(blank=True, null=True) + + NOTE_INDEX = models.TextField(blank=True, null=True) + + TITLE = models.TextField(blank=True, null=True) + + CONTENT = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + +class ELECFINDINGSTEXT(models.Model): + SEQ_NUMBER = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + TEXT = models.TextField(blank=True, null=True) + + CHARTSTABLES = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + +class ELECCPAS(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + SEQNUM = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + CPAFIRMNAME = models.TextField(blank=True, null=True) + + CPASTREET1 = models.TextField(blank=True, null=True) + + CPACITY = models.TextField(blank=True, null=True) + + CPASTATE = models.TextField(blank=True, null=True) + + CPAZIPCODE = models.TextField(blank=True, null=True) + + CPACONTACT = models.TextField(blank=True, null=True) + + CPATITLE = models.TextField(blank=True, null=True) + + CPAPHONE = models.TextField(blank=True, null=True) + + CPAFAX = models.TextField(blank=True, null=True) + + CPAEMAIL = models.TextField(blank=True, null=True) + + CPAEIN = models.TextField(blank=True, null=True) + + +class ELECAUDITS(models.Model): + ELECAUDITSID = models.TextField(blank=True, null=True) + + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + CFDASEQNUM = models.TextField(blank=True, null=True) + + CFDA = models.TextField(blank=True, null=True) + + FEDERALPROGRAMNAME = models.TextField(blank=True, null=True) + + AMOUNT = models.TextField(blank=True, null=True) + + MAJORPROGRAM = models.TextField(blank=True, null=True) + + TYPEREQUIREMENT = models.TextField(blank=True, null=True) + + QCOSTS2 = models.TextField(blank=True, null=True) + + FINDINGS = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + RD = models.TextField(blank=True, null=True) + + DIRECT = models.TextField(blank=True, null=True) + + CFDA_PREFIX = models.TextField(blank=True, null=True) + + CFDA_EXT = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + CFDA2 = models.TextField(blank=True, null=True) + + TYPEREPORT_MP = models.TextField(blank=True, null=True) + + TYPEREPORT_MP_OVERRIDE = models.TextField(blank=True, null=True) + + ARRA = models.TextField(blank=True, null=True) + + LOANS = models.TextField(blank=True, null=True) + + FINDINGSCOUNT = models.TextField(blank=True, null=True) + + LOANBALANCE = models.TextField(blank=True, null=True) + + PASSTHROUGHAMOUNT = models.TextField(blank=True, null=True) + + AWARDIDENTIFICATION = models.TextField(blank=True, null=True) + + CLUSTERNAME = models.TextField(blank=True, null=True) + + PASSTHROUGHAWARD = models.TextField(blank=True, null=True) + + STATECLUSTERNAME = models.TextField(blank=True, null=True) + + PROGRAMTOTAL = models.TextField(blank=True, null=True) + + CLUSTERTOTAL = models.TextField(blank=True, null=True) + + OTHERCLUSTERNAME = models.TextField(blank=True, null=True) + + CFDAPROGRAMNAME = models.TextField(blank=True, null=True) + + +class ELECPASSTHROUGH(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + ELECAUDITSID = models.TextField(blank=True, null=True) + + PASSTHROUGHNAME = models.TextField(blank=True, null=True) + + PASSTHROUGHID = models.TextField(blank=True, null=True) + + +class ELECUEIS(models.Model): + UEISID = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + SEQNUM = models.TextField(blank=True, null=True) + + +class ELECCAPTEXT(models.Model): + SEQ_NUMBER = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + TEXT = models.TextField(blank=True, null=True) + + CHARTSTABLES = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) diff --git a/backend/census_to_gsafac/routers.py b/backend/census_to_gsafac/routers.py new file mode 100644 index 0000000000..c980fadbd7 --- /dev/null +++ b/backend/census_to_gsafac/routers.py @@ -0,0 +1,17 @@ +app_name = "c2g" +db_name = "c2g-db" + + +class DBRouter: + def db_for_read(self, model, **hints): + if model._meta.app_label == app_name: + return db_name + return None + + def db_for_write(self, model, **hints): + return self.db_for_read(model, hints) + + def allow_migrate(self, db, app_label, model_name=None, **hints): + if app_label == app_name: + return db == db_name + return False diff --git a/backend/census_to_gsafac/test_models.py b/backend/census_to_gsafac/test_models.py new file mode 100644 index 0000000000..30808bad25 --- /dev/null +++ b/backend/census_to_gsafac/test_models.py @@ -0,0 +1,14 @@ +from django.test import TestCase + +from model_bakery import baker + +from .models import ELECAUDITHEADER + + +class C2FModelsTestCase(TestCase): + def test_can_load_model(self): + gen = ELECAUDITHEADER.objects.all() + self.assertIsNotNone(gen) + baker.make(ELECAUDITHEADER).save() + gen = ELECAUDITHEADER.objects.all() + self.assertEquals(len(gen), 1) diff --git a/backend/census_to_gsafac/tests.py b/backend/census_to_gsafac/tests.py new file mode 100644 index 0000000000..7ce503c2dd --- /dev/null +++ b/backend/census_to_gsafac/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/backend/census_to_gsafac/views.py b/backend/census_to_gsafac/views.py new file mode 100644 index 0000000000..91ea44a218 --- /dev/null +++ b/backend/census_to_gsafac/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. From cd7dc1e72f69c49c3b7413d0f731ca1ff7680f95 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Thu, 2 Nov 2023 16:05:55 -0700 Subject: [PATCH 02/61] Initial commit --- .../management/commands/load_raw.py | 62 ------------------- 1 file changed, 62 deletions(-) delete mode 100644 backend/census_to_gsafac/management/commands/load_raw.py diff --git a/backend/census_to_gsafac/management/commands/load_raw.py b/backend/census_to_gsafac/management/commands/load_raw.py deleted file mode 100644 index f5c9101704..0000000000 --- a/backend/census_to_gsafac/management/commands/load_raw.py +++ /dev/null @@ -1,62 +0,0 @@ -import logging - -# import requests -import zipfile - -# import io -import os -import boto3 - -from django.core.management.base import BaseCommand -from django.conf import settings - -logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) - -s3_client = boto3.client( - "s3", - aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, - endpoint_url=settings.AWS_S3_ENDPOINT_URL, -) -c2f_bucket_name = settings.AWS_C2F_BUCKET_NAME - - -class Command(BaseCommand): - def add_arguments(self, parser): - parser.add_argument("--zip_url", help="Remote file name") - parser.add_argument("--zip_src", help="local file name.") - - def handle(self, *args, **options): - url = options["zip_url"] - src = options["zip_src"] - if not url and not src: - logger.error("Remote or local zip file must be specified") - return - if url: - print("Not yet implemented") - return - - folder, zip_file = self.get_folder_and_file(url, src) - for file_name in zip_file.namelist(): - tgt_path = f"{folder}/{file_name}" - with zip_file.open(file_name, "r") as zip_object: - s3_client.upload_fileobj(zip_object, c2f_bucket_name, tgt_path) - print(f"Uploaded : {tgt_path} ") - - def get_folder_and_file(self, url, src): - if url: - print("Not yet implemented") - - # response = requests.get(url) - # if response.status_code != 200: - # logger.error(f"Unable to read from {url}. Response = {response}") - # return - # folder = url.split("/")[-1] - # zip_file = zipfile.ZipFile(io.BytesIO(response.content)) - - if src: - folder = os.path.basename(src).split(".")[0] - zip_file = zipfile.ZipFile(src) - - return folder, zip_file From cd4b87b62d9bca581bb8b32996ea9689e15f29b3 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Thu, 2 Nov 2023 16:11:38 -0700 Subject: [PATCH 03/61] Initial commit --- backend/census_to_gsafac/routers.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 backend/census_to_gsafac/routers.py diff --git a/backend/census_to_gsafac/routers.py b/backend/census_to_gsafac/routers.py deleted file mode 100644 index c980fadbd7..0000000000 --- a/backend/census_to_gsafac/routers.py +++ /dev/null @@ -1,17 +0,0 @@ -app_name = "c2g" -db_name = "c2g-db" - - -class DBRouter: - def db_for_read(self, model, **hints): - if model._meta.app_label == app_name: - return db_name - return None - - def db_for_write(self, model, **hints): - return self.db_for_read(model, hints) - - def allow_migrate(self, db, app_label, model_name=None, **hints): - if app_label == app_name: - return db == db_name - return False From 1adc362181b1ad4ed2ad72933b593e3a6a073d6f Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Thu, 2 Nov 2023 16:13:51 -0700 Subject: [PATCH 04/61] Initial commit --- backend/census_to_gsafac/test_models.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 backend/census_to_gsafac/test_models.py diff --git a/backend/census_to_gsafac/test_models.py b/backend/census_to_gsafac/test_models.py deleted file mode 100644 index 30808bad25..0000000000 --- a/backend/census_to_gsafac/test_models.py +++ /dev/null @@ -1,14 +0,0 @@ -from django.test import TestCase - -from model_bakery import baker - -from .models import ELECAUDITHEADER - - -class C2FModelsTestCase(TestCase): - def test_can_load_model(self): - gen = ELECAUDITHEADER.objects.all() - self.assertIsNotNone(gen) - baker.make(ELECAUDITHEADER).save() - gen = ELECAUDITHEADER.objects.all() - self.assertEquals(len(gen), 1) From 54ba1df22d1bed2dd7a0f42c1cdd97e130476d09 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:19:34 -0700 Subject: [PATCH 05/61] Initial commit --- backend/census_to_gsafac/loader.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 backend/census_to_gsafac/loader.py diff --git a/backend/census_to_gsafac/loader.py b/backend/census_to_gsafac/loader.py new file mode 100644 index 0000000000..a37eeebb35 --- /dev/null +++ b/backend/census_to_gsafac/loader.py @@ -0,0 +1,28 @@ +import datetime + + +from audit.models import SingleAuditChecklist +from .models import ELECAUDITHEADER as Gen +from .wb_generator import load_historic_data + + +def load_data(): + SingleAuditChecklist.objects.all().delete + + gens = Gen.objects.all() + total_count = error_count = 0 + for gen in gens: + audit_year = gen.AUDITYEAR + dbkey = gen.DBKEY + result = load_historic_data(audit_year, dbkey) + print(result) + total_count += 1 + if len(result["errors"]) > 0: + error_count += 1 + break + if total_count % 25 == 0: + now = datetime.datetime.now() + print(now.strftime("%H:%M")) + print(f"{error_count} errors out of {total_count}") + + print(f"{error_count} errors out of {total_count}") From ff307ecf3443e76fef21eeeaaeaf44b120628076 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:20:09 -0700 Subject: [PATCH 06/61] Initial commit --- .../management/commands/fac_s3.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 backend/census_to_gsafac/management/commands/fac_s3.py diff --git a/backend/census_to_gsafac/management/commands/fac_s3.py b/backend/census_to_gsafac/management/commands/fac_s3.py new file mode 100644 index 0000000000..9884d5b5f6 --- /dev/null +++ b/backend/census_to_gsafac/management/commands/fac_s3.py @@ -0,0 +1,81 @@ +from os import path +import os + +import boto3 + +from django.core.management.base import BaseCommand + +from django.conf import settings + + +class Command(BaseCommand): + help = """ + Alternative to aws s3 as the cli is not available in production. + Usage: + manage.py fac_s3 --upload --src SRC [--tgt TGT] + manage.py fac_s3 --download --src SRC [--tgt TGT] + manage.py fac_s3 --rm --tgt TGT] + manage.py fac_s3 --ls [--tgt TGT] + """ + + def add_arguments(self, parser): + parser.add_argument("bucket_name", type=str, help="The S3 bucket name.") + parser.add_argument("--src", help="local file name.") + parser.add_argument("--tgt", help="s3 file name.") + parser.add_argument("--ls", action="store_true", help="List all files.") + parser.add_argument( + "--upload", action="store_true", help="Copy local src to S3 tgt." + ) + parser.add_argument( + "--download", action="store_true", help="Copy S3 tgt to local src." + ) + parser.add_argument("--rm", action="store_true", help="Delete tgt.") + + def handle(self, *args, **options): + bucket_name = options["bucket_name"] + src_path = options["src"] + tgt_path = options["tgt"] + + s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, + ) + + if options["ls"]: + items = s3_client.list_objects( + Bucket=bucket_name, + Prefix=tgt_path or "", + ).get("Contents") + if not items: + print("Target is empty") + return + for item in items: + print(item["Key"], item["Size"], item["LastModified"]) + return + + if options["upload"]: + file_path = path.join(settings.BASE_DIR, src_path) + tgt_name = tgt_path or os.path.basename(file_path) + tgt_name_offset = len(str(file_path)) + for subdir, dir, files in os.walk(file_path): + object_name = tgt_name + str(subdir)[tgt_name_offset:] + "/" + print(subdir, dir, object_name, files) + for file in files: + full_path = os.path.join(subdir, file) + s3_client.upload_file(full_path, bucket_name, object_name + file) + print(f"Copied {full_path} to {bucket_name} {object_name+file}.") + return + + if options["download"]: + file_path = path.join(settings.BASE_DIR, src_path) + object_name = tgt_path + s3_client.download_file(bucket_name, object_name, file_path) + return + + if options["rm"]: + s3_client.delete_object( + Bucket=bucket_name, + Key=tgt_path, + ) From b8498e5eed678273f91f06884d9599e96b411a66 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:20:36 -0700 Subject: [PATCH 07/61] Initial commit --- .../management/commands/load_raw.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 backend/census_to_gsafac/management/commands/load_raw.py diff --git a/backend/census_to_gsafac/management/commands/load_raw.py b/backend/census_to_gsafac/management/commands/load_raw.py new file mode 100644 index 0000000000..f5c9101704 --- /dev/null +++ b/backend/census_to_gsafac/management/commands/load_raw.py @@ -0,0 +1,62 @@ +import logging + +# import requests +import zipfile + +# import io +import os +import boto3 + +from django.core.management.base import BaseCommand +from django.conf import settings + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + +s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, +) +c2f_bucket_name = settings.AWS_C2F_BUCKET_NAME + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument("--zip_url", help="Remote file name") + parser.add_argument("--zip_src", help="local file name.") + + def handle(self, *args, **options): + url = options["zip_url"] + src = options["zip_src"] + if not url and not src: + logger.error("Remote or local zip file must be specified") + return + if url: + print("Not yet implemented") + return + + folder, zip_file = self.get_folder_and_file(url, src) + for file_name in zip_file.namelist(): + tgt_path = f"{folder}/{file_name}" + with zip_file.open(file_name, "r") as zip_object: + s3_client.upload_fileobj(zip_object, c2f_bucket_name, tgt_path) + print(f"Uploaded : {tgt_path} ") + + def get_folder_and_file(self, url, src): + if url: + print("Not yet implemented") + + # response = requests.get(url) + # if response.status_code != 200: + # logger.error(f"Unable to read from {url}. Response = {response}") + # return + # folder = url.split("/")[-1] + # zip_file = zipfile.ZipFile(io.BytesIO(response.content)) + + if src: + folder = os.path.basename(src).split(".")[0] + zip_file = zipfile.ZipFile(src) + + return folder, zip_file From 1138a7ccfe45eba00f01b8118cad4ec243d9e01b Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:21:00 -0700 Subject: [PATCH 08/61] Initial commit --- backend/census_to_gsafac/routers.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 backend/census_to_gsafac/routers.py diff --git a/backend/census_to_gsafac/routers.py b/backend/census_to_gsafac/routers.py new file mode 100644 index 0000000000..3270adf67b --- /dev/null +++ b/backend/census_to_gsafac/routers.py @@ -0,0 +1,17 @@ +app_name = "census_to_gsafac" +db_name = "c2g-db" + + +class DBRouter: + def db_for_read(self, model, **hints): + if model._meta.app_label == app_name: + return db_name + return None + + def db_for_write(self, model, **hints): + return self.db_for_read(model, hints) + + def allow_migrate(self, db, app_label, model_name=None, **hints): + if app_label == app_name: + return db == db_name + return False From 3c77f099dec033caca95f45b5f88caae497f763c Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:21:22 -0700 Subject: [PATCH 09/61] Removed comments --- .../management/commands/raw_to_pg.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/backend/census_to_gsafac/management/commands/raw_to_pg.py b/backend/census_to_gsafac/management/commands/raw_to_pg.py index 57a908e6d6..5ea22548bb 100644 --- a/backend/census_to_gsafac/management/commands/raw_to_pg.py +++ b/backend/census_to_gsafac/management/commands/raw_to_pg.py @@ -65,22 +65,11 @@ def handle(self, *args, **options): if model_name: model_obj = c2g_models[c2g_model_names.index(model_name)] response = s3_client.get_object(Bucket=c2g_bucket_name, Key=item["Key"]) - # rows = io.BytesIO(response["Body"].read().replace(b"\r", b"")) - # rows = response["Body"].readlines() - # rows = [] - # for line in response["Body"].read().splitlines(keepends=True): - # rows.append(line.replace(b'\r', b'')) print("Obtained response from S3") lines = response["Body"].read().decode("utf-8").splitlines(True) print("Loaded Body into 'lines'") - # print(lines) - # Use following only for ELECAUDITS - # rows = [row for row in csv.DictReader(lines[11550:12000])] rows = [row for row in csv.DictReader(lines)] print("Completed processing 'lines'") - # for row in rows: - # print(row) - # break self.load_table(model_obj, rows) for mdl in c2g_models: @@ -113,8 +102,6 @@ def get_model_name(self, name): def load_table(self, model_obj, rows): print("Loading data for model_obj ", model_obj) for i in range(0, len(rows)): - # if i > 2: - # break model_instance = model_obj() for column_name, value in rows[i].items(): From 2547ef479ab697954a3a3261c6f888e4cd7751de Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:51:55 -0700 Subject: [PATCH 10/61] Added census_to_gsafac --- backend/config/settings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/config/settings.py b/backend/config/settings.py index 16826a310d..5159683142 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -123,6 +123,7 @@ # "data_distro", "dissemination", "support", + "census_to_gsafac" ] MIDDLEWARE = [ From 9173ba6d79554c94efae84d5a07527c1d2cde620 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 10:52:55 -0700 Subject: [PATCH 11/61] Initial commit --- .../migrations/0001_initial.py | 364 ++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 backend/census_to_gsafac/migrations/0001_initial.py diff --git a/backend/census_to_gsafac/migrations/0001_initial.py b/backend/census_to_gsafac/migrations/0001_initial.py new file mode 100644 index 0000000000..c7393cb835 --- /dev/null +++ b/backend/census_to_gsafac/migrations/0001_initial.py @@ -0,0 +1,364 @@ +# Generated by Django 4.2.6 on 2023-11-03 17:38 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="ELECAUDITFINDINGS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITFINDINGSID", models.TextField(blank=True, null=True)), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("QCOSTS", models.TextField(blank=True, null=True)), + ("OTHERFINDINGS", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("OTHERNONCOMPLIANCE", models.TextField(blank=True, null=True)), + ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("MODIFIEDOPINION", models.TextField(blank=True, null=True)), + ("REPEATFINDING", models.TextField(blank=True, null=True)), + ("PRIORFINDINGREFNUMS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECAUDITHEADER", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITHEADERID", models.TextField(blank=True, null=True)), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("FYENDDATE", models.TextField(blank=True, null=True)), + ("AUDITTYPE", models.TextField(blank=True, null=True)), + ("PERIODCOVERED", models.TextField(blank=True, null=True)), + ("NUMBERMONTHS", models.TextField(blank=True, null=True)), + ("MULTIPLEEINS", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("EINSUBCODE", models.TextField(blank=True, null=True)), + ("MULTIPLEDUNS", models.TextField(blank=True, null=True)), + ("DUNS", models.TextField(blank=True, null=True)), + ("AUDITEENAME", models.TextField(blank=True, null=True)), + ("STREET1", models.TextField(blank=True, null=True)), + ("STREET2", models.TextField(blank=True, null=True)), + ("CITY", models.TextField(blank=True, null=True)), + ("STATE", models.TextField(blank=True, null=True)), + ("ZIPCODE", models.TextField(blank=True, null=True)), + ("AUDITEECONTACT", models.TextField(blank=True, null=True)), + ("AUDITEETITLE", models.TextField(blank=True, null=True)), + ("AUDITEEPHONE", models.TextField(blank=True, null=True)), + ("AUDITEEFAX", models.TextField(blank=True, null=True)), + ("AUDITEEEMAIL", models.TextField(blank=True, null=True)), + ("AUDITEEDATESIGNED", models.TextField(blank=True, null=True)), + ("AUDITEENAMETITLE", models.TextField(blank=True, null=True)), + ("CPAFIRMNAME", models.TextField(blank=True, null=True)), + ("CPASTREET1", models.TextField(blank=True, null=True)), + ("CPASTREET2", models.TextField(blank=True, null=True)), + ("CPACITY", models.TextField(blank=True, null=True)), + ("CPASTATE", models.TextField(blank=True, null=True)), + ("CPAZIPCODE", models.TextField(blank=True, null=True)), + ("CPACONTACT", models.TextField(blank=True, null=True)), + ("CPATITLE", models.TextField(blank=True, null=True)), + ("CPAPHONE", models.TextField(blank=True, null=True)), + ("CPAFAX", models.TextField(blank=True, null=True)), + ("CPAEMAIL", models.TextField(blank=True, null=True)), + ("CPADATESIGNED", models.TextField(blank=True, null=True)), + ("CPANAMETITLE", models.TextField(blank=True, null=True)), + ("COG_OVER", models.TextField(blank=True, null=True)), + ("COGAGENCY", models.TextField(blank=True, null=True)), + ("TYPEREPORT_FS", models.TextField(blank=True, null=True)), + ("REPORTABLECONDITION", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("MATERIALNONCOMPLIANCE", models.TextField(blank=True, null=True)), + ("GOINGCONCERN", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), + ("DOLLARTHRESHOLD", models.TextField(blank=True, null=True)), + ("LOWRISK", models.TextField(blank=True, null=True)), + ("REPORTREQUIRED", models.TextField(blank=True, null=True)), + ("TOTFEDEXPEND", models.TextField(blank=True, null=True)), + ("COPIES", models.TextField(blank=True, null=True)), + ("REPORTABLECONDITION_MP", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), + ("QCOSTS", models.TextField(blank=True, null=True)), + ("CYFINDINGS", models.TextField(blank=True, null=True)), + ("PYSCHEDULE", models.TextField(blank=True, null=True)), + ("DUP_REPORTS", models.TextField(blank=True, null=True)), + ("COG_AGENCY", models.TextField(blank=True, null=True)), + ("OVERSIGHTAGENCY", models.TextField(blank=True, null=True)), + ("DATERECEIVED", models.TextField(blank=True, null=True)), + ("DATEFIREWALL", models.TextField(blank=True, null=True)), + ("PREVIOUSDATEFIREWALL", models.TextField(blank=True, null=True)), + ("FINDINGREFNUM", models.TextField(blank=True, null=True)), + ("TYPEOFENTITY", models.TextField(blank=True, null=True)), + ("IMAGE", models.TextField(blank=True, null=True)), + ("AGENCYCFDA", models.TextField(blank=True, null=True)), + ("INITIALDATE", models.TextField(blank=True, null=True)), + ("DATERECEIVEDOTHER", models.TextField(blank=True, null=True)), + ("MULTIPLE_CPAS", models.TextField(blank=True, null=True)), + ("AUDITEECERTIFYNAME", models.TextField(blank=True, null=True)), + ("AUDITEECERTIFYTITLE", models.TextField(blank=True, null=True)), + ("FACACCEPTEDDATE", models.TextField(blank=True, null=True)), + ("AUDITOR_EIN", models.TextField(blank=True, null=True)), + ("SD_MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("SD_MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY_MP", models.TextField(blank=True, null=True)), + ("SP_FRAMEWORK", models.TextField(blank=True, null=True)), + ("SP_FRAMEWORK_REQUIRED", models.TextField(blank=True, null=True)), + ("TYPEREPORT_SP_FRAMEWORK", models.TextField(blank=True, null=True)), + ("SUPPRESSION_CODE", models.TextField(blank=True, null=True)), + ("ENTITY_TYPE", models.TextField(blank=True, null=True)), + ("TYPEAUDIT_CODE", models.TextField(blank=True, null=True)), + ("OPEID", models.TextField(blank=True, null=True)), + ("DATETOED", models.TextField(blank=True, null=True)), + ("DATEFINISHED", models.TextField(blank=True, null=True)), + ("TYPEFINDING", models.TextField(blank=True, null=True)), + ("TYPEFUNDING", models.TextField(blank=True, null=True)), + ("FYSTARTDATE", models.TextField(blank=True, null=True)), + ("CPAFOREIGN", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ("CPACOUNTRY", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECAUDITS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("CFDASEQNUM", models.TextField(blank=True, null=True)), + ("CFDA", models.TextField(blank=True, null=True)), + ("FEDERALPROGRAMNAME", models.TextField(blank=True, null=True)), + ("AMOUNT", models.TextField(blank=True, null=True)), + ("MAJORPROGRAM", models.TextField(blank=True, null=True)), + ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), + ("QCOSTS2", models.TextField(blank=True, null=True)), + ("FINDINGS", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("RD", models.TextField(blank=True, null=True)), + ("DIRECT", models.TextField(blank=True, null=True)), + ("CFDA_PREFIX", models.TextField(blank=True, null=True)), + ("CFDA_EXT", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("CFDA2", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP_OVERRIDE", models.TextField(blank=True, null=True)), + ("ARRA", models.TextField(blank=True, null=True)), + ("LOANS", models.TextField(blank=True, null=True)), + ("FINDINGSCOUNT", models.TextField(blank=True, null=True)), + ("LOANBALANCE", models.TextField(blank=True, null=True)), + ("PASSTHROUGHAMOUNT", models.TextField(blank=True, null=True)), + ("AWARDIDENTIFICATION", models.TextField(blank=True, null=True)), + ("CLUSTERNAME", models.TextField(blank=True, null=True)), + ("PASSTHROUGHAWARD", models.TextField(blank=True, null=True)), + ("STATECLUSTERNAME", models.TextField(blank=True, null=True)), + ("PROGRAMTOTAL", models.TextField(blank=True, null=True)), + ("CLUSTERTOTAL", models.TextField(blank=True, null=True)), + ("OTHERCLUSTERNAME", models.TextField(blank=True, null=True)), + ("CFDAPROGRAMNAME", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECCAPTEXT", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("TEXT", models.TextField(blank=True, null=True)), + ("CHARTSTABLES", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECCPAS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("SEQNUM", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("CPAFIRMNAME", models.TextField(blank=True, null=True)), + ("CPASTREET1", models.TextField(blank=True, null=True)), + ("CPACITY", models.TextField(blank=True, null=True)), + ("CPASTATE", models.TextField(blank=True, null=True)), + ("CPAZIPCODE", models.TextField(blank=True, null=True)), + ("CPACONTACT", models.TextField(blank=True, null=True)), + ("CPATITLE", models.TextField(blank=True, null=True)), + ("CPAPHONE", models.TextField(blank=True, null=True)), + ("CPAFAX", models.TextField(blank=True, null=True)), + ("CPAEMAIL", models.TextField(blank=True, null=True)), + ("CPAEIN", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECEINS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("EINSEQNUM", models.TextField(blank=True, null=True)), + ("DUNS", models.TextField(blank=True, null=True)), + ("DUNSEQNUM", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECFINDINGSTEXT", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("TEXT", models.TextField(blank=True, null=True)), + ("CHARTSTABLES", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECNOTES", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("TYPE_ID", models.TextField(blank=True, null=True)), + ("NOTE_INDEX", models.TextField(blank=True, null=True)), + ("TITLE", models.TextField(blank=True, null=True)), + ("CONTENT", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECPASSTHROUGH", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("PASSTHROUGHNAME", models.TextField(blank=True, null=True)), + ("PASSTHROUGHID", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECUEIS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("UEISID", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("SEQNUM", models.TextField(blank=True, null=True)), + ], + ), + ] From 352456e2c1918b2fb2ddda0b47f10418eb77b448 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 11:09:05 -0700 Subject: [PATCH 12/61] Initial commit --- backend/config/settings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/config/settings.py b/backend/config/settings.py index 5159683142..d09ab72e72 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -244,6 +244,9 @@ # Private bucket AWS_PRIVATE_STORAGE_BUCKET_NAME = "gsa-fac-private-s3" + # Private C2g bucket + AWS_C2G_BUCKET_NAME = "fac-c2g-s3" + AWS_S3_PRIVATE_REGION_NAME = os.environ.get( "AWS_S3_PRIVATE_REGION_NAME", "us-east-1" ) From 988a11176577bfe348f7182d06e66eb6e594ade2 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 11:09:28 -0700 Subject: [PATCH 13/61] Initial commit --- backend/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/run.sh b/backend/run.sh index a22b47ef35..ddf16b769a 100755 --- a/backend/run.sh +++ b/backend/run.sh @@ -14,6 +14,7 @@ if [[ "${ENV}" == "LOCAL" || "${ENV}" == "TESTING" ]]; then export AWS_S3_PRIVATE_ENDPOINT="http://minio:9000" mc alias set myminio "${AWS_S3_PRIVATE_ENDPOINT}" minioadmin minioadmin mc mb myminio/gsa-fac-private-s3 + mc mb myminio/fac-c2g-s3 mc admin user svcacct add --access-key="${AWS_PRIVATE_ACCESS_KEY_ID}" --secret-key="${AWS_PRIVATE_SECRET_ACCESS_KEY}" myminio minioadmin fi; From 71ad2cc5cce96fed3e5e96f07843d5407fd74205 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 11:47:24 -0700 Subject: [PATCH 14/61] Initial commit --- backend/census_to_gsafac/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/census_to_gsafac/loader.py b/backend/census_to_gsafac/loader.py index a37eeebb35..873b7896b1 100644 --- a/backend/census_to_gsafac/loader.py +++ b/backend/census_to_gsafac/loader.py @@ -3,7 +3,6 @@ from audit.models import SingleAuditChecklist from .models import ELECAUDITHEADER as Gen -from .wb_generator import load_historic_data def load_data(): From d6438e6cd88b5e1a1caaa3c672c70cbea1d9ab11 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 11:47:57 -0700 Subject: [PATCH 15/61] Initial commit --- backend/census_to_gsafac/management/commands/raw_to_pg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/census_to_gsafac/management/commands/raw_to_pg.py b/backend/census_to_gsafac/management/commands/raw_to_pg.py index 5ea22548bb..9289f1f6f2 100644 --- a/backend/census_to_gsafac/management/commands/raw_to_pg.py +++ b/backend/census_to_gsafac/management/commands/raw_to_pg.py @@ -7,12 +7,12 @@ from django.conf import settings from django.apps import apps -from c2g.loader import load_data +from census_to_gsafac.loader import load_data logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) -c2g_models = list(apps.get_app_config("c2g").get_models()) +c2g_models = list(apps.get_app_config("census_to_gsafac").get_models()) c2g_model_names = [m._meta.model_name for m in c2g_models] s3_client = boto3.client( "s3", From ae5ebc14a3133cd53092ba2bed88d7fb2a16310c Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 13:42:01 -0700 Subject: [PATCH 16/61] Added procedure to load test Census data to postgres --- backend/census_to_gsafac/README.md | 41 +++++++++--------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index 49f1c955c7..5a04b50cd2 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -11,20 +11,17 @@ This is implemented as a django app to leverage existing management commands and ## Infrastructure changes -* Create a new S3 bucket in cloud.gov spaces as well as in the ;ocal environment -** Affected files: TBD +* Create a new S3 bucket in cloud.gov spaces as well as in the local environment * Create a new Postgres instance both in CG and locally -** Affected files: ## Utilities -* fac_s3 - is a management command in the `support` app. It can be used to upload folders or files to an s3 nucket. +* fac_s3.py - Uploads folders or files to an s3 nucket. ```bash -manage.py fac_s3 fac-c2g-s3 --upload --src c2g/data +manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data ``` -* load_raw.py - Read zip files providd by Census, and upload them to the S3 bucket. The basename of the zip file is used to create a folder in S3. The individual unzipped files are stored in the folder. There is an assumption that there are no sub-folders. * raw_to_pg.py - Inserts data into PG tables using the contents of the csv files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. ```bash @@ -37,34 +34,20 @@ manage.py raw_to_pg --clean True * data A folder that contains sample data that we can use for development. -* wb_generator.py This module loads a single submission from the history tables to the GSA FAC tables - -* loader.py This module will eventually loadd all of the historic data by invoking wb_generator for each submission +## Pre-requisites for -* c2g/workbooklib is a clone of dissemination/workbooklib +* A django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc to simulate a real submission -### Testing +## How to load test Census data into postgres? -We need to write more tests. But we have one basic test. This can be invoked as follows +1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_to_gsafac/data folder. +2. In the FAC/backend folder, run the following to load csv files from census_to_gsafac/data folder into fac-c2g-s3 bucket. ```bash -manage.py test c2g +python manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data ``` -In addition there is a small hack in place to test with the data that was created from the Census csv files. After loading the data into minio and populating postgres as described above, we can now try to create submissions with the following command - +3. In the FAC/backend folder, run the following to read the csv files from fac-c2g-s3 bucket and load into postgres. ```bash -manage.py raw_to_pg --load True -``` - -Currently, the above command will stop at the first submission that fails. Note also that this program cyrrently deletes everything in SingleAuditChecklist before it starts loading. These are things that we will address once we have most of the code working. - -### Work in progress - -* c2g/workbooklib has only been modified to handle general_information and federal_awards. The rest of the worknooks need to be workd on. -* Meed to write more tests. Have been doing mainly manual testing so far. -* Nothing has been done yet to handle pdf files. - -## Pre-requisites for - -* A django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc to simulate a real submission +manage.py raw_to_pg --folder data +``` \ No newline at end of file From 356a3d39de1181a75e687df984acba2cb7e96a5b Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 13:44:50 -0700 Subject: [PATCH 17/61] Excluding workbook loader --- backend/census_to_gsafac/management/commands/raw_to_pg.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/backend/census_to_gsafac/management/commands/raw_to_pg.py b/backend/census_to_gsafac/management/commands/raw_to_pg.py index 9289f1f6f2..03547b413e 100644 --- a/backend/census_to_gsafac/management/commands/raw_to_pg.py +++ b/backend/census_to_gsafac/management/commands/raw_to_pg.py @@ -7,7 +7,6 @@ from django.conf import settings from django.apps import apps -from census_to_gsafac.loader import load_data logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) @@ -45,10 +44,6 @@ def handle(self, *args, **options): self.sample_data() return - if options.get("load") == "True": - load_data() - return - folder = options.get("folder") if not folder: print("Please specify a folder name") From 47d034f4115a9289df7ecd9a39f8d2c3b440322a Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 13:45:26 -0700 Subject: [PATCH 18/61] Excluding workbook loader --- backend/census_to_gsafac/loader.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 backend/census_to_gsafac/loader.py diff --git a/backend/census_to_gsafac/loader.py b/backend/census_to_gsafac/loader.py deleted file mode 100644 index 873b7896b1..0000000000 --- a/backend/census_to_gsafac/loader.py +++ /dev/null @@ -1,27 +0,0 @@ -import datetime - - -from audit.models import SingleAuditChecklist -from .models import ELECAUDITHEADER as Gen - - -def load_data(): - SingleAuditChecklist.objects.all().delete - - gens = Gen.objects.all() - total_count = error_count = 0 - for gen in gens: - audit_year = gen.AUDITYEAR - dbkey = gen.DBKEY - result = load_historic_data(audit_year, dbkey) - print(result) - total_count += 1 - if len(result["errors"]) > 0: - error_count += 1 - break - if total_count % 25 == 0: - now = datetime.datetime.now() - print(now.strftime("%H:%M")) - print(f"{error_count} errors out of {total_count}") - - print(f"{error_count} errors out of {total_count}") From 4f4f90c6fce90bfc862650ce3e307a5dc4a5912e Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 13:49:41 -0700 Subject: [PATCH 19/61] Excluding load_raw --- .../management/commands/load_raw.py | 62 ------------------- 1 file changed, 62 deletions(-) delete mode 100644 backend/census_to_gsafac/management/commands/load_raw.py diff --git a/backend/census_to_gsafac/management/commands/load_raw.py b/backend/census_to_gsafac/management/commands/load_raw.py deleted file mode 100644 index f5c9101704..0000000000 --- a/backend/census_to_gsafac/management/commands/load_raw.py +++ /dev/null @@ -1,62 +0,0 @@ -import logging - -# import requests -import zipfile - -# import io -import os -import boto3 - -from django.core.management.base import BaseCommand -from django.conf import settings - -logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) - -s3_client = boto3.client( - "s3", - aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, - endpoint_url=settings.AWS_S3_ENDPOINT_URL, -) -c2f_bucket_name = settings.AWS_C2F_BUCKET_NAME - - -class Command(BaseCommand): - def add_arguments(self, parser): - parser.add_argument("--zip_url", help="Remote file name") - parser.add_argument("--zip_src", help="local file name.") - - def handle(self, *args, **options): - url = options["zip_url"] - src = options["zip_src"] - if not url and not src: - logger.error("Remote or local zip file must be specified") - return - if url: - print("Not yet implemented") - return - - folder, zip_file = self.get_folder_and_file(url, src) - for file_name in zip_file.namelist(): - tgt_path = f"{folder}/{file_name}" - with zip_file.open(file_name, "r") as zip_object: - s3_client.upload_fileobj(zip_object, c2f_bucket_name, tgt_path) - print(f"Uploaded : {tgt_path} ") - - def get_folder_and_file(self, url, src): - if url: - print("Not yet implemented") - - # response = requests.get(url) - # if response.status_code != 200: - # logger.error(f"Unable to read from {url}. Response = {response}") - # return - # folder = url.split("/")[-1] - # zip_file = zipfile.ZipFile(io.BytesIO(response.content)) - - if src: - folder = os.path.basename(src).split(".")[0] - zip_file = zipfile.ZipFile(src) - - return folder, zip_file From 8db3fe2a903fd95d2e39fb44323e54ea7a89b391 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 14:38:21 -0700 Subject: [PATCH 20/61] Updates --- backend/census_to_gsafac/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index 5a04b50cd2..6e53ab1bc4 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -7,7 +7,6 @@ This is implemented as a django app to leverage existing management commands and * load raw census data as csv files into an S3 bucket * create postgres tables from these csv files * perform any data clean up required to create a table from a csv file -* perforn any ither validations or cleansing, such as verifying the integrity of df files, of data coming into FAC from Census ## Infrastructure changes @@ -29,7 +28,7 @@ manage.py raw_to_pg --folder data manage.py raw_to_pg --clean True ``` -* models.py These ought to correspons to the incoming csv files +* models.py These ought to correspond to the incoming csv files * routers.py This tells django to use a different postgres instance. * data A folder that contains sample data that we can use for development. @@ -40,7 +39,8 @@ manage.py raw_to_pg --clean True ## How to load test Census data into postgres? -1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_to_gsafac/data folder. +1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_to_gsafac/data folder. +NOTE: Never checkin the census_to_gsafac/data folder into github. 2. In the FAC/backend folder, run the following to load csv files from census_to_gsafac/data folder into fac-c2g-s3 bucket. ```bash From 2efada791c1403d335b0fc778c64904ee0bb3f17 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 15:29:17 -0700 Subject: [PATCH 21/61] Added c2g-db --- backend/config/settings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/config/settings.py b/backend/config/settings.py index d09ab72e72..b28eb01f21 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -170,6 +170,9 @@ DATABASES = { "default": env.dj_db_url( "DATABASE_URL", default="postgres://postgres:password@0.0.0.0/backend" + ), + "c2g-db": env.dj_db_url( + "DATABASE_URL_C2G_DB", default="postgres://postgres:password@0.0.0.0/c2g-db" ), } From 9532f1e416c3b214ca2f93ea6645727791017f4c Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Fri, 3 Nov 2023 15:29:45 -0700 Subject: [PATCH 22/61] Added c2g-db --- backend/docker-compose.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml index 2255372bc1..50fc87cebd 100644 --- a/backend/docker-compose.yml +++ b/backend/docker-compose.yml @@ -18,6 +18,20 @@ services: timeout: 5s retries: 10 + c2g-db: + image: "postgres:12" + environment: + POSTGRES_HOST_AUTH_METHOD: trust + volumes: + - c2gdb-data:/var/lib/postgresql/data/ + ports: + - "5433:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -d postgres -U postgres -p 5433"] + interval: 10s + timeout: 5s + retries: 10 + #--------------------------------------------- # Historic data #--------------------------------------------- @@ -116,4 +130,5 @@ services: condition: service_healthy volumes: postgres-data: + c2gdb-data: minio-vol: From 984bff50475769da03cef89989f90ba6f385371d Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 10:50:41 -0800 Subject: [PATCH 23/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- .../management/commands/csv_to_postgres.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 backend/census_to_gsafac/management/commands/csv_to_postgres.py diff --git a/backend/census_to_gsafac/management/commands/csv_to_postgres.py b/backend/census_to_gsafac/management/commands/csv_to_postgres.py new file mode 100644 index 0000000000..3bdda41334 --- /dev/null +++ b/backend/census_to_gsafac/management/commands/csv_to_postgres.py @@ -0,0 +1,109 @@ +import logging +import boto3 +import csv + + +from django.core.management.base import BaseCommand +from django.conf import settings +from django.apps import apps + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + +census_to_gsafac_models = list(apps.get_app_config("census_to_gsafac").get_models()) +census_to_gsafac_model_names = [m._meta.model_name for m in census_to_gsafac_models] +s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, +) +census_to_gsafac_bucket_name = settings.AWS_CENSUS_TO_GSAFAC_BUCKET_NAME +DELIMITER = "," + + +class Command(BaseCommand): + help = """ + Populate PG database from csv files + Usage: + manage.py raw_to_pg --folder --clean + """ + + def add_arguments(self, parser): + parser.add_argument("--folder", help="S3 folder name") + parser.add_argument("--clean") + parser.add_argument("--sample") + parser.add_argument("--load") + + def handle(self, *args, **options): + if options.get("clean") == "True": + self.delete_data() + return + if options.get("sample") == "True": + self.sample_data() + return + + folder = options.get("folder") + if not folder: + print("Please specify a folder name") + return + + items = s3_client.list_objects( + Bucket=census_to_gsafac_bucket_name, + Prefix=folder, + )["Contents"] + for item in items: + if item["Key"].endswith("/"): + continue + model_name = self.get_model_name(item["Key"]) + if model_name: + model_obj = census_to_gsafac_models[census_to_gsafac_model_names.index(model_name)] + response = s3_client.get_object(Bucket=census_to_gsafac_bucket_name, Key=item["Key"]) + print("Obtained response from S3") + lines = response["Body"].read().decode("utf-8").splitlines(True) + print("Loaded Body into 'lines'") + rows = [row for row in csv.DictReader(lines)] + print("Completed processing 'lines'") + self.load_table(model_obj, rows) + + for mdl in census_to_gsafac_models: + row_count = mdl.objects.all().count() + print(f"{row_count} in ", mdl) + + def delete_data(self): + for mdl in census_to_gsafac_models: + print("Deleting ", mdl) + mdl.objects.all().delete() + + def sample_data(self): + for mdl in census_to_gsafac_models: + print("Sampling ", mdl) + rows = mdl.objects.all()[:1] + for row in rows: + for col in mdl._meta.fields: + print(f"{col.name}: {getattr(row, col.name)}") + + def get_model_name(self, name): + print("Processing ", name) + file_name = name.split("/")[-1].split(".")[0] + for model_name in census_to_gsafac_model_names: + if file_name.lower().startswith(model_name): + print("model_name = ", model_name) + return model_name + print("Could not find a matching model for ", name) + return None + + def load_table(self, model_obj, rows): + print("Loading data for model_obj ", model_obj) + for i in range(0, len(rows)): + model_instance = model_obj() + + for column_name, value in rows[i].items(): + if column_name == "id": + continue + setattr(model_instance, column_name, value) + model_instance.save() + if i % 1000 == 0: + print(f"Loaded {i} of {len(rows)} rows to ", model_obj) + print(f"Loaded {len(rows)} rows to ", model_obj) From 5d45686f88dd8f4168ac81b3e33ce4732e50fb78 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 10:51:18 -0800 Subject: [PATCH 24/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- .../management/commands/raw_to_pg.py | 109 ------------------ 1 file changed, 109 deletions(-) delete mode 100644 backend/census_to_gsafac/management/commands/raw_to_pg.py diff --git a/backend/census_to_gsafac/management/commands/raw_to_pg.py b/backend/census_to_gsafac/management/commands/raw_to_pg.py deleted file mode 100644 index 03547b413e..0000000000 --- a/backend/census_to_gsafac/management/commands/raw_to_pg.py +++ /dev/null @@ -1,109 +0,0 @@ -import logging -import boto3 -import csv - - -from django.core.management.base import BaseCommand -from django.conf import settings -from django.apps import apps - - -logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) - -c2g_models = list(apps.get_app_config("census_to_gsafac").get_models()) -c2g_model_names = [m._meta.model_name for m in c2g_models] -s3_client = boto3.client( - "s3", - aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, - endpoint_url=settings.AWS_S3_ENDPOINT_URL, -) -c2g_bucket_name = settings.AWS_C2G_BUCKET_NAME -DELIMITER = "," - - -class Command(BaseCommand): - help = """ - Populate PG database from csv files - Usage: - manage.py raw_to_pg --folder --clean - """ - - def add_arguments(self, parser): - parser.add_argument("--folder", help="S3 folder name") - parser.add_argument("--clean") - parser.add_argument("--sample") - parser.add_argument("--load") - - def handle(self, *args, **options): - if options.get("clean") == "True": - self.delete_data() - return - if options.get("sample") == "True": - self.sample_data() - return - - folder = options.get("folder") - if not folder: - print("Please specify a folder name") - return - - items = s3_client.list_objects( - Bucket=c2g_bucket_name, - Prefix=folder, - )["Contents"] - for item in items: - if item["Key"].endswith("/"): - continue - model_name = self.get_model_name(item["Key"]) - if model_name: - model_obj = c2g_models[c2g_model_names.index(model_name)] - response = s3_client.get_object(Bucket=c2g_bucket_name, Key=item["Key"]) - print("Obtained response from S3") - lines = response["Body"].read().decode("utf-8").splitlines(True) - print("Loaded Body into 'lines'") - rows = [row for row in csv.DictReader(lines)] - print("Completed processing 'lines'") - self.load_table(model_obj, rows) - - for mdl in c2g_models: - row_count = mdl.objects.all().count() - print(f"{row_count} in ", mdl) - - def delete_data(self): - for mdl in c2g_models: - print("Deleting ", mdl) - mdl.objects.all().delete() - - def sample_data(self): - for mdl in c2g_models: - print("Sampling ", mdl) - rows = mdl.objects.all()[:1] - for row in rows: - for col in mdl._meta.fields: - print(f"{col.name}: {getattr(row, col.name)}") - - def get_model_name(self, name): - print("Processing ", name) - file_name = name.split("/")[-1].split(".")[0] - for model_name in c2g_model_names: - if file_name.lower().startswith(model_name): - print("model_name = ", model_name) - return model_name - print("Could not find a matching model for ", name) - return None - - def load_table(self, model_obj, rows): - print("Loading data for model_obj ", model_obj) - for i in range(0, len(rows)): - model_instance = model_obj() - - for column_name, value in rows[i].items(): - if column_name == "id": - continue - setattr(model_instance, column_name, value) - model_instance.save() - if i % 1000 == 0: - print(f"Loaded {i} of {len(rows)} rows to ", model_obj) - print(f"Loaded {len(rows)} rows to ", model_obj) From f132d6463bb4b341568e88d289e47164a31287a8 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 10:51:39 -0800 Subject: [PATCH 25/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- backend/census_to_gsafac/routers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/census_to_gsafac/routers.py b/backend/census_to_gsafac/routers.py index 3270adf67b..4500e28dcf 100644 --- a/backend/census_to_gsafac/routers.py +++ b/backend/census_to_gsafac/routers.py @@ -1,5 +1,5 @@ app_name = "census_to_gsafac" -db_name = "c2g-db" +db_name = "census-to-gsafac-db" class DBRouter: From cf9435ba41215a91e8dfea0d0afdfa39e9d820d3 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 10:52:00 -0800 Subject: [PATCH 26/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- backend/config/settings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/config/settings.py b/backend/config/settings.py index 126f74cc93..ca5bedcd4f 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -172,8 +172,8 @@ "default": env.dj_db_url( "DATABASE_URL", default="postgres://postgres:password@0.0.0.0/backend" ), - "c2g-db": env.dj_db_url( - "DATABASE_URL_C2G_DB", default="postgres://postgres:password@0.0.0.0/c2g-db" + "census-to-gsafac-db": env.dj_db_url( + "DATABASE_URL_CENSUS_TO_GSAFAC_DB", default="postgres://postgres:password@0.0.0.0/census-to-gsafac-db" ), } @@ -248,8 +248,8 @@ # Private bucket AWS_PRIVATE_STORAGE_BUCKET_NAME = "gsa-fac-private-s3" - # Private C2g bucket - AWS_C2G_BUCKET_NAME = "fac-c2g-s3" + # Private CENSUS_TO_GSAFAC bucket + AWS_CENSUS_TO_GSAFAC_BUCKET_NAME = "fac-census-to-gsafac-s3" AWS_S3_PRIVATE_REGION_NAME = os.environ.get( "AWS_S3_PRIVATE_REGION_NAME", "us-east-1" From 33431242c3774cb44ee4480573eea471b2b41045 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 10:52:18 -0800 Subject: [PATCH 27/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- backend/docker-compose.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml index 20fa04eb2b..b66375f91c 100644 --- a/backend/docker-compose.yml +++ b/backend/docker-compose.yml @@ -18,12 +18,12 @@ services: timeout: 5s retries: 10 - c2g-db: + census-to-gsafac-db: image: "postgres:12" environment: POSTGRES_HOST_AUTH_METHOD: trust volumes: - - c2gdb-data:/var/lib/postgresql/data/ + - census-to-gsafac-data:/var/lib/postgresql/data/ ports: - "5433:5432" healthcheck: @@ -130,5 +130,5 @@ services: condition: service_healthy volumes: postgres-data: - c2gdb-data: + census-to-gsafac-data: minio-vol: From 5e3e74411ac2ec30969797a012c868eae4abc527 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 10:52:30 -0800 Subject: [PATCH 28/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- backend/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/run.sh b/backend/run.sh index ddf16b769a..6e2b3e0911 100755 --- a/backend/run.sh +++ b/backend/run.sh @@ -14,7 +14,7 @@ if [[ "${ENV}" == "LOCAL" || "${ENV}" == "TESTING" ]]; then export AWS_S3_PRIVATE_ENDPOINT="http://minio:9000" mc alias set myminio "${AWS_S3_PRIVATE_ENDPOINT}" minioadmin minioadmin mc mb myminio/gsa-fac-private-s3 - mc mb myminio/fac-c2g-s3 + mc mb myminio/fac-census-to-gsafac-s3 mc admin user svcacct add --access-key="${AWS_PRIVATE_ACCESS_KEY_ID}" --secret-key="${AWS_PRIVATE_SECRET_ACCESS_KEY}" myminio minioadmin fi; From 2d19a4687a1eefb65e85c71be43c3578805655cc Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 11:00:50 -0800 Subject: [PATCH 29/61] Replaced c2g with census_to_gsafac, renamed raw_to_pg.py as csv_to_postgres.py --- .../census_to_gsafac/management/commands/csv_to_postgres.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/census_to_gsafac/management/commands/csv_to_postgres.py b/backend/census_to_gsafac/management/commands/csv_to_postgres.py index 3bdda41334..3de2bb646d 100644 --- a/backend/census_to_gsafac/management/commands/csv_to_postgres.py +++ b/backend/census_to_gsafac/management/commands/csv_to_postgres.py @@ -25,9 +25,9 @@ class Command(BaseCommand): help = """ - Populate PG database from csv files + Populate Postgres database from csv files Usage: - manage.py raw_to_pg --folder --clean + manage.py csv_to_postgres --folder --clean """ def add_arguments(self, parser): From 047a4514043d9b29bf24c2dd681c46096e802f5d Mon Sep 17 00:00:00 2001 From: Purvin Patel <146017183+purvinptl@users.noreply.github.com> Date: Mon, 6 Nov 2023 14:13:57 -0600 Subject: [PATCH 30/61] Apply suggestions from code review Co-authored-by: Hassan D. M. Sambo --- backend/census_to_gsafac/README.md | 38 ++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index 6e53ab1bc4..819bcca67d 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -2,52 +2,50 @@ ## Overview -This is implemented as a django app to leverage existing management commands and settings. It has python and shell scripts to +This is implemented as a Django app to leverage existing management commands and settings. It includes Python and shell scripts to: -* load raw census data as csv files into an S3 bucket -* create postgres tables from these csv files -* perform any data clean up required to create a table from a csv file +* Load raw census data as CSV files into an S3 bucket +* Create Postgres tables from these CSV files +* Perform any data clean up required to create a table from a CSV file ## Infrastructure changes -* Create a new S3 bucket in cloud.gov spaces as well as in the local environment +* Create a new S3 bucket in Cloud.gov spaces as well as in the local environment * Create a new Postgres instance both in CG and locally ## Utilities -* fac_s3.py - Uploads folders or files to an s3 nucket. +* fac_s3.py - Uploads folders or files to an S3 bucket. ```bash -manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data -``` +python manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data -* raw_to_pg.py - Inserts data into PG tables using the contents of the csv files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. +* raw_to_pg.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. ```bash -manage.py raw_to_pg --folder data -manage.py raw_to_pg --clean True -``` +python manage.py raw_to_pg --folder data +python manage.py raw_to_pg --clean True -* models.py These ought to correspond to the incoming csv files +* models.py These correspond to the incoming CSV files * routers.py This tells django to use a different postgres instance. * data A folder that contains sample data that we can use for development. -## Pre-requisites for +## Prerequisites -* A django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc to simulate a real submission +* A Django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc. to simulate a real submission -## How to load test Census data into postgres? +## How to load test Census data into Postgres 1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_to_gsafac/data folder. -NOTE: Never checkin the census_to_gsafac/data folder into github. +NOTE: Never check in the census_to_gsafac/data folder into GitHub. -2. In the FAC/backend folder, run the following to load csv files from census_to_gsafac/data folder into fac-c2g-s3 bucket. +2. In the FAC/backend folder, run the following to load CSV files from census_to_gsafac/data folder into fac-c2g-s3 bucket. ```bash python manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data ``` -3. In the FAC/backend folder, run the following to read the csv files from fac-c2g-s3 bucket and load into postgres. +3. In the FAC/backend folder, run the following to read the CSV files from fac-c2g-s3 bucket and load into Postgres. ```bash -manage.py raw_to_pg --folder data +python manage.py raw_to_pg --folder data ``` \ No newline at end of file From 19f633ff5c6172c9491f5a8cd999dd87342fb9e3 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 12:48:37 -0800 Subject: [PATCH 31/61] Added census-to-gsafac database --- backend/docker-compose-web.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/docker-compose-web.yml b/backend/docker-compose-web.yml index 757281f19b..d8e89ffd35 100644 --- a/backend/docker-compose-web.yml +++ b/backend/docker-compose-web.yml @@ -16,6 +16,20 @@ services: timeout: 5s retries: 10 + census-to-gsafac-db: + image: "postgres:12" + environment: + POSTGRES_HOST_AUTH_METHOD: "trust" + volumes: + - census-to-gsafac-data:/var/lib/postgresql/data/ + ports: + - "5433:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -d postgres -U postgres -p 5433"] + interval: 10s + timeout: 5s + retries: 10 + web: image: ghcr.io/gsa-tts/fac/web-container:latest command: /src/run.sh @@ -83,4 +97,5 @@ services: volumes: postgres-data: + census-to-gsafac-data: minio-vol: From dce6318d9c1f2955bccc861a229ccbc9407d55c5 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 12:57:02 -0800 Subject: [PATCH 32/61] Replaced c2g with census-to-gsafac --- backend/census_to_gsafac/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index 819bcca67d..d4474894b1 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -18,13 +18,13 @@ This is implemented as a Django app to leverage existing management commands and * fac_s3.py - Uploads folders or files to an S3 bucket. ```bash -python manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data +python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data * raw_to_pg.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. ```bash -python manage.py raw_to_pg --folder data -python manage.py raw_to_pg --clean True +python manage.py csv_to_postgres --folder data +python manage.py csv_to_postgres --clean True * models.py These correspond to the incoming CSV files * routers.py This tells django to use a different postgres instance. @@ -42,10 +42,10 @@ NOTE: Never check in the census_to_gsafac/data folder into GitHub. 2. In the FAC/backend folder, run the following to load CSV files from census_to_gsafac/data folder into fac-c2g-s3 bucket. ```bash -python manage.py fac_s3 fac-c2g-s3 --upload --src census_to_gsafac/data +python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data ``` 3. In the FAC/backend folder, run the following to read the CSV files from fac-c2g-s3 bucket and load into Postgres. ```bash -python manage.py raw_to_pg --folder data +python manage.py csv_to_postgres --folder data ``` \ No newline at end of file From 294e5c5ddd176a6d5495b902fbadb5a6f5cd7494 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 13:30:14 -0800 Subject: [PATCH 33/61] Fix linting --- backend/config/settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/config/settings.py b/backend/config/settings.py index ca5bedcd4f..b970b93205 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -124,7 +124,7 @@ "dissemination", "census_historical_migration", "support", - "census_to_gsafac" + "census_to_gsafac", ] MIDDLEWARE = [ @@ -250,7 +250,7 @@ AWS_PRIVATE_STORAGE_BUCKET_NAME = "gsa-fac-private-s3" # Private CENSUS_TO_GSAFAC bucket AWS_CENSUS_TO_GSAFAC_BUCKET_NAME = "fac-census-to-gsafac-s3" - + AWS_S3_PRIVATE_REGION_NAME = os.environ.get( "AWS_S3_PRIVATE_REGION_NAME", "us-east-1" ) From cb940dc68d30a68672c429ef580e587020acda51 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 13:49:43 -0800 Subject: [PATCH 34/61] Fix linting --- backend/config/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/config/settings.py b/backend/config/settings.py index b970b93205..bb04d04534 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -171,7 +171,7 @@ DATABASES = { "default": env.dj_db_url( "DATABASE_URL", default="postgres://postgres:password@0.0.0.0/backend" - ), + ), "census-to-gsafac-db": env.dj_db_url( "DATABASE_URL_CENSUS_TO_GSAFAC_DB", default="postgres://postgres:password@0.0.0.0/census-to-gsafac-db" ), From 1db2f1e1518324fd2e3f54d2cf425e2bebb1e704 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:02:04 -0800 Subject: [PATCH 35/61] Fix linting --- backend/census_to_gsafac/admin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/census_to_gsafac/admin.py b/backend/census_to_gsafac/admin.py index 8c38f3f3da..4185d360e9 100644 --- a/backend/census_to_gsafac/admin.py +++ b/backend/census_to_gsafac/admin.py @@ -1,3 +1,3 @@ -from django.contrib import admin +# from django.contrib import admin # Register your models here. From 5a47524c5aedc01e5709f3e3a7dc431fc83f4ede Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:02:26 -0800 Subject: [PATCH 36/61] Fix linting --- backend/census_to_gsafac/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/census_to_gsafac/tests.py b/backend/census_to_gsafac/tests.py index 7ce503c2dd..a79ca8be56 100644 --- a/backend/census_to_gsafac/tests.py +++ b/backend/census_to_gsafac/tests.py @@ -1,3 +1,3 @@ -from django.test import TestCase +# from django.test import TestCase # Create your tests here. From 6648f0a38b89276c197ae8d2505763af4cc10ca5 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:02:42 -0800 Subject: [PATCH 37/61] Fix linting --- backend/census_to_gsafac/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/census_to_gsafac/views.py b/backend/census_to_gsafac/views.py index 91ea44a218..fd0e044955 100644 --- a/backend/census_to_gsafac/views.py +++ b/backend/census_to_gsafac/views.py @@ -1,3 +1,3 @@ -from django.shortcuts import render +# from django.shortcuts import render # Create your views here. From e681affd169948b20411823a9cf5c571d87b8632 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:22:05 -0800 Subject: [PATCH 38/61] Reformatted with black --- .../management/commands/csv_to_postgres.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backend/census_to_gsafac/management/commands/csv_to_postgres.py b/backend/census_to_gsafac/management/commands/csv_to_postgres.py index 3de2bb646d..5bd2daabf0 100644 --- a/backend/census_to_gsafac/management/commands/csv_to_postgres.py +++ b/backend/census_to_gsafac/management/commands/csv_to_postgres.py @@ -58,8 +58,12 @@ def handle(self, *args, **options): continue model_name = self.get_model_name(item["Key"]) if model_name: - model_obj = census_to_gsafac_models[census_to_gsafac_model_names.index(model_name)] - response = s3_client.get_object(Bucket=census_to_gsafac_bucket_name, Key=item["Key"]) + model_obj = census_to_gsafac_models[ + census_to_gsafac_model_names.index(model_name) + ] + response = s3_client.get_object( + Bucket=census_to_gsafac_bucket_name, Key=item["Key"] + ) print("Obtained response from S3") lines = response["Body"].read().decode("utf-8").splitlines(True) print("Loaded Body into 'lines'") From 67bb9a87f4ed1f67ea1e9a467125fcdc43d8c1ba Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:22:28 -0800 Subject: [PATCH 39/61] Reformatted with black --- backend/census_to_gsafac/apps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/census_to_gsafac/apps.py b/backend/census_to_gsafac/apps.py index 74305c65f9..47e93a1897 100644 --- a/backend/census_to_gsafac/apps.py +++ b/backend/census_to_gsafac/apps.py @@ -2,5 +2,5 @@ class CensusToGsafacConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'census_to_gsafac' + default_auto_field = "django.db.models.BigAutoField" + name = "census_to_gsafac" From 89624c4bd57b8c5148379af49c84451d64f41731 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:22:49 -0800 Subject: [PATCH 40/61] Reformatted with black --- backend/config/settings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/config/settings.py b/backend/config/settings.py index bb04d04534..6f823fbaee 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -173,7 +173,8 @@ "DATABASE_URL", default="postgres://postgres:password@0.0.0.0/backend" ), "census-to-gsafac-db": env.dj_db_url( - "DATABASE_URL_CENSUS_TO_GSAFAC_DB", default="postgres://postgres:password@0.0.0.0/census-to-gsafac-db" + "DATABASE_URL_CENSUS_TO_GSAFAC_DB", + default="postgres://postgres:password@0.0.0.0/census-to-gsafac-db", ), } From e8b91e05da09969afabf47b87bd76fc56d9c264e Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:35:50 -0800 Subject: [PATCH 41/61] Updated S3 bucket name and filename --- backend/census_to_gsafac/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index d4474894b1..150273f4ee 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -20,7 +20,7 @@ This is implemented as a Django app to leverage existing management commands and ```bash python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data -* raw_to_pg.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. +* csv_to_postgres.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. ```bash python manage.py csv_to_postgres --folder data @@ -40,12 +40,12 @@ python manage.py csv_to_postgres --clean True 1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_to_gsafac/data folder. NOTE: Never check in the census_to_gsafac/data folder into GitHub. -2. In the FAC/backend folder, run the following to load CSV files from census_to_gsafac/data folder into fac-c2g-s3 bucket. +2. In the FAC/backend folder, run the following to load CSV files from census_to_gsafac/data folder into fac-census-to-gsafac-s3 bucket. ```bash python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data ``` -3. In the FAC/backend folder, run the following to read the CSV files from fac-c2g-s3 bucket and load into Postgres. +3. In the FAC/backend folder, run the following to read the CSV files from fac-census-to-gsafac-s3 bucket and load into Postgres. ```bash python manage.py csv_to_postgres --folder data ``` \ No newline at end of file From d96004f472198df6324917f0c612bc8c714d53e2 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 14:38:28 -0800 Subject: [PATCH 42/61] Updated S3 bucket name and filename --- backend/census_to_gsafac/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index 150273f4ee..dd79631e58 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -19,12 +19,14 @@ This is implemented as a Django app to leverage existing management commands and ```bash python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data +``` * csv_to_postgres.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. ```bash python manage.py csv_to_postgres --folder data python manage.py csv_to_postgres --clean True +``` * models.py These correspond to the incoming CSV files * routers.py This tells django to use a different postgres instance. From 2e28f1405ae3577306b94fe507ba9d2ca2081826 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Mon, 6 Nov 2023 16:28:27 -0800 Subject: [PATCH 43/61] Updates --- backend/census_to_gsafac/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md index dd79631e58..fea16fc8a8 100644 --- a/backend/census_to_gsafac/README.md +++ b/backend/census_to_gsafac/README.md @@ -44,10 +44,10 @@ NOTE: Never check in the census_to_gsafac/data folder into GitHub. 2. In the FAC/backend folder, run the following to load CSV files from census_to_gsafac/data folder into fac-census-to-gsafac-s3 bucket. ```bash -python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data +docker compose run web python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data ``` 3. In the FAC/backend folder, run the following to read the CSV files from fac-census-to-gsafac-s3 bucket and load into Postgres. ```bash -python manage.py csv_to_postgres --folder data +docker compose run web python manage.py csv_to_postgres --folder data ``` \ No newline at end of file From 9b0beb446d57450a004fd205a79b0e2933d2660c Mon Sep 17 00:00:00 2001 From: "Hassan D. M. Sambo" Date: Tue, 7 Nov 2023 13:56:47 -0500 Subject: [PATCH 44/61] Consolidated census_to_gsafac and census_historical_migration apps --- backend/census_historical_migration/README.md | 58 ++- .../management/commands/csv_to_postgres.py | 2 +- .../management/commands/fac_s3.py | 0 backend/census_historical_migration/models.py | 448 +++++++++++++++++- .../routers.py | 0 backend/census_to_gsafac/README.md | 53 --- backend/census_to_gsafac/__init__.py | 0 backend/census_to_gsafac/admin.py | 3 - backend/census_to_gsafac/apps.py | 6 - .../migrations/0001_initial.py | 364 -------------- .../census_to_gsafac/migrations/__init__.py | 0 backend/census_to_gsafac/models.py | 445 ----------------- backend/census_to_gsafac/tests.py | 3 - backend/census_to_gsafac/views.py | 3 - 14 files changed, 502 insertions(+), 883 deletions(-) rename backend/{census_to_gsafac => census_historical_migration}/management/commands/csv_to_postgres.py (97%) rename backend/{census_to_gsafac => census_historical_migration}/management/commands/fac_s3.py (100%) rename backend/{census_to_gsafac => census_historical_migration}/routers.py (100%) delete mode 100644 backend/census_to_gsafac/README.md delete mode 100644 backend/census_to_gsafac/__init__.py delete mode 100644 backend/census_to_gsafac/admin.py delete mode 100644 backend/census_to_gsafac/apps.py delete mode 100644 backend/census_to_gsafac/migrations/0001_initial.py delete mode 100644 backend/census_to_gsafac/migrations/__init__.py delete mode 100644 backend/census_to_gsafac/models.py delete mode 100644 backend/census_to_gsafac/tests.py delete mode 100644 backend/census_to_gsafac/views.py diff --git a/backend/census_historical_migration/README.md b/backend/census_historical_migration/README.md index 209400fa09..d9fbd0f5ee 100644 --- a/backend/census_historical_migration/README.md +++ b/backend/census_historical_migration/README.md @@ -1,4 +1,58 @@ -# Census Historical Migration +# Census to FAC data migration + +## Overview + +This is implemented as a Django app to leverage existing management commands and settings. It includes Python and shell scripts to: + +* Load raw census data as CSV files into an S3 bucket +* Create Postgres tables from these CSV files +* Perform any data clean up required to create a table from a CSV file +* Run the historic data migrator +* Run the historic workbook generator + +## Infrastructure changes + +* Create a new S3 bucket in Cloud.gov spaces as well as in the local environment +* Create a new Postgres instance both in CG and locally + +## Utilities + +* fac_s3.py - Uploads folders or files to an S3 bucket. + +```bash +python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_historical_migration/data +``` + +* csv_to_postgres.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. + +```bash +python manage.py csv_to_postgres --folder data +python manage.py csv_to_postgres --clean True +``` + +* models.py These correspond to the incoming CSV files +* routers.py This tells django to use a different postgres instance. + +* data A folder that contains sample data that we can use for development. + +## Prerequisites + +* A Django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc. to simulate a real submission + +## How to load test Census data into Postgres + +1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_historical_migration/data folder. +NOTE: Never check in the census_historical_migration/data folder into GitHub. + +2. In the FAC/backend folder, run the following to load CSV files from census_historical_migration/data folder into fac-census-to-gsafac-s3 bucket. +```bash +docker compose run web python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_historical_migration/data +``` + +3. In the FAC/backend folder, run the following to read the CSV files from fac-census-to-gsafac-s3 bucket and load into Postgres. +```bash +docker compose run web python manage.py csv_to_postgres --folder data +``` ### How to run the historic data migrator: ``` @@ -17,4 +71,4 @@ docker compose run web python manage.py historic_workbook_generator --dbkey 100010 ``` - `year` is optional and defaults to `22`. -- The `output` directory will be created if it doesn't already exist. +- The `output` directory will be created if it doesn't already exist. \ No newline at end of file diff --git a/backend/census_to_gsafac/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py similarity index 97% rename from backend/census_to_gsafac/management/commands/csv_to_postgres.py rename to backend/census_historical_migration/management/commands/csv_to_postgres.py index 5bd2daabf0..a9d8fb40fe 100644 --- a/backend/census_to_gsafac/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) -census_to_gsafac_models = list(apps.get_app_config("census_to_gsafac").get_models()) +census_to_gsafac_models = list(apps.get_app_config("census_historical_migration").get_models()) census_to_gsafac_model_names = [m._meta.model_name for m in census_to_gsafac_models] s3_client = boto3.client( "s3", diff --git a/backend/census_to_gsafac/management/commands/fac_s3.py b/backend/census_historical_migration/management/commands/fac_s3.py similarity index 100% rename from backend/census_to_gsafac/management/commands/fac_s3.py rename to backend/census_historical_migration/management/commands/fac_s3.py diff --git a/backend/census_historical_migration/models.py b/backend/census_historical_migration/models.py index af3844168d..503a9e027f 100644 --- a/backend/census_historical_migration/models.py +++ b/backend/census_historical_migration/models.py @@ -1,3 +1,445 @@ -from django.db import models # noqa: F401 - -# Create your models here. +from django.db import models + + +class ELECAUDITHEADER(models.Model): + ELECAUDITHEADERID = models.TextField(blank=True, null=True) + + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + FYENDDATE = models.TextField(blank=True, null=True) + + AUDITTYPE = models.TextField(blank=True, null=True) + + PERIODCOVERED = models.TextField(blank=True, null=True) + + NUMBERMONTHS = models.TextField(blank=True, null=True) + + MULTIPLEEINS = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + EINSUBCODE = models.TextField(blank=True, null=True) + + MULTIPLEDUNS = models.TextField(blank=True, null=True) + + DUNS = models.TextField(blank=True, null=True) + + AUDITEENAME = models.TextField(blank=True, null=True) + + STREET1 = models.TextField(blank=True, null=True) + + STREET2 = models.TextField(blank=True, null=True) + + CITY = models.TextField(blank=True, null=True) + + STATE = models.TextField(blank=True, null=True) + + ZIPCODE = models.TextField(blank=True, null=True) + + AUDITEECONTACT = models.TextField(blank=True, null=True) + + AUDITEETITLE = models.TextField(blank=True, null=True) + + AUDITEEPHONE = models.TextField(blank=True, null=True) + + AUDITEEFAX = models.TextField(blank=True, null=True) + + AUDITEEEMAIL = models.TextField(blank=True, null=True) + + AUDITEEDATESIGNED = models.TextField(blank=True, null=True) + + AUDITEENAMETITLE = models.TextField(blank=True, null=True) + + CPAFIRMNAME = models.TextField(blank=True, null=True) + + CPASTREET1 = models.TextField(blank=True, null=True) + + CPASTREET2 = models.TextField(blank=True, null=True) + + CPACITY = models.TextField(blank=True, null=True) + + CPASTATE = models.TextField(blank=True, null=True) + + CPAZIPCODE = models.TextField(blank=True, null=True) + + CPACONTACT = models.TextField(blank=True, null=True) + + CPATITLE = models.TextField(blank=True, null=True) + + CPAPHONE = models.TextField(blank=True, null=True) + + CPAFAX = models.TextField(blank=True, null=True) + + CPAEMAIL = models.TextField(blank=True, null=True) + + CPADATESIGNED = models.TextField(blank=True, null=True) + + CPANAMETITLE = models.TextField(blank=True, null=True) + + COG_OVER = models.TextField(blank=True, null=True) + + COGAGENCY = models.TextField(blank=True, null=True) + + TYPEREPORT_FS = models.TextField(blank=True, null=True) + + REPORTABLECONDITION = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + MATERIALNONCOMPLIANCE = models.TextField(blank=True, null=True) + + GOINGCONCERN = models.TextField(blank=True, null=True) + + TYPEREPORT_MP = models.TextField(blank=True, null=True) + + DOLLARTHRESHOLD = models.TextField(blank=True, null=True) + + LOWRISK = models.TextField(blank=True, null=True) + + REPORTREQUIRED = models.TextField(blank=True, null=True) + + TOTFEDEXPEND = models.TextField(blank=True, null=True) + + COPIES = models.TextField(blank=True, null=True) + + REPORTABLECONDITION_MP = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) + + QCOSTS = models.TextField(blank=True, null=True) + + CYFINDINGS = models.TextField(blank=True, null=True) + + PYSCHEDULE = models.TextField(blank=True, null=True) + + DUP_REPORTS = models.TextField(blank=True, null=True) + + COG_AGENCY = models.TextField(blank=True, null=True) + + OVERSIGHTAGENCY = models.TextField(blank=True, null=True) + + DATERECEIVED = models.TextField(blank=True, null=True) + + DATEFIREWALL = models.TextField(blank=True, null=True) + + PREVIOUSDATEFIREWALL = models.TextField(blank=True, null=True) + + FINDINGREFNUM = models.TextField(blank=True, null=True) + + TYPEOFENTITY = models.TextField(blank=True, null=True) + + IMAGE = models.TextField(blank=True, null=True) + + AGENCYCFDA = models.TextField(blank=True, null=True) + + INITIALDATE = models.TextField(blank=True, null=True) + + DATERECEIVEDOTHER = models.TextField(blank=True, null=True) + + MULTIPLE_CPAS = models.TextField(blank=True, null=True) + + AUDITEECERTIFYNAME = models.TextField(blank=True, null=True) + + AUDITEECERTIFYTITLE = models.TextField(blank=True, null=True) + + FACACCEPTEDDATE = models.TextField(blank=True, null=True) + + AUDITOR_EIN = models.TextField(blank=True, null=True) + + SD_MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + SD_MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY_MP = models.TextField(blank=True, null=True) + + SP_FRAMEWORK = models.TextField(blank=True, null=True) + + SP_FRAMEWORK_REQUIRED = models.TextField(blank=True, null=True) + + TYPEREPORT_SP_FRAMEWORK = models.TextField(blank=True, null=True) + + SUPPRESSION_CODE = models.TextField(blank=True, null=True) + + ENTITY_TYPE = models.TextField(blank=True, null=True) + + TYPEAUDIT_CODE = models.TextField(blank=True, null=True) + + OPEID = models.TextField(blank=True, null=True) + + DATETOED = models.TextField(blank=True, null=True) + + DATEFINISHED = models.TextField(blank=True, null=True) + + TYPEFINDING = models.TextField(blank=True, null=True) + + TYPEFUNDING = models.TextField(blank=True, null=True) + + FYSTARTDATE = models.TextField(blank=True, null=True) + + CPAFOREIGN = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + CPACOUNTRY = models.TextField(blank=True, null=True) + + +class ELECEINS(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + EINSEQNUM = models.TextField(blank=True, null=True) + + DUNS = models.TextField(blank=True, null=True) + + DUNSEQNUM = models.TextField(blank=True, null=True) + + +class ELECAUDITFINDINGS(models.Model): + ELECAUDITFINDINGSID = models.TextField(blank=True, null=True) + + ELECAUDITSID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + QCOSTS = models.TextField(blank=True, null=True) + + OTHERFINDINGS = models.TextField(blank=True, null=True) + + SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) + + MATERIALWEAKNESS = models.TextField(blank=True, null=True) + + OTHERNONCOMPLIANCE = models.TextField(blank=True, null=True) + + TYPEREQUIREMENT = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + MODIFIEDOPINION = models.TextField(blank=True, null=True) + + REPEATFINDING = models.TextField(blank=True, null=True) + + PRIORFINDINGREFNUMS = models.TextField(blank=True, null=True) + + +class ELECNOTES(models.Model): + ID = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + SEQ_NUMBER = models.TextField(blank=True, null=True) + + TYPE_ID = models.TextField(blank=True, null=True) + + NOTE_INDEX = models.TextField(blank=True, null=True) + + TITLE = models.TextField(blank=True, null=True) + + CONTENT = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + +class ELECFINDINGSTEXT(models.Model): + SEQ_NUMBER = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + TEXT = models.TextField(blank=True, null=True) + + CHARTSTABLES = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) + + +class ELECCPAS(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + SEQNUM = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + CPAFIRMNAME = models.TextField(blank=True, null=True) + + CPASTREET1 = models.TextField(blank=True, null=True) + + CPACITY = models.TextField(blank=True, null=True) + + CPASTATE = models.TextField(blank=True, null=True) + + CPAZIPCODE = models.TextField(blank=True, null=True) + + CPACONTACT = models.TextField(blank=True, null=True) + + CPATITLE = models.TextField(blank=True, null=True) + + CPAPHONE = models.TextField(blank=True, null=True) + + CPAFAX = models.TextField(blank=True, null=True) + + CPAEMAIL = models.TextField(blank=True, null=True) + + CPAEIN = models.TextField(blank=True, null=True) + + +class ELECAUDITS(models.Model): + ELECAUDITSID = models.TextField(blank=True, null=True) + + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + CFDASEQNUM = models.TextField(blank=True, null=True) + + CFDA = models.TextField(blank=True, null=True) + + FEDERALPROGRAMNAME = models.TextField(blank=True, null=True) + + AMOUNT = models.TextField(blank=True, null=True) + + MAJORPROGRAM = models.TextField(blank=True, null=True) + + TYPEREQUIREMENT = models.TextField(blank=True, null=True) + + QCOSTS2 = models.TextField(blank=True, null=True) + + FINDINGS = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + RD = models.TextField(blank=True, null=True) + + DIRECT = models.TextField(blank=True, null=True) + + CFDA_PREFIX = models.TextField(blank=True, null=True) + + CFDA_EXT = models.TextField(blank=True, null=True) + + EIN = models.TextField(blank=True, null=True) + + CFDA2 = models.TextField(blank=True, null=True) + + TYPEREPORT_MP = models.TextField(blank=True, null=True) + + TYPEREPORT_MP_OVERRIDE = models.TextField(blank=True, null=True) + + ARRA = models.TextField(blank=True, null=True) + + LOANS = models.TextField(blank=True, null=True) + + FINDINGSCOUNT = models.TextField(blank=True, null=True) + + LOANBALANCE = models.TextField(blank=True, null=True) + + PASSTHROUGHAMOUNT = models.TextField(blank=True, null=True) + + AWARDIDENTIFICATION = models.TextField(blank=True, null=True) + + CLUSTERNAME = models.TextField(blank=True, null=True) + + PASSTHROUGHAWARD = models.TextField(blank=True, null=True) + + STATECLUSTERNAME = models.TextField(blank=True, null=True) + + PROGRAMTOTAL = models.TextField(blank=True, null=True) + + CLUSTERTOTAL = models.TextField(blank=True, null=True) + + OTHERCLUSTERNAME = models.TextField(blank=True, null=True) + + CFDAPROGRAMNAME = models.TextField(blank=True, null=True) + + +class ELECPASSTHROUGH(models.Model): + ID = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + ELECAUDITSID = models.TextField(blank=True, null=True) + + PASSTHROUGHNAME = models.TextField(blank=True, null=True) + + PASSTHROUGHID = models.TextField(blank=True, null=True) + + +class ELECUEIS(models.Model): + UEISID = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + SEQNUM = models.TextField(blank=True, null=True) + + +class ELECCAPTEXT(models.Model): + SEQ_NUMBER = models.TextField(blank=True, null=True) + + DBKEY = models.TextField(blank=True, null=True) + + AUDITYEAR = models.TextField(blank=True, null=True) + + FINDINGREFNUMS = models.TextField(blank=True, null=True) + + TEXT = models.TextField(blank=True, null=True) + + CHARTSTABLES = models.TextField(blank=True, null=True) + + REPORTID = models.TextField(blank=True, null=True) + + VERSION = models.TextField(blank=True, null=True) + + UEI = models.TextField(blank=True, null=True) + + MULTIPLEUEIS = models.TextField(blank=True, null=True) diff --git a/backend/census_to_gsafac/routers.py b/backend/census_historical_migration/routers.py similarity index 100% rename from backend/census_to_gsafac/routers.py rename to backend/census_historical_migration/routers.py diff --git a/backend/census_to_gsafac/README.md b/backend/census_to_gsafac/README.md deleted file mode 100644 index fea16fc8a8..0000000000 --- a/backend/census_to_gsafac/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# Census to FAC data migration - -## Overview - -This is implemented as a Django app to leverage existing management commands and settings. It includes Python and shell scripts to: - -* Load raw census data as CSV files into an S3 bucket -* Create Postgres tables from these CSV files -* Perform any data clean up required to create a table from a CSV file - -## Infrastructure changes - -* Create a new S3 bucket in Cloud.gov spaces as well as in the local environment -* Create a new Postgres instance both in CG and locally - -## Utilities - -* fac_s3.py - Uploads folders or files to an S3 bucket. - -```bash -python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data -``` - -* csv_to_postgres.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. - -```bash -python manage.py csv_to_postgres --folder data -python manage.py csv_to_postgres --clean True -``` - -* models.py These correspond to the incoming CSV files -* routers.py This tells django to use a different postgres instance. - -* data A folder that contains sample data that we can use for development. - -## Prerequisites - -* A Django app that reads the tables created here as unmanaged models and populates SF-SAC tables by creating workbooks, etc. to simulate a real submission - -## How to load test Census data into Postgres - -1. Download test Census data from https://drive.google.com/drive/folders/1TY-7yWsMd8DsVEXvwrEe_oWW1iR2sGoy into census_to_gsafac/data folder. -NOTE: Never check in the census_to_gsafac/data folder into GitHub. - -2. In the FAC/backend folder, run the following to load CSV files from census_to_gsafac/data folder into fac-census-to-gsafac-s3 bucket. -```bash -docker compose run web python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_to_gsafac/data -``` - -3. In the FAC/backend folder, run the following to read the CSV files from fac-census-to-gsafac-s3 bucket and load into Postgres. -```bash -docker compose run web python manage.py csv_to_postgres --folder data -``` \ No newline at end of file diff --git a/backend/census_to_gsafac/__init__.py b/backend/census_to_gsafac/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/backend/census_to_gsafac/admin.py b/backend/census_to_gsafac/admin.py deleted file mode 100644 index 4185d360e9..0000000000 --- a/backend/census_to_gsafac/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -# from django.contrib import admin - -# Register your models here. diff --git a/backend/census_to_gsafac/apps.py b/backend/census_to_gsafac/apps.py deleted file mode 100644 index 47e93a1897..0000000000 --- a/backend/census_to_gsafac/apps.py +++ /dev/null @@ -1,6 +0,0 @@ -from django.apps import AppConfig - - -class CensusToGsafacConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "census_to_gsafac" diff --git a/backend/census_to_gsafac/migrations/0001_initial.py b/backend/census_to_gsafac/migrations/0001_initial.py deleted file mode 100644 index c7393cb835..0000000000 --- a/backend/census_to_gsafac/migrations/0001_initial.py +++ /dev/null @@ -1,364 +0,0 @@ -# Generated by Django 4.2.6 on 2023-11-03 17:38 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [] - - operations = [ - migrations.CreateModel( - name="ELECAUDITFINDINGS", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ELECAUDITFINDINGSID", models.TextField(blank=True, null=True)), - ("ELECAUDITSID", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("REPORTID", models.TextField(blank=True, null=True)), - ("VERSION", models.TextField(blank=True, null=True)), - ("QCOSTS", models.TextField(blank=True, null=True)), - ("OTHERFINDINGS", models.TextField(blank=True, null=True)), - ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), - ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), - ("OTHERNONCOMPLIANCE", models.TextField(blank=True, null=True)), - ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), - ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), - ("MODIFIEDOPINION", models.TextField(blank=True, null=True)), - ("REPEATFINDING", models.TextField(blank=True, null=True)), - ("PRIORFINDINGREFNUMS", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECAUDITHEADER", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ELECAUDITHEADERID", models.TextField(blank=True, null=True)), - ("ID", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("FYENDDATE", models.TextField(blank=True, null=True)), - ("AUDITTYPE", models.TextField(blank=True, null=True)), - ("PERIODCOVERED", models.TextField(blank=True, null=True)), - ("NUMBERMONTHS", models.TextField(blank=True, null=True)), - ("MULTIPLEEINS", models.TextField(blank=True, null=True)), - ("EIN", models.TextField(blank=True, null=True)), - ("EINSUBCODE", models.TextField(blank=True, null=True)), - ("MULTIPLEDUNS", models.TextField(blank=True, null=True)), - ("DUNS", models.TextField(blank=True, null=True)), - ("AUDITEENAME", models.TextField(blank=True, null=True)), - ("STREET1", models.TextField(blank=True, null=True)), - ("STREET2", models.TextField(blank=True, null=True)), - ("CITY", models.TextField(blank=True, null=True)), - ("STATE", models.TextField(blank=True, null=True)), - ("ZIPCODE", models.TextField(blank=True, null=True)), - ("AUDITEECONTACT", models.TextField(blank=True, null=True)), - ("AUDITEETITLE", models.TextField(blank=True, null=True)), - ("AUDITEEPHONE", models.TextField(blank=True, null=True)), - ("AUDITEEFAX", models.TextField(blank=True, null=True)), - ("AUDITEEEMAIL", models.TextField(blank=True, null=True)), - ("AUDITEEDATESIGNED", models.TextField(blank=True, null=True)), - ("AUDITEENAMETITLE", models.TextField(blank=True, null=True)), - ("CPAFIRMNAME", models.TextField(blank=True, null=True)), - ("CPASTREET1", models.TextField(blank=True, null=True)), - ("CPASTREET2", models.TextField(blank=True, null=True)), - ("CPACITY", models.TextField(blank=True, null=True)), - ("CPASTATE", models.TextField(blank=True, null=True)), - ("CPAZIPCODE", models.TextField(blank=True, null=True)), - ("CPACONTACT", models.TextField(blank=True, null=True)), - ("CPATITLE", models.TextField(blank=True, null=True)), - ("CPAPHONE", models.TextField(blank=True, null=True)), - ("CPAFAX", models.TextField(blank=True, null=True)), - ("CPAEMAIL", models.TextField(blank=True, null=True)), - ("CPADATESIGNED", models.TextField(blank=True, null=True)), - ("CPANAMETITLE", models.TextField(blank=True, null=True)), - ("COG_OVER", models.TextField(blank=True, null=True)), - ("COGAGENCY", models.TextField(blank=True, null=True)), - ("TYPEREPORT_FS", models.TextField(blank=True, null=True)), - ("REPORTABLECONDITION", models.TextField(blank=True, null=True)), - ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), - ("MATERIALNONCOMPLIANCE", models.TextField(blank=True, null=True)), - ("GOINGCONCERN", models.TextField(blank=True, null=True)), - ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), - ("DOLLARTHRESHOLD", models.TextField(blank=True, null=True)), - ("LOWRISK", models.TextField(blank=True, null=True)), - ("REPORTREQUIRED", models.TextField(blank=True, null=True)), - ("TOTFEDEXPEND", models.TextField(blank=True, null=True)), - ("COPIES", models.TextField(blank=True, null=True)), - ("REPORTABLECONDITION_MP", models.TextField(blank=True, null=True)), - ("MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), - ("QCOSTS", models.TextField(blank=True, null=True)), - ("CYFINDINGS", models.TextField(blank=True, null=True)), - ("PYSCHEDULE", models.TextField(blank=True, null=True)), - ("DUP_REPORTS", models.TextField(blank=True, null=True)), - ("COG_AGENCY", models.TextField(blank=True, null=True)), - ("OVERSIGHTAGENCY", models.TextField(blank=True, null=True)), - ("DATERECEIVED", models.TextField(blank=True, null=True)), - ("DATEFIREWALL", models.TextField(blank=True, null=True)), - ("PREVIOUSDATEFIREWALL", models.TextField(blank=True, null=True)), - ("FINDINGREFNUM", models.TextField(blank=True, null=True)), - ("TYPEOFENTITY", models.TextField(blank=True, null=True)), - ("IMAGE", models.TextField(blank=True, null=True)), - ("AGENCYCFDA", models.TextField(blank=True, null=True)), - ("INITIALDATE", models.TextField(blank=True, null=True)), - ("DATERECEIVEDOTHER", models.TextField(blank=True, null=True)), - ("MULTIPLE_CPAS", models.TextField(blank=True, null=True)), - ("AUDITEECERTIFYNAME", models.TextField(blank=True, null=True)), - ("AUDITEECERTIFYTITLE", models.TextField(blank=True, null=True)), - ("FACACCEPTEDDATE", models.TextField(blank=True, null=True)), - ("AUDITOR_EIN", models.TextField(blank=True, null=True)), - ("SD_MATERIALWEAKNESS", models.TextField(blank=True, null=True)), - ("SD_MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), - ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), - ("SIGNIFICANTDEFICIENCY_MP", models.TextField(blank=True, null=True)), - ("SP_FRAMEWORK", models.TextField(blank=True, null=True)), - ("SP_FRAMEWORK_REQUIRED", models.TextField(blank=True, null=True)), - ("TYPEREPORT_SP_FRAMEWORK", models.TextField(blank=True, null=True)), - ("SUPPRESSION_CODE", models.TextField(blank=True, null=True)), - ("ENTITY_TYPE", models.TextField(blank=True, null=True)), - ("TYPEAUDIT_CODE", models.TextField(blank=True, null=True)), - ("OPEID", models.TextField(blank=True, null=True)), - ("DATETOED", models.TextField(blank=True, null=True)), - ("DATEFINISHED", models.TextField(blank=True, null=True)), - ("TYPEFINDING", models.TextField(blank=True, null=True)), - ("TYPEFUNDING", models.TextField(blank=True, null=True)), - ("FYSTARTDATE", models.TextField(blank=True, null=True)), - ("CPAFOREIGN", models.TextField(blank=True, null=True)), - ("UEI", models.TextField(blank=True, null=True)), - ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), - ("CPACOUNTRY", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECAUDITS", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ELECAUDITSID", models.TextField(blank=True, null=True)), - ("ID", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("CFDASEQNUM", models.TextField(blank=True, null=True)), - ("CFDA", models.TextField(blank=True, null=True)), - ("FEDERALPROGRAMNAME", models.TextField(blank=True, null=True)), - ("AMOUNT", models.TextField(blank=True, null=True)), - ("MAJORPROGRAM", models.TextField(blank=True, null=True)), - ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), - ("QCOSTS2", models.TextField(blank=True, null=True)), - ("FINDINGS", models.TextField(blank=True, null=True)), - ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), - ("RD", models.TextField(blank=True, null=True)), - ("DIRECT", models.TextField(blank=True, null=True)), - ("CFDA_PREFIX", models.TextField(blank=True, null=True)), - ("CFDA_EXT", models.TextField(blank=True, null=True)), - ("EIN", models.TextField(blank=True, null=True)), - ("CFDA2", models.TextField(blank=True, null=True)), - ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), - ("TYPEREPORT_MP_OVERRIDE", models.TextField(blank=True, null=True)), - ("ARRA", models.TextField(blank=True, null=True)), - ("LOANS", models.TextField(blank=True, null=True)), - ("FINDINGSCOUNT", models.TextField(blank=True, null=True)), - ("LOANBALANCE", models.TextField(blank=True, null=True)), - ("PASSTHROUGHAMOUNT", models.TextField(blank=True, null=True)), - ("AWARDIDENTIFICATION", models.TextField(blank=True, null=True)), - ("CLUSTERNAME", models.TextField(blank=True, null=True)), - ("PASSTHROUGHAWARD", models.TextField(blank=True, null=True)), - ("STATECLUSTERNAME", models.TextField(blank=True, null=True)), - ("PROGRAMTOTAL", models.TextField(blank=True, null=True)), - ("CLUSTERTOTAL", models.TextField(blank=True, null=True)), - ("OTHERCLUSTERNAME", models.TextField(blank=True, null=True)), - ("CFDAPROGRAMNAME", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECCAPTEXT", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("SEQ_NUMBER", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), - ("TEXT", models.TextField(blank=True, null=True)), - ("CHARTSTABLES", models.TextField(blank=True, null=True)), - ("REPORTID", models.TextField(blank=True, null=True)), - ("VERSION", models.TextField(blank=True, null=True)), - ("UEI", models.TextField(blank=True, null=True)), - ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECCPAS", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ID", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("SEQNUM", models.TextField(blank=True, null=True)), - ("VERSION", models.TextField(blank=True, null=True)), - ("CPAFIRMNAME", models.TextField(blank=True, null=True)), - ("CPASTREET1", models.TextField(blank=True, null=True)), - ("CPACITY", models.TextField(blank=True, null=True)), - ("CPASTATE", models.TextField(blank=True, null=True)), - ("CPAZIPCODE", models.TextField(blank=True, null=True)), - ("CPACONTACT", models.TextField(blank=True, null=True)), - ("CPATITLE", models.TextField(blank=True, null=True)), - ("CPAPHONE", models.TextField(blank=True, null=True)), - ("CPAFAX", models.TextField(blank=True, null=True)), - ("CPAEMAIL", models.TextField(blank=True, null=True)), - ("CPAEIN", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECEINS", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ID", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("EIN", models.TextField(blank=True, null=True)), - ("EINSEQNUM", models.TextField(blank=True, null=True)), - ("DUNS", models.TextField(blank=True, null=True)), - ("DUNSEQNUM", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECFINDINGSTEXT", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("SEQ_NUMBER", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), - ("TEXT", models.TextField(blank=True, null=True)), - ("CHARTSTABLES", models.TextField(blank=True, null=True)), - ("REPORTID", models.TextField(blank=True, null=True)), - ("VERSION", models.TextField(blank=True, null=True)), - ("UEI", models.TextField(blank=True, null=True)), - ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECNOTES", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ID", models.TextField(blank=True, null=True)), - ("REPORTID", models.TextField(blank=True, null=True)), - ("VERSION", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("SEQ_NUMBER", models.TextField(blank=True, null=True)), - ("TYPE_ID", models.TextField(blank=True, null=True)), - ("NOTE_INDEX", models.TextField(blank=True, null=True)), - ("TITLE", models.TextField(blank=True, null=True)), - ("CONTENT", models.TextField(blank=True, null=True)), - ("UEI", models.TextField(blank=True, null=True)), - ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECPASSTHROUGH", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("ID", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("ELECAUDITSID", models.TextField(blank=True, null=True)), - ("PASSTHROUGHNAME", models.TextField(blank=True, null=True)), - ("PASSTHROUGHID", models.TextField(blank=True, null=True)), - ], - ), - migrations.CreateModel( - name="ELECUEIS", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("UEISID", models.TextField(blank=True, null=True)), - ("REPORTID", models.TextField(blank=True, null=True)), - ("VERSION", models.TextField(blank=True, null=True)), - ("DBKEY", models.TextField(blank=True, null=True)), - ("AUDITYEAR", models.TextField(blank=True, null=True)), - ("UEI", models.TextField(blank=True, null=True)), - ("SEQNUM", models.TextField(blank=True, null=True)), - ], - ), - ] diff --git a/backend/census_to_gsafac/migrations/__init__.py b/backend/census_to_gsafac/migrations/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/backend/census_to_gsafac/models.py b/backend/census_to_gsafac/models.py deleted file mode 100644 index 503a9e027f..0000000000 --- a/backend/census_to_gsafac/models.py +++ /dev/null @@ -1,445 +0,0 @@ -from django.db import models - - -class ELECAUDITHEADER(models.Model): - ELECAUDITHEADERID = models.TextField(blank=True, null=True) - - ID = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - FYENDDATE = models.TextField(blank=True, null=True) - - AUDITTYPE = models.TextField(blank=True, null=True) - - PERIODCOVERED = models.TextField(blank=True, null=True) - - NUMBERMONTHS = models.TextField(blank=True, null=True) - - MULTIPLEEINS = models.TextField(blank=True, null=True) - - EIN = models.TextField(blank=True, null=True) - - EINSUBCODE = models.TextField(blank=True, null=True) - - MULTIPLEDUNS = models.TextField(blank=True, null=True) - - DUNS = models.TextField(blank=True, null=True) - - AUDITEENAME = models.TextField(blank=True, null=True) - - STREET1 = models.TextField(blank=True, null=True) - - STREET2 = models.TextField(blank=True, null=True) - - CITY = models.TextField(blank=True, null=True) - - STATE = models.TextField(blank=True, null=True) - - ZIPCODE = models.TextField(blank=True, null=True) - - AUDITEECONTACT = models.TextField(blank=True, null=True) - - AUDITEETITLE = models.TextField(blank=True, null=True) - - AUDITEEPHONE = models.TextField(blank=True, null=True) - - AUDITEEFAX = models.TextField(blank=True, null=True) - - AUDITEEEMAIL = models.TextField(blank=True, null=True) - - AUDITEEDATESIGNED = models.TextField(blank=True, null=True) - - AUDITEENAMETITLE = models.TextField(blank=True, null=True) - - CPAFIRMNAME = models.TextField(blank=True, null=True) - - CPASTREET1 = models.TextField(blank=True, null=True) - - CPASTREET2 = models.TextField(blank=True, null=True) - - CPACITY = models.TextField(blank=True, null=True) - - CPASTATE = models.TextField(blank=True, null=True) - - CPAZIPCODE = models.TextField(blank=True, null=True) - - CPACONTACT = models.TextField(blank=True, null=True) - - CPATITLE = models.TextField(blank=True, null=True) - - CPAPHONE = models.TextField(blank=True, null=True) - - CPAFAX = models.TextField(blank=True, null=True) - - CPAEMAIL = models.TextField(blank=True, null=True) - - CPADATESIGNED = models.TextField(blank=True, null=True) - - CPANAMETITLE = models.TextField(blank=True, null=True) - - COG_OVER = models.TextField(blank=True, null=True) - - COGAGENCY = models.TextField(blank=True, null=True) - - TYPEREPORT_FS = models.TextField(blank=True, null=True) - - REPORTABLECONDITION = models.TextField(blank=True, null=True) - - MATERIALWEAKNESS = models.TextField(blank=True, null=True) - - MATERIALNONCOMPLIANCE = models.TextField(blank=True, null=True) - - GOINGCONCERN = models.TextField(blank=True, null=True) - - TYPEREPORT_MP = models.TextField(blank=True, null=True) - - DOLLARTHRESHOLD = models.TextField(blank=True, null=True) - - LOWRISK = models.TextField(blank=True, null=True) - - REPORTREQUIRED = models.TextField(blank=True, null=True) - - TOTFEDEXPEND = models.TextField(blank=True, null=True) - - COPIES = models.TextField(blank=True, null=True) - - REPORTABLECONDITION_MP = models.TextField(blank=True, null=True) - - MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) - - QCOSTS = models.TextField(blank=True, null=True) - - CYFINDINGS = models.TextField(blank=True, null=True) - - PYSCHEDULE = models.TextField(blank=True, null=True) - - DUP_REPORTS = models.TextField(blank=True, null=True) - - COG_AGENCY = models.TextField(blank=True, null=True) - - OVERSIGHTAGENCY = models.TextField(blank=True, null=True) - - DATERECEIVED = models.TextField(blank=True, null=True) - - DATEFIREWALL = models.TextField(blank=True, null=True) - - PREVIOUSDATEFIREWALL = models.TextField(blank=True, null=True) - - FINDINGREFNUM = models.TextField(blank=True, null=True) - - TYPEOFENTITY = models.TextField(blank=True, null=True) - - IMAGE = models.TextField(blank=True, null=True) - - AGENCYCFDA = models.TextField(blank=True, null=True) - - INITIALDATE = models.TextField(blank=True, null=True) - - DATERECEIVEDOTHER = models.TextField(blank=True, null=True) - - MULTIPLE_CPAS = models.TextField(blank=True, null=True) - - AUDITEECERTIFYNAME = models.TextField(blank=True, null=True) - - AUDITEECERTIFYTITLE = models.TextField(blank=True, null=True) - - FACACCEPTEDDATE = models.TextField(blank=True, null=True) - - AUDITOR_EIN = models.TextField(blank=True, null=True) - - SD_MATERIALWEAKNESS = models.TextField(blank=True, null=True) - - SD_MATERIALWEAKNESS_MP = models.TextField(blank=True, null=True) - - SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) - - SIGNIFICANTDEFICIENCY_MP = models.TextField(blank=True, null=True) - - SP_FRAMEWORK = models.TextField(blank=True, null=True) - - SP_FRAMEWORK_REQUIRED = models.TextField(blank=True, null=True) - - TYPEREPORT_SP_FRAMEWORK = models.TextField(blank=True, null=True) - - SUPPRESSION_CODE = models.TextField(blank=True, null=True) - - ENTITY_TYPE = models.TextField(blank=True, null=True) - - TYPEAUDIT_CODE = models.TextField(blank=True, null=True) - - OPEID = models.TextField(blank=True, null=True) - - DATETOED = models.TextField(blank=True, null=True) - - DATEFINISHED = models.TextField(blank=True, null=True) - - TYPEFINDING = models.TextField(blank=True, null=True) - - TYPEFUNDING = models.TextField(blank=True, null=True) - - FYSTARTDATE = models.TextField(blank=True, null=True) - - CPAFOREIGN = models.TextField(blank=True, null=True) - - UEI = models.TextField(blank=True, null=True) - - MULTIPLEUEIS = models.TextField(blank=True, null=True) - - CPACOUNTRY = models.TextField(blank=True, null=True) - - -class ELECEINS(models.Model): - ID = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - EIN = models.TextField(blank=True, null=True) - - EINSEQNUM = models.TextField(blank=True, null=True) - - DUNS = models.TextField(blank=True, null=True) - - DUNSEQNUM = models.TextField(blank=True, null=True) - - -class ELECAUDITFINDINGS(models.Model): - ELECAUDITFINDINGSID = models.TextField(blank=True, null=True) - - ELECAUDITSID = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - REPORTID = models.TextField(blank=True, null=True) - - VERSION = models.TextField(blank=True, null=True) - - QCOSTS = models.TextField(blank=True, null=True) - - OTHERFINDINGS = models.TextField(blank=True, null=True) - - SIGNIFICANTDEFICIENCY = models.TextField(blank=True, null=True) - - MATERIALWEAKNESS = models.TextField(blank=True, null=True) - - OTHERNONCOMPLIANCE = models.TextField(blank=True, null=True) - - TYPEREQUIREMENT = models.TextField(blank=True, null=True) - - FINDINGREFNUMS = models.TextField(blank=True, null=True) - - MODIFIEDOPINION = models.TextField(blank=True, null=True) - - REPEATFINDING = models.TextField(blank=True, null=True) - - PRIORFINDINGREFNUMS = models.TextField(blank=True, null=True) - - -class ELECNOTES(models.Model): - ID = models.TextField(blank=True, null=True) - - REPORTID = models.TextField(blank=True, null=True) - - VERSION = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - SEQ_NUMBER = models.TextField(blank=True, null=True) - - TYPE_ID = models.TextField(blank=True, null=True) - - NOTE_INDEX = models.TextField(blank=True, null=True) - - TITLE = models.TextField(blank=True, null=True) - - CONTENT = models.TextField(blank=True, null=True) - - UEI = models.TextField(blank=True, null=True) - - MULTIPLEUEIS = models.TextField(blank=True, null=True) - - -class ELECFINDINGSTEXT(models.Model): - SEQ_NUMBER = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - FINDINGREFNUMS = models.TextField(blank=True, null=True) - - TEXT = models.TextField(blank=True, null=True) - - CHARTSTABLES = models.TextField(blank=True, null=True) - - REPORTID = models.TextField(blank=True, null=True) - - VERSION = models.TextField(blank=True, null=True) - - UEI = models.TextField(blank=True, null=True) - - MULTIPLEUEIS = models.TextField(blank=True, null=True) - - -class ELECCPAS(models.Model): - ID = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - SEQNUM = models.TextField(blank=True, null=True) - - VERSION = models.TextField(blank=True, null=True) - - CPAFIRMNAME = models.TextField(blank=True, null=True) - - CPASTREET1 = models.TextField(blank=True, null=True) - - CPACITY = models.TextField(blank=True, null=True) - - CPASTATE = models.TextField(blank=True, null=True) - - CPAZIPCODE = models.TextField(blank=True, null=True) - - CPACONTACT = models.TextField(blank=True, null=True) - - CPATITLE = models.TextField(blank=True, null=True) - - CPAPHONE = models.TextField(blank=True, null=True) - - CPAFAX = models.TextField(blank=True, null=True) - - CPAEMAIL = models.TextField(blank=True, null=True) - - CPAEIN = models.TextField(blank=True, null=True) - - -class ELECAUDITS(models.Model): - ELECAUDITSID = models.TextField(blank=True, null=True) - - ID = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - CFDASEQNUM = models.TextField(blank=True, null=True) - - CFDA = models.TextField(blank=True, null=True) - - FEDERALPROGRAMNAME = models.TextField(blank=True, null=True) - - AMOUNT = models.TextField(blank=True, null=True) - - MAJORPROGRAM = models.TextField(blank=True, null=True) - - TYPEREQUIREMENT = models.TextField(blank=True, null=True) - - QCOSTS2 = models.TextField(blank=True, null=True) - - FINDINGS = models.TextField(blank=True, null=True) - - FINDINGREFNUMS = models.TextField(blank=True, null=True) - - RD = models.TextField(blank=True, null=True) - - DIRECT = models.TextField(blank=True, null=True) - - CFDA_PREFIX = models.TextField(blank=True, null=True) - - CFDA_EXT = models.TextField(blank=True, null=True) - - EIN = models.TextField(blank=True, null=True) - - CFDA2 = models.TextField(blank=True, null=True) - - TYPEREPORT_MP = models.TextField(blank=True, null=True) - - TYPEREPORT_MP_OVERRIDE = models.TextField(blank=True, null=True) - - ARRA = models.TextField(blank=True, null=True) - - LOANS = models.TextField(blank=True, null=True) - - FINDINGSCOUNT = models.TextField(blank=True, null=True) - - LOANBALANCE = models.TextField(blank=True, null=True) - - PASSTHROUGHAMOUNT = models.TextField(blank=True, null=True) - - AWARDIDENTIFICATION = models.TextField(blank=True, null=True) - - CLUSTERNAME = models.TextField(blank=True, null=True) - - PASSTHROUGHAWARD = models.TextField(blank=True, null=True) - - STATECLUSTERNAME = models.TextField(blank=True, null=True) - - PROGRAMTOTAL = models.TextField(blank=True, null=True) - - CLUSTERTOTAL = models.TextField(blank=True, null=True) - - OTHERCLUSTERNAME = models.TextField(blank=True, null=True) - - CFDAPROGRAMNAME = models.TextField(blank=True, null=True) - - -class ELECPASSTHROUGH(models.Model): - ID = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - ELECAUDITSID = models.TextField(blank=True, null=True) - - PASSTHROUGHNAME = models.TextField(blank=True, null=True) - - PASSTHROUGHID = models.TextField(blank=True, null=True) - - -class ELECUEIS(models.Model): - UEISID = models.TextField(blank=True, null=True) - - REPORTID = models.TextField(blank=True, null=True) - - VERSION = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - UEI = models.TextField(blank=True, null=True) - - SEQNUM = models.TextField(blank=True, null=True) - - -class ELECCAPTEXT(models.Model): - SEQ_NUMBER = models.TextField(blank=True, null=True) - - DBKEY = models.TextField(blank=True, null=True) - - AUDITYEAR = models.TextField(blank=True, null=True) - - FINDINGREFNUMS = models.TextField(blank=True, null=True) - - TEXT = models.TextField(blank=True, null=True) - - CHARTSTABLES = models.TextField(blank=True, null=True) - - REPORTID = models.TextField(blank=True, null=True) - - VERSION = models.TextField(blank=True, null=True) - - UEI = models.TextField(blank=True, null=True) - - MULTIPLEUEIS = models.TextField(blank=True, null=True) diff --git a/backend/census_to_gsafac/tests.py b/backend/census_to_gsafac/tests.py deleted file mode 100644 index a79ca8be56..0000000000 --- a/backend/census_to_gsafac/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -# from django.test import TestCase - -# Create your tests here. diff --git a/backend/census_to_gsafac/views.py b/backend/census_to_gsafac/views.py deleted file mode 100644 index fd0e044955..0000000000 --- a/backend/census_to_gsafac/views.py +++ /dev/null @@ -1,3 +0,0 @@ -# from django.shortcuts import render - -# Create your views here. From 927a80d42d10ba04b8f9b13e189b5d652f076f10 Mon Sep 17 00:00:00 2001 From: "Hassan D. M. Sambo" Date: Tue, 7 Nov 2023 13:57:40 -0500 Subject: [PATCH 45/61] Django migration --- .../migrations/0001_initial.py | 364 ++++++++++++++++++ backend/config/settings.py | 1 - 2 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 backend/census_historical_migration/migrations/0001_initial.py diff --git a/backend/census_historical_migration/migrations/0001_initial.py b/backend/census_historical_migration/migrations/0001_initial.py new file mode 100644 index 0000000000..a2b2587762 --- /dev/null +++ b/backend/census_historical_migration/migrations/0001_initial.py @@ -0,0 +1,364 @@ +# Generated by Django 4.2.6 on 2023-11-07 18:46 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="ELECAUDITFINDINGS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITFINDINGSID", models.TextField(blank=True, null=True)), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("QCOSTS", models.TextField(blank=True, null=True)), + ("OTHERFINDINGS", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("OTHERNONCOMPLIANCE", models.TextField(blank=True, null=True)), + ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("MODIFIEDOPINION", models.TextField(blank=True, null=True)), + ("REPEATFINDING", models.TextField(blank=True, null=True)), + ("PRIORFINDINGREFNUMS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECAUDITHEADER", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITHEADERID", models.TextField(blank=True, null=True)), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("FYENDDATE", models.TextField(blank=True, null=True)), + ("AUDITTYPE", models.TextField(blank=True, null=True)), + ("PERIODCOVERED", models.TextField(blank=True, null=True)), + ("NUMBERMONTHS", models.TextField(blank=True, null=True)), + ("MULTIPLEEINS", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("EINSUBCODE", models.TextField(blank=True, null=True)), + ("MULTIPLEDUNS", models.TextField(blank=True, null=True)), + ("DUNS", models.TextField(blank=True, null=True)), + ("AUDITEENAME", models.TextField(blank=True, null=True)), + ("STREET1", models.TextField(blank=True, null=True)), + ("STREET2", models.TextField(blank=True, null=True)), + ("CITY", models.TextField(blank=True, null=True)), + ("STATE", models.TextField(blank=True, null=True)), + ("ZIPCODE", models.TextField(blank=True, null=True)), + ("AUDITEECONTACT", models.TextField(blank=True, null=True)), + ("AUDITEETITLE", models.TextField(blank=True, null=True)), + ("AUDITEEPHONE", models.TextField(blank=True, null=True)), + ("AUDITEEFAX", models.TextField(blank=True, null=True)), + ("AUDITEEEMAIL", models.TextField(blank=True, null=True)), + ("AUDITEEDATESIGNED", models.TextField(blank=True, null=True)), + ("AUDITEENAMETITLE", models.TextField(blank=True, null=True)), + ("CPAFIRMNAME", models.TextField(blank=True, null=True)), + ("CPASTREET1", models.TextField(blank=True, null=True)), + ("CPASTREET2", models.TextField(blank=True, null=True)), + ("CPACITY", models.TextField(blank=True, null=True)), + ("CPASTATE", models.TextField(blank=True, null=True)), + ("CPAZIPCODE", models.TextField(blank=True, null=True)), + ("CPACONTACT", models.TextField(blank=True, null=True)), + ("CPATITLE", models.TextField(blank=True, null=True)), + ("CPAPHONE", models.TextField(blank=True, null=True)), + ("CPAFAX", models.TextField(blank=True, null=True)), + ("CPAEMAIL", models.TextField(blank=True, null=True)), + ("CPADATESIGNED", models.TextField(blank=True, null=True)), + ("CPANAMETITLE", models.TextField(blank=True, null=True)), + ("COG_OVER", models.TextField(blank=True, null=True)), + ("COGAGENCY", models.TextField(blank=True, null=True)), + ("TYPEREPORT_FS", models.TextField(blank=True, null=True)), + ("REPORTABLECONDITION", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("MATERIALNONCOMPLIANCE", models.TextField(blank=True, null=True)), + ("GOINGCONCERN", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), + ("DOLLARTHRESHOLD", models.TextField(blank=True, null=True)), + ("LOWRISK", models.TextField(blank=True, null=True)), + ("REPORTREQUIRED", models.TextField(blank=True, null=True)), + ("TOTFEDEXPEND", models.TextField(blank=True, null=True)), + ("COPIES", models.TextField(blank=True, null=True)), + ("REPORTABLECONDITION_MP", models.TextField(blank=True, null=True)), + ("MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), + ("QCOSTS", models.TextField(blank=True, null=True)), + ("CYFINDINGS", models.TextField(blank=True, null=True)), + ("PYSCHEDULE", models.TextField(blank=True, null=True)), + ("DUP_REPORTS", models.TextField(blank=True, null=True)), + ("COG_AGENCY", models.TextField(blank=True, null=True)), + ("OVERSIGHTAGENCY", models.TextField(blank=True, null=True)), + ("DATERECEIVED", models.TextField(blank=True, null=True)), + ("DATEFIREWALL", models.TextField(blank=True, null=True)), + ("PREVIOUSDATEFIREWALL", models.TextField(blank=True, null=True)), + ("FINDINGREFNUM", models.TextField(blank=True, null=True)), + ("TYPEOFENTITY", models.TextField(blank=True, null=True)), + ("IMAGE", models.TextField(blank=True, null=True)), + ("AGENCYCFDA", models.TextField(blank=True, null=True)), + ("INITIALDATE", models.TextField(blank=True, null=True)), + ("DATERECEIVEDOTHER", models.TextField(blank=True, null=True)), + ("MULTIPLE_CPAS", models.TextField(blank=True, null=True)), + ("AUDITEECERTIFYNAME", models.TextField(blank=True, null=True)), + ("AUDITEECERTIFYTITLE", models.TextField(blank=True, null=True)), + ("FACACCEPTEDDATE", models.TextField(blank=True, null=True)), + ("AUDITOR_EIN", models.TextField(blank=True, null=True)), + ("SD_MATERIALWEAKNESS", models.TextField(blank=True, null=True)), + ("SD_MATERIALWEAKNESS_MP", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY", models.TextField(blank=True, null=True)), + ("SIGNIFICANTDEFICIENCY_MP", models.TextField(blank=True, null=True)), + ("SP_FRAMEWORK", models.TextField(blank=True, null=True)), + ("SP_FRAMEWORK_REQUIRED", models.TextField(blank=True, null=True)), + ("TYPEREPORT_SP_FRAMEWORK", models.TextField(blank=True, null=True)), + ("SUPPRESSION_CODE", models.TextField(blank=True, null=True)), + ("ENTITY_TYPE", models.TextField(blank=True, null=True)), + ("TYPEAUDIT_CODE", models.TextField(blank=True, null=True)), + ("OPEID", models.TextField(blank=True, null=True)), + ("DATETOED", models.TextField(blank=True, null=True)), + ("DATEFINISHED", models.TextField(blank=True, null=True)), + ("TYPEFINDING", models.TextField(blank=True, null=True)), + ("TYPEFUNDING", models.TextField(blank=True, null=True)), + ("FYSTARTDATE", models.TextField(blank=True, null=True)), + ("CPAFOREIGN", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ("CPACOUNTRY", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECAUDITS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("CFDASEQNUM", models.TextField(blank=True, null=True)), + ("CFDA", models.TextField(blank=True, null=True)), + ("FEDERALPROGRAMNAME", models.TextField(blank=True, null=True)), + ("AMOUNT", models.TextField(blank=True, null=True)), + ("MAJORPROGRAM", models.TextField(blank=True, null=True)), + ("TYPEREQUIREMENT", models.TextField(blank=True, null=True)), + ("QCOSTS2", models.TextField(blank=True, null=True)), + ("FINDINGS", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("RD", models.TextField(blank=True, null=True)), + ("DIRECT", models.TextField(blank=True, null=True)), + ("CFDA_PREFIX", models.TextField(blank=True, null=True)), + ("CFDA_EXT", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("CFDA2", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP", models.TextField(blank=True, null=True)), + ("TYPEREPORT_MP_OVERRIDE", models.TextField(blank=True, null=True)), + ("ARRA", models.TextField(blank=True, null=True)), + ("LOANS", models.TextField(blank=True, null=True)), + ("FINDINGSCOUNT", models.TextField(blank=True, null=True)), + ("LOANBALANCE", models.TextField(blank=True, null=True)), + ("PASSTHROUGHAMOUNT", models.TextField(blank=True, null=True)), + ("AWARDIDENTIFICATION", models.TextField(blank=True, null=True)), + ("CLUSTERNAME", models.TextField(blank=True, null=True)), + ("PASSTHROUGHAWARD", models.TextField(blank=True, null=True)), + ("STATECLUSTERNAME", models.TextField(blank=True, null=True)), + ("PROGRAMTOTAL", models.TextField(blank=True, null=True)), + ("CLUSTERTOTAL", models.TextField(blank=True, null=True)), + ("OTHERCLUSTERNAME", models.TextField(blank=True, null=True)), + ("CFDAPROGRAMNAME", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECCAPTEXT", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("TEXT", models.TextField(blank=True, null=True)), + ("CHARTSTABLES", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECCPAS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("SEQNUM", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("CPAFIRMNAME", models.TextField(blank=True, null=True)), + ("CPASTREET1", models.TextField(blank=True, null=True)), + ("CPACITY", models.TextField(blank=True, null=True)), + ("CPASTATE", models.TextField(blank=True, null=True)), + ("CPAZIPCODE", models.TextField(blank=True, null=True)), + ("CPACONTACT", models.TextField(blank=True, null=True)), + ("CPATITLE", models.TextField(blank=True, null=True)), + ("CPAPHONE", models.TextField(blank=True, null=True)), + ("CPAFAX", models.TextField(blank=True, null=True)), + ("CPAEMAIL", models.TextField(blank=True, null=True)), + ("CPAEIN", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECEINS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("EIN", models.TextField(blank=True, null=True)), + ("EINSEQNUM", models.TextField(blank=True, null=True)), + ("DUNS", models.TextField(blank=True, null=True)), + ("DUNSEQNUM", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECFINDINGSTEXT", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("FINDINGREFNUMS", models.TextField(blank=True, null=True)), + ("TEXT", models.TextField(blank=True, null=True)), + ("CHARTSTABLES", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECNOTES", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("SEQ_NUMBER", models.TextField(blank=True, null=True)), + ("TYPE_ID", models.TextField(blank=True, null=True)), + ("NOTE_INDEX", models.TextField(blank=True, null=True)), + ("TITLE", models.TextField(blank=True, null=True)), + ("CONTENT", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("MULTIPLEUEIS", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECPASSTHROUGH", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("ID", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("ELECAUDITSID", models.TextField(blank=True, null=True)), + ("PASSTHROUGHNAME", models.TextField(blank=True, null=True)), + ("PASSTHROUGHID", models.TextField(blank=True, null=True)), + ], + ), + migrations.CreateModel( + name="ELECUEIS", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("UEISID", models.TextField(blank=True, null=True)), + ("REPORTID", models.TextField(blank=True, null=True)), + ("VERSION", models.TextField(blank=True, null=True)), + ("DBKEY", models.TextField(blank=True, null=True)), + ("AUDITYEAR", models.TextField(blank=True, null=True)), + ("UEI", models.TextField(blank=True, null=True)), + ("SEQNUM", models.TextField(blank=True, null=True)), + ], + ), + ] diff --git a/backend/config/settings.py b/backend/config/settings.py index 6f823fbaee..b2bc4f55ac 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -124,7 +124,6 @@ "dissemination", "census_historical_migration", "support", - "census_to_gsafac", ] MIDDLEWARE = [ From a2aaffedcd07d1f4bd5ef5f0e6e1dc7ceea24132 Mon Sep 17 00:00:00 2001 From: "Hassan D. M. Sambo" Date: Tue, 7 Nov 2023 13:58:11 -0500 Subject: [PATCH 46/61] Telling mypy to ignore django migration files --- backend/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 672c354c11..72e4c856a4 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -7,7 +7,7 @@ exclude = "node_modules/" [tool.mypy] ignore_missing_imports = true explicit_package_bases = true -exclude = ".venv/|audit/migrations/|dissemination/migrations" +exclude = ".venv/|audit/migrations/|dissemination/migrations|census_historical_migration/migrations" [tool.pylint."MESSAGES CONTROL"] # Tadhg 2022-05-03: I added the below because I've found them unhelpful. From a2a58d34b6c9fed12ba3e1854f995052bbb47268 Mon Sep 17 00:00:00 2001 From: "Hassan D. M. Sambo" Date: Tue, 7 Nov 2023 14:02:23 -0500 Subject: [PATCH 47/61] Linting --- .../management/commands/csv_to_postgres.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index a9d8fb40fe..b9b4ab3d07 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -11,7 +11,9 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) -census_to_gsafac_models = list(apps.get_app_config("census_historical_migration").get_models()) +census_to_gsafac_models = list( + apps.get_app_config("census_historical_migration").get_models() +) census_to_gsafac_model_names = [m._meta.model_name for m in census_to_gsafac_models] s3_client = boto3.client( "s3", From b11107cfcbb7407138c609dd6747e15ff1285608 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Tue, 7 Nov 2023 13:29:58 -0800 Subject: [PATCH 48/61] Incorporated chunking capabilities from Alternative suggestion for loading data from S3 #2660 --- .../management/commands/csv_to_postgres.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index b9b4ab3d07..b4633492c3 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -1,12 +1,17 @@ import logging +import botocore import boto3 -import csv +import pandas as pd +from io import BytesIO +from botocore.exceptions import ClientError from django.core.management.base import BaseCommand from django.conf import settings from django.apps import apps +CHUNK_SIZE = 10_000 + logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) @@ -63,15 +68,24 @@ def handle(self, *args, **options): model_obj = census_to_gsafac_models[ census_to_gsafac_model_names.index(model_name) ] - response = s3_client.get_object( - Bucket=census_to_gsafac_bucket_name, Key=item["Key"] - ) - print("Obtained response from S3") - lines = response["Body"].read().decode("utf-8").splitlines(True) - print("Loaded Body into 'lines'") - rows = [row for row in csv.DictReader(lines)] - print("Completed processing 'lines'") - self.load_table(model_obj, rows) + file = BytesIO() + try: + s3_client.download_fileobj( + Bucket=census_to_gsafac_bucket_name, + Key=item["Key"], + Fileobj=file, + ) + except ClientError: + logger.error("Could not download {}".format(model_obj)) + return + file.seek(0) + for df in pd.read_csv(file, iterator=True, chunksize=CHUNK_SIZE): + # Each row is a dictionary. The columns are the + # correct names for our model. So, this should be a + # clean way to load the model from a row. + for _, row in df.iterrows(): + obj = model_obj(**row) + obj.save() for mdl in census_to_gsafac_models: row_count = mdl.objects.all().count() @@ -99,17 +113,3 @@ def get_model_name(self, name): return model_name print("Could not find a matching model for ", name) return None - - def load_table(self, model_obj, rows): - print("Loading data for model_obj ", model_obj) - for i in range(0, len(rows)): - model_instance = model_obj() - - for column_name, value in rows[i].items(): - if column_name == "id": - continue - setattr(model_instance, column_name, value) - model_instance.save() - if i % 1000 == 0: - print(f"Loaded {i} of {len(rows)} rows to ", model_obj) - print(f"Loaded {len(rows)} rows to ", model_obj) From 414cbc04a5a51a54a076352786f64698c62fdbde Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Tue, 7 Nov 2023 13:54:05 -0800 Subject: [PATCH 49/61] Incorporated chunking capabilities from Alternative suggestion for loading data from S3 #2660 --- .../management/commands/csv_to_postgres.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index b4633492c3..f217a02953 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -79,6 +79,7 @@ def handle(self, *args, **options): logger.error("Could not download {}".format(model_obj)) return file.seek(0) + rows_loaded = 0 for df in pd.read_csv(file, iterator=True, chunksize=CHUNK_SIZE): # Each row is a dictionary. The columns are the # correct names for our model. So, this should be a @@ -86,6 +87,8 @@ def handle(self, *args, **options): for _, row in df.iterrows(): obj = model_obj(**row) obj.save() + rows_loaded += df.shape[0] + print(f"Loaded {rows_loaded} rows in ", model_obj) for mdl in census_to_gsafac_models: row_count = mdl.objects.all().count() From a89982ee2090ab0876af17bce500e4a014164ce5 Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Tue, 7 Nov 2023 14:53:06 -0800 Subject: [PATCH 50/61] Moving fac_s3.py to support/management/commands/ --- .../management/commands/fac_s3.py | 81 ------------------- 1 file changed, 81 deletions(-) delete mode 100644 backend/census_historical_migration/management/commands/fac_s3.py diff --git a/backend/census_historical_migration/management/commands/fac_s3.py b/backend/census_historical_migration/management/commands/fac_s3.py deleted file mode 100644 index 9884d5b5f6..0000000000 --- a/backend/census_historical_migration/management/commands/fac_s3.py +++ /dev/null @@ -1,81 +0,0 @@ -from os import path -import os - -import boto3 - -from django.core.management.base import BaseCommand - -from django.conf import settings - - -class Command(BaseCommand): - help = """ - Alternative to aws s3 as the cli is not available in production. - Usage: - manage.py fac_s3 --upload --src SRC [--tgt TGT] - manage.py fac_s3 --download --src SRC [--tgt TGT] - manage.py fac_s3 --rm --tgt TGT] - manage.py fac_s3 --ls [--tgt TGT] - """ - - def add_arguments(self, parser): - parser.add_argument("bucket_name", type=str, help="The S3 bucket name.") - parser.add_argument("--src", help="local file name.") - parser.add_argument("--tgt", help="s3 file name.") - parser.add_argument("--ls", action="store_true", help="List all files.") - parser.add_argument( - "--upload", action="store_true", help="Copy local src to S3 tgt." - ) - parser.add_argument( - "--download", action="store_true", help="Copy S3 tgt to local src." - ) - parser.add_argument("--rm", action="store_true", help="Delete tgt.") - - def handle(self, *args, **options): - bucket_name = options["bucket_name"] - src_path = options["src"] - tgt_path = options["tgt"] - - s3_client = boto3.client( - "s3", - aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, - endpoint_url=settings.AWS_S3_ENDPOINT_URL, - ) - - if options["ls"]: - items = s3_client.list_objects( - Bucket=bucket_name, - Prefix=tgt_path or "", - ).get("Contents") - if not items: - print("Target is empty") - return - for item in items: - print(item["Key"], item["Size"], item["LastModified"]) - return - - if options["upload"]: - file_path = path.join(settings.BASE_DIR, src_path) - tgt_name = tgt_path or os.path.basename(file_path) - tgt_name_offset = len(str(file_path)) - for subdir, dir, files in os.walk(file_path): - object_name = tgt_name + str(subdir)[tgt_name_offset:] + "/" - print(subdir, dir, object_name, files) - for file in files: - full_path = os.path.join(subdir, file) - s3_client.upload_file(full_path, bucket_name, object_name + file) - print(f"Copied {full_path} to {bucket_name} {object_name+file}.") - return - - if options["download"]: - file_path = path.join(settings.BASE_DIR, src_path) - object_name = tgt_path - s3_client.download_file(bucket_name, object_name, file_path) - return - - if options["rm"]: - s3_client.delete_object( - Bucket=bucket_name, - Key=tgt_path, - ) From a21e55bafa810a332abca5a0bd4ef3d26012089e Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Tue, 7 Nov 2023 14:54:05 -0800 Subject: [PATCH 51/61] Moving fac_s3.py to support/management/commands/ --- backend/support/management/commands/fac_s3.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 backend/support/management/commands/fac_s3.py diff --git a/backend/support/management/commands/fac_s3.py b/backend/support/management/commands/fac_s3.py new file mode 100644 index 0000000000..9884d5b5f6 --- /dev/null +++ b/backend/support/management/commands/fac_s3.py @@ -0,0 +1,81 @@ +from os import path +import os + +import boto3 + +from django.core.management.base import BaseCommand + +from django.conf import settings + + +class Command(BaseCommand): + help = """ + Alternative to aws s3 as the cli is not available in production. + Usage: + manage.py fac_s3 --upload --src SRC [--tgt TGT] + manage.py fac_s3 --download --src SRC [--tgt TGT] + manage.py fac_s3 --rm --tgt TGT] + manage.py fac_s3 --ls [--tgt TGT] + """ + + def add_arguments(self, parser): + parser.add_argument("bucket_name", type=str, help="The S3 bucket name.") + parser.add_argument("--src", help="local file name.") + parser.add_argument("--tgt", help="s3 file name.") + parser.add_argument("--ls", action="store_true", help="List all files.") + parser.add_argument( + "--upload", action="store_true", help="Copy local src to S3 tgt." + ) + parser.add_argument( + "--download", action="store_true", help="Copy S3 tgt to local src." + ) + parser.add_argument("--rm", action="store_true", help="Delete tgt.") + + def handle(self, *args, **options): + bucket_name = options["bucket_name"] + src_path = options["src"] + tgt_path = options["tgt"] + + s3_client = boto3.client( + "s3", + aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY, + endpoint_url=settings.AWS_S3_ENDPOINT_URL, + ) + + if options["ls"]: + items = s3_client.list_objects( + Bucket=bucket_name, + Prefix=tgt_path or "", + ).get("Contents") + if not items: + print("Target is empty") + return + for item in items: + print(item["Key"], item["Size"], item["LastModified"]) + return + + if options["upload"]: + file_path = path.join(settings.BASE_DIR, src_path) + tgt_name = tgt_path or os.path.basename(file_path) + tgt_name_offset = len(str(file_path)) + for subdir, dir, files in os.walk(file_path): + object_name = tgt_name + str(subdir)[tgt_name_offset:] + "/" + print(subdir, dir, object_name, files) + for file in files: + full_path = os.path.join(subdir, file) + s3_client.upload_file(full_path, bucket_name, object_name + file) + print(f"Copied {full_path} to {bucket_name} {object_name+file}.") + return + + if options["download"]: + file_path = path.join(settings.BASE_DIR, src_path) + object_name = tgt_path + s3_client.download_file(bucket_name, object_name, file_path) + return + + if options["rm"]: + s3_client.delete_object( + Bucket=bucket_name, + Key=tgt_path, + ) From b09bd22f087087fb5e2eedf66aa3ecbe68eedcfe Mon Sep 17 00:00:00 2001 From: Sudha Kumar Date: Tue, 7 Nov 2023 17:02:17 -0800 Subject: [PATCH 52/61] Added load_data function --- .../management/commands/csv_to_postgres.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index f217a02953..93ea5a7908 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -78,17 +78,8 @@ def handle(self, *args, **options): except ClientError: logger.error("Could not download {}".format(model_obj)) return - file.seek(0) - rows_loaded = 0 - for df in pd.read_csv(file, iterator=True, chunksize=CHUNK_SIZE): - # Each row is a dictionary. The columns are the - # correct names for our model. So, this should be a - # clean way to load the model from a row. - for _, row in df.iterrows(): - obj = model_obj(**row) - obj.save() - rows_loaded += df.shape[0] - print(f"Loaded {rows_loaded} rows in ", model_obj) + print("Obtained {model_obj} from S3") + self.load_data(file, model_obj) for mdl in census_to_gsafac_models: row_count = mdl.objects.all().count() @@ -116,3 +107,18 @@ def get_model_name(self, name): return model_name print("Could not find a matching model for ", name) return None + + def load_data(file, model_obj): + print("Starting load data to postgres") + file.seek(0) + rows_loaded = 0 + for df in pd.read_csv(file, iterator=True, chunksize=CHUNK_SIZE): + # Each row is a dictionary. The columns are the + # correct names for our model. So, this should be a + # clean way to load the model from a row. + for _, row in df.iterrows(): + obj = model_obj(**row) + obj.save() + rows_loaded += df.shape[0] + print(f"Loaded {rows_loaded} rows in ", model_obj) + return None From e38b3955a4e6f5c2435cedde4672e74d00e7334e Mon Sep 17 00:00:00 2001 From: SudhaUKumar Date: Tue, 7 Nov 2023 17:10:01 -0800 Subject: [PATCH 53/61] Tested load_data --- .../management/commands/csv_to_postgres.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index 93ea5a7908..003682eed2 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -78,7 +78,7 @@ def handle(self, *args, **options): except ClientError: logger.error("Could not download {}".format(model_obj)) return - print("Obtained {model_obj} from S3") + print(f"Obtained {model_obj} from S3") self.load_data(file, model_obj) for mdl in census_to_gsafac_models: @@ -108,7 +108,7 @@ def get_model_name(self, name): print("Could not find a matching model for ", name) return None - def load_data(file, model_obj): + def load_data(self, file, model_obj): print("Starting load data to postgres") file.seek(0) rows_loaded = 0 From 6003868c3e59a4b43d72b594687e4d237a00866a Mon Sep 17 00:00:00 2001 From: Sudha Kumar Date: Wed, 8 Nov 2023 09:18:19 -0800 Subject: [PATCH 54/61] Removed import botocore --- .../management/commands/csv_to_postgres.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index 003682eed2..2fe8e9bcc8 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -1,5 +1,4 @@ import logging -import botocore import boto3 import pandas as pd From d8ab3cbbc5c2ab6d6dfb1f8075eaaee776251df5 Mon Sep 17 00:00:00 2001 From: Sudha Kumar Date: Wed, 8 Nov 2023 09:24:26 -0800 Subject: [PATCH 55/61] Removed import botocore --- .../management/commands/csv_to_postgres.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index 2fe8e9bcc8..b3a4b235a9 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -120,4 +120,4 @@ def load_data(self, file, model_obj): obj.save() rows_loaded += df.shape[0] print(f"Loaded {rows_loaded} rows in ", model_obj) - return None + return None From f951ea268a6991f72532cf6c2daed98281ce38cc Mon Sep 17 00:00:00 2001 From: Edward Zapata Date: Wed, 8 Nov 2023 16:06:25 -0500 Subject: [PATCH 56/61] refactored csv_to_postgres.py Co-authored-by: Purvin Patel --- .../management/commands/csv_to_postgres.py | 104 +++++++++--------- 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index b3a4b235a9..d56ce765d3 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -10,11 +10,8 @@ from django.apps import apps CHUNK_SIZE = 10_000 - - logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) - census_to_gsafac_models = list( apps.get_app_config("census_historical_migration").get_models() ) @@ -27,97 +24,94 @@ ) census_to_gsafac_bucket_name = settings.AWS_CENSUS_TO_GSAFAC_BUCKET_NAME DELIMITER = "," - - class Command(BaseCommand): help = """ Populate Postgres database from csv files Usage: manage.py csv_to_postgres --folder --clean """ - def add_arguments(self, parser): - parser.add_argument("--folder", help="S3 folder name") - parser.add_argument("--clean") - parser.add_argument("--sample") + parser.add_argument("--folder", help="S3 folder name (required)", type=str) + parser.add_argument("--clean", help="Clean the data (default: False)", type=bool, default=False) + parser.add_argument("--sample", help="Sample the data (default: False)", type=bool, default=False) parser.add_argument("--load") - + parser.add_argument("--chunk-size", help="Chunk size for processing data (default: 10000)", type=int, default=10000) + def handle(self, *args, **options): - if options.get("clean") == "True": + folder = options.get("folder") + if not folder: + self.stderr.write(self.style.ERROR("Please specify a folder name")) + return + if options.get("clean"): self.delete_data() return - if options.get("sample") == "True": + if options.get("sample"): self.sample_data() return - - folder = options.get("folder") - if not folder: - print("Please specify a folder name") - return - - items = s3_client.list_objects( - Bucket=census_to_gsafac_bucket_name, - Prefix=folder, - )["Contents"] + self.process_csv_files(folder) + + def process_csv_files(self, folder): + items = self.list_s3_objects(census_to_gsafac_bucket_name, folder) for item in items: if item["Key"].endswith("/"): continue model_name = self.get_model_name(item["Key"]) if model_name: - model_obj = census_to_gsafac_models[ - census_to_gsafac_model_names.index(model_name) - ] - file = BytesIO() - try: - s3_client.download_fileobj( - Bucket=census_to_gsafac_bucket_name, - Key=item["Key"], - Fileobj=file, - ) - except ClientError: - logger.error("Could not download {}".format(model_obj)) - return - print(f"Obtained {model_obj} from S3") - self.load_data(file, model_obj) - + model_index = census_to_gsafac_model_names.index(model_name) + model_obj = census_to_gsafac_models[model_index] + file = self.get_s3_object(census_to_gsafac_bucket_name, item["Key"], model_obj) + if file: + self.process_and_load_data(file, model_obj) for mdl in census_to_gsafac_models: row_count = mdl.objects.all().count() - print(f"{row_count} in ", mdl) - + self.stdout.write(f"{row_count} in {mdl}") + def delete_data(self): for mdl in census_to_gsafac_models: - print("Deleting ", mdl) + self.stdout.write(f"Deleting {mdl}") mdl.objects.all().delete() - + def sample_data(self): for mdl in census_to_gsafac_models: - print("Sampling ", mdl) + self.stdout.write(f"Sampling {mdl}") rows = mdl.objects.all()[:1] for row in rows: for col in mdl._meta.fields: - print(f"{col.name}: {getattr(row, col.name)}") - + self.stdout.write(f"{col.name}: {getattr(row, col.name)}") + + def list_s3_objects(self, bucket_name, folder): + return s3_client.list_objects(Bucket=bucket_name, Prefix=folder)["Contents"] + + def process_and_load_data(self, file, model_obj): + self.stdout.write(f"Obtained {model_obj} from S3") + self.load_data(file, model_obj) + + def get_s3_object(self, bucket_name, key, model_obj): + file = BytesIO() + try: + s3_client.download_fileobj(Bucket=bucket_name, Key=key, Fileobj=file) + except ClientError: + self.stderr.write(self.style.ERROR(f"Could not download {model_obj}")) + return None + return file + def get_model_name(self, name): - print("Processing ", name) + self.stdout.write(f"Processing {name}") file_name = name.split("/")[-1].split(".")[0] for model_name in census_to_gsafac_model_names: if file_name.lower().startswith(model_name): - print("model_name = ", model_name) + self.stdout.write(f"model_name = {model_name}") return model_name - print("Could not find a matching model for ", name) + self.stdout.write(f"Could not find a matching model for {name}") return None - + def load_data(self, file, model_obj): - print("Starting load data to postgres") + self.stdout.write(f"Starting load data to postgres") file.seek(0) rows_loaded = 0 for df in pd.read_csv(file, iterator=True, chunksize=CHUNK_SIZE): - # Each row is a dictionary. The columns are the - # correct names for our model. So, this should be a - # clean way to load the model from a row. for _, row in df.iterrows(): obj = model_obj(**row) obj.save() rows_loaded += df.shape[0] - print(f"Loaded {rows_loaded} rows in ", model_obj) - return None + self.stdout.write(f"Loaded {rows_loaded} rows in {model_obj}") From 4f03d7a953d5c90f5c198b3f7dec4c787c7a7b41 Mon Sep 17 00:00:00 2001 From: Edward Zapata Date: Wed, 8 Nov 2023 16:58:35 -0500 Subject: [PATCH 57/61] added chunk-size arguments --- .../management/commands/csv_to_postgres.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index d56ce765d3..2f62b3c006 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -9,7 +9,6 @@ from django.conf import settings from django.apps import apps -CHUNK_SIZE = 10_000 logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) census_to_gsafac_models = list( @@ -48,9 +47,10 @@ def handle(self, *args, **options): if options.get("sample"): self.sample_data() return - self.process_csv_files(folder) + chunk_size = options.get("chunk-size") + self.process_csv_files(folder, chunk_size) - def process_csv_files(self, folder): + def process_csv_files(self, folder, chunk_size): items = self.list_s3_objects(census_to_gsafac_bucket_name, folder) for item in items: if item["Key"].endswith("/"): @@ -61,7 +61,7 @@ def process_csv_files(self, folder): model_obj = census_to_gsafac_models[model_index] file = self.get_s3_object(census_to_gsafac_bucket_name, item["Key"], model_obj) if file: - self.process_and_load_data(file, model_obj) + self.process_and_load_data(file, model_obj, chunk_size) for mdl in census_to_gsafac_models: row_count = mdl.objects.all().count() self.stdout.write(f"{row_count} in {mdl}") @@ -82,9 +82,9 @@ def sample_data(self): def list_s3_objects(self, bucket_name, folder): return s3_client.list_objects(Bucket=bucket_name, Prefix=folder)["Contents"] - def process_and_load_data(self, file, model_obj): + def process_and_load_data(self, file, model_obj, chunk_size): self.stdout.write(f"Obtained {model_obj} from S3") - self.load_data(file, model_obj) + self.load_data(file, model_obj, chunk_size) def get_s3_object(self, bucket_name, key, model_obj): file = BytesIO() @@ -105,13 +105,13 @@ def get_model_name(self, name): self.stdout.write(f"Could not find a matching model for {name}") return None - def load_data(self, file, model_obj): + def load_data(self, file, model_obj, chunk_size): self.stdout.write(f"Starting load data to postgres") file.seek(0) rows_loaded = 0 - for df in pd.read_csv(file, iterator=True, chunksize=CHUNK_SIZE): + for df in pd.read_csv(file, iterator=True, chunksize=chunk_size): for _, row in df.iterrows(): obj = model_obj(**row) obj.save() rows_loaded += df.shape[0] - self.stdout.write(f"Loaded {rows_loaded} rows in {model_obj}") + self.stdout.write(f"Loaded {rows_loaded} rows in {model_obj})") From 13736ad2146c73444eb3f1c8b6f01db6bd620490 Mon Sep 17 00:00:00 2001 From: Edward Zapata Date: Wed, 8 Nov 2023 16:59:32 -0500 Subject: [PATCH 58/61] added help comments for load_data --- .../management/commands/csv_to_postgres.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index 2f62b3c006..f1c1df1c6a 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -110,6 +110,9 @@ def load_data(self, file, model_obj, chunk_size): file.seek(0) rows_loaded = 0 for df in pd.read_csv(file, iterator=True, chunksize=chunk_size): + # Each row is a dictionary. The columns are the + # correct names for our model. So, this should be a + # clean way to load the model from a row. for _, row in df.iterrows(): obj = model_obj(**row) obj.save() From e3f90a176761d284d49cf30fbc12be0a635ca92c Mon Sep 17 00:00:00 2001 From: "Hassan D. M. Sambo" Date: Wed, 8 Nov 2023 17:46:54 -0500 Subject: [PATCH 59/61] Code cleaning --- .../management/commands/csv_to_postgres.py | 76 ++++++++++++------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index f1c1df1c6a..0e05f8cbd3 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -23,23 +23,38 @@ ) census_to_gsafac_bucket_name = settings.AWS_CENSUS_TO_GSAFAC_BUCKET_NAME DELIMITER = "," + + class Command(BaseCommand): help = """ Populate Postgres database from csv files Usage: manage.py csv_to_postgres --folder --clean """ + def add_arguments(self, parser): parser.add_argument("--folder", help="S3 folder name (required)", type=str) - parser.add_argument("--clean", help="Clean the data (default: False)", type=bool, default=False) - parser.add_argument("--sample", help="Sample the data (default: False)", type=bool, default=False) + parser.add_argument( + "--clean", help="Clean the data (default: False)", type=bool, default=False + ) + parser.add_argument( + "--sample", + help="Sample the data (default: False)", + type=bool, + default=False, + ) parser.add_argument("--load") - parser.add_argument("--chunk-size", help="Chunk size for processing data (default: 10000)", type=int, default=10000) - + parser.add_argument( + "--chunk-size", + help="Chunk size for processing data (default: 10000)", + type=int, + default=10000, + ) + def handle(self, *args, **options): folder = options.get("folder") if not folder: - self.stderr.write(self.style.ERROR("Please specify a folder name")) + print("Please specify a folder name") return if options.get("clean"): self.delete_data() @@ -49,7 +64,7 @@ def handle(self, *args, **options): return chunk_size = options.get("chunk-size") self.process_csv_files(folder, chunk_size) - + def process_csv_files(self, folder, chunk_size): items = self.list_s3_objects(census_to_gsafac_bucket_name, folder) for item in items: @@ -59,54 +74,57 @@ def process_csv_files(self, folder, chunk_size): if model_name: model_index = census_to_gsafac_model_names.index(model_name) model_obj = census_to_gsafac_models[model_index] - file = self.get_s3_object(census_to_gsafac_bucket_name, item["Key"], model_obj) + file = self.get_s3_object( + census_to_gsafac_bucket_name, item["Key"], model_obj + ) if file: - self.process_and_load_data(file, model_obj, chunk_size) - for mdl in census_to_gsafac_models: + self.load_data(file, model_obj, chunk_size) + + self.display_row_counts(census_to_gsafac_models) + + def display_row_counts(self, models): + for mdl in models: row_count = mdl.objects.all().count() - self.stdout.write(f"{row_count} in {mdl}") - + print(f"{row_count} in ", mdl) + def delete_data(self): for mdl in census_to_gsafac_models: - self.stdout.write(f"Deleting {mdl}") + print("Deleting ", mdl) mdl.objects.all().delete() - + def sample_data(self): for mdl in census_to_gsafac_models: - self.stdout.write(f"Sampling {mdl}") + print("Sampling ", mdl) rows = mdl.objects.all()[:1] for row in rows: for col in mdl._meta.fields: - self.stdout.write(f"{col.name}: {getattr(row, col.name)}") - + print(f"{col.name}: {getattr(row, col.name)}") + def list_s3_objects(self, bucket_name, folder): return s3_client.list_objects(Bucket=bucket_name, Prefix=folder)["Contents"] - - def process_and_load_data(self, file, model_obj, chunk_size): - self.stdout.write(f"Obtained {model_obj} from S3") - self.load_data(file, model_obj, chunk_size) - + def get_s3_object(self, bucket_name, key, model_obj): file = BytesIO() try: s3_client.download_fileobj(Bucket=bucket_name, Key=key, Fileobj=file) except ClientError: - self.stderr.write(self.style.ERROR(f"Could not download {model_obj}")) + logger.error("Could not download {}".format(model_obj)) return None + print(f"Obtained {model_obj} from S3") return file - + def get_model_name(self, name): - self.stdout.write(f"Processing {name}") + print("Processing ", name) file_name = name.split("/")[-1].split(".")[0] for model_name in census_to_gsafac_model_names: if file_name.lower().startswith(model_name): - self.stdout.write(f"model_name = {model_name}") + print("model_name = ", model_name) return model_name - self.stdout.write(f"Could not find a matching model for {name}") + print("Could not find a matching model for ", name) return None - + def load_data(self, file, model_obj, chunk_size): - self.stdout.write(f"Starting load data to postgres") + print("Starting load data to postgres") file.seek(0) rows_loaded = 0 for df in pd.read_csv(file, iterator=True, chunksize=chunk_size): @@ -117,4 +135,4 @@ def load_data(self, file, model_obj, chunk_size): obj = model_obj(**row) obj.save() rows_loaded += df.shape[0] - self.stdout.write(f"Loaded {rows_loaded} rows in {model_obj})") + print(f"Loaded {rows_loaded} rows in ", model_obj) From 653ec97956d6b633f6c2c09af775edf601bf8b54 Mon Sep 17 00:00:00 2001 From: Sudha Kumar Date: Wed, 8 Nov 2023 16:15:06 -0800 Subject: [PATCH 60/61] Renamed chunk-size to chunksize --- .../management/commands/csv_to_postgres.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/census_historical_migration/management/commands/csv_to_postgres.py b/backend/census_historical_migration/management/commands/csv_to_postgres.py index 0e05f8cbd3..f9882f5c42 100644 --- a/backend/census_historical_migration/management/commands/csv_to_postgres.py +++ b/backend/census_historical_migration/management/commands/csv_to_postgres.py @@ -45,10 +45,10 @@ def add_arguments(self, parser): ) parser.add_argument("--load") parser.add_argument( - "--chunk-size", - help="Chunk size for processing data (default: 10000)", + "--chunksize", + help="Chunk size for processing data (default: 10_000)", type=int, - default=10000, + default=10_000, ) def handle(self, *args, **options): @@ -62,7 +62,7 @@ def handle(self, *args, **options): if options.get("sample"): self.sample_data() return - chunk_size = options.get("chunk-size") + chunk_size = options.get("chunksize") self.process_csv_files(folder, chunk_size) def process_csv_files(self, folder, chunk_size): @@ -136,3 +136,4 @@ def load_data(self, file, model_obj, chunk_size): obj.save() rows_loaded += df.shape[0] print(f"Loaded {rows_loaded} rows in ", model_obj) + return None From 7245368aab9335476fc1fb48e7ae406b2d4c6695 Mon Sep 17 00:00:00 2001 From: Sudha Kumar Date: Wed, 8 Nov 2023 16:20:55 -0800 Subject: [PATCH 61/61] Added chunksize argument --- backend/census_historical_migration/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/census_historical_migration/README.md b/backend/census_historical_migration/README.md index a1b74a78d0..f23a4c79a2 100644 --- a/backend/census_historical_migration/README.md +++ b/backend/census_historical_migration/README.md @@ -26,7 +26,7 @@ python manage.py fac_s3 fac-census-to-gsafac-s3 --upload --src census_historical * csv_to_postgres.py - Inserts data into Postgres tables using the contents of the CSV files in the S3 bucket. The first row of each file is assumed to have the column names (we convert to lowercase). The name of the table is determined by examining the name of the file. The sample source files do not have delimters for empty fields at the end of a line - so we assume these are nulls. ```bash -python manage.py csv_to_postgres --folder data +python manage.py csv_to_postgres --folder data --chunksize 10000 python manage.py csv_to_postgres --clean True ``` @@ -51,7 +51,7 @@ docker compose run web python manage.py fac_s3 fac-census-to-gsafac-s3 --upload 3. In the FAC/backend folder, run the following to read the CSV files from fac-census-to-gsafac-s3 bucket and load into Postgres. ```bash -docker compose run web python manage.py csv_to_postgres --folder data +docker compose run web python manage.py csv_to_postgres --folder data --chunksize 10000 ``` ### How to run the historic data migrator: