add python scripts for generation of HEROP_ID in CSVs and SHPs #68

GeoDaCenter · Sep 1, 2023 · 1979591 · 1979591
1 parent 9a6b56e
commit 1979591
Show file tree

Hide file tree

Showing 3 changed files with 180 additions and 0 deletions.
diff --git a/code/SpatialIds/add_csv_herop_ids.py b/code/SpatialIds/add_csv_herop_ids.py
@@ -0,0 +1,89 @@
+import csv
+from pathlib import Path
+
+## This is one-off script used to add a new HEROP_ID to all of the CSVs
+## The HEROP_ID is a similar to GEOIDs that can come from data.census.gov,
+## as described here: https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html
+## but is simplified to three parts:
+## 3-digit summary level code, the letters "US", and the relevant GEOID.
+## For example, the summary level code for State is 040, and the GEOID for states
+## are FP codes, so 040US48 would be the HEROP_ID for Texas.
+
+## Note that in the future, the source and destination paths in the script may
+## no longer exist, so it functions more as a record of transformation than as a
+## reusable snippet.
+
+sl_lookup = {
+    "S": {
+        "code": "040",
+        "id_length": 2,
+    },
+    "C": {
+        "code": "050",
+        "id_length": 5,
+    },
+    "T": {
+        "code": "140",
+        "id_length": 11,
+    },
+    "Z": {
+        "code": "860",
+        "id_length": 5,
+    },
+}
+
+f_lookup = {
+    "C_1980.csv": "GEOID",
+    "C_1990.csv": "GEOID",
+    "C_2000.csv": "GEOID",
+    "C_2010.csv": "GEOID",
+    "C_Latest.csv": "GEOID",
+    "S_1980.csv": "STATEFP",
+    "S_1990.csv": "STATEFP",
+    "S_2000.csv": "STATEFP",
+    "S_2010.csv": "GEOID",
+    "S_Latest.csv": "GEOID",
+    "T_1980.csv": "GEOID",
+    "T_1990.csv": "GEOID",
+    "T_2000.csv": "GEOID",
+    "T_2010.csv": "GEOID",
+    "T_Latest.csv": "GEOID",
+    "Z_1980.csv": "ZCTA",
+    "Z_1990.csv": "ZCTA",
+    "Z_2000.csv": "ZCTA",
+    "Z_2010.csv": "GEOID",
+    "Z_Latest.csv": "GEOID",
+}
+
+drop_fields = ["STATEFP", "G_STATEFP", "STUSPS", "TRACTCE", "ZIP", "COUNTYFP", "ZCTA", "GEOID"]
+
+new_suffix = "bq"
+
+csv_dir = Path(__file__).resolve().parent.parent.parent / 'data_final' / 'v2.0' / 'tables'
+paths = [i for i in csv_dir.glob("*.csv") if not str(i).endswith(f"{new_suffix}.csv")]
+
+for path in paths:
+
+    print(path.name)
+    out_path = str(path).replace(".csv", f"_{new_suffix}.csv")
+    print(out_path)
+    geo = path.name[0]
+
+    rows = []
+    new_geoid_field = 'HEROP_ID'
+    with open(path, "r") as r:
+        reader = csv.DictReader(r)
+        fieldnames = [new_geoid_field] + [i for i in reader.fieldnames]
+
+        for r in reader:
+            str_geoid = str(r[f_lookup[path.name]]).zfill(sl_lookup[geo]['id_length'])
+            new_geoid = f"{sl_lookup[geo]['code']}US{str_geoid}"
+            r[new_geoid_field] = new_geoid
+            rows.append(r)
+
+    with open(out_path, "w") as w:
+        writer = csv.DictWriter(w, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    Path(out_path).replace(path)
diff --git a/code/SpatialIds/add_shp_herop_ids.py b/code/SpatialIds/add_shp_herop_ids.py
@@ -0,0 +1,90 @@
+import fiona
+from fiona import Feature, Properties
+from pathlib import Path
+
+## This is one-off script used to add a new HEROP_ID to all of the shapefiles,
+## while also moving them to a new final location.
+## The HEROP_ID is a similar to GEOIDs that can come from data.census.gov,
+## as described here: https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html
+## but is simplified to three parts:
+## 3-digit summary level code, the letters "US", and the relevant GEOID.
+## For example, the summary level code for State is 040, and the GEOID for states
+## are FP codes, so 040US48 would be the HEROP_ID for Texas.
+
+## Note that in the future, the source and destination paths in the script may
+## no longer exist, so it functions more as a record of transformation than as a
+## reusable snippet.
+
+sl_lookup = {
+    "S": {
+        "code": "040",
+        "id_length": 2,
+    },
+    "C": {
+        "code": "050",
+        "id_length": 5,
+    },
+    "T": {
+        "code": "140",
+        "id_length": 11,
+    },
+    "Z": {
+        "code": "860",
+        "id_length": 5,
+    },
+}
+
+# relative paths from the data_final/geometryFiles directory, field names with GEOID base
+f_lookup = {
+    "state/states2018.shp": "STATEFP",
+    "county/counties2018.shp": "GEOID",
+    "tract/tracts2018.shp": "GEOID",
+    "zcta/zctas2018.shp": "GEOID10",
+    "tl_2010_state/states2010.shp": "STATEFP",
+    "tl_2010_county/counties2010.shp": "GEOID",
+    "tl_2010_tract/tracts2010.shp": "GEOID",
+    "tl_2010_zcta/tl_2010_us_zcta510-wgs84-generalized100ft.shp": "GEOID10",
+}
+
+new_suffix = "bq"
+
+in_dir = Path(__file__).resolve().parent.parent.parent / 'data_final' / 'geometryFiles'
+out_dir = Path(__file__).resolve().parent.parent.parent / 'data_final' / 'v2.0' / 'spatial_data'
+out_dir.mkdir(exist_ok=True)
+
+for path in f_lookup.keys():
+
+    shp_path = Path(in_dir, path)
+    print("\n"+shp_path.name)
+
+    if "state" in path:
+        sl = sl_lookup["S"]
+    elif "counties" in path:
+        sl = sl_lookup["C"]
+    elif "tract" in path:
+        sl = sl_lookup["T"]
+    elif "zcta" in path:
+        sl = sl_lookup["Z"]
+    else:
+        raise Exception("unexpected input shapefile")
+
+    out_path = out_dir / Path(path).name
+    with fiona.open(shp_path) as src:
+        dst_schema = src.schema
+        dst_schema['properties']['HEROP_ID'] = 'str'
+
+        with fiona.open(
+            out_path,
+            mode="w",
+            crs=src.crs,
+            driver="ESRI Shapefile",
+            schema=dst_schema,
+        ) as dst:
+            for feat in src:
+                geo_id = str(feat.properties[f_lookup[path]]).zfill(sl['id_length'])
+                herop_id =  f"{sl['code']}US{geo_id}"
+                props = Properties.from_dict(
+                    **feat.properties,
+                    HEROP_ID=herop_id,
+                )
+                dst.write(Feature(geometry=feat.geometry, properties=props))
diff --git a/code/SpatialIds/requirements.txt b/code/SpatialIds/requirements.txt
@@ -0,0 +1 @@
+fiona