Merge pull request #50 from bucketteOfIvy/49_update_historical_data

49 update historical data
GeoDaCenter · Jul 17, 2023 · d9f6ac8 · d9f6ac8
2 parents 62a292f + 51b68ab
commit d9f6ac8
Show file tree

Hide file tree

Showing 73 changed files with 1,724,127 additions and 73,128 deletions.
diff --git a/code/InterpolateDemographics_NHGIS_County_1980.R b/code/InterpolateDemographics_NHGIS_County_1980.R
@@ -0,0 +1,128 @@
+# Author: Ashlynn Wimer
+# Date: July 11th, 2023
+# About: This R Script takes in 1980 county level census data and runs a population
+# weighted interpolation using tidycensus::interpolate_pw to transform them to
+# 2010 county shapes. The population weights used are at the 1980 county
+# subdivision level. 
+
+# Libraries
+library(dplyr)
+library(stringr)
+library(tidycensus)
+library(sf)
+
+# If running in RStudio, uncomment this
+# setwd(getSrcDirectory(function(){})[1])
+
+
+# Import Data
+print("Loading data!")
+cnty_full_data <- read.csv('../data_raw/nhgis/1980/nhgis0022_ds104_1980_county.csv')
+cnty_sample_data <- read.csv('../data_raw/nhgis/1980/nhgis0022_ds107_1980_county.csv')
+cntySbdv_pop_data <- read.csv('../data_raw/nhgis/1980/nhgis0027_ds104_1980_cty_sub.csv')
+cnty1980 <- st_read('../data_raw/nhgis/geometryFiles/1980/US_county_1980_conflated.shp')
+cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp')
+cntySbdv1980 <- st_read('../data_raw/nhgis/geometryFiles/1980/US_mcd_1980.shp')
+
+
+# Transform Data
+print("Transforming Data!")
+cnty_data <- cnty_sample_data |>
+  select(GISJOIN, DHM001, DHM002, DHM003, DHM004, DHM005) |>
+  merge(cnty_full_data, by="GISJOIN")
+names(cnty_full_data)
+cnty_data <- cnty_data |>
+  rename(
+    ageUnd1     = C67001, age1_2         = C67002,
+    age3_4      = C67003, age5           = C67004,
+    age6        = C67005, age7_9         = C67006,
+    age10_13    = C67007, age14          = C67008,
+    age15       = C67009, age16          = C67010,
+    age17       = C67011, age18          = C67012,
+    age19       = C67013, age20          = C67014,
+    age21       = C67015, age22_24       = C67016, 
+    age25_29    = C67017, age30_34       = C67018, 
+    age35_44    = C67019, age45_54       = C67020,
+    age55_59    = C67021, age60_61       = C67022, 
+    age62_64    = C67023, age65_74       = C67024, 
+    age75_84    = C67025, ageOv85        = C67026,
+    notHispPop  = C9E001, mexicanPop     = C9E002, 
+    prPop       = C9E003, cubanPop       = C9E004,
+    otherHisp   = C9E005, whitePop       = C9D001, 
+    blackPop    = C9D002, contigAmIndPop = C9D003, 
+    inuitPop    = C9D004, unanganPop     = C9D005, 
+    japanesePop = C9D006, chinesePop     = C9D007, 
+    filipinoPop = C9D008, koreanPop      = C9D009, 
+    indianPop   = C9D010, vietnamesePop  = C9D011, 
+    hawaiinPop  = C9D012, guamanianPop   = C9D013, 
+    samoanPop   = C9D014, otherPop       = C9D015,
+    totPop      = C7L001, elementary     = DHM001,
+    hghschl1_3  = DHM002, hghschl4       = DHM003,
+    college1_3  = DHM004, college4orMore = DHM005)
+
+cnty_data <- cnty_data |>
+  mutate(
+    age0_4   = ageUnd1  + age1_2   + age3_4,
+    age5_14  = age5     + age6     + age7_9    + age10_13 + age14,
+    age18_64 = age18    + age19    + age20     + age21    + age22_24 + age25_29 + age30_34 + age35_44 + age45_54 + age55_59 + age60_61 + age62_64,
+    age15_19 = age15    + age16    + age17     + age18    + age19,
+    age20_24 = age20    + age21    + age22_24,
+    age15_44 = age15    + age16    + age17     + age18    + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_44,
+    age55_59 = age55_59,
+    age60_64 = age60_61 + age62_64,
+    ageOv65  = age65_74 + age75_84 + ageOv85,
+    hispPop  = mexicanPop + prPop + cubanPop + otherHisp,
+    amIndPop = contigAmIndPop + inuitPop + unanganPop,
+    asianPop = japanesePop + chinesePop + filipinoPop + koreanPop + indianPop + vietnamesePop,
+    pacIsPop = hawaiinPop + guamanianPop + samoanPop,
+    NoHSPop  = elementary + hghschl1_3,
+    edSampl = elementary + hghschl1_3 + hghschl4 + college1_3 + college4orMore
+  )
+
+cnty_data <- cnty_data |>
+  select(GISJOIN, STATEA, COUNTYA, 
+         age18_64, age0_4, age5_14, age15_19, age20_24, 
+         age15_44, age55_59, age60_64, ageOv65, hispPop, 
+         amIndPop, asianPop, pacIsPop, totPop, whitePop, 
+         blackPop, NoHSPop, edSampl) 
+
+cnty1980 <- cnty1980 |>
+  merge(cnty_data, by="GISJOIN") |>
+  st_transform(st_crs(cnty2010))
+
+print("Final preparation before interpolating..")
+cnty2010$GEOID <- paste(
+  str_pad(cnty2010$STATEFP10, width=2, side='left', pad='0'),
+  str_pad(cnty2010$COUNTYFP10, width=3, side='left', pad='0'),
+  sep='0'
+  )
+
+cnty2010 <- cnty2010 |>
+  select(GISJOIN, GEOID, NAME10)
+
+# Create population weights
+pop_weights <- cntySbdv1980 |>
+  merge(cntySbdv_pop_data, by='GISJOIN') |>
+  st_transform(st_crs(cnty2010)) |>
+  rename(totPop = C7L001)
+
+# Run the interpolation algorithm
+print("Interpolating!")
+cnty1980_on_2010 <- interpolate_pw(
+  from = cnty1980,
+  to = cnty2010,
+  to_id = 'GEOID',
+  extensive = T,
+  weights = pop_weights,
+  weight_column = 'totPop',
+  weight_placement = 'surface'
+)
+
+# Filter out PR data
+# Alaska NAs are left in for posterity.
+cnty1980_on_2010 <- cnty1980_on_2010 |> 
+  filter(substr(GEOID, start=1, stop=2) != "72") |>
+  st_drop_geometry()
+
+# Save
+write.csv(cnty1980_on_2010, "../data_raw/nhgis/1980InterpolatedDataCounty.csv")
diff --git a/code/InterpolateDemographics_NHGIS_County_1990.R b/code/InterpolateDemographics_NHGIS_County_1990.R
@@ -0,0 +1,131 @@
+# Author: Ashlynn Wimer
+# Date: July 5th, 2023
+# About: This R Script takes in 1990 county level census data and runs a population
+# weighted interpolation using tidycensus::interpolate_pw to transform them to
+# 2010 county shapes. The population weights originate from the 1990 census tracts.
+
+
+# Libraries
+library(dplyr)
+library(stringr)
+library(tidycensus)
+library(sf)
+
+# If running in RStudio, uncomment this
+# setwd(getSrcDirectory(function(){})[1])
+
+
+# Import Data
+print("Loading data!")
+cnty_full_data <- read.csv('../data_raw/nhgis/1990/nhgis0022_ds120_1990_county.csv')
+cnty_sample_data <- read.csv('../data_raw/nhgis/1990/nhgis0022_ds123_1990_county.csv')
+trct_demos <- read.csv('../data_raw/nhgis/1990/nhgis0020_ds120_1990_tract.csv')
+cnty1990 <- st_read('../data_raw/nhgis/geometryFiles/1990/US_county_1990.shp')
+cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp')
+trct1990 <- st_read('../data_raw/nhgis/geometryFiles/1990/US_tract_1990.shp') 
+
+# Transform
+cnty_data <- cnty_sample_data |>
+  select(GISJOIN, E33001, E33002, E33003, E33004, E33005, E33006, E33007) |>
+  merge(cnty_full_data, by='GISJOIN')
+
+cnty_data <- cnty_data |>
+  rename(
+    whitePop          = EUZ001, blackPop            = EUZ002,
+    contigAmIndPop    = EUZ003, inuitPop            = EUZ004,
+    unanganPop        = EUZ005, chinesePop          = EUZ006,
+    filipinoPop       = EUZ007, japanesePop         = EUZ008,
+    indianPop         = EUZ009, koreanPop           = EUZ010,
+    vietnamesePop     = EUZ011, cambodianPop        = EUZ012,
+    hmongPop          = EUZ013, loatianPop          = EUZ014,
+    thaiPop           = EUZ015, otherAsianPop       = EUZ016,
+    hawaiinPop        = EUZ017, samoanPop           = EUZ018,
+    tonganPop         = EUZ019, otherPlynsnPop      = EUZ020,
+    guamianPop        = EUZ021, otherMcrnsnPop      = EUZ022,
+    melanesianPop     = EUZ023, unspecPcfcIslndrPop = EUZ024,
+    otherMcrnsnRcePop = EUZ025, totPop              = ET1001,
+    notHisp           = EU1001, mexicanHisp         = EU1002,
+    prHisp            = EU1003, cubanHisp           = EU1004,
+    otherHisp         = EU1005, ageUnd1             = ET3001,
+    age1_2            = ET3002, age3_4              = ET3003,
+    age5              = ET3004, age6                = ET3005,
+    age7_9            = ET3006, age10_11            = ET3007,
+    age12_13          = ET3008, age14               = ET3009,
+    age15             = ET3010, age16               = ET3011,
+    age17             = ET3012, age18               = ET3013,
+    age19             = ET3014, age20               = ET3015,
+    age21             = ET3016, age22_24            = ET3017,
+    age25_29          = ET3018, age30_34            = ET3019,
+    age35_39          = ET3020, age40_44            = ET3021,
+    age45_49          = ET3022, age50_54            = ET3023,
+    age55_59          = ET3024, age60_61            = ET3025,
+    age62_64          = ET3026, age65_69            = ET3027,
+    age70_74          = ET3028, age75_79            = ET3029,
+    age80_84          = ET3030, ageOver85           = ET3031, 
+    noHS              = E33001, hsNoGrad            = E33002, 
+    hsOrEquiv         = E33003, collegeNoGrad       = E33004, 
+    associates        = E33005, bachelors           = E33006, 
+    graduateDeg       = E33007
+    )
+
+cnty_data <- cnty_data |>
+  mutate(
+    noHighSchoolDeg = noHS + hsNoGrad,
+    edSampl         = noHS + hsNoGrad + hsOrEquiv + collegeNoGrad + associates + bachelors + graduateDeg,
+    age18_64 = age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_39 + age40_44 + age45_49 + age50_54 + age55_59 + age60_61 + age62_64,
+    age0_4   = ageUnd1 + age1_2 + age3_4,
+    age5_14  = age5 + age6 + age7_9 + age10_11 + age12_13 + age14,
+    age15_19 = age15 + age16 + age17 + age18 + age19,
+    age20_24 = age20 + age21 + age22_24,
+    age15_44 = age15 + age16 + age17 + age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_39 + age40_44,
+    age45_49 = age45_49,
+    age50_54 = age50_54,
+    age55_59 = age55_59,
+    age60_64 = age60_61 + age62_64,
+    agOver65 = age65_69 + age70_74 + age75_79 + age80_84 + ageOver85,
+    whitePop = whitePop,
+    blackPop = blackPop,
+    hispPop  = mexicanHisp + cubanHisp + prHisp + otherHisp,
+    amIndPop = contigAmIndPop + inuitPop + unanganPop,
+    asianPop = chinesePop + filipinoPop + japanesePop + indianPop + koreanPop + vietnamesePop + cambodianPop + hmongPop + loatianPop + thaiPop + otherAsianPop,
+    pacIsPop = hawaiinPop + samoanPop + tonganPop + otherPlynsnPop + guamianPop + otherMcrnsnPop + melanesianPop + unspecPcfcIslndrPop + otherMcrnsnRcePop,
+    totPop   = totPop
+  )
+
+
+cnty1990 <- cnty1990 |> 
+  merge(cnty_data, by='GISJOIN') |>
+  st_transform(st_crs(cnty2010)) |>
+  select(GISJOIN,
+         age18_64, age0_4,   age5_14,  age15_19, 
+         age20_24, age15_44, age45_49, age50_54, 
+         age55_59, age60_64, agOver65, whitePop, 
+         blackPop, hispPop,  amIndPop, asianPop, 
+         pacIsPop, totPop,   noHighSchoolDeg,
+         edSampl)
+
+# Make population weights
+pop_weights <- trct1990 |>
+  merge(trct_demos, by='GISJOIN') |>
+  select(GISJOIN, ET1001) |>
+  rename(totPop = ET1001) |>
+  st_transform(st_crs(cnty2010))
+
+# Interpolate
+print('Interpolating!')
+cnty1990_on_2010 <- interpolate_pw(
+  from = cnty1990,
+  to = cnty2010,
+  to_id = 'GEOID10',
+  extensive = TRUE,
+  weights = pop_weights,
+  weight_column = 'totPop',
+  weight_placement = 'surface') |>
+  st_drop_geometry() |>
+  filter(substr(GEOID10, start=1, stop=2) != "72") |>
+  rename(GEOID = GEOID10)
+
+print("Saving!")
+write.csv(cnty1990_on_2010, "../data_raw/nhgis/1990InterpolatedCounties.csv", row.names=FALSE)
+
+
diff --git a/code/InterpolateDemographics_NHGIS_County_2000.R b/code/InterpolateDemographics_NHGIS_County_2000.R
@@ -0,0 +1,129 @@
+# Author: Ashlynn Wimer
+# Date: July 5th, 2023
+# About: This R Script takes in 2000 county level census data and runs a population
+# weighted interpolation using tidycensus::interpolate_pw to transform them to
+# 2010 county shapes. The population weights originate from the 2000 census tracts.
+
+# Libraries
+library(dplyr)
+library(stringr)
+library(tidycensus)
+library(sf)
+
+# If running in RStudio, uncomment this
+# setwd(getSrcDirectory(function(){})[1])
+
+
+# Import Data
+print("Loading data!")
+cnty_full_data <- read.csv('../data_raw/nhgis/2000/nhgis0022_ds146_2000_county.csv')
+cnty_sample_data <- read.csv('../data_raw/nhgis/2000/nhgis0022_ds151_2000_county.csv')
+trct_demos <- read.csv('../data_raw/nhgis/2000/nhgis0020_ds146_2000_tract.csv')
+cnty2000 <- st_read('../data_raw/nhgis/geometryFiles/2000/US_county_2000.shp')
+cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp')
+trct2000 <- st_read('../data_raw/nhgis/geometryFiles/2000/US_tract_2000.shp') 
+
+cnty_data <- cnty_sample_data |>
+  select(GISJOIN, GKT001, GKT003, GKT005,
+         GKT007,  GKT009, GKT011, GKT013,
+         GKT015,  GKT017, GKT019, GKT021,
+         GKT023,  GKT025, GKT027, GKT029,
+         GKT031,  GKT002, GKT004, GKT006,
+         GKT008,  GKT010, GKT012, GKT014,
+         GKT016,  GKT018, GKT020, GKT022,
+         GKT024,  GKT026, GKT028, GKT030,
+         GKT032,  GLD001) |>
+  merge(cnty_full_data, by='GISJOIN')
+
+cnty_data <- cnty_data |>
+  rename(
+    whitePop   = FMR001, blackPop   = FMR002, amIndPop   = FMR003, asianPop    = FMR004,
+    pacIsPop   = FMR005, mAgeUnd5   = FMZ001, mAgeUnder5 = FMZ001,  mAge5_9    = FMZ002,
+    mAge10_14  = FMZ003, mAge15_17  = FMZ004, mAge18_19  = FMZ005,  mAge20     = FMZ006,
+    mAge21     = FMZ007, mAge22_24  = FMZ008, mAge25_29  = FMZ009,  mAge30_34  = FMZ010,
+    mAge35_39  = FMZ011, mAge40_44  = FMZ012, mAge45_49  = FMZ013,  mAge50_54  = FMZ014,
+    mAge55_59  = FMZ015, mAge60_61  = FMZ016, mAge62_64  = FMZ017,  mAge65_66  = FMZ018,
+    mAge67_69  = FMZ019, mAge70_74  = FMZ020, mAge75_79  = FMZ021,  mAge80_84  = FMZ022,
+    mAgeOver85 = FMZ023, fAgeUnder5 = FMZ024, fAge5_9    = FMZ025,  fAge10_14  = FMZ026,
+    fAge15_17  = FMZ027, fAge18_19  = FMZ028, fAge20     = FMZ029,  fAge21     = FMZ030,
+    fAge22_24  = FMZ031, fAge25_29  = FMZ032, fAge30_34  = FMZ033,  fAge35_39  = FMZ034,
+    fAge40_44  = FMZ035, fAge45_49  = FMZ036, fAge50_54  = FMZ037,  fAge55_59  = FMZ038,
+    fAge60_61  = FMZ039, fAge62_64  = FMZ040, fAge65_66  = FMZ041,  fAge67_69  = FMZ042,
+    fAge70_74  = FMZ043, fAge75_79  = FMZ044, fAge80_84  = FMZ045,  fAgeOver85 = FMZ046,
+    mNoSchool  = GKT001, m0_4       = GKT002, m5_6       = GKT003,   m7_8      = GKT004,
+    m9         = GKT005, m10        = GKT006, m11        = GKT007,   m12       = GKT008,
+    mHS        = GKT009, mLtlClg    = GKT010, mSmClg     = GKT011,   mAss      = GKT012,
+    mBach      = GKT013, mMas       = GKT014, mProf      = GKT015,   mDoc      = GKT016,
+    fNoSchool  = GKT017, f0_4       = GKT018, f5_6       = GKT019,   f7_8      = GKT020,
+    f9         = GKT021, f10        = GKT022, f11        = GKT023,   f12       = GKT024,
+    fHS        = GKT025, fLtlClg    = GKT026, fSmClg     = GKT027,   fAss      = GKT028,
+    fBach      = GKT029, fMas       = GKT030, fProf      = GKT031,   fDoc      = GKT032,
+    disPop     = GLD001, totPop     = FL5001, hispPop    = FMC001)
+
+cnty_data <- cnty_data |>
+  mutate(
+    noHSDeg = mNoSchool + m0_4 + m5_6 + m7_8 + m9 + m10 + m11 + m12 +
+               fNoSchool + f0_4 + f5_6 + f7_8 + f9 + f10 + f11 + f12,
+    edSmpl  = (mNoSchool + fNoSchool + m0_4 + f0_4 + m5_6 + f5_6 + m7_8 + f7_8 + 
+                 m9 + f9 + m10 + f10 + m11 + f11 + m12 + f12 + mHS + fHS +
+                 mLtlClg + fLtlClg + mSmClg + fSmClg + mAss + fAss + mBach + 
+                 fBach + mMas + fMas + mProf + fProf + mDoc + fDoc),
+    age18_64 = mAge18_19 + mAge20 + mAge21 + mAge22_24 + mAge25_29 + mAge30_34 +
+                mAge35_39 + mAge40_44 + mAge45_49 + mAge50_54 + mAge55_59 + mAge60_61 +
+                mAge62_64 + fAge18_19 + fAge20 + fAge21 + fAge22_24 + fAge25_29 +
+                fAge30_34 + fAge35_39 + fAge40_44 + fAge45_49 + fAge50_54 + fAge55_59 +
+                fAge60_61 + fAge62_64, 
+    age0_4   = mAgeUnder5 + fAgeUnder5,
+    age5_14  = mAge5_9 + mAge10_14 + fAge5_9 + fAge10_14,
+    age15_19 = mAge15_17 + fAge15_17 + mAge18_19 + fAge18_19,
+    age20_24 = mAge20 + mAge21 + mAge22_24 + fAge20 + fAge21 + fAge22_24,
+    age15_44 = mAge15_17 + mAge18_19 + mAge20 + mAge21 + mAge22_24 + mAge25_29 +
+                mAge30_34 + mAge35_39 + mAge40_44 + fAge15_17 + fAge18_19 + fAge20 +
+                fAge21 + fAge22_24 + fAge25_29 + fAge30_34 + fAge35_39 + fAge40_44,
+    age45_49 = mAge45_49 + fAge45_49,
+    age50_54 = mAge50_54 + fAge50_54,
+    age55_59 = mAge55_59 + fAge55_59,
+    age60_64 = mAge60_61 + mAge62_64 + fAge60_61 + fAge62_64,
+    ageOv65  = mAge65_66 + mAge67_69 + mAge70_74 + mAge75_79 + mAge80_84 +
+                mAgeOver85 + fAge65_66 + fAge67_69 + fAge70_74 + fAge75_79 + fAge80_84 +
+                fAgeOver85          
+  ) |>
+  select(
+    GISJOIN, totPop, age18_64, age0_4, age5_14, age15_19, age20_24, age15_44, age45_49, 
+    age50_54, age55_59, age60_64, ageOv65, hispPop, whitePop, blackPop, 
+    amIndPop, asianPop, pacIsPop, noHSDeg, edSmpl, disPop
+  )
+
+cnty2000 <- cnty2000 |>
+  merge(cnty_data, by='GISJOIN') |>
+  st_transform(st_crs(cnty2010))
+
+# Make population weights.
+pop_weights <- trct2000 |>
+  merge(trct_demos, by="GISJOIN") |>
+  select(GISJOIN, FL5001) |>
+  rename(totPop = FL5001) |>
+  st_transform(st_crs(cnty2010))
+
+# Interpolate
+cnty2000_on_2010 <- interpolate_pw(
+  from = cnty2000,
+  to = cnty2010,
+  to_id = 'GEOID10',
+  extensive = TRUE,
+  weights = pop_weights,
+  weight_column = 'totPop',
+  weight_placement = 'surface'
+)
+
+# Clean result
+cnty2000_on_2010 <- cnty2000_on_2010 |>
+  rename(GEOID = GEOID10) |>
+  filter(substr(GEOID, start=1, stop=2) !="72") |>
+  st_drop_geometry()
+
+# Save
+write.csv(
+  cnty2000_on_2010, 
+  "../data_raw/nhgis/2000DataInterpolatedCounty.csv", row.names=FALSE
+  )