Skip to content

Commit

Permalink
Merge pull request #50 from bucketteOfIvy/49_update_historical_data
Browse files Browse the repository at this point in the history
49 update historical data
  • Loading branch information
mradamcox authored Jul 17, 2023
2 parents 62a292f + 51b68ab commit d9f6ac8
Show file tree
Hide file tree
Showing 73 changed files with 1,724,127 additions and 73,128 deletions.
128 changes: 128 additions & 0 deletions code/InterpolateDemographics_NHGIS_County_1980.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Author: Ashlynn Wimer
# Date: July 11th, 2023
# About: This R Script takes in 1980 county level census data and runs a population
# weighted interpolation using tidycensus::interpolate_pw to transform them to
# 2010 county shapes. The population weights used are at the 1980 county
# subdivision level.

# Libraries
library(dplyr)
library(stringr)
library(tidycensus)
library(sf)

# If running in RStudio, uncomment this
# setwd(getSrcDirectory(function(){})[1])


# Import Data
print("Loading data!")
cnty_full_data <- read.csv('../data_raw/nhgis/1980/nhgis0022_ds104_1980_county.csv')
cnty_sample_data <- read.csv('../data_raw/nhgis/1980/nhgis0022_ds107_1980_county.csv')
cntySbdv_pop_data <- read.csv('../data_raw/nhgis/1980/nhgis0027_ds104_1980_cty_sub.csv')
cnty1980 <- st_read('../data_raw/nhgis/geometryFiles/1980/US_county_1980_conflated.shp')
cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp')
cntySbdv1980 <- st_read('../data_raw/nhgis/geometryFiles/1980/US_mcd_1980.shp')


# Transform Data
print("Transforming Data!")
cnty_data <- cnty_sample_data |>
select(GISJOIN, DHM001, DHM002, DHM003, DHM004, DHM005) |>
merge(cnty_full_data, by="GISJOIN")
names(cnty_full_data)
cnty_data <- cnty_data |>
rename(
ageUnd1 = C67001, age1_2 = C67002,
age3_4 = C67003, age5 = C67004,
age6 = C67005, age7_9 = C67006,
age10_13 = C67007, age14 = C67008,
age15 = C67009, age16 = C67010,
age17 = C67011, age18 = C67012,
age19 = C67013, age20 = C67014,
age21 = C67015, age22_24 = C67016,
age25_29 = C67017, age30_34 = C67018,
age35_44 = C67019, age45_54 = C67020,
age55_59 = C67021, age60_61 = C67022,
age62_64 = C67023, age65_74 = C67024,
age75_84 = C67025, ageOv85 = C67026,
notHispPop = C9E001, mexicanPop = C9E002,
prPop = C9E003, cubanPop = C9E004,
otherHisp = C9E005, whitePop = C9D001,
blackPop = C9D002, contigAmIndPop = C9D003,
inuitPop = C9D004, unanganPop = C9D005,
japanesePop = C9D006, chinesePop = C9D007,
filipinoPop = C9D008, koreanPop = C9D009,
indianPop = C9D010, vietnamesePop = C9D011,
hawaiinPop = C9D012, guamanianPop = C9D013,
samoanPop = C9D014, otherPop = C9D015,
totPop = C7L001, elementary = DHM001,
hghschl1_3 = DHM002, hghschl4 = DHM003,
college1_3 = DHM004, college4orMore = DHM005)

cnty_data <- cnty_data |>
mutate(
age0_4 = ageUnd1 + age1_2 + age3_4,
age5_14 = age5 + age6 + age7_9 + age10_13 + age14,
age18_64 = age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_44 + age45_54 + age55_59 + age60_61 + age62_64,
age15_19 = age15 + age16 + age17 + age18 + age19,
age20_24 = age20 + age21 + age22_24,
age15_44 = age15 + age16 + age17 + age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_44,
age55_59 = age55_59,
age60_64 = age60_61 + age62_64,
ageOv65 = age65_74 + age75_84 + ageOv85,
hispPop = mexicanPop + prPop + cubanPop + otherHisp,
amIndPop = contigAmIndPop + inuitPop + unanganPop,
asianPop = japanesePop + chinesePop + filipinoPop + koreanPop + indianPop + vietnamesePop,
pacIsPop = hawaiinPop + guamanianPop + samoanPop,
NoHSPop = elementary + hghschl1_3,
edSampl = elementary + hghschl1_3 + hghschl4 + college1_3 + college4orMore
)

cnty_data <- cnty_data |>
select(GISJOIN, STATEA, COUNTYA,
age18_64, age0_4, age5_14, age15_19, age20_24,
age15_44, age55_59, age60_64, ageOv65, hispPop,
amIndPop, asianPop, pacIsPop, totPop, whitePop,
blackPop, NoHSPop, edSampl)

cnty1980 <- cnty1980 |>
merge(cnty_data, by="GISJOIN") |>
st_transform(st_crs(cnty2010))

print("Final preparation before interpolating..")
cnty2010$GEOID <- paste(
str_pad(cnty2010$STATEFP10, width=2, side='left', pad='0'),
str_pad(cnty2010$COUNTYFP10, width=3, side='left', pad='0'),
sep='0'
)

cnty2010 <- cnty2010 |>
select(GISJOIN, GEOID, NAME10)

# Create population weights
pop_weights <- cntySbdv1980 |>
merge(cntySbdv_pop_data, by='GISJOIN') |>
st_transform(st_crs(cnty2010)) |>
rename(totPop = C7L001)

# Run the interpolation algorithm
print("Interpolating!")
cnty1980_on_2010 <- interpolate_pw(
from = cnty1980,
to = cnty2010,
to_id = 'GEOID',
extensive = T,
weights = pop_weights,
weight_column = 'totPop',
weight_placement = 'surface'
)

# Filter out PR data
# Alaska NAs are left in for posterity.
cnty1980_on_2010 <- cnty1980_on_2010 |>
filter(substr(GEOID, start=1, stop=2) != "72") |>
st_drop_geometry()

# Save
write.csv(cnty1980_on_2010, "../data_raw/nhgis/1980InterpolatedDataCounty.csv")
131 changes: 131 additions & 0 deletions code/InterpolateDemographics_NHGIS_County_1990.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Author: Ashlynn Wimer
# Date: July 5th, 2023
# About: This R Script takes in 1990 county level census data and runs a population
# weighted interpolation using tidycensus::interpolate_pw to transform them to
# 2010 county shapes. The population weights originate from the 1990 census tracts.


# Libraries
library(dplyr)
library(stringr)
library(tidycensus)
library(sf)

# If running in RStudio, uncomment this
# setwd(getSrcDirectory(function(){})[1])


# Import Data
print("Loading data!")
cnty_full_data <- read.csv('../data_raw/nhgis/1990/nhgis0022_ds120_1990_county.csv')
cnty_sample_data <- read.csv('../data_raw/nhgis/1990/nhgis0022_ds123_1990_county.csv')
trct_demos <- read.csv('../data_raw/nhgis/1990/nhgis0020_ds120_1990_tract.csv')
cnty1990 <- st_read('../data_raw/nhgis/geometryFiles/1990/US_county_1990.shp')
cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp')
trct1990 <- st_read('../data_raw/nhgis/geometryFiles/1990/US_tract_1990.shp')

# Transform
cnty_data <- cnty_sample_data |>
select(GISJOIN, E33001, E33002, E33003, E33004, E33005, E33006, E33007) |>
merge(cnty_full_data, by='GISJOIN')

cnty_data <- cnty_data |>
rename(
whitePop = EUZ001, blackPop = EUZ002,
contigAmIndPop = EUZ003, inuitPop = EUZ004,
unanganPop = EUZ005, chinesePop = EUZ006,
filipinoPop = EUZ007, japanesePop = EUZ008,
indianPop = EUZ009, koreanPop = EUZ010,
vietnamesePop = EUZ011, cambodianPop = EUZ012,
hmongPop = EUZ013, loatianPop = EUZ014,
thaiPop = EUZ015, otherAsianPop = EUZ016,
hawaiinPop = EUZ017, samoanPop = EUZ018,
tonganPop = EUZ019, otherPlynsnPop = EUZ020,
guamianPop = EUZ021, otherMcrnsnPop = EUZ022,
melanesianPop = EUZ023, unspecPcfcIslndrPop = EUZ024,
otherMcrnsnRcePop = EUZ025, totPop = ET1001,
notHisp = EU1001, mexicanHisp = EU1002,
prHisp = EU1003, cubanHisp = EU1004,
otherHisp = EU1005, ageUnd1 = ET3001,
age1_2 = ET3002, age3_4 = ET3003,
age5 = ET3004, age6 = ET3005,
age7_9 = ET3006, age10_11 = ET3007,
age12_13 = ET3008, age14 = ET3009,
age15 = ET3010, age16 = ET3011,
age17 = ET3012, age18 = ET3013,
age19 = ET3014, age20 = ET3015,
age21 = ET3016, age22_24 = ET3017,
age25_29 = ET3018, age30_34 = ET3019,
age35_39 = ET3020, age40_44 = ET3021,
age45_49 = ET3022, age50_54 = ET3023,
age55_59 = ET3024, age60_61 = ET3025,
age62_64 = ET3026, age65_69 = ET3027,
age70_74 = ET3028, age75_79 = ET3029,
age80_84 = ET3030, ageOver85 = ET3031,
noHS = E33001, hsNoGrad = E33002,
hsOrEquiv = E33003, collegeNoGrad = E33004,
associates = E33005, bachelors = E33006,
graduateDeg = E33007
)

cnty_data <- cnty_data |>
mutate(
noHighSchoolDeg = noHS + hsNoGrad,
edSampl = noHS + hsNoGrad + hsOrEquiv + collegeNoGrad + associates + bachelors + graduateDeg,
age18_64 = age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_39 + age40_44 + age45_49 + age50_54 + age55_59 + age60_61 + age62_64,
age0_4 = ageUnd1 + age1_2 + age3_4,
age5_14 = age5 + age6 + age7_9 + age10_11 + age12_13 + age14,
age15_19 = age15 + age16 + age17 + age18 + age19,
age20_24 = age20 + age21 + age22_24,
age15_44 = age15 + age16 + age17 + age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_39 + age40_44,
age45_49 = age45_49,
age50_54 = age50_54,
age55_59 = age55_59,
age60_64 = age60_61 + age62_64,
agOver65 = age65_69 + age70_74 + age75_79 + age80_84 + ageOver85,
whitePop = whitePop,
blackPop = blackPop,
hispPop = mexicanHisp + cubanHisp + prHisp + otherHisp,
amIndPop = contigAmIndPop + inuitPop + unanganPop,
asianPop = chinesePop + filipinoPop + japanesePop + indianPop + koreanPop + vietnamesePop + cambodianPop + hmongPop + loatianPop + thaiPop + otherAsianPop,
pacIsPop = hawaiinPop + samoanPop + tonganPop + otherPlynsnPop + guamianPop + otherMcrnsnPop + melanesianPop + unspecPcfcIslndrPop + otherMcrnsnRcePop,
totPop = totPop
)


cnty1990 <- cnty1990 |>
merge(cnty_data, by='GISJOIN') |>
st_transform(st_crs(cnty2010)) |>
select(GISJOIN,
age18_64, age0_4, age5_14, age15_19,
age20_24, age15_44, age45_49, age50_54,
age55_59, age60_64, agOver65, whitePop,
blackPop, hispPop, amIndPop, asianPop,
pacIsPop, totPop, noHighSchoolDeg,
edSampl)

# Make population weights
pop_weights <- trct1990 |>
merge(trct_demos, by='GISJOIN') |>
select(GISJOIN, ET1001) |>
rename(totPop = ET1001) |>
st_transform(st_crs(cnty2010))

# Interpolate
print('Interpolating!')
cnty1990_on_2010 <- interpolate_pw(
from = cnty1990,
to = cnty2010,
to_id = 'GEOID10',
extensive = TRUE,
weights = pop_weights,
weight_column = 'totPop',
weight_placement = 'surface') |>
st_drop_geometry() |>
filter(substr(GEOID10, start=1, stop=2) != "72") |>
rename(GEOID = GEOID10)

print("Saving!")
write.csv(cnty1990_on_2010, "../data_raw/nhgis/1990InterpolatedCounties.csv", row.names=FALSE)


129 changes: 129 additions & 0 deletions code/InterpolateDemographics_NHGIS_County_2000.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Author: Ashlynn Wimer
# Date: July 5th, 2023
# About: This R Script takes in 2000 county level census data and runs a population
# weighted interpolation using tidycensus::interpolate_pw to transform them to
# 2010 county shapes. The population weights originate from the 2000 census tracts.

# Libraries
library(dplyr)
library(stringr)
library(tidycensus)
library(sf)

# If running in RStudio, uncomment this
# setwd(getSrcDirectory(function(){})[1])


# Import Data
print("Loading data!")
cnty_full_data <- read.csv('../data_raw/nhgis/2000/nhgis0022_ds146_2000_county.csv')
cnty_sample_data <- read.csv('../data_raw/nhgis/2000/nhgis0022_ds151_2000_county.csv')
trct_demos <- read.csv('../data_raw/nhgis/2000/nhgis0020_ds146_2000_tract.csv')
cnty2000 <- st_read('../data_raw/nhgis/geometryFiles/2000/US_county_2000.shp')
cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp')
trct2000 <- st_read('../data_raw/nhgis/geometryFiles/2000/US_tract_2000.shp')

cnty_data <- cnty_sample_data |>
select(GISJOIN, GKT001, GKT003, GKT005,
GKT007, GKT009, GKT011, GKT013,
GKT015, GKT017, GKT019, GKT021,
GKT023, GKT025, GKT027, GKT029,
GKT031, GKT002, GKT004, GKT006,
GKT008, GKT010, GKT012, GKT014,
GKT016, GKT018, GKT020, GKT022,
GKT024, GKT026, GKT028, GKT030,
GKT032, GLD001) |>
merge(cnty_full_data, by='GISJOIN')

cnty_data <- cnty_data |>
rename(
whitePop = FMR001, blackPop = FMR002, amIndPop = FMR003, asianPop = FMR004,
pacIsPop = FMR005, mAgeUnd5 = FMZ001, mAgeUnder5 = FMZ001, mAge5_9 = FMZ002,
mAge10_14 = FMZ003, mAge15_17 = FMZ004, mAge18_19 = FMZ005, mAge20 = FMZ006,
mAge21 = FMZ007, mAge22_24 = FMZ008, mAge25_29 = FMZ009, mAge30_34 = FMZ010,
mAge35_39 = FMZ011, mAge40_44 = FMZ012, mAge45_49 = FMZ013, mAge50_54 = FMZ014,
mAge55_59 = FMZ015, mAge60_61 = FMZ016, mAge62_64 = FMZ017, mAge65_66 = FMZ018,
mAge67_69 = FMZ019, mAge70_74 = FMZ020, mAge75_79 = FMZ021, mAge80_84 = FMZ022,
mAgeOver85 = FMZ023, fAgeUnder5 = FMZ024, fAge5_9 = FMZ025, fAge10_14 = FMZ026,
fAge15_17 = FMZ027, fAge18_19 = FMZ028, fAge20 = FMZ029, fAge21 = FMZ030,
fAge22_24 = FMZ031, fAge25_29 = FMZ032, fAge30_34 = FMZ033, fAge35_39 = FMZ034,
fAge40_44 = FMZ035, fAge45_49 = FMZ036, fAge50_54 = FMZ037, fAge55_59 = FMZ038,
fAge60_61 = FMZ039, fAge62_64 = FMZ040, fAge65_66 = FMZ041, fAge67_69 = FMZ042,
fAge70_74 = FMZ043, fAge75_79 = FMZ044, fAge80_84 = FMZ045, fAgeOver85 = FMZ046,
mNoSchool = GKT001, m0_4 = GKT002, m5_6 = GKT003, m7_8 = GKT004,
m9 = GKT005, m10 = GKT006, m11 = GKT007, m12 = GKT008,
mHS = GKT009, mLtlClg = GKT010, mSmClg = GKT011, mAss = GKT012,
mBach = GKT013, mMas = GKT014, mProf = GKT015, mDoc = GKT016,
fNoSchool = GKT017, f0_4 = GKT018, f5_6 = GKT019, f7_8 = GKT020,
f9 = GKT021, f10 = GKT022, f11 = GKT023, f12 = GKT024,
fHS = GKT025, fLtlClg = GKT026, fSmClg = GKT027, fAss = GKT028,
fBach = GKT029, fMas = GKT030, fProf = GKT031, fDoc = GKT032,
disPop = GLD001, totPop = FL5001, hispPop = FMC001)

cnty_data <- cnty_data |>
mutate(
noHSDeg = mNoSchool + m0_4 + m5_6 + m7_8 + m9 + m10 + m11 + m12 +
fNoSchool + f0_4 + f5_6 + f7_8 + f9 + f10 + f11 + f12,
edSmpl = (mNoSchool + fNoSchool + m0_4 + f0_4 + m5_6 + f5_6 + m7_8 + f7_8 +
m9 + f9 + m10 + f10 + m11 + f11 + m12 + f12 + mHS + fHS +
mLtlClg + fLtlClg + mSmClg + fSmClg + mAss + fAss + mBach +
fBach + mMas + fMas + mProf + fProf + mDoc + fDoc),
age18_64 = mAge18_19 + mAge20 + mAge21 + mAge22_24 + mAge25_29 + mAge30_34 +
mAge35_39 + mAge40_44 + mAge45_49 + mAge50_54 + mAge55_59 + mAge60_61 +
mAge62_64 + fAge18_19 + fAge20 + fAge21 + fAge22_24 + fAge25_29 +
fAge30_34 + fAge35_39 + fAge40_44 + fAge45_49 + fAge50_54 + fAge55_59 +
fAge60_61 + fAge62_64,
age0_4 = mAgeUnder5 + fAgeUnder5,
age5_14 = mAge5_9 + mAge10_14 + fAge5_9 + fAge10_14,
age15_19 = mAge15_17 + fAge15_17 + mAge18_19 + fAge18_19,
age20_24 = mAge20 + mAge21 + mAge22_24 + fAge20 + fAge21 + fAge22_24,
age15_44 = mAge15_17 + mAge18_19 + mAge20 + mAge21 + mAge22_24 + mAge25_29 +
mAge30_34 + mAge35_39 + mAge40_44 + fAge15_17 + fAge18_19 + fAge20 +
fAge21 + fAge22_24 + fAge25_29 + fAge30_34 + fAge35_39 + fAge40_44,
age45_49 = mAge45_49 + fAge45_49,
age50_54 = mAge50_54 + fAge50_54,
age55_59 = mAge55_59 + fAge55_59,
age60_64 = mAge60_61 + mAge62_64 + fAge60_61 + fAge62_64,
ageOv65 = mAge65_66 + mAge67_69 + mAge70_74 + mAge75_79 + mAge80_84 +
mAgeOver85 + fAge65_66 + fAge67_69 + fAge70_74 + fAge75_79 + fAge80_84 +
fAgeOver85
) |>
select(
GISJOIN, totPop, age18_64, age0_4, age5_14, age15_19, age20_24, age15_44, age45_49,
age50_54, age55_59, age60_64, ageOv65, hispPop, whitePop, blackPop,
amIndPop, asianPop, pacIsPop, noHSDeg, edSmpl, disPop
)

cnty2000 <- cnty2000 |>
merge(cnty_data, by='GISJOIN') |>
st_transform(st_crs(cnty2010))

# Make population weights.
pop_weights <- trct2000 |>
merge(trct_demos, by="GISJOIN") |>
select(GISJOIN, FL5001) |>
rename(totPop = FL5001) |>
st_transform(st_crs(cnty2010))

# Interpolate
cnty2000_on_2010 <- interpolate_pw(
from = cnty2000,
to = cnty2010,
to_id = 'GEOID10',
extensive = TRUE,
weights = pop_weights,
weight_column = 'totPop',
weight_placement = 'surface'
)

# Clean result
cnty2000_on_2010 <- cnty2000_on_2010 |>
rename(GEOID = GEOID10) |>
filter(substr(GEOID, start=1, stop=2) !="72") |>
st_drop_geometry()

# Save
write.csv(
cnty2000_on_2010,
"../data_raw/nhgis/2000DataInterpolatedCounty.csv", row.names=FALSE
)
Loading

0 comments on commit d9f6ac8

Please sign in to comment.