-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #50 from bucketteOfIvy/49_update_historical_data
49 update historical data
- Loading branch information
Showing
73 changed files
with
1,724,127 additions
and
73,128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# Author: Ashlynn Wimer | ||
# Date: July 11th, 2023 | ||
# About: This R Script takes in 1980 county level census data and runs a population | ||
# weighted interpolation using tidycensus::interpolate_pw to transform them to | ||
# 2010 county shapes. The population weights used are at the 1980 county | ||
# subdivision level. | ||
|
||
# Libraries | ||
library(dplyr) | ||
library(stringr) | ||
library(tidycensus) | ||
library(sf) | ||
|
||
# If running in RStudio, uncomment this | ||
# setwd(getSrcDirectory(function(){})[1]) | ||
|
||
|
||
# Import Data | ||
print("Loading data!") | ||
cnty_full_data <- read.csv('../data_raw/nhgis/1980/nhgis0022_ds104_1980_county.csv') | ||
cnty_sample_data <- read.csv('../data_raw/nhgis/1980/nhgis0022_ds107_1980_county.csv') | ||
cntySbdv_pop_data <- read.csv('../data_raw/nhgis/1980/nhgis0027_ds104_1980_cty_sub.csv') | ||
cnty1980 <- st_read('../data_raw/nhgis/geometryFiles/1980/US_county_1980_conflated.shp') | ||
cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp') | ||
cntySbdv1980 <- st_read('../data_raw/nhgis/geometryFiles/1980/US_mcd_1980.shp') | ||
|
||
|
||
# Transform Data | ||
print("Transforming Data!") | ||
cnty_data <- cnty_sample_data |> | ||
select(GISJOIN, DHM001, DHM002, DHM003, DHM004, DHM005) |> | ||
merge(cnty_full_data, by="GISJOIN") | ||
names(cnty_full_data) | ||
cnty_data <- cnty_data |> | ||
rename( | ||
ageUnd1 = C67001, age1_2 = C67002, | ||
age3_4 = C67003, age5 = C67004, | ||
age6 = C67005, age7_9 = C67006, | ||
age10_13 = C67007, age14 = C67008, | ||
age15 = C67009, age16 = C67010, | ||
age17 = C67011, age18 = C67012, | ||
age19 = C67013, age20 = C67014, | ||
age21 = C67015, age22_24 = C67016, | ||
age25_29 = C67017, age30_34 = C67018, | ||
age35_44 = C67019, age45_54 = C67020, | ||
age55_59 = C67021, age60_61 = C67022, | ||
age62_64 = C67023, age65_74 = C67024, | ||
age75_84 = C67025, ageOv85 = C67026, | ||
notHispPop = C9E001, mexicanPop = C9E002, | ||
prPop = C9E003, cubanPop = C9E004, | ||
otherHisp = C9E005, whitePop = C9D001, | ||
blackPop = C9D002, contigAmIndPop = C9D003, | ||
inuitPop = C9D004, unanganPop = C9D005, | ||
japanesePop = C9D006, chinesePop = C9D007, | ||
filipinoPop = C9D008, koreanPop = C9D009, | ||
indianPop = C9D010, vietnamesePop = C9D011, | ||
hawaiinPop = C9D012, guamanianPop = C9D013, | ||
samoanPop = C9D014, otherPop = C9D015, | ||
totPop = C7L001, elementary = DHM001, | ||
hghschl1_3 = DHM002, hghschl4 = DHM003, | ||
college1_3 = DHM004, college4orMore = DHM005) | ||
|
||
cnty_data <- cnty_data |> | ||
mutate( | ||
age0_4 = ageUnd1 + age1_2 + age3_4, | ||
age5_14 = age5 + age6 + age7_9 + age10_13 + age14, | ||
age18_64 = age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_44 + age45_54 + age55_59 + age60_61 + age62_64, | ||
age15_19 = age15 + age16 + age17 + age18 + age19, | ||
age20_24 = age20 + age21 + age22_24, | ||
age15_44 = age15 + age16 + age17 + age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_44, | ||
age55_59 = age55_59, | ||
age60_64 = age60_61 + age62_64, | ||
ageOv65 = age65_74 + age75_84 + ageOv85, | ||
hispPop = mexicanPop + prPop + cubanPop + otherHisp, | ||
amIndPop = contigAmIndPop + inuitPop + unanganPop, | ||
asianPop = japanesePop + chinesePop + filipinoPop + koreanPop + indianPop + vietnamesePop, | ||
pacIsPop = hawaiinPop + guamanianPop + samoanPop, | ||
NoHSPop = elementary + hghschl1_3, | ||
edSampl = elementary + hghschl1_3 + hghschl4 + college1_3 + college4orMore | ||
) | ||
|
||
cnty_data <- cnty_data |> | ||
select(GISJOIN, STATEA, COUNTYA, | ||
age18_64, age0_4, age5_14, age15_19, age20_24, | ||
age15_44, age55_59, age60_64, ageOv65, hispPop, | ||
amIndPop, asianPop, pacIsPop, totPop, whitePop, | ||
blackPop, NoHSPop, edSampl) | ||
|
||
cnty1980 <- cnty1980 |> | ||
merge(cnty_data, by="GISJOIN") |> | ||
st_transform(st_crs(cnty2010)) | ||
|
||
print("Final preparation before interpolating..") | ||
cnty2010$GEOID <- paste( | ||
str_pad(cnty2010$STATEFP10, width=2, side='left', pad='0'), | ||
str_pad(cnty2010$COUNTYFP10, width=3, side='left', pad='0'), | ||
sep='0' | ||
) | ||
|
||
cnty2010 <- cnty2010 |> | ||
select(GISJOIN, GEOID, NAME10) | ||
|
||
# Create population weights | ||
pop_weights <- cntySbdv1980 |> | ||
merge(cntySbdv_pop_data, by='GISJOIN') |> | ||
st_transform(st_crs(cnty2010)) |> | ||
rename(totPop = C7L001) | ||
|
||
# Run the interpolation algorithm | ||
print("Interpolating!") | ||
cnty1980_on_2010 <- interpolate_pw( | ||
from = cnty1980, | ||
to = cnty2010, | ||
to_id = 'GEOID', | ||
extensive = T, | ||
weights = pop_weights, | ||
weight_column = 'totPop', | ||
weight_placement = 'surface' | ||
) | ||
|
||
# Filter out PR data | ||
# Alaska NAs are left in for posterity. | ||
cnty1980_on_2010 <- cnty1980_on_2010 |> | ||
filter(substr(GEOID, start=1, stop=2) != "72") |> | ||
st_drop_geometry() | ||
|
||
# Save | ||
write.csv(cnty1980_on_2010, "../data_raw/nhgis/1980InterpolatedDataCounty.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# Author: Ashlynn Wimer | ||
# Date: July 5th, 2023 | ||
# About: This R Script takes in 1990 county level census data and runs a population | ||
# weighted interpolation using tidycensus::interpolate_pw to transform them to | ||
# 2010 county shapes. The population weights originate from the 1990 census tracts. | ||
|
||
|
||
# Libraries | ||
library(dplyr) | ||
library(stringr) | ||
library(tidycensus) | ||
library(sf) | ||
|
||
# If running in RStudio, uncomment this | ||
# setwd(getSrcDirectory(function(){})[1]) | ||
|
||
|
||
# Import Data | ||
print("Loading data!") | ||
cnty_full_data <- read.csv('../data_raw/nhgis/1990/nhgis0022_ds120_1990_county.csv') | ||
cnty_sample_data <- read.csv('../data_raw/nhgis/1990/nhgis0022_ds123_1990_county.csv') | ||
trct_demos <- read.csv('../data_raw/nhgis/1990/nhgis0020_ds120_1990_tract.csv') | ||
cnty1990 <- st_read('../data_raw/nhgis/geometryFiles/1990/US_county_1990.shp') | ||
cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp') | ||
trct1990 <- st_read('../data_raw/nhgis/geometryFiles/1990/US_tract_1990.shp') | ||
|
||
# Transform | ||
cnty_data <- cnty_sample_data |> | ||
select(GISJOIN, E33001, E33002, E33003, E33004, E33005, E33006, E33007) |> | ||
merge(cnty_full_data, by='GISJOIN') | ||
|
||
cnty_data <- cnty_data |> | ||
rename( | ||
whitePop = EUZ001, blackPop = EUZ002, | ||
contigAmIndPop = EUZ003, inuitPop = EUZ004, | ||
unanganPop = EUZ005, chinesePop = EUZ006, | ||
filipinoPop = EUZ007, japanesePop = EUZ008, | ||
indianPop = EUZ009, koreanPop = EUZ010, | ||
vietnamesePop = EUZ011, cambodianPop = EUZ012, | ||
hmongPop = EUZ013, loatianPop = EUZ014, | ||
thaiPop = EUZ015, otherAsianPop = EUZ016, | ||
hawaiinPop = EUZ017, samoanPop = EUZ018, | ||
tonganPop = EUZ019, otherPlynsnPop = EUZ020, | ||
guamianPop = EUZ021, otherMcrnsnPop = EUZ022, | ||
melanesianPop = EUZ023, unspecPcfcIslndrPop = EUZ024, | ||
otherMcrnsnRcePop = EUZ025, totPop = ET1001, | ||
notHisp = EU1001, mexicanHisp = EU1002, | ||
prHisp = EU1003, cubanHisp = EU1004, | ||
otherHisp = EU1005, ageUnd1 = ET3001, | ||
age1_2 = ET3002, age3_4 = ET3003, | ||
age5 = ET3004, age6 = ET3005, | ||
age7_9 = ET3006, age10_11 = ET3007, | ||
age12_13 = ET3008, age14 = ET3009, | ||
age15 = ET3010, age16 = ET3011, | ||
age17 = ET3012, age18 = ET3013, | ||
age19 = ET3014, age20 = ET3015, | ||
age21 = ET3016, age22_24 = ET3017, | ||
age25_29 = ET3018, age30_34 = ET3019, | ||
age35_39 = ET3020, age40_44 = ET3021, | ||
age45_49 = ET3022, age50_54 = ET3023, | ||
age55_59 = ET3024, age60_61 = ET3025, | ||
age62_64 = ET3026, age65_69 = ET3027, | ||
age70_74 = ET3028, age75_79 = ET3029, | ||
age80_84 = ET3030, ageOver85 = ET3031, | ||
noHS = E33001, hsNoGrad = E33002, | ||
hsOrEquiv = E33003, collegeNoGrad = E33004, | ||
associates = E33005, bachelors = E33006, | ||
graduateDeg = E33007 | ||
) | ||
|
||
cnty_data <- cnty_data |> | ||
mutate( | ||
noHighSchoolDeg = noHS + hsNoGrad, | ||
edSampl = noHS + hsNoGrad + hsOrEquiv + collegeNoGrad + associates + bachelors + graduateDeg, | ||
age18_64 = age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_39 + age40_44 + age45_49 + age50_54 + age55_59 + age60_61 + age62_64, | ||
age0_4 = ageUnd1 + age1_2 + age3_4, | ||
age5_14 = age5 + age6 + age7_9 + age10_11 + age12_13 + age14, | ||
age15_19 = age15 + age16 + age17 + age18 + age19, | ||
age20_24 = age20 + age21 + age22_24, | ||
age15_44 = age15 + age16 + age17 + age18 + age19 + age20 + age21 + age22_24 + age25_29 + age30_34 + age35_39 + age40_44, | ||
age45_49 = age45_49, | ||
age50_54 = age50_54, | ||
age55_59 = age55_59, | ||
age60_64 = age60_61 + age62_64, | ||
agOver65 = age65_69 + age70_74 + age75_79 + age80_84 + ageOver85, | ||
whitePop = whitePop, | ||
blackPop = blackPop, | ||
hispPop = mexicanHisp + cubanHisp + prHisp + otherHisp, | ||
amIndPop = contigAmIndPop + inuitPop + unanganPop, | ||
asianPop = chinesePop + filipinoPop + japanesePop + indianPop + koreanPop + vietnamesePop + cambodianPop + hmongPop + loatianPop + thaiPop + otherAsianPop, | ||
pacIsPop = hawaiinPop + samoanPop + tonganPop + otherPlynsnPop + guamianPop + otherMcrnsnPop + melanesianPop + unspecPcfcIslndrPop + otherMcrnsnRcePop, | ||
totPop = totPop | ||
) | ||
|
||
|
||
cnty1990 <- cnty1990 |> | ||
merge(cnty_data, by='GISJOIN') |> | ||
st_transform(st_crs(cnty2010)) |> | ||
select(GISJOIN, | ||
age18_64, age0_4, age5_14, age15_19, | ||
age20_24, age15_44, age45_49, age50_54, | ||
age55_59, age60_64, agOver65, whitePop, | ||
blackPop, hispPop, amIndPop, asianPop, | ||
pacIsPop, totPop, noHighSchoolDeg, | ||
edSampl) | ||
|
||
# Make population weights | ||
pop_weights <- trct1990 |> | ||
merge(trct_demos, by='GISJOIN') |> | ||
select(GISJOIN, ET1001) |> | ||
rename(totPop = ET1001) |> | ||
st_transform(st_crs(cnty2010)) | ||
|
||
# Interpolate | ||
print('Interpolating!') | ||
cnty1990_on_2010 <- interpolate_pw( | ||
from = cnty1990, | ||
to = cnty2010, | ||
to_id = 'GEOID10', | ||
extensive = TRUE, | ||
weights = pop_weights, | ||
weight_column = 'totPop', | ||
weight_placement = 'surface') |> | ||
st_drop_geometry() |> | ||
filter(substr(GEOID10, start=1, stop=2) != "72") |> | ||
rename(GEOID = GEOID10) | ||
|
||
print("Saving!") | ||
write.csv(cnty1990_on_2010, "../data_raw/nhgis/1990InterpolatedCounties.csv", row.names=FALSE) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# Author: Ashlynn Wimer | ||
# Date: July 5th, 2023 | ||
# About: This R Script takes in 2000 county level census data and runs a population | ||
# weighted interpolation using tidycensus::interpolate_pw to transform them to | ||
# 2010 county shapes. The population weights originate from the 2000 census tracts. | ||
|
||
# Libraries | ||
library(dplyr) | ||
library(stringr) | ||
library(tidycensus) | ||
library(sf) | ||
|
||
# If running in RStudio, uncomment this | ||
# setwd(getSrcDirectory(function(){})[1]) | ||
|
||
|
||
# Import Data | ||
print("Loading data!") | ||
cnty_full_data <- read.csv('../data_raw/nhgis/2000/nhgis0022_ds146_2000_county.csv') | ||
cnty_sample_data <- read.csv('../data_raw/nhgis/2000/nhgis0022_ds151_2000_county.csv') | ||
trct_demos <- read.csv('../data_raw/nhgis/2000/nhgis0020_ds146_2000_tract.csv') | ||
cnty2000 <- st_read('../data_raw/nhgis/geometryFiles/2000/US_county_2000.shp') | ||
cnty2010 <- st_read('../data_raw/nhgis/geometryFiles/2010/US_county_2010.shp') | ||
trct2000 <- st_read('../data_raw/nhgis/geometryFiles/2000/US_tract_2000.shp') | ||
|
||
cnty_data <- cnty_sample_data |> | ||
select(GISJOIN, GKT001, GKT003, GKT005, | ||
GKT007, GKT009, GKT011, GKT013, | ||
GKT015, GKT017, GKT019, GKT021, | ||
GKT023, GKT025, GKT027, GKT029, | ||
GKT031, GKT002, GKT004, GKT006, | ||
GKT008, GKT010, GKT012, GKT014, | ||
GKT016, GKT018, GKT020, GKT022, | ||
GKT024, GKT026, GKT028, GKT030, | ||
GKT032, GLD001) |> | ||
merge(cnty_full_data, by='GISJOIN') | ||
|
||
cnty_data <- cnty_data |> | ||
rename( | ||
whitePop = FMR001, blackPop = FMR002, amIndPop = FMR003, asianPop = FMR004, | ||
pacIsPop = FMR005, mAgeUnd5 = FMZ001, mAgeUnder5 = FMZ001, mAge5_9 = FMZ002, | ||
mAge10_14 = FMZ003, mAge15_17 = FMZ004, mAge18_19 = FMZ005, mAge20 = FMZ006, | ||
mAge21 = FMZ007, mAge22_24 = FMZ008, mAge25_29 = FMZ009, mAge30_34 = FMZ010, | ||
mAge35_39 = FMZ011, mAge40_44 = FMZ012, mAge45_49 = FMZ013, mAge50_54 = FMZ014, | ||
mAge55_59 = FMZ015, mAge60_61 = FMZ016, mAge62_64 = FMZ017, mAge65_66 = FMZ018, | ||
mAge67_69 = FMZ019, mAge70_74 = FMZ020, mAge75_79 = FMZ021, mAge80_84 = FMZ022, | ||
mAgeOver85 = FMZ023, fAgeUnder5 = FMZ024, fAge5_9 = FMZ025, fAge10_14 = FMZ026, | ||
fAge15_17 = FMZ027, fAge18_19 = FMZ028, fAge20 = FMZ029, fAge21 = FMZ030, | ||
fAge22_24 = FMZ031, fAge25_29 = FMZ032, fAge30_34 = FMZ033, fAge35_39 = FMZ034, | ||
fAge40_44 = FMZ035, fAge45_49 = FMZ036, fAge50_54 = FMZ037, fAge55_59 = FMZ038, | ||
fAge60_61 = FMZ039, fAge62_64 = FMZ040, fAge65_66 = FMZ041, fAge67_69 = FMZ042, | ||
fAge70_74 = FMZ043, fAge75_79 = FMZ044, fAge80_84 = FMZ045, fAgeOver85 = FMZ046, | ||
mNoSchool = GKT001, m0_4 = GKT002, m5_6 = GKT003, m7_8 = GKT004, | ||
m9 = GKT005, m10 = GKT006, m11 = GKT007, m12 = GKT008, | ||
mHS = GKT009, mLtlClg = GKT010, mSmClg = GKT011, mAss = GKT012, | ||
mBach = GKT013, mMas = GKT014, mProf = GKT015, mDoc = GKT016, | ||
fNoSchool = GKT017, f0_4 = GKT018, f5_6 = GKT019, f7_8 = GKT020, | ||
f9 = GKT021, f10 = GKT022, f11 = GKT023, f12 = GKT024, | ||
fHS = GKT025, fLtlClg = GKT026, fSmClg = GKT027, fAss = GKT028, | ||
fBach = GKT029, fMas = GKT030, fProf = GKT031, fDoc = GKT032, | ||
disPop = GLD001, totPop = FL5001, hispPop = FMC001) | ||
|
||
cnty_data <- cnty_data |> | ||
mutate( | ||
noHSDeg = mNoSchool + m0_4 + m5_6 + m7_8 + m9 + m10 + m11 + m12 + | ||
fNoSchool + f0_4 + f5_6 + f7_8 + f9 + f10 + f11 + f12, | ||
edSmpl = (mNoSchool + fNoSchool + m0_4 + f0_4 + m5_6 + f5_6 + m7_8 + f7_8 + | ||
m9 + f9 + m10 + f10 + m11 + f11 + m12 + f12 + mHS + fHS + | ||
mLtlClg + fLtlClg + mSmClg + fSmClg + mAss + fAss + mBach + | ||
fBach + mMas + fMas + mProf + fProf + mDoc + fDoc), | ||
age18_64 = mAge18_19 + mAge20 + mAge21 + mAge22_24 + mAge25_29 + mAge30_34 + | ||
mAge35_39 + mAge40_44 + mAge45_49 + mAge50_54 + mAge55_59 + mAge60_61 + | ||
mAge62_64 + fAge18_19 + fAge20 + fAge21 + fAge22_24 + fAge25_29 + | ||
fAge30_34 + fAge35_39 + fAge40_44 + fAge45_49 + fAge50_54 + fAge55_59 + | ||
fAge60_61 + fAge62_64, | ||
age0_4 = mAgeUnder5 + fAgeUnder5, | ||
age5_14 = mAge5_9 + mAge10_14 + fAge5_9 + fAge10_14, | ||
age15_19 = mAge15_17 + fAge15_17 + mAge18_19 + fAge18_19, | ||
age20_24 = mAge20 + mAge21 + mAge22_24 + fAge20 + fAge21 + fAge22_24, | ||
age15_44 = mAge15_17 + mAge18_19 + mAge20 + mAge21 + mAge22_24 + mAge25_29 + | ||
mAge30_34 + mAge35_39 + mAge40_44 + fAge15_17 + fAge18_19 + fAge20 + | ||
fAge21 + fAge22_24 + fAge25_29 + fAge30_34 + fAge35_39 + fAge40_44, | ||
age45_49 = mAge45_49 + fAge45_49, | ||
age50_54 = mAge50_54 + fAge50_54, | ||
age55_59 = mAge55_59 + fAge55_59, | ||
age60_64 = mAge60_61 + mAge62_64 + fAge60_61 + fAge62_64, | ||
ageOv65 = mAge65_66 + mAge67_69 + mAge70_74 + mAge75_79 + mAge80_84 + | ||
mAgeOver85 + fAge65_66 + fAge67_69 + fAge70_74 + fAge75_79 + fAge80_84 + | ||
fAgeOver85 | ||
) |> | ||
select( | ||
GISJOIN, totPop, age18_64, age0_4, age5_14, age15_19, age20_24, age15_44, age45_49, | ||
age50_54, age55_59, age60_64, ageOv65, hispPop, whitePop, blackPop, | ||
amIndPop, asianPop, pacIsPop, noHSDeg, edSmpl, disPop | ||
) | ||
|
||
cnty2000 <- cnty2000 |> | ||
merge(cnty_data, by='GISJOIN') |> | ||
st_transform(st_crs(cnty2010)) | ||
|
||
# Make population weights. | ||
pop_weights <- trct2000 |> | ||
merge(trct_demos, by="GISJOIN") |> | ||
select(GISJOIN, FL5001) |> | ||
rename(totPop = FL5001) |> | ||
st_transform(st_crs(cnty2010)) | ||
|
||
# Interpolate | ||
cnty2000_on_2010 <- interpolate_pw( | ||
from = cnty2000, | ||
to = cnty2010, | ||
to_id = 'GEOID10', | ||
extensive = TRUE, | ||
weights = pop_weights, | ||
weight_column = 'totPop', | ||
weight_placement = 'surface' | ||
) | ||
|
||
# Clean result | ||
cnty2000_on_2010 <- cnty2000_on_2010 |> | ||
rename(GEOID = GEOID10) |> | ||
filter(substr(GEOID, start=1, stop=2) !="72") |> | ||
st_drop_geometry() | ||
|
||
# Save | ||
write.csv( | ||
cnty2000_on_2010, | ||
"../data_raw/nhgis/2000DataInterpolatedCounty.csv", row.names=FALSE | ||
) |
Oops, something went wrong.