-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_cps_asec.qmd
159 lines (128 loc) · 5.14 KB
/
get_cps_asec.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
---
title: "CPS ASEC Data Pull and Diagnostics"
format:
html:
df-print: paged
embed-resources: TRUE
fig-width: 8
fig-height: 6
---
# This file reads in longitudinal data from CPS ASEC and appends into a single dataframe.
```{r}
library(tidyverse)
library(ipumsr)
library(purrr)
# Run this once to save key to .Renviron file:
# set_ipums_api_key('Enter your key here')
```
## Pull Extracts from IPUMS
Note: ethnicity not available pre-1971; school or college attendance (SCHLCOLL) not available pre-1986
```{r eval=FALSE}
years <- 1971:2023
get_cps_asec <- function(year, import=TRUE){
cps_extract_request <- define_extract_cps(
description = 'Longitudinal CPS ASEC Data',
sample = paste0('cps', year, '_03s'),
variables = c('SEX', 'AGE', 'RACE', 'HISPAN', 'EDUC')
)
submitted_extract <- submit_extract(cps_extract_request)
collection <- submitted_extract$collection
number <- submitted_extract$number
downloadable_extract <- wait_for_extract(submitted_extract)
path_to_asec_files <- download_extract(c(collection, number), download_dir = "data")
}
purrr::map(years, get_cps_asec)
```
## Read in and clean extracts
```{r message = FALSE, eval=FALSE}
# Get all files ending in .xml
asec_files <- fs::dir_ls('data', regexp = '\\.xml$')
asec_df_long <- purrr::map_dfr(asec_files, read_ipums_micro)
educ_labels <- ipums_val_labels(asec_df_long$EDUC)
asec_df_long2 <- asec_df_long |>
mutate(race_eth = as.factor(
case_when(
HISPAN != 0 ~ "Hispanic",
RACE == 100 ~ "White",
RACE == 200 ~ "Black",
RACE %in% 650:652 ~ "AAPI",
TRUE ~ "Other")),
educ_bucket = as.factor(
case_when(
EDUC %in% c(1, 999) ~ "Not in Universe/Missing",
EDUC < 70 ~ "Less than high school completed",
EDUC < 80 ~ "High school completed",
EDUC < 110 ~ "Some college completed", # This may include Associate's degree or other vocational training
TRUE ~ "Bachelor's or higher completed"
)),
age_bucket = as.factor(
case_when(
AGE < 16 ~ "Under 16",
AGE < 26 ~ "16-25",
AGE < 36 ~ "26-35",
AGE < 46 ~ "36-45",
AGE < 56 ~ "46-55",
TRUE ~ "56+"
)
),
SEX = as.factor(ifelse(SEX == 1, "Male", "Female"))
) |>
tidylog::left_join(educ_labels, by=c('EDUC'='val')) |>
mutate(EDUC_LABEL_FULL = as.factor(lbl)) |>
select(-lbl)
asec_df_long2$EDUC_LABEL_FULL <- reorder(asec_df_long2$EDUC_LABEL_FULL,
asec_df_long2$EDUC)
# Write out stacked R dataset (and upload to Box)
write_rds(asec_df_long2, 'data/asec_longitudinal.Rds')
```
## Time series plots
```{r}
# Remove individuals not in universe
asec_df_long2 <- read_rds('data/asec_longitudinal.Rds')
asec_df_filtered <- asec_df_long2 |>
filter(educ_bucket != "Not in Universe/Missing") |>
filter(AGE %in% 16:55) |> #Keeping only age buckets of interest
filter(race_eth != 'Other')
# Plots of educational attainment over time by (1) age, (2) sex, (3) race/ethnicity
make_facet_plot <- function(facet_var){
plot <- asec_df_filtered |>
count(YEAR, {{facet_var}}, educ_bucket, wt = ASECWT) |>
group_by(YEAR, {{facet_var}}) |>
mutate(freq = n/sum(n)) |>
ggplot(aes(x=YEAR, y=freq, color=educ_bucket)) + geom_line() +
facet_wrap(as.formula(paste0('~', deparse(substitute(facet_var))))) +
theme(legend.position = 'bottom') +
guides(color = guide_legend(nrow=2))
return(plot)
}
make_facet_plot(age_bucket)
make_facet_plot(SEX)
make_facet_plot(race_eth)
# Plots of sex by race by age for Bachelor's degree recipients
asec_df_filtered |>
count(YEAR, age_bucket, SEX, race_eth, educ_bucket == "Bachelor's or higher completed", wt = ASECWT) |>
mutate(`Relative Freq` = n/sum(n), .by = c('YEAR', 'age_bucket', 'SEX', 'race_eth')) |>
filter(`educ_bucket == "Bachelor's or higher completed"`) |>
ggplot(aes(x=YEAR, y=`Relative Freq`, color=race_eth)) + geom_line() + geom_smooth(method='lm', se=FALSE) +
facet_grid(age_bucket~SEX, scales='free') +
theme(legend.position = 'bottom') +
ylab("Share with Bachelor's degree or higher")
# Regression of relative frequency on age, SEX, race, and quadratic time trend
lm_data <- asec_df_filtered |>
count(YEAR, age_bucket, SEX, race_eth, educ_bucket == "Bachelor's or higher completed", wt = ASECWT) |>
mutate(`Relative Freq` = n/sum(n), .by = c('YEAR', 'age_bucket', 'SEX', 'race_eth')) |>
filter(`educ_bucket == "Bachelor's or higher completed"`)
# lme4::lmList(`Relative Freq` ~ YEAR | (age_bucket, SEX), .)
fit_time_trend <- function(df){
lm_fit <- lm(`Relative Freq` ~ YEAR, data=df)
coef(lm_fit)
}
models_list <- lm_data |>
group_by(SEX, age_bucket, race_eth) |>
nest()
models_list <- cbind(
models_list,
map_dfr(models_list$data, fit_time_trend)
)
models_list |> select(-data)
```