Skip to content

Commit

Permalink
removing unsued parameters from stan model.
Browse files Browse the repository at this point in the history
fixing the way our model was calculating upper/lower bounds and p values

adding a chart on calibration
  • Loading branch information
Elliott Morris committed Oct 15, 2020
1 parent 85be55a commit b89e602
Show file tree
Hide file tree
Showing 31 changed files with 353 additions and 186 deletions.
169 changes: 155 additions & 14 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ library(pbapply)
library(urbnmapr)
library(ggrepel)
library(DT)
# source('scripts/model/final_2016.R');source('scripts/model/final_2012.R');source('scripts/model/final_2008.R');beepr::beep(2);
```


Expand All @@ -57,7 +60,7 @@ start_date <- as.Date("2008-03-01") # Keeping all polls after March 1, 2016
evs_df <- read_csv('data/2012.csv')
# will read in the most recent back-test for 2008
out <- read_rds(sprintf('models/backtest_2008/stan_model_%s.rds',RUN_DATE))
out <- read_rds(sprintf('models/backtest_2008/stan_model_%s_normal_final.rds',RUN_DATE))
# get all the polling data
#setwd(here("data/"))
Expand Down Expand Up @@ -208,8 +211,8 @@ pct_obama <- pblapply(1:dim(predicted_score)[3],
temp <- predicted_score[,,x]
# put in tibble
tibble(low = apply(temp,2,function(x){(quantile(x,0.05))}),
high = apply(temp,2,function(x){(quantile(x,0.95))}),
tibble(low = apply(temp,2,function(x){(quantile(x,0.025))}),
high = apply(temp,2,function(x){(quantile(x,0.975))}),
mean = apply(temp,2,function(x){(mean(x))}),
prob = apply(temp,2,function(x){(mean(x>0.5))}),
state = x)
Expand Down Expand Up @@ -238,8 +241,8 @@ pct_obama_natl <- pblapply(1:dim(predicted_score)[2],
pct_obama_natl <- pct_obama_natl %>%
group_by(t) %>%
summarise(low = quantile(natl_vote,0.05),
high = quantile(natl_vote,0.95),
summarise(low = quantile(natl_vote,0.025),
high = quantile(natl_vote,0.975),
mean = mean(natl_vote),
prob = mean(natl_vote > 0.5)) %>%
mutate(state = '--')
Expand All @@ -251,7 +254,7 @@ pct_obama <- pct_obama %>%
# look
ex_states <- c('IA','FL','OH','WI','MI','PA','AZ','NC','NH','TX','GA','MN')
pct_obama %>% filter(t == RUN_DATE,state %in% c(ex_states,'--')) %>% mutate(se = (high - mean)/1.68) %>% dplyr::select(-t)
pct_obama %>% filter(t == RUN_DATE,state %in% c(ex_states,'--')) %>% mutate(se = (high - mean)/1.96) %>% dplyr::select(-t)
map.2008.gg <- urbnmapr::states %>%
left_join(pct_obama %>% filter(t == max(t)) %>%
Expand Down Expand Up @@ -473,6 +476,32 @@ final_evs.2008.gg <- ggplot(final_evs,aes(x=dem_ev,
subtitle=sprintf("p(dem win) = %s | full stan model",round(mean(final_evs$dem_ev>=270),3)))
# get the p value of the actual results
p_values_2008 <- draws %>%
filter(state != '--',
t == election_day) %>%
left_join(politicaldata::pres_results %>% filter(year == 2008) %>%
mutate(actual = dem/(dem+rep)) %>%
dplyr::select(state,actual)) %>%
group_by(state) %>%
summarise(high = quantile(pct_obama, 0.975),
low = quantile(pct_obama, 0.025),
mean = mean(pct_obama),
prob = mean(pct_obama > 0.5),
actual = unique(actual),
p_value =
( 2*sum(pct_obama < actual) + 1 ) /
( 2 * n() + 2) #,
# p_value_2 = mean(ifelse(actual >= mean(pct_obama),
# pct_obama >= actual,
# pct_obama <= actual))
) %>%
arrange(p_value) %>%
mutate(year = 2008,
outside_ci = ifelse(actual > high | actual < low, TRUE,FALSE))
```

#### Map
Expand Down Expand Up @@ -547,7 +576,7 @@ start_date <- as.Date("2012-03-01") # Keeping all polls after March 1, 2016
evs_df <- read_csv('data/2012.csv')
# will read in the most recent back-test for 2012
out <- read_rds(sprintf('models/backtest_2012/stan_model_%s.rds',RUN_DATE))
out <- read_rds(sprintf('models/backtest_2012/stan_model_%s_normal_final.rds',RUN_DATE))
# get all the polling data
#setwd(here("data/"))
Expand Down Expand Up @@ -693,8 +722,8 @@ pct_obama <- pblapply(1:dim(predicted_score)[3],
temp <- predicted_score[,,x]
# put in tibble
tibble(low = apply(temp,2,function(x){(quantile(x,0.05))}),
high = apply(temp,2,function(x){(quantile(x,0.95))}),
tibble(low = apply(temp,2,function(x){(quantile(x,0.025))}),
high = apply(temp,2,function(x){(quantile(x,0.975))}),
mean = apply(temp,2,function(x){(mean(x))}),
prob = apply(temp,2,function(x){(mean(x>0.5))}),
state = x)
Expand Down Expand Up @@ -723,8 +752,8 @@ pct_obama_natl <- pblapply(1:dim(predicted_score)[2],
pct_obama_natl <- pct_obama_natl %>%
group_by(t) %>%
summarise(low = quantile(natl_vote,0.05),
high = quantile(natl_vote,0.95),
summarise(low = quantile(natl_vote,0.025),
high = quantile(natl_vote,0.975),
mean = mean(natl_vote),
prob = mean(natl_vote > 0.5)) %>%
mutate(state = '--')
Expand All @@ -736,7 +765,7 @@ pct_obama <- pct_obama %>%
# look
ex_states <- c('IA','FL','OH','WI','MI','PA','AZ','NC','NH','TX','GA','MN')
pct_obama %>% filter(t == RUN_DATE,state %in% c(ex_states,'--')) %>% mutate(se = (high - mean)/1.68) %>% dplyr::select(-t)
pct_obama %>% filter(t == RUN_DATE,state %in% c(ex_states,'--')) %>% mutate(se = (high - mean)/1.96) %>% dplyr::select(-t)
map.2012.gg <- urbnmapr::states %>%
left_join(pct_obama %>% filter(t == max(t)) %>%
Expand Down Expand Up @@ -962,6 +991,33 @@ final_evs.2012.gg <- ggplot(final_evs,aes(x=dem_ev,
subtitle=sprintf("p(dem win) = %s | full stan model",round(mean(final_evs$dem_ev>=270),3)))
# get the p value of the actual results
p_values_2012 <- draws %>%
filter(state != '--',
t == election_day) %>%
left_join(politicaldata::pres_results %>% filter(year == 2012) %>%
mutate(actual = dem/(dem+rep)) %>%
dplyr::select(state,actual)) %>%
group_by(state) %>%
summarise(high = quantile(pct_obama, 0.975),
low = quantile(pct_obama, 0.025),
mean = mean(pct_obama),
prob = mean(pct_obama > 0.5),
actual = unique(actual),
p_value =
( 2*sum(pct_obama < actual) + 1 ) /
( 2 * n() + 2) #,
# p_value_2 = mean(ifelse(actual >= mean(pct_obama),
# pct_obama >= actual,
# pct_obama <= actual))
) %>%
arrange(p_value) %>%
mutate(year = 2012,
outside_ci = ifelse(actual > high | actual < low, TRUE,FALSE))
```
Expand Down Expand Up @@ -1038,7 +1094,7 @@ start_date <- as.Date("2016-03-01") # Keeping all polls after March 1, 2016
evs_df <- read_csv('data/2012.csv')
# will read in the most recent back-test for 2012
out <- read_rds(sprintf('models/stan_model_%s.rds',RUN_DATE))
out <- read_rds(sprintf('models/stan_model_%s_normal_final.rds',RUN_DATE))
# get all the polling data
#setwd(here("data/"))
Expand Down Expand Up @@ -1457,6 +1513,34 @@ final_evs.2016.gg <- ggplot(final_evs,aes(x=dem_ev,
subtitle=sprintf("p(dem win) = %s | full stan model",round(mean(final_evs$dem_ev>=270),3)))
# get the p value of the actual results
p_values_2016 <- draws %>%
filter(state != '--',
t == election_day) %>%
left_join(politicaldata::pres_results %>% filter(year == 2016) %>%
mutate(actual = dem/(dem+rep)) %>%
dplyr::select(state,actual)) %>%
group_by(state) %>%
summarise(high = quantile(pct_clinton, 0.975),
low = quantile(pct_clinton, 0.025),
mean = mean(pct_clinton),
prob = mean(pct_clinton > 0.5),
actual = unique(actual),
p_value =
( 2*sum(pct_clinton < actual) + 1 ) /
( 2 * n() + 2) #,
# p_value_2 = mean(ifelse(actual >= mean(pct_clinton),
# pct_clinton >= actual,
# pct_clinton <= actual))
) %>%
arrange(p_value) %>%
mutate(year = 2016,
outside_ci = ifelse(actual > high | actual < low, TRUE,FALSE))
```
Expand Down Expand Up @@ -1523,7 +1607,7 @@ pct_clinton %>%
## Cumulative charts
### Calibration plot
### Probability calibration plot
```{r echo=F,message=F,warning=F}
calibration_data <- model_v_actual.2016 %>% mutate(year = 2016) %>%
Expand Down Expand Up @@ -1551,6 +1635,63 @@ print(calibration.gg)
```
### Confidence interval coverage
```{r echo=F, message=FALSE, warning=FALSE}
calibration_data <- p_values_2016 %>% mutate(year = 2016) %>%
bind_rows(p_values_2012 %>% mutate(year = 2012) ) %>%
bind_rows(p_values_2008 %>% mutate(year = 2008)) %>%
filter(state != 'DC')
# sum(calibration_data$outside_ci) / nrow(calibration_data)
p_value_hist.gg <- ggplot(calibration_data, aes(x=p_value)) +
geom_histogram(binwidth = 0.01,aes(group=paste0(state,year))) +
geom_vline(xintercept = 0.025,col='red',linetype=2) +
geom_vline(xintercept = 0.975,col='red',linetype=2) +
scale_x_continuous(breaks=seq(0,1,0.05)) +
scale_y_continuous(breaks=seq(0,100,2)) +
theme_minimal() +
theme(panel.grid.minor = element_blank())
preds_p_value_scatter.gg <- ggplot(calibration_data, aes(x=mean*100,y=actual*100)) +
geom_abline() +
# states inside the ci
geom_point(data =. %>% filter(!outside_ci),col='black') +
# states outside
geom_point(data = . %>% filter(outside_ci),col='red') +
geom_text_repel(data = . %>% filter(outside_ci),
aes(label=state),col='red',min.segment.length = 0) +
geom_segment(data = . %>% filter(outside_ci),
aes(x=mean*100,xend=mean*100,y=low*100,yend=high*100),col='red') +
facet_wrap(~year,ncol=3) +
theme_minimal() +
theme(panel.grid.minor = element_blank()) +
labs(
# title='Model with normal(0, sigma) distributions for polling error and state priors',
subtitle='Results in highlighted states fall outside our uncertainty intervals',
x='Predicted Democratic share of the two-party vote',
y='Actual Democratic share'
) +
coord_cartesian(xlim=c(25,75),ylim=c(25,75))
```
```{r echo=F,message=F,warning=F,fig.width = 9, fig.asp = .42}
print(preds_p_value_scatter.gg)
```
```{r echo=F,message=F,warning=F}
print(p_value_hist.gg)
```
# Licence
This software is published by _[The Economist](https://www.economist.com)_ under the [MIT licence](https://opensource.org/licenses/MIT). The data generated by _The Economist_ are available under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/).
Expand Down
Loading

0 comments on commit b89e602

Please sign in to comment.