-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtidy.R
201 lines (165 loc) · 7.84 KB
/
tidy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
library(tidyr)
library(stringr)
library(dplyr)
library(tibble)
library(ggplot2)
################
### Examples ###
################
table1 <- tibble(country = c("Afghanistan", "Afghanistan", "Brazil", "Brazil", "China", "China"),
year = c(1999, 2000, 1999, 2000, 1999, 2000),
cases = c(745, 2666, 37737, 80488, 212258, 213766),
population = c(19987071, 20595360, 172006362, 174504898, 1272915272, 1280428583))
table2 <- tibble(country = c("Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Brazil", "Brazil", "Brazil", "Brazil",
"China", "China", "China", "China"),
year = c(1999, 1999, 2000, 2000, 1999, 1999, 2000, 2000, 1999, 1999, 2000, 2000),
type = c("cases", "population", "cases", "population", "cases", "population",
"cases", "population", "cases", "population", "cases", "population"),
count = c(745, 19987071, 2666, 20595360, 37737, 172006362, 80488,
174504898, 212258, 1272915272, 213766, 1280428583))
table3 <- tibble(country = c("Afghanistan", "Afghanistan", "Brazil", "Brazil", "China", "China"),
year = c(1999, 2000, 1999, 2000, 1999, 2000),
rate = c("745/19987071", "2666/20595360", "37737/172006362",
"80488/174504898", "212258/1272915272", "213766/1280428583"))
# Spread across two tibbles
table4a <- tibble(country = c("Afghanistan", "Brazil", "China"),
`1999` = c(745, 37737, 212258),
`2000` = c(2666, 80488, 213766)) # cases
table4b <- tibble(country = c("Afghanistan", "Brazil", "China"),
`1999` = c(19987071, 172006362, 1272915272),
`2000` = c(20595360, 174504898, 1280428583)) # population
# 1. Compute the rate for table2, and table4a + table4b. You will need to perform four operations:
# 1. Extract the number of TB cases per country per year.
# 2. Extract the matching population per country per year.
# 3. Divide cases by population, and multiply by 10000.
# 4. Store back in the appropriate place.
# 5. Which representation is easiest to work with? Which is hardest? Why?
tab2_cases <- filter(table2, type == "cases")
tab2_pop <- filter(table2, type == "population")
tab2_rates <- tab2_cases$count / tab2_pop$count
temptab <- tab2_cases
temptab$type <- "rates"
temptab$count <- tab2_rates
table2 <- rbind(table2, temptab)
table4c <- tibble(country = table4a$country,
`1999` = table4a$`1999` / table4b$`1999`,
`2000` = table4a$`2000` / table4b$`2000`)
table1 %>%
mutate(rate = cases / population * 10000)
# 2. Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first?
table1 %>%
count(year, wt = cases)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
table2 %>% filter(type == "cases") %>%
ggplot(aes(year, count)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
#########################
### Gather and Spread ###
#########################
# 1. Why are gather() and spread() not perfectly symmetrical?
# Carefully consider the following example:
stocks <- data_frame(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
spread(year, return) %>%
gather("year", "return", `2015`:`2016`)
# Can be reordered
# (Hint: look at the variable types and think about column names.)
# Both spread() and gather() have a convert argument. What does it do?
# Convert converts char to num / complex / factor /...
# 2. Why does this code fail?
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
# Need ``!
#> Error in eval(expr, envir, enclos): Position must be between 0 and n
# 3. Why does spreading this tibble fail? How could you add a new column to fix the problem?
people <- frame_data(
~name, ~key, ~value,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
spread(people, key = key, value = value)
# Two (different) ages for Phillip. Would be data loss otherwise.
# 4. Tidy the simple tibble below. Do you need to spread or gather it? What are the variables?
preg <- frame_data(
~pregnant, ~male, ~female,
"yes", NA, 10,
"no", 20, 12
)
preg %>% gather(male, female, key = "gender", value = "number")
##########################
### Separate and unite ###
##########################
# 1. What do the extra and fill arguments do in separate()?
# Experiment with the various options for the following two toy datasets.
# Extra: what if too many pieces? Either "warn", "drip", or "merge"
# Fill: what if not enough pieces? Either "warn", "right" or "left"
tibble::tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
separate(x, c("one", "two", "three"), extra = "merge")
tibble::tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"), fill = "left")
# 2. Both unite() and separate() have a remove argument. What does it do?
# Why would you set it to FALSE?
# If TRUE, remove input column from output data frame. If I don't need the messed up original, it's handy.
# 3. Compare and contrast separate() and extract(). Why are there three variations of separation,
# but only one unite?
# Stuff can go wrong with separation, but not really with unite
######################
### Missing values ###
######################
# 1. Compare and contrast the fill arguments to spread() and complete().
# Both take values for missing values. In spread, both explicit and implicit values are filled, but in complete,
# only implicit ones (since it is for turning implicit into explicit)
# 2. What does the direction argument to fill() do?
# Direction to fill (previous or next non-missing value)
##################
### Case study ###
##################
who
who1 <- who %>%
gather(key = 'key', value = 'cases', new_sp_m014:newrel_f65, na.rm = T) %>%
mutate(key = str_replace(key, "newrel", "new_rel")) %>%
separate(key, c("new", "type", "sexage"), sep = "_") %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex","age"), sep = 1)
# 1. In this case study I set na.rm = TRUE just to make it easier to check that we had the correct values.
# Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values?
# What’s the difference between an NA and zero?
# na.rm sounds reasonable, I guess?? Not sure actually.
# implicit missing value: yes, a LOT (all the NA-s in the original table)
who_na <- who %>%
gather(key = 'key', value = 'cases', new_sp_m014:newrel_f65, na.rm = T)
summary(who_na) # 329394 out of 405440 is missing. There are 11080 0-s though...
# NA is not data, zero is no new cases in that cohort. Not sure about this either.
# 2. What happens if you neglect the mutate() step?
# Warning at separate, as separator sometimes missing.
# 3. I claimed that iso2 and iso3 were redundant with country. Confirm my claim by creating a table that uniquely
# maps from country to iso2 and iso3.
country_codes <- who %>%
select(country, iso2, iso3) %>%
distinct()
# Check if unique
count(country_codes, country) # Same for iso2, iso3
# 4. For each country, year, and sex compute the total number of cases of TB. Make an informative visualisation of the data.
total_cases <- who1 %>%
group_by(country, year, sex) %>%
summarise(cases = sum(cases))
total_cases %>%
group_by(year) %>%
summarise(n = n())
total_cases %>%
filter(country == "Afghanistan") %>%
ggplot(aes(x = year, y = cases, color = sex)) +
geom_point() +
geom_line()