forked from zhichaoluo/DataAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcase_ufo_sightings.R
241 lines (197 loc) · 10.8 KB
/
case_ufo_sightings.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File-Name: ufo_sightings.R
# Date: 2012-02-10
# Author: Drew Conway ([email protected])
# Purpose: Code for Chapter 1. In this case we will review some of the basic
# R functions and coding paradigms we will use throughout this book.
# This includes loading, viewing, and cleaning raw data; as well as
# some basic visualization. This specific case we will use data from
# reported UFO sightings to investigate what, if any, seasonal trends
# exists in the data.
# Data Used: http://www.infochimps.com/datasets/60000-documented-ufo-sightings-with-text-descriptions-and-metada
# Packages Used: ggplot2, plyr, scales
# All source code is copyright (c) 2012, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
# All images and materials produced by this code are licensed under the Creative Commons
# Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/
# All rights reserved.
# NOTE: If you are running this in the R console you must use the 'setwd' command to set the
# working directory for the console to whereever you have saved this file prior to running.
# Otherwise you will see errors when loading data or saving figures!
# Load libraries and data
library(ggplot2) # We'll use ggplot2 for all of our visualizations
library(plyr) # For data manipulation
library(scales) # We'll need to fix date formats in plots
# This is a tab-delimited file, so we use 'read.delim' and set the separator as a tab character.
# We also have to alter two defaults; first, we want the strings to not be converted to
# factor types; and, this data has does not have header labels in the first row, so
# we want to keep the first row as data.
ufo <- read.delim("exercise/Exercise2/data/ufo_awesome.tsv",
stringsAsFactors = FALSE,
header = FALSE,
na.strings = "")
# This is a large text file (75MB), so this may take a moment
# Inspect the data frame
summary(ufo)
head(ufo)
# From the data's description file, we will set the column names accordingly using
# the 'names' function
names(ufo) <- c("DateOccurred", "DateReported",
"Location", "ShortDescription",
"Duration", "LongDescription")
# To work with the dates, we will need to convert the YYYYMMDD string to an R Date
# type using the 'strptime' function
# But, something has gone wrong with the data. For now, we'll just ignore the errata
# by removing those entries that have not parsed correctly. We know that the date
# strings are always 8 characters long, and any deviation from this would indicate
# a row to ignore. We will use the 'ifelse' function to construct a vector of
# Booleans indicating the problem rows
good.rows <- ifelse(nchar(ufo$DateOccurred) != 8 |
nchar(ufo$DateReported) != 8,
FALSE,
TRUE)
length(which(!good.rows)) # While 731 rows may seem like a lot, out of over 60K
ufo <- ufo[good.rows, ] # it is only about 0.6% of the total number of records.
# Now we can convert the strings to Date objects and work with them properly
ufo$DateOccurred <- as.Date(ufo$DateOccurred, format = "%Y%m%d")
ufo$DateReported <- as.Date(ufo$DateReported, format = "%Y%m%d")
# It will be useful to create separate columns for both town and state from the Location
# column. To do so we will use the 'strsplit' function to perform the regex.
# Note: not every entry in Location is of the form 'City, State'. We use the
# 'tryCatch' function to simply return [NA, NA] when this is the case. Next,
# we remove the leading white-space from both the city and state strings with 'gsub'
get.location <- function(l)
{
split.location <- tryCatch(strsplit(l, ",")[[1]],
error = function(e) return(c(NA, NA)))
clean.location <- gsub("^ ","",split.location)
if (length(clean.location) > 2)
{
return(c(NA,NA))
}
else
{
return(clean.location)
}
}
# We use 'lapply' to return a list with [City, State] vector as each element
city.state <- lapply(ufo$Location, get.location)
# We use 'do.call' to collapse the list to an N-by-2 matrix
location.matrix <- do.call(rbind, city.state)
# Add the city and state data to ufo data frame. We can do this using the 'transform'
# function.
ufo <- transform(ufo,
USCity = location.matrix[, 1],
USState = location.matrix[, 2],
stringsAsFactors = FALSE)
# Next step, we will strip out non-US incidents
# Insert NA's where there are non-US cities
ufo$USState <- state.abb[match(ufo$USState, state.abb)]
# Finally, we'll use 'subset' to examine only events in the United States and convert
# states to factors, i.e., a categorical variable.
ufo.us <- subset(ufo, !is.na(USState))
# Now, we are ready to do some analysis! First, take a look at the post-processed data
summary(ufo.us)
head(ufo.us)
# The summary functions shows us that the data actually go back a very long time (1440!). So,
# we will want to take a quick look at the date to see where the majority of the data exists.
# We can do this by creating a histogram of frequencies for UFO sightings over time
quick.hist <- ggplot(ufo.us, aes(x = DateOccurred)) +
geom_histogram() +
scale_x_date(breaks = "50 years")
ggsave(plot = quick.hist,
filename = file.path("image", "quick_hist.pdf"),
height = 6,
width = 8)
# First, we notice that there are many very old entries in the data. For our purposes, we will only look
# at incidents that occurred from 1990 to the most recent
ufo.us <- subset(ufo.us, DateOccurred >= as.Date("1990-01-01"))
# Let's look at the histogram now
new.hist <- ggplot(ufo.us, aes(x = DateOccurred)) +
geom_histogram(aes(fill='white', color='red')) +
scale_fill_manual(values=c('white'='white'), guide="none") +
scale_color_manual(values=c('red'='red'), guide="none") +
scale_x_date(breaks = "50 years")
ggsave(plot = new.hist,
filename = file.path("image", "new_hist.pdf"),
height = 6,
width = 8)
# Now that we have the data we want, let's look at some aggregations. We will use
# the 'ddply' funtion in the plyr package. But first, we create a column of just
# the Year-Month of each incident.
ufo.us$YearMonth <- strftime(ufo.us$DateOccurred, format = "%Y-%m")
# This will return the number of sightings of UFO by Year-Month and state for the whole time-series
sightings.counts <- ddply(ufo.us, .(USState,YearMonth), nrow)
# As we might expect, there are several Year-Month and state combinations for which there are no
# UFO sightings. We need to count these as zero so we can go back and fill those in.
# First, we will create a new vector that has all of the Year-Month dates in it that span the
# range of our time-series (1990-2010)
date.range <- seq.Date(from = as.Date(min(ufo.us$DateOccurred)),
to = as.Date(max(ufo.us$DateOccurred)),
by = "month")
date.strings <- strftime(date.range, "%Y-%m")
# To fill in the missing dates from the 'sightings.counts' data frame we will need to create a separate data
# frame with a column of states and Year-Months.
states.dates <- lapply(state.abb, function(s) cbind(s, date.strings))
states.dates <- data.frame(do.call(rbind, states.dates),
stringsAsFactors = FALSE)
# We use 'merge' to take the counts we have and merge them with the missing dates. Note, we have to specify
# the columns from each data frame we are using to do the merge, and set 'all' to TRUE, which will fill in
# this missing dates from the original data frame with NA.
all.sightings <- merge(states.dates,
sightings.counts,
by.x = c("s", "date.strings"),
by.y = c("USState", "YearMonth"),
all = TRUE)
# Now we just need to clean up the merged data frame a bit
# Set the column names to something meaningful
names(all.sightings) <- c("State", "YearMonth", "Sightings")
# Covert the NAs to 0's, what we really wanted
all.sightings$Sightings[is.na(all.sightings$Sightings)] <- 0
# Reset the character Year-Month to a Date objects
all.sightings$YearMonth <- as.Date(rep(date.range, length(state.abb)))
# Capitalize the State abbreviation and set as factor
all.sightings$State <- as.factor(all.sightings$State)
# There are lots of ways we could test the seasonality of of these sightings, but one basic method is to
# inspect the trends visually. We now construct a plot that will show these trends for all 50 U.S. states
# over the time-series.
# First we have to create a ggplot2 object and then create a geom layer, which in this case is a line.
# Additional points of note:
# (1) facet_wrap() will create separate plots for each state on a 10x5 grid.
# (2) theme_bw() changes the default ggplot2 style from grey to white (personal preference).
# (3) scale_color_manual() sets the line color to dark blue.
# (4) scale_x_date() scales the x-axis as a date, with major lines every 5 years.
# (5) xlab() and ylab() set axis labels.
# (6) opts() sets a title for the plot
state.plot <- ggplot(all.sightings, aes(x = YearMonth,y = Sightings)) +
geom_line(aes(color = "darkblue")) +
facet_wrap(~State, nrow = 10, ncol = 5) +
theme_bw() +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
scale_x_date(breaks = "5 years", labels = date_format('%Y')) +
xlab("Years") +
ylab("Number of Sightings") +
ggtitle("Number of UFO sightings by Month-Year and U.S. State (1990-2010)")
# Save the plot as a PDF
ggsave(plot = state.plot,
filename = file.path("image", "ufo_sightings.pdf"),
width = 14,
height = 8.5)
# Create a new graph where the number of signtings is normailzed by the state population
state.pop <- read.csv("exercise/Exercise2/data/census.csv", stringsAsFactors=FALSE)
state.pop$abbs <- sapply(state.pop$State, function(x) state.abb[grep(paste('^', x, sep=''), state.name)])
all.sightings$Sightings.Norm <- sapply(1:nrow(all.sightings),
function(i) all.sightings$Sightings[i] / state.pop$X2000[which(state.pop$abbs== all.sightings$State[i])])
state.plot.norm <- ggplot(all.sightings, aes(x = YearMonth,y = Sightings.Norm)) +
geom_line(aes(color = "darkblue")) +
facet_wrap(~State, nrow = 10, ncol = 5) +
theme_bw() +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
scale_x_date(breaks = "5 years", labels = date_format('%Y')) +
xlab("Years") +
ylab("Per Capita Number of Sightings (2000 Census)") +
ggtitle("Number of UFO sightings by Month-Year and U.S. State (1990-2010)")
# Save the plot as a PDF
ggsave(plot = state.plot.norm,
filename = file.path("image", "ufo_sightings_norm.pdf"),
width = 14,
height = 8.5)