-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_Seattle_Library_Checkouts.R
88 lines (70 loc) · 2.16 KB
/
02_Seattle_Library_Checkouts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
library(arrow)
library(dplyr)
seattle_csv <- open_dataset(here::here("data/seattle-library-checkouts.csv"),
format = "csv")
seattle_csv
schema(seattle_csv)
# If you needed to determine the estimated schema for all columns, you could run
# the following to change out the class of a certain column:
seattle_csv$schema$code()
seattle_csv <- open_dataset(
sources = here::here("data/seattle-library-checkouts.csv"),
format = "csv",
skip = 1,
schema = schema(
UsageClass = utf8(),
CheckoutType = utf8(),
MaterialType = utf8(),
CheckoutYear = int64(),
CheckoutMonth = int64(),
Checkouts = int64(),
Title = utf8(),
ISBN = string(), #utf8() was replaced since this field is a string()
Creator = utf8(),
Subjects = utf8(),
Publisher = utf8(),
PublicationYear = utf8()
)
)
seattle_csv |>
group_by(CheckoutYear) |>
count() |>
arrange(CheckoutYear) |>
collect() |>
system.time()
seattle_parquet <- here::here("data/seattle-library-checkouts-parquet")
seattle_csv |>
write_dataset(path = seattle_parquet,
format = "parquet")
seattle_parquet_obj <- open_dataset(seattle_parquet)
seattle_parquet_obj |>
group_by(CheckoutYear) |>
count() |>
arrange(CheckoutYear) |>
collect() |>
system.time()
# Write out partitioned data set
seattle_parquet_part <- here::here("data/seattle-library-checkouts")
seattle_csv |>
group_by(CheckoutYear) |>
write_dataset(path = seattle_parquet_part,
format = "parquet")
seattle_parquet_part_obj <- open_dataset(seattle_parquet_part)
seattle_parquet_part_obj |>
group_by(CheckoutYear) |>
count() |>
arrange(CheckoutYear) |>
collect() |>
system.time()
open_dataset(here::here("data/seattle-library-checkouts")) |>
filter(CheckoutYear == 2019, CheckoutMonth == 9) |>
group_by(CheckoutMonth) |>
summarize(TotalCheckouts = sum(Checkouts, na.rm = T)) |>
arrange(desc(CheckoutMonth)) |>
collect() |>
system.time()
# use the read_parquet() function
parquet_file <- here::here("data/nyc-taxi/year=2019/month=9/part-0.parquet")
taxi_df <- read_parquet(parquet_file) |>
tibble()
object.size(taxi_df)