-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path00.Ready.R
320 lines (292 loc) · 16.4 KB
/
00.Ready.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# This script is written for reading CSMAR data batched,
# and then store them as tables in SQLite database and RData file.
# Importing data through querying the SQL database will solve the problem
# of memory shortage and process timeout, but the relative shortcoming is
# the data type defined in R will vanish, especially for the date and factor,
# and we need to transform the data type of them carefully.
library(tidyverse)
library(magrittr)
library(lubridate)
library(DBI)
library(RSQLite)
library(dbplyr)
library(parallel)
library(doParallel)
CL <- makeCluster(detectCores(logical = FALSE) - 1L)
registerDoParallel(CL)
# working directory
setwd("~/OneDrive/Data.backup/QEAData")
# link to SQLite database
QEA_db <- dbConnect(RSQLite::SQLite(), "./QEA_db.sqlite")
# Part I, create tables with annual, quarter, and daily data ------------
# annual =====
annual_path <- list.files(path = "./Acc_Annual",
pattern = "^(CSR|FAR|MNM).*[^]]\\.txt$",
full.names = TRUE)
annual <- foreach(i = 1:length(annual_path),
# the combine approach is taking collection
.combine = full_join,
# will be used package, `readr` to read data
.packages = c("readr")
) %dopar%
read_delim(annual_path[i],
delim = "\t", na = '',
col_types = cols(.default = col_double(),
Stkcd = col_character(),
Accper = col_date('%Y-%m-%d'),
Annodt = col_date('%Y-%m-%d'))
)
arrange(annual, Accper, Stkcd) %>%
dbWriteTable(QEA_db, "annual", .)
# quarter =====
quarter_path <- list.files(path = "./Acc_Quarter",
pattern = "[0-9]\\.txt$",
full.names = TRUE)
quarter <- foreach(i = 1:length(quarter_path),
.combine = full_join,
.packages = c("readr")
) %dopar%
read_delim(quarter_path[i],
delim = "\t", na = '',
col_types = cols(.default = col_double(),
Stkcd = col_character(),
Accper = col_date('%Y-%m-%d'),
Typrep = col_character(),
Indcd = col_character())
)
arrange(quarter, Accper, Stkcd) %>%
dbWriteTable(QEA_db, "quarter", .)
stopCluster(CL)
# all the accounting indicators in earnings report of listed firms ====
# cash flow statement
CashFlow <- read_delim(file = "./CashFlow_Statement/FS_Comscfd.txt",
delim = '\t', na = '',
col_types = cols(.default = col_double(),
Stkcd = col_character(),
Accper = col_date(format = '%Y-%m-%d'),
Typrep = col_character())
)
arrange(CashFlow, Stkcd, Accper, Typrep) %>%
dbWriteTable(QEA_db, "CashFlow_Statement", .)
# Income statement
Income <- read_delim(file = "./Income_Statement/FS_Comins.txt",
delim = '\t', na = '',
col_types = cols(.default = col_double(),
Stkcd = col_character(),
Accper = col_date(format = '%Y-%m-%d'),
Typrep = col_character())
)
arrange(Income, Accper, Stkcd, Typrep) %>%
dbWriteTable(QEA_db, "Income_Statement", .)
# Balance sheet
Balance <- read_delim(file = "./Balance_Sheet/FS_Combas.txt",
delim = '\t', na = '',
col_types = cols(.default = col_double(),
Stkcd = col_character(),
Accper = col_date(format = '%Y-%m-%d'),
Typrep = col_character())
)
arrange(Balance, Accper, Stkcd, Typrep) %>%
dbWriteTable(QEA_db, "Balance_Sheet", .)
rm(annual, quarter, CashFlow, Income, Balance); gc()
# daily trading data =====
TRD_Dalyr <- read_delim(file = './Acc_Daily/TRD_Dalyr.txt',
delim = '\t', na = '',
col_types = cols(.default = col_double(),
Stkcd = col_character(), # stocks symbol
Trddt = col_date(format = '%Y-%m-%d'), # trading date
# the last date of stock capitalization changed
Dretwd = col_skip(), Capchgdt = col_skip(),
Adjprcwd = col_skip(), Adjprcnd = col_skip(),
# 1=SH-A, 2=SH-B, 4=SZ-A, 8=SZ-B, 16=startup, 32=Tech
Markettype = col_factor(levels = c(1,2,4,8,16,32)),
Trdsta = col_factor(levels = c(1:16))) # trading status
) %>%
# only the category of 1 indicate that stocks were traded normally
filter(Trdsta == "1") %>% select(-Trdsta) %>%
# focus on Chinese A-Share markets
filter(Markettype %in% as.character(c(1, 4, 16))) %>% droplevels() %>%
# rename the column of trading date
rename("TradingDate" = Trddt) %>%
# sorting by stock and trading date
arrange(Markettype, Stkcd, TradingDate) %>%
# confirm the daily returns of stocks we calculated is same with CSMAR's ====
# when any value of daily returns calculated by ourselves
# is equal to the return data form CSMAR, we select CSMAR's data
group_nest(Stkcd) %>% # generate the time series lists sort by stocks
mutate('DR' = map_lgl(data, # R_{it} = ln(P_{it}) - ln(P_{it-1})
~ `/`(.x$Clsprc[-1], .x$Clsprc[-length(.x$Clsprc)]) %>%
log(base = exp(1)) %>%
near(.x$Dretnd[-1]) %>%
any()
)
) %>%
subset(DR == TRUE, select = -DR) %>%
unnest(cols = 'data')
# Import data of the transaction derivative index of stocks
MKT_Dalyr <- read_delim(file = "./Acc_Daily/STK_MKT_Dalyr.txt",
delim = "\t", na = '',
col_types = cols(.default = col_double(),
Symbol = col_character(),
TradingDate = col_date("%Y-%m-%d"),
ShortName = col_skip(),
SecurityID = col_skip(),
Ret = col_skip())
) %>%
# stocks symbol, need to rename
rename('Stkcd' = Symbol)
# Import the daily one-year-deposit-interest-rate, free-risk return in market
Nrrate <- read_delim(file = './TRD_Nrrate.txt',
delim = '\t', na = '',
col_types = cols_only(Clsdt = col_date(format = '%Y-%m-%d'),
Nrr1 = col_factor(levels = c('NRI01', 'TBC')),
Nrrdaydt = col_double() # risk-free interest
)
) %>%
# select the one-year deposit interest rate as the free-risk return
filter(Nrr1 == 'NRI01') %>% select(-Nrr1) %>%
# # we could either select the Treasury rate as risk-free interest
# filter(Nrr1 == "TBC") %>% select(- Nrr1) %>%
rename('TradingDate' = Clsdt)
left_join(TRD_Dalyr, MKT_Dalyr, by = c("Stkcd", "TradingDate")) %>%
left_join(Nrrate, by = 'TradingDate') %>%
dbWriteTable(QEA_db, "daily", .)
rm(TRD_Dalyr, MKT_Dalyr); gc()
# Part II, prepare the data to reproduce Fama-French factor model --------
# Input the listing date and industry of stocks for wiping out the stocks listed newly
AF_Co <- read_delim(file = 'AF_Co.csv',
delim = '\t', na = '',
col_types = cols_only(Stkcd = col_character(),
IndClaCd = col_factor(), # industry divide standard
Indus = col_character(), # industry symbol
Listdt = col_date('%Y-%m-%d'))
) %>%
# just using the 2012 Edition Industry Classification
# published by China Securities Regulatory Commission
filter(IndClaCd == 2) %>% select(-IndClaCd)
# the paying attention resource disclosure by stock exchange, yearly data
# the market size, the number of analysis team following,
# and how much survey papers are issued (opacity of company)
AF_Cfeature <- read_delim('./Acc_Annual/AF_CFEATUREPROFILE.txt',
delim = '\t', na = '',
col_types = cols_only(Stkcd = col_character(),
Accper = col_date(format = "%Y-%m-%d"),
CompanySize = col_double(),
# the number of analysis teams
AnaAttention = col_integer(),
# the number of analysis reports
ReportAttention = col_integer(),
# the information opacity of company
CompanyOpacity = col_character())
)
save(AF_Co, AF_Cfeature, file = "./FirmAttr.RData")
# Import the daily trading in China A-Share markets
trddat <- dbGetQuery(QEA_db,
"SELECT Markettype, Stkcd, TradingDate, Clsprc, Dretnd, Dsmvosd
FROM daily"
) %>%
# transform the record of trading date to date form
mutate(TradingDate = as.Date(TradingDate, origin = "1970-01-01")) %>%
group_nest(Markettype, Stkcd) %>%
# join the information of stocks listed date and industry to trading data
inner_join(AF_Co, by = 'Stkcd')
dbDisconnect(QEA_db)
# Setup trading day in Chinese A-share markets
trdday <- read_delim(file = 'TRD_Cale.csv', delim = '\t', na = '',
col_types = cols(# 1=SH-A, 2=SH-B, 4=SZ-A, 8=SZ-B, 16=startup, 32=Tech
Markettype = col_factor(c(1,2,4,8,16,32)),
Clddt = col_date(format = "%Y-%m-%d"),
# c(0:6): Sun < Mon < Tue < Wed < Thu < Fri < Sat
Daywk = col_factor(c(0:6)),
# "Open" or "Closed", stock share were traded or not
State = col_factor(levels = c('C', 'O')))
) %>%
rename('TradingDate' = Clddt) %>%
# trading date in China A-Share markets
filter(Markettype %in% as.character(c(1,4,16))) %>% droplevels() %>%
# columns by market class
spread(Markettype, State) %>%
na.omit()
# confirm the trade calendar is same among different kinds of share market
if (`&`(all.equal(trdday$`1`, trdday$`4`),
all.equal(trdday$`4`, trdday$`16`))
) { # we take the trading dates of Shanghai A-share as China A-share market's
trdday %<>% subset(`1` == "O", select = TradingDate, drop = TRUE)
} else print("Attention! The trading dates among China A-share markets are different.")
# the structure of shares of stocks (annual capitalization)
Nshr <- read_delim(file = "./Acc_Annual/CG_Capchg.txt",
delim = '\t', na = '',
col_types = cols_only(Stkcd = col_character(),
# annual interval
Reptdt = col_date(format = "%Y-%m-%d"),
Nshrnn = col_double(), # non-circulation
Nshra = col_double() # A-shares under circulation
)
) %>%
# take a sum of non-circulation shares and A-shares
transmute(Stkcd, Reptdt, shrttl = Nshrnn + Nshra) %>%
# for merge convenience, just extraction the year from report date
# be sure that the capitalization is a annual data in earning reports
mutate(Reptdt = year(Reptdt)) %>%
arrange(Reptdt)
# store above data that will be used in next script as a image
save(trddat, # the daily trading data, the listed date and industry of stocks
trdday, # the trading date of Chinese-A-Share markets
Nrrate, # the daily risk-free interests of capital market
Nshr, # the yearly situation of structure of shares of stocks
file = "./PrePotfol.RData")
# Prepare data relative with quarterly reports in advance ====
# Import the status data of quarterly financial report
ReptInfo <- read_delim('./Acc_Quarter/IAR_Rept.txt',
delim = '\t', na = '',
col_types = cols_only(Stkcd = col_character(),
# 1:4, first quarter, mid of year, third quarter,
# and year earnings report
Reptyp = col_factor(levels = c(1:4)),
# the deadline of accounting cycle
Accper = col_date(format = "%Y-%m-%d"),
# the date when report was discolsed
Annodt = col_date(format = "%Y-%m-%d"),
# the day of week when report was discolsed
# c(0:6): Sun < Mon < Tue < Wed < Thu < Fri < Sat
Annowk = col_factor(levels = c(0:6)),
# net profits and earnings per share
Profita = col_double(), Erana = col_double())
)
# we could observe that the symbols of stocks belong to China A-Share markets
# are begin with number c(0, 3, 6),
# so we use regular expression and string function 'grepl' to filter others
if (nrow(problems(ReptInfo)) >= 0L) {
ReptInfo %<>% `[`(-unique(problems(.)$row), ) %>%
filter(grepl('^[0-6]', Stkcd)) %>%
arrange(Stkcd, Accper)
}
# structure the sub-sample according to pre-earnings report ====
# ForecFinReportType - Nine categories including turn to loss, continued loss,
# turn to gain profit from loss, continued profitability, large increase,
# large decrease, slight increase, slight decrease, and uncertainty
PreRept <- read_delim('./Acc_Quarter/FIN_F_ForecFin.txt',
delim = '\t', na = '',
col_types = cols_only(# the symbol of stocks, need to be rename
StockCode = col_character(),
PubliDate = col_date(format = "%Y-%m-%d"),
AccPeriod = col_date(format = "%Y-%m-%d"),
# 0 = performance advance expose,
# 1 = regular disclosure
Source = col_factor(),
# the categories of report source
ForecFinReportType = col_factor())
) %>%
rename('Stkcd' = StockCode) %>%
filter(grepl('^[0-6]', Stkcd))
save(ReptInfo, # the key accounting index in quarterly earnings reports
PreRept, # the pre-disclosure report of stocks which operated unexpected
file = "./ReportInfo.RData")
# take a look at the numbers of stocks listed in a year
ggplot(data = count(AF_Co, year = year(Listdt), Industry = str_sub(Indus, 1, 1))) +
geom_col(aes(x = year, y = n, fill = Industry)) +
labs(x = "Year", y = "The count of firms listed in this year") +
theme_bw()
# export this image
ggsave(filename = glue::glue("./Stock-Indus-His.pdf"),
width = 16, height = 9)