forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_allele_flip.R
248 lines (245 loc) · 10.6 KB
/
check_allele_flip.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#' Ensure A1 & A2 are correctly named, if GWAS SNP constructed as
#' Alternative/Reference or Risk/Nonrisk alleles these SNPs will need to be
#' converted to Reference/Alternative or Nonrisk/Risk. Here non-risk is defined
#' as what's on the reference genome (this may not always be the case).
#'
#' @inheritParams format_sumstats
#' @param log_files list of log file locations
#' @param standardise_headers Run
#' \code{standardise_sumstats_column_headers_crossplatform} first.
#'
#' @return A list containing two data tables:
#' \itemize{
#' \item \code{sumstats_dt}: the modified summary statistics
#' \code{data.table} object.
#' \item \code{rsids}: snpsById, filtered to SNPs of interest if
#' loaded already. Or else NULL.
#' \item \code{log_files}: log file list
#' }
#' @keywords internal
#' @import R.utils
#' @importFrom data.table setkey
#' @importFrom data.table :=
#' @importFrom data.table setnames
#' @importFrom data.table set
#' @importFrom data.table setorder
#' @importFrom data.table copy
check_allele_flip <- function(sumstats_dt, path,
ref_genome, rsids,
allele_flip_check,
allele_flip_drop,
allele_flip_z,
allele_flip_frq,
bi_allelic_filter,
flip_frq_as_biallelic,
imputation_ind,
log_folder_ind,
check_save_out,
tabix_index,
nThread,
log_files,
standardise_headers = FALSE,
mapping_file,
dbSNP) {
# # GenomicSEM' allele flipping strategy:
# file.path("https://github.com/GenomicSEM/GenomicSEM/blob",
# "fc8f17a817a8022d6900acf41824d27b3676f9c4/R/munge.R#L151")
# #example
# path <- system.file("extdata","eduAttainOkbay.txt",
# package="MungeSumstats")
# sumstats_dt <- read_sumstats(path = path)
# sumstats_return <- check_allele_flip(sumstats_dt = sumstats_dt,
# path=path,
# ref_genome="GRCh37",
# rsids=NULL,
# allele_flip_check=TRUE,
# standardise_headers=TRUE)
## Set variables to be used in in place data.table functions to NULL
## to avoid confusing BiocCheck.
SNP <- i.seqnames <- CHR <- BP <- i.pos <- LP <- P <- A1 <- A2 <- eff_i <-
i.A1 <- i.A2 <- ss_A1 <- ss_A2 <- i.ref_allele <-
ref_gen_allele <- match_type <-
tmp <- flipped <- NULL
if (standardise_headers) {
sumstats_dt <-
standardise_sumstats_column_headers_crossplatform(
sumstats_dt = sumstats_dt,
mapping_file = mapping_file
)[["sumstats_dt"]]
}
# If SNP present but no A1/A2 then need to find them
col_headers <- names(sumstats_dt)
if (sum(c("A1", "A2") %in% col_headers) == 2 && allele_flip_check) {
message(
"Checking for correct direction of A1 (reference) ",
"and A2 (alternative allele)."
)
# check if rsids loaded if not do so
if (is.null(rsids)) {
rsids <-
load_ref_genome_data(
data.table::copy(sumstats_dt$SNP),
ref_genome = ref_genome,
dbSNP = dbSNP, msg=NULL
)
}
# ensure rsids is up-to-date with filtered sumstats_dt
rsids <- rsids[unique(sumstats_dt$SNP), , nomatch = NULL]
data.table::setkey(rsids, SNP)
data.table::setkey(sumstats_dt, SNP)
# Append reference genome to data and check matches to both A1 or A2
# For each SNP, if ref genome matches A1, leave it (TRUE)
# if ref genome matches A2, flip it (FALSE)
# if ref genome doesn't match A1 or A2, leave it (TRUE)
sumstats_dt[rsids, ref_gen_allele := i.ref_allele]
sumstats_dt[is.na(ref_gen_allele), match_type := TRUE]
sumstats_dt[A1 == ref_gen_allele, match_type := TRUE]
sumstats_dt[A2 == ref_gen_allele, match_type := FALSE]
print(sumstats_dt)
# drop cases that don't match either
if (allele_flip_drop &&
nrow(sumstats_dt[A1 != ref_gen_allele &
A2 != ref_gen_allele, ]) > 0) {
print_msg0 <-
paste0(
"There are ",
formatC(nrow(sumstats_dt[A1 != ref_gen_allele &
A2 != ref_gen_allele, ]), big.mark = ","),
" SNPs where neither A1 nor A2 match the reference genome.",
" These will be removed."
)
message(print_msg0)
# If user wants log, save it to there
if (log_folder_ind) {
name <- "alleles_dont_match_ref_gen"
name <- get_unique_name_log_file(
name = name,
log_files = log_files
)
write_sumstats(
sumstats_dt = sumstats_dt[(A1 != ref_gen_allele &
A2 != ref_gen_allele), ],
save_path =
paste0(
check_save_out$log_folder,
"/", name,
check_save_out$extension
),
sep = check_save_out$sep,
#don't tab indx as could be miss values & cause err
#tabix_index = tabix_index,
nThread = nThread
)
log_files[[name]] <-
paste0(
check_save_out$log_folder, "/", name,
check_save_out$extension
)
}
sumstats_dt <- sumstats_dt[!(A1 != ref_gen_allele &
A2 != ref_gen_allele), ]
} else {
sumstats_dt[
A1 != ref_gen_allele & A2 != ref_gen_allele,
match_type := TRUE
]
}
# continue if flipping necessary
if (nrow(sumstats_dt[match_type == FALSE, ]) > 0) {
print_msg <- paste0(
"There are ",
formatC(nrow(sumstats_dt[match_type == FALSE, ]),
big.mark = ","),
" SNPs where A1 doesn't match the reference genome.",
"\nThese will be flipped with their effect columns."
)
message(print_msg)
# swap A1 and A2 for those SNPs needing to be flipped
sumstats_dt[match_type == FALSE, tmp := A2]
sumstats_dt[match_type == FALSE, A2 := A1]
sumstats_dt[match_type == FALSE, A1 := tmp]
sumstats_dt[, tmp := NULL]
# flip effect column(s) - BETA, OR, Z, log_odds, SIGNED_SUMSTAT, FRQ
effect_columns <- c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT")
if (allele_flip_z) {
effect_columns <- c(effect_columns, "Z")
}
if (allele_flip_frq) {
effect_columns <- c(effect_columns, "FRQ")
}
effect_columns <-
effect_columns[effect_columns %in% names(sumstats_dt)]
# if FRQ present and needs to be flipped for SNPs ensure
# bi_allelic_filter is TRUE
stp_msg <- paste0(
"Certain SNPs need to be flipped along with their ",
"effect columns and frequency column. However to flip ",
"the\nFRQ column, only bi-allelic SNPs can be ",
"considered. It is recommended to set ",
"bi_allelic_filter to TRUE so\nnon-bi-allelic SNPs are",
" removed. Otherwise, set allele_flip_frq to FALSE to ",
"not flip the FRQ column but note\nthis could lead to ",
"incorrect FRQ values.\nA new option added is to flip ",
"non-bi-allelic SNPs as if they were bi-allelic (1-FRQ), ",
"to do this set\nflip_frq_as_biallelic to TRUE but note ",
"these values will not be exact down to the missing ",
"frequencies of the other\nalternative allele(s)."
)
if (nrow(sumstats_dt[match_type == FALSE, ]) > 0 &&
"FRQ" %in% effect_columns &&
!bi_allelic_filter && !flip_frq_as_biallelic) {
stop(stp_msg)
}
#if set flip_frq_as_biallelic = TRUE, let them know
if (nrow(sumstats_dt[match_type == FALSE, ]) > 0 &&
"FRQ" %in% effect_columns &&
!bi_allelic_filter && flip_frq_as_biallelic) {
print_msg <- paste0(
"Note: You have set flip_frq_as_biallelic to TRUE meaning of ",
"the ",
formatC(nrow(sumstats_dt[match_type == FALSE, ]),
big.mark = ","),
" SNPs to be flipped, ",
"any non-bi-allelic SNPs\nwill have their frequencies flipped ",
"as (1-FRQ) essentially ignoring the frequencies of any other ",
"alternative alleles."
)
message(print_msg)
}
for (eff_i in effect_columns) { # set updates quicker for DT
# conversion done in case, VCF beta column may not be numeric
if (eff_i == "FRQ") {
sumstats_dt[
match_type == FALSE,
(eff_i) := (1 - as.numeric(get(eff_i)))
]
} else if(eff_i == "OR"){
sumstats_dt[
match_type == FALSE,
(eff_i) := (1/as.numeric(get(eff_i)))
]
}
else {
sumstats_dt[
match_type == FALSE,
(eff_i) := as.numeric(get(eff_i)) * -1
]
}
}
if (imputation_ind) {
sumstats_dt[match_type == FALSE, flipped := TRUE]
}
}
# remove extra created columns and return
sumstats_dt[, ref_gen_allele := NULL]
sumstats_dt[, match_type := NULL]
return(list(
"sumstats_dt" = sumstats_dt,
"rsids" = rsids, "log_files" = log_files
))
}
return(list(
"sumstats_dt" = sumstats_dt,
"rsids" = rsids, "log_files" = log_files
))
}