forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_vcf.R
159 lines (158 loc) · 6.1 KB
/
read_vcf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#' Read in VCF file
#'
#' Read in a VCF file as a \link[VariantAnnotation]{VCF} or a
#' \link[data.table]{data.table}.
#' Can optionally save the VCF/data.table as well.
#'
#' @param path Path to local or remote VCF file.
#' @param as_datatable Return the data as a
#' \link[data.table]{data.table} (default: \code{TRUE})
#' or a \link[VariantAnnotation]{VCF} (\code{FALSE}).
#' @param samples Which samples to use:
#' \itemize{
#' \item{1 : }{Only the first sample will be used (\emph{DEFAULT}).}
#' \item{NULL : }{All samples will be used.}
#' \item{c("<sample_id1>","<sample_id2>",...) : }{
#' Only user-selected samples will be used (case-insensitive).}
#' }
#' @param which Genomic ranges to be added if supplied. Default is NULL.
#' @param use_params
#' When \code{TRUE} (default), increases the speed of reading in the VCF by
#' omitting columns that are empty based on the head of the VCF (NAs only).
#' NOTE that that this requires the VCF to be sorted, bgzip-compressed,
#' tabix-indexed, which \link[MungeSumstats]{read_vcf} will attempt to do.
#' @param download Download the VCF (and its index file)
#' to a temp folder before reading it into R.
#' This is important to keep \code{TRUE} when \code{nThread>1} to avoid
#' making too many queries to remote file.
#' @param mt_thresh When the number of rows (variants) in the VCF is
#' \code{< mt_thresh}, only use single-threading for reading in the VCF.
#' This is because the overhead of parallelisation outweighs the speed benefits
#' when VCFs are small.
#' @param verbose Print messages.
#' @inheritParams check_empty_cols
#' @inheritParams format_sumstats
#' @inheritParams downloader
#' @inheritParams download_vcf
#' @inheritParams import_sumstats
#' @inheritParams VariantAnnotation::ScanVcfParam
#'
#' @return The VCF file in data.table format.
#' @export
#' @importFrom VariantAnnotation writeVcf
#' @importFrom data.table as.data.table setnames fwrite
#' @source
#' \code{
#' #### Benchmarking ####
#' library(VCFWrenchR)
#' library(VariantAnnotation)
#' path <- "https://gwas.mrcieu.ac.uk/files/ubm-a-2929/ubm-a-2929.vcf.gz"
#' vcf <- VariantAnnotation::readVcf(file = path)
#' N <- 1e5
#' vcf_sub <- vcf[1:N,]
#' res <- microbenchmark::microbenchmark(
#' "vcf2df"={dat1 <- MungeSumstats:::vcf2df(vcf = vcf_sub)},
#' "VCFWrenchR"= {dat2 <- as.data.frame(x = vcf_sub)},
#' "VRanges"={dat3 <- data.table::as.data.table(
#' methods::as(vcf_sub, "VRanges"))},
#' times=1
#' )
#' }
#' @source \href{https://github.com/Bioconductor/VariantAnnotation/issues/57}{
#' Discussion on VariantAnnotation GitHub}
#' @source \href{https://github.com/Bioconductor/VariantAnnotation/issues/59}{
#' Discussion on VariantAnnotation GitHub}
#' @examples
#' #### Local file ####
#' path <- system.file("extdata","ALSvcf.vcf", package="MungeSumstats")
#' sumstats_dt <- read_vcf(path = path)
#'
#' #### Remote file ####
#' ## Small GWAS (0.2Mb)
#' # path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-298/ieu-a-298.vcf.gz"
#' # sumstats_dt2 <- read_vcf(path = path)
#'
#' ## Large GWAS (250Mb)
#' # path <- "https://gwas.mrcieu.ac.uk/files/ubm-a-2929/ubm-a-2929.vcf.gz"
#' # sumstats_dt3 <- read_vcf(path = path, nThread=11)
#'
#' ### Very large GWAS (500Mb)
#' # path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-1124/ieu-a-1124.vcf.gz"
#' # sumstats_dt4 <- read_vcf(path = path, nThread=11)
read_vcf <- function(path,
as_datatable = TRUE,
save_path = NULL,
tabix_index = FALSE,
samples = 1,
which = NULL,
use_params = TRUE,
sampled_rows = 1e4L,
download = TRUE,
vcf_dir = tempdir(),
download_method = "download.file",
force_new = FALSE,
mt_thresh = 1e5L,
nThread = 1,
verbose = TRUE) {
#### Read VCF ####
## Returns either a VCF or a data.table, depending on as_datatable arg.
vcf_dt <- read_vcf_parallel(path = path,
samples = samples,
which = which,
use_params = use_params,
as_datatable = as_datatable,
sampled_rows = sampled_rows,
download = download,
vcf_dir = vcf_dir,
download_method = download_method,
force_new = force_new,
nThread = nThread,
verbose = verbose)
#### Check save path ####
if (!is.null(save_path)) {
check_save_path_out <- check_save_path(
### dummy arg, not used here
log_folder = "NULL",
### dummy arg, not used here
log_folder_ind = FALSE,
save_path = save_path,
write_vcf = !as_datatable,
tabix_index = tabix_index)
save_path <- check_save_path_out$save_path
}
#### Save as VCF ####
if (isFALSE(as_datatable)) {
if (!is.null(save_path)) {
VariantAnnotation::writeVcf(
obj = vcf_dt,
filename = check_save_path_out$save_path,
index = tabix_index
)
}
messager("Returning as VCF.")
return(vcf_dt)
}
#### Prepare SNP column ####
read_vcf_markername(sumstats_dt = vcf_dt)
#### Prepare/un-log P col ####
sumstats_dt <- read_log_pval(sumstats_dt = vcf_dt,
return_list=FALSE)
#### Prepare INFO col ####
read_vcf_info(sumstats_dt = vcf_dt)
#### Rename start col #####
data.table::setnames(vcf_dt, "start", "BP")
#### Write new data ####
if (!is.null(save_path)) {
messager("Storing intermediate file before proceeding ==>", save_path,
v = verbose)
data.table::fwrite(
x = vcf_dt,
file = save_path,
sep = "\t",
nThread = nThread,
na = "NA",
quote = FALSE
)
}
return(vcf_dt)
}