forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_multi_gwas.R
119 lines (118 loc) · 5.24 KB
/
check_multi_gwas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#' Ensure that only one model in GWAS sumstats or only one trait tested
#'
#' @param sumstats_dt data table obj of the summary statistics file for the GWAS
#' @param path Filepath for the summary statistics file to be formatted
#' @param analysis_trait If multiple traits were studied, name of the trait for
#' analysis from the GWAS. Default is NULL
#' @param mapping_file MungeSumstats has a pre-defined column-name mapping file
#' which should cover the most common column headers and their interpretations.
#' However, if a column header that is in youf file is missing of the mapping we
#' give is incorrect you can supply your own mapping file. Must be a 2 column
#' dataframe with column names "Uncorrected" and "Corrected". See
#' data(sumstatsColHeaders) for default mapping and necessary format.
#' @return list containing sumstats_dt, the modified summary statistics data
#' table object
#' @keywords internal
check_multi_gwas <- function(sumstats_dt, path, analysis_trait,
ignore_multi_trait,mapping_file) {
if(isFALSE(ignore_multi_trait)){
message("Checking for multi-GWAS.")
col_headers <- names(sumstats_dt)
# load synonym mapping - internal data no loading
# Check for multiple p values and other columns,
# if found choose 1 to continue
# Use P value to find - first load all p-value synonyms
p_syn <- sumstatsColHeaders[sumstatsColHeaders$Corrected == "P", 1]
# So P_value columns found have already been corrected to "P" so if you find
# any in p_syn in colheaders then we found multiple
# NOTE THIS ASSUMES AN '_' or '.' SEPARATES P VALUE SYN FROM TRAIT/MODEL
# NAME AND COLUMN NAME STARTS WITH P VAULE SYM
# Sort p_syn so longest ones checked first, once 1 found stop
# so don't get wrong subset with shorter ones
p_syn <- p_syn[order(nchar(p_syn), p_syn, decreasing = TRUE)]
for (pattern_i in p_syn) {
p_val_multi <- grepl(paste0(
"^", pattern_i, "_", "|",
"^", pattern_i, "[.]"
), col_headers)
if (any(p_val_multi)) {
break
}
}
# So note we could have 0 (do nothing), 1 (just update names),
# 2+ (remove all but one) traits
if (any(p_val_multi)) { # multiple traits
# first get trait name by removing specific p_syn
traits <- gsub(paste0(
"^", pattern_i, "_", "|",
"^", pattern_i, "[.]"
), "", col_headers[p_val_multi])
if (length(traits) > 1) {
if (!is.null(analysis_trait)) {
analysis_trait <- as.character(analysis_trait)
} # ensure character
msg2 <- paste0(
"WARNING: Multiple traits found in sumstats file only one",
" of which can be analysed: \n",
paste(traits, collapse = ", ")
)
message(msg2)
stp_msg <- paste0(
"Inputted analysis_trait not one of these traits. Pleas",
"e input one of the traits above.\n Or set",
"`ignore_multi_trait=TRUE` to ignore these multi-trait ",
"p-values and rerun `format_sumstats()`."
)
if (is.null(analysis_trait) ||
!toupper(analysis_trait) %in% toupper(traits)) {
stop(stp_msg)
}
# get right case for choice
chosen_trait <- traits[toupper(traits) %in% toupper(analysis_trait)]
} else { # just one trait
msg <- paste0(
"WARNING: A single multi-trait was found in sumstats file: \n",
paste(traits, collapse = ", "),"\n",
"If you don't want to use this value for P, set ",
"`ignore_multi_trait=TRUE` and rerun `format_sumstats()`"
)
message(msg)
chosen_trait <- traits
}
# just 1 trait from start or 1 chosen by user,
# remove the trait from col name
# then get the correct synonyms
# NOTE ASSUMES TRAIT NAME STARTS WITH '_' or '.' &
# IS AT THE END OF COLUMN
chnge_header_names <- col_headers[grepl(
paste0(
"_", chosen_trait, "$", "|",
"[.]", chosen_trait, "$"
),
col_headers
)]
new_names <- gsub(paste0(
"_", chosen_trait, "$", "|",
".", chosen_trait, "$"
), "", chnge_header_names)
#if names present already, rename
for(new_i in new_names){
if(new_i %in% colnames(sumstats_dt)){
data.table::setnames(sumstats_dt, new_i, paste0(new_i,"_input"))
}
}
data.table::setnames(sumstats_dt, chnge_header_names, new_names)
# RE-standardise headers for all OS
sumstats_return <-
standardise_sumstats_column_headers_crossplatform(
sumstats_dt = sumstats_dt,
mapping_file = mapping_file
)
return(sumstats_return)
} else {
return(list("sumstats_dt" = sumstats_dt))
}
} else {
return(list("sumstats_dt" = sumstats_dt))
}
}