forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_four_step_col.R
134 lines (132 loc) · 5.66 KB
/
check_four_step_col.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#' Ensure that CHR:BP:A2:A1 aren't merged into 1 column
#'
#' @param sumstats_dt data table obj of the summary statistics file for the GWAS
#' @param path Filepath for the summary statistics file to be formatted
#' @return list containing sumstats_dt, the modified
#' summary statistics data table object
#' @keywords internal
#' @importFrom data.table tstrsplit
#' @importFrom data.table :=
check_four_step_col <- function(sumstats_dt, path) {
A2 <- A1 <- NULL
# get col headers
col_headers <- names(sumstats_dt)
# Obtain a row of the actual data
row_of_data <- as.character(sumstats_dt[1, ])
# Check if there is a column of data with CHR:BP:A2:A1 format
fourStepCol <- grep(".*:.*:\\w:\\w", row_of_data)
# in case there are more than one column with ":", just take first one
if (length(fourStepCol) > 1) {
# sort to get most recent genome build by default
# (cols: SNP_hg19, SNP_hg18)
keep_col <- sort(col_headers[fourStepCol], decreasing = TRUE)[1]
drop_cols <- sort(col_headers[fourStepCol], decreasing = TRUE)[-1]
msg <- paste0(
"Warning: Multiple columns in the sumstats file seem to ",
"relate to Chromosome:Base Pair position:A2:A1.\nThe column",
" ", keep_col, " will be kept whereas the column(s) ",
drop_cols, " will be removed.\nIf this is not the correct ",
"column to keep, please remove all incorrect columns from ",
"those listed here before \nrunning `format_sumstats()`."
)
message(msg)
# Get data without dropped
sumstats_dt[, (drop_cols) := NULL]
fourStepCol <- which(col_headers == keep_col)
}
if (length(fourStepCol)) {
keep_col <- col_headers[fourStepCol]
# split out col into separate values, keep names
format <- strsplit(keep_col, ":")[[1]]
if (length(format) != 4) { # check : and underscore in name
format <- strsplit(keep_col, "_")[[1]]
}
if (length(format) != 4) { # If neither found
# first check if allele col exists and assign based on that
fourStepCol_val <- row_of_data[fourStepCol]
if (sum("A2" %in% col_headers) == 1 &&
sum("A1" %in% col_headers) == 1){
A2_val <- sumstats_dt[1,A2]
A1_val <- sumstats_dt[1,A1]
split_fourStepCol <- strsplit(fourStepCol_val,split=":")[[1]]
A1_fnd <- any(split_fourStepCol==A1_val)
A2_fnd <- any(split_fourStepCol==A2_val)
A1_ind <- which(split_fourStepCol==A1_val)
A2_ind <- which(split_fourStepCol==A2_val)
}
#make sure you got a hit for A1 and A2 vals
if(sum("A2" %in% col_headers) == 1 &&
sum("A1" %in% col_headers) == 1 &&
A1_fnd && A2_fnd){
#bought A1 and A2 are present and we know their position
format <- vector(mode="character", length=4)
othrs <- c('CHR','BP')
j <- 1
for (i in seq_len(4)){
if (i == A1_ind){
format[[i]] <- 'A1'
} else if (i == A2_ind){
format[[i]] <- 'A2'
} else{
format[[i]] <- othrs[[j]]
j = j+1
}
}
} else if(sum("A2" %in% col_headers) == 1){
A2_val <- sumstats_dt[1,A2]
split_fourStepCol <- strsplit(fourStepCol_val,split=":")[[1]]
A2_fnd <- any(split_fourStepCol==A2_val)
A2_ind <- which(split_fourStepCol==A2_val)
if (A2_ind == 4){
format <- c("CHR", "BP", "A1", "A2")
} else if(A2_ind == 3){
format <- c("CHR", "BP", "A2", "A1")
} else if(A2_ind == 2){
format <- c("CHR", "A2","BP", "A1")
} else if(A2_ind == 1){
format <- c("A2", "CHR","BP", "A1")
} else{
#not found....
format <- c("CHR", "BP", "A2", "A1")
}
} else if(sum("A1" %in% col_headers) == 1){
A1_val <- sumstats_dt[1,A1]
split_fourStepCol <- strsplit(fourStepCol_val,split=":")[[1]]
A1_fnd <- any(split_fourStepCol==A1_val)
A1_ind <- which(split_fourStepCol==A1_val)
if (A1_ind == 4){
format <- c("CHR", "BP", "A2", "A1")
} else if(A1_ind == 3){
format <- c("CHR", "BP", "A1", "A2")
} else if(A1_ind == 2){
format <- c("CHR", "A1","BP", "A2")
} else if(A1_ind == 1){
format <- c("A1", "CHR","BP", "A2")
} else{
#not found....
format <- c("CHR", "BP", "A2", "A1")
}
}
else{
#otherwise, just use this as default order
format <- c("CHR", "BP", "A1", "A2")
}
}
sumstats_dt[, (format) := data.table::tstrsplit(get(keep_col),
split = ":", fixed = TRUE,
type.convert = TRUE
)]
# remove combined column
sumstats_dt[, (keep_col) := NULL]
msg <- paste0(
"Column ", keep_col, " has been separated into the columns ",
paste(format, collapse = ", "),"\nIf this is the incorrect ",
"format for the column, update the column name to the correct ",
"format e.g.`CHR:BP:A2:A1` and format_sumstats()."
)
message(msg)
return(list("sumstats_dt" = sumstats_dt))
} else {
return(list("sumstats_dt" = sumstats_dt))
}
}