-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathextract_features.py
230 lines (201 loc) · 13.8 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import joblib
import numpy as np
import pandas as pd
import pybedtools
import pysam
import re
import sys
from typing import Tuple
# Make a few dictionaries/tables from the pedigree file
def make_pedigree_dicts(ped_filepath):
pedigree_dict = {}
offspring_parents_dict = {}
sample_sex_and_phenotype_dict = {}
with open(ped_filepath, "r") as f:
for line in f:
linesplit = line.rstrip().split("\t")
family_id = linesplit[0]
sample_id = linesplit[1]
father_id = linesplit[2]
mother_id = linesplit[3]
sex_code = linesplit[4] # ('1' = male, '2' = female, '0' = unknown)
phenotype_code = linesplit[5] # ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
if family_id not in pedigree_dict: pedigree_dict[family_id] = {"father_id": "", "mother_id": "", "offspring": []}
if father_id == "0" and mother_id == "0":
if sex_code == "1": pedigree_dict[family_id]["father_id"] = sample_id
elif sex_code == "2": pedigree_dict[family_id]["mother_id"] = sample_id
else:
pedigree_dict[family_id]["offspring"].append(sample_id)
offspring_parents_dict[sample_id] = (father_id, mother_id)
sample_sex_and_phenotype_dict[sample_id] = {"sex_code": sex_code, "phenotype_code": phenotype_code}
return pedigree_dict, offspring_parents_dict, sample_sex_and_phenotype_dict
def make_sample_index_dicts(vcf_filepath):
vcf_iterator = pysam.VariantFile(vcf_filepath, mode = "r")
index_sample_dict = dict(enumerate(vcf_iterator.header.samples))
sample_index_dict = {v: k for k, v in index_sample_dict.items()}
return sample_index_dict, index_sample_dict
def make_offspring_index_dict(offspring_parents_dict, sample_index_dict):
offspring_index_id_dict = {}
for sample in offspring_parents_dict:
offspring_index_id_dict[sample_index_dict[sample]] = sample
return offspring_index_id_dict
def get_ratio(feature):
# Check that it's tuple of size 2
buffer_ = 1.0
return float(feature[0]) / (float(feature[1]) + buffer_)
def get_log2_ratio(feature):
# Check that it's a tuple of size 2
buffer_ = 1.0
feature_log2_coverage_ratio = np.log2( (float(feature[0]) + float(feature[1]) + buffer_) / (np.median([float(feature[0]), float(feature[1])]) + buffer_) )
return feature_log2_coverage_ratio
# Extract putative de novo mutations from the VCF
def make_features_dict(vcf_filepath, offspring_index_id_dict, offspring_parents_dict, sample_index_dict, sample_sex_and_phenotype_dict, features_file = None, region = None):
func_name_dict = {"get_ratio": get_ratio, "get_log2_ratio": get_log2_ratio}
dnm_features_dict = {}
vcf_iterator = pysam.VariantFile(vcf_filepath, mode = "r")
excluded_info_features = set(["AC", "AF", "AN", "ClippingRankSum", "DP", "DS", "END", "ExcessHet", "HaplotypeScore", "MLEAC", "MLEAF", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE", "RAW_MQ", "VariantType", "culprit"])
info_features = []
format_features = []
format_features_for_custom_features = [] # custom features may use these features as inputs (but they should not be saved in the output table)
custom_features = []
custom_feature_function_dict = {}
custom_feature_input_dict = {}
if features_file:
with open(features_file, "r") as f:
# Check that these features exist in the header...
info_features = f.readline().replace(" ", "").strip().split(",")
format_features = f.readline().replace(" ", "").strip().split(",")
for line in f:
regex_match = re.match("(^\w+):(int|float)=(\w+)\((\w+)\)", line.rstrip().replace(" ", ""))
feature_name = regex_match.group(1)
return_type = regex_match.group(2)
function_name = regex_match.group(3)
input_feature_name = regex_match.group(4)
func = func_name_dict[function_name]
custom_feature_function_dict[feature_name] = func
custom_feature_input_dict[feature_name] = input_feature_name
if input_feature_name not in format_features: format_features_for_custom_features.append(input_feature_name)
else:
# Default features to extract
for info_field in vcf_iterator.header.info:
if info_field in excluded_info_features: continue
if vcf_iterator.header.info[info_field].number != 1: continue # Exclude non-scalar values for now
info_features.append(info_field)
for format_field in vcf_iterator.header.formats:
if format_field == "GT": continue
format_features.append(format_field)
if "AD" in format_features:
custom_features.append("AR")
custom_feature_function_dict["AR"] = func_name_dict["get_ratio"]
custom_feature_input_dict["AR"] = "AD"
custom_features.append("AD_log2_coverage_ratio")
custom_feature_function_dict["AD_log2_coverage_ratio"] = func_name_dict["get_log2_ratio"]
custom_feature_input_dict["AD_log2_coverage_ratio"] = "AD"
number_of_samples = len(vcf_iterator.header.samples)
# When we get the format-level features, we want to determine what types (of numbers) they are using the VCF header.
# If a particular format feature is single-valued (1), then we'll get 3 values for it (for offspring, mother, father).
# If a particular format feature has one value for each possible genotype ("G"), then we'll get 3 values per sample (and so 9 total for a trio pedigree).
# For each format feature, get the offspring and their parents' values
format_features_dict = {}
vcf_header = vcf_iterator.header
for record in vcf_iterator:
# Skip multiallelic variants
if len(record.alts) > 1: continue
# Skip X and Y for now (not anymore...)
# if "X" in record.chrom or "Y" in record.chrom: continue
# Get INFO-level features
dnm_info_features = {}
for feature in info_features:
if feature in record.info:
dnm_info_features[feature] = record.info[feature]
else: dnm_info_features[feature] = np.nan # In error log, say that this feature doesn't exist for this record
for i in range(number_of_samples):
if i not in offspring_index_id_dict: continue
offspring_id = offspring_index_id_dict[i]
(father_id, mother_id) = offspring_parents_dict[offspring_id]
offspring_GT = record.samples[i]["GT"]
father_GT = record.samples[sample_index_dict[father_id]]["GT"]
mother_GT = record.samples[sample_index_dict[mother_id]]["GT"]
# Filter out non-putative de novo mutations
if "X" in record.chrom and sample_sex_and_phenotype_dict[offspring_id] == "1":
if offspring_GT != (0,1): continue
if mother_GT != (0, 0): continue
elif "Y" in record.chrom and sample_sex_and_phenotype_dict[offspring_id] == "1":
if offspring_GT != (0,1): continue
if father_GT != (0, 0): continue
else:
if offspring_GT != (0, 1): continue
if father_GT != (0, 0): continue
if mother_GT != (0, 0): continue
key = (record.chrom, record.pos, record.ref, record.alts[0], offspring_id, sample_sex_and_phenotype_dict[offspring_id]["sex_code"], sample_sex_and_phenotype_dict[offspring_id]["phenotype_code"])
dnm_features_dict[key] = {}
dnm_features_dict[key]["offspring_GT"] = offspring_GT
dnm_features_dict[key]["father_GT"] = father_GT
dnm_features_dict[key]["mother_GT"] = mother_GT
# Getting format fields
for format_feature in format_features:
dnm_features_dict[key]["{}_{}".format("offspring", format_feature)] = record.samples[i][format_feature]
dnm_features_dict[key]["{}_{}".format("father", format_feature)] = record.samples[sample_index_dict[father_id]][format_feature]
dnm_features_dict[key]["{}_{}".format("mother", format_feature)] = record.samples[sample_index_dict[mother_id]][format_feature]
for format_feature in format_features_for_custom_features:
dnm_features_dict[key]["{}_{}".format("offspring", format_feature)] = record.samples[i][format_feature]
dnm_features_dict[key]["{}_{}".format("father", format_feature)] = record.samples[sample_index_dict[father_id]][format_feature]
dnm_features_dict[key]["{}_{}".format("mother", format_feature)] = record.samples[sample_index_dict[mother_id]][format_feature]
# Add INFO-level features for this row
# dnm_features_dict[key] |= dnm_info_features
dnm_features_dict[key].update(dnm_info_features)
df_dnm_features_dict = pd.DataFrame.from_dict(dnm_features_dict).transpose().reset_index()
for feature_name in custom_features:
func = custom_feature_function_dict[feature_name]
input_feature_name = custom_feature_input_dict[feature_name]
df_dnm_features_dict["offspring_{}".format(feature_name)] = df_dnm_features_dict["offspring_{}".format(input_feature_name)].apply(func)
df_dnm_features_dict["father_{}".format(feature_name)] = df_dnm_features_dict["father_{}".format(input_feature_name)].apply(func)
df_dnm_features_dict["mother_{}".format(feature_name)] = df_dnm_features_dict["mother_{}".format(input_feature_name)].apply(func)
# Get min and max of format features
for feature in format_features:
if vcf_header.formats[feature].number == 1:
df_dnm_features_dict["min_parent_{}".format(feature)] = df_dnm_features_dict[["father_{}".format(feature), "mother_{}".format(feature)]].min(axis = 1)
df_dnm_features_dict["max_parent_{}".format(feature)] = df_dnm_features_dict[["father_{}".format(feature), "mother_{}".format(feature)]].max(axis = 1)
elif vcf_header.formats[feature].number == "G":
for i in range(0, 3):
df_dnm_features_dict["offspring_{}_{}".format(feature, str(i))] = df_dnm_features_dict["offspring_{}".format(feature)].str.get(i)
df_dnm_features_dict["father_{}_{}".format(feature, str(i))] = df_dnm_features_dict["father_{}".format(feature)].str.get(i)
df_dnm_features_dict["mother_{}_{}".format(feature, str(i))] = df_dnm_features_dict["mother_{}".format(feature)].str.get(i)
df_dnm_features_dict["min_parent_{}_{}".format(feature, str(i))] = df_dnm_features_dict[["father_{}_{}".format(feature, str(i)), "mother_{}_{}".format(feature, str(i))]].min(axis = 1)
df_dnm_features_dict["max_parent_{}_{}".format(feature, str(i))] = df_dnm_features_dict[["father_{}_{}".format(feature, str(i)), "mother_{}_{}".format(feature, str(i))]].max(axis = 1)
elif vcf_header.formats[feature].number == "R":
for i in range(0, 2):
df_dnm_features_dict["offspring_{}_{}".format(feature, str(i))] = df_dnm_features_dict["offspring_{}".format(feature)].str.get(i)
df_dnm_features_dict["father_{}_{}".format(feature, str(i))] = df_dnm_features_dict["father_{}".format(feature)].str.get(i)
df_dnm_features_dict["mother_{}_{}".format(feature, str(i))] = df_dnm_features_dict["mother_{}".format(feature)].str.get(i)
df_dnm_features_dict["min_parent_{}_{}".format(feature, str(i))] = df_dnm_features_dict[["father_{}_{}".format(feature, str(i)), "mother_{}_{}".format(feature, str(i))]].min(axis = 1)
df_dnm_features_dict["max_parent_{}_{}".format(feature, str(i))] = df_dnm_features_dict[["father_{}_{}".format(feature, str(i)), "mother_{}_{}".format(feature, str(i))]].max(axis = 1)
for feature in custom_features:
df_dnm_features_dict["min_parent_{}".format(feature)] = df_dnm_features_dict[["father_{}".format(feature), "mother_{}".format(feature)]].min(axis = 1)
df_dnm_features_dict["max_parent_{}".format(feature)] = df_dnm_features_dict[["father_{}".format(feature), "mother_{}".format(feature)]].max(axis = 1)
# Only retaining necessary features now...
retained_features = []
for feature in info_features: retained_features.append(feature)
for feature in format_features:
if vcf_header.formats[feature].number == 1:
retained_features.append("{}_{}".format("offspring", feature))
retained_features.append("{}_{}".format("max_parent", feature))
retained_features.append("{}_{}".format("min_parent", feature))
elif vcf_header.formats[feature].number == "G":
for i in range(0, 3):
retained_features.append("{}_{}_{}".format("offspring", feature, i))
retained_features.append("{}_{}_{}".format("max_parent", feature, i))
retained_features.append("{}_{}_{}".format("min_parent", feature, i))
elif vcf_header.formats[feature].number == "R":
for i in range(0, 2):
retained_features.append("{}_{}_{}".format("offspring", feature, i))
retained_features.append("{}_{}_{}".format("max_parent", feature, i))
retained_features.append("{}_{}_{}".format("min_parent", feature, i))
for feature in custom_features:
retained_features.append("{}_{}".format("offspring", feature))
retained_features.append("{}_{}".format("max_parent", feature))
retained_features.append("{}_{}".format("min_parent", feature))
df_dnm_features_dict = df_dnm_features_dict.rename(columns = {"level_0": "chrom", "level_1": "pos", "level_2": "ref", "level_3": "alt", "level_4": "iid", "level_5": "sex", "level_6": "phenotype"})
key = ["chrom", "pos", "ref", "alt", "iid", "sex", "phenotype"]
df_dnm_features_dict = df_dnm_features_dict[key + retained_features]
return df_dnm_features_dict