-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind_interactions.py
100 lines (65 loc) · 3.38 KB
/
find_interactions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
from scipy.sparse import csr_matrix
# load de-duplicated library to csv
nextera_unique = pd.read_csv(sys.argv[1])
# Calculate total counts of each barcode across both entry columns
barcode_counts = nextera_unique[["R1_full_bc", "R2_full_bc"]].stack().value_counts()
# Filter for barcodes that appear at least x times (look at kneeplot)
total_bc_cutoff = int(sys.argv[2])
print(total_bc_cutoff)
real_barcodes = barcode_counts[barcode_counts >= total_bc_cutoff].index.tolist() #used total bc counts of R1 + R2
print(f"Number of barcodes after filtering: {len(real_barcodes)}")
filtered_df = nextera_unique[nextera_unique['R1_full_bc'].isin(real_barcodes) & nextera_unique['R2_full_bc'].isin(real_barcodes)]
# Filter for barcodes that appear in both entry columns
filtered_barcodes = []
barcode_set_entry1 = set(filtered_df['R1_full_bc'])
barcode_set_entry2 = set(filtered_df['R2_full_bc'])
filtered_barcodes = list(barcode_set_entry1.intersection(barcode_set_entry2))
filtered_df = filtered_df[filtered_df['R1_full_bc'].isin(filtered_barcodes)]
filtered_df = filtered_df[filtered_df['R2_full_bc'].isin(filtered_barcodes)]
# Getting matrix of interactions
interaction_df = filtered_df.groupby(["R1_full_bc", "R2_full_bc"]).size().reset_index().rename(columns={0:'count'})
# Create a set to store unique barcodes
unique_barcodes = set()
# Mapping dictionary to store the relationship between barcode and integer
barcode_to_integer_mapping = {}
# Function to replace barcodes with unique integers and update the mapping dictionary
def replace_with_integer(barcode):
if barcode not in unique_barcodes:
unique_barcodes.add(barcode)
barcode_to_integer_mapping[barcode] = len(unique_barcodes) - 1
return barcode_to_integer_mapping[barcode]
# Apply the function to each element in the specified columns
interaction_df['R1_full_bc'] = interaction_df['R1_full_bc'].apply(replace_with_integer)
interaction_df['R2_full_bc'] = interaction_df['R2_full_bc'].apply(replace_with_integer)
# save file
interaction_df.to_csv(sys.argv[3], index=False)
# write a json file with barcode to numerical index matching
with open(sys.argv[4], "w") as f:
f.write(json.dumps(barcode_to_integer_mapping))
# Create a sparse matrix
sparse_matrix = csr_matrix((interaction_df['count'], (interaction_df['R1_full_bc'], interaction_df['R2_full_bc'])))
# Randomly sample 18 beads for plotting
sampled_num = 18
total_num_beads = sparse_matrix.shape[0]
random_sampled_beads = np.random.choice([*range(total_num_beads)], size=sampled_num, replace=False)
fig, axs = plt.subplots(3, 6, figsize=(16, 8))
axs = axs.flatten()
for i in range(sampled_num):
interactions = sparse_matrix.getrow(random_sampled_beads[i]).toarray().flatten()
non_zero_interactions = sorted(interactions[interactions != 0])[::-1]
axs[i].scatter([*range(len(non_zero_interactions))], non_zero_interactions)
axs[i].set_title(f'Bead {random_sampled_beads[i]}')
plt.tight_layout()
plt.savefig(sys.argv[5], format='png')
"""
sys.argv[1]: deduplicated lib to load (.csv)
sys.argv[2]: threshold for filtering barcodes (int)
sys.argv[3]: interactions.csv output filename (.csv)
sys.argv[4]: output .json file with barcode to numerical index matching (.json)
sys.argv[5]: output plot for the iteraction distribution of 18 randomly selected beads (.png)
"""