forked from fanagislab/EndHiC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshortCtgs_to_cluster.pl
executable file
·185 lines (137 loc) · 4.73 KB
/
shortCtgs_to_cluster.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/perl
=head1 Name
shortCtgs_to_cluster.pl -- assign the short contigs into cluster
=head1 Description
This program consider the max contact as well as the max / second times to cluster, in order to assign the short contigs to clusters
=head1 Version
Author: Fan Wei, [email protected]
Version: 1.0, Date: 2021/7/6
Note:
=head1 Usage
shortCtgs_to_cluster.pl [options] <cluster_file> <contig_length_file> <ctgContact_file>
--contact <float> cutoff for the contact value of the max cluster, default=0
--times <float> cutoff for the max contact / second contact, default=1
--verbose output running progress information to screen
--help output help information to screen
=head1 Exmple
perl ../../shortCtgs_to_cluster.pl --contact 500 --times 2 formal_1000000_iced.matrix.ctgcontact.gfa.cluster contigs_200kb.len formal_1000000_iced.matrix.ctgcontact > contigs_200kb.len.cluster
=cut
use strict;
use Getopt::Long;
use FindBin qw($Bin $Script);
use File::Basename qw(basename dirname);
use Data::Dumper;
##get options from command line into variables and set default values
my $max_contact_cutoff;
my $times_max_second;
my ($Verbose,$Help);
GetOptions(
"contact:f"=>\$max_contact_cutoff,
"times:f"=>\$times_max_second,
"verbose"=>\$Verbose,
"help"=>\$Help
);
$max_contact_cutoff ||= 0;
$times_max_second ||= 1;
die `pod2text $0` if (@ARGV == 0 || $Help);
#print STDERR "max_contact_cutoff: $max_contact_cutoff\n";
#print STDERR "max / second cutoff: $times_max_second\n";
my $cluster_result_file = shift;
my $contig_len_file = shift;
my $ctgContact_file = shift; ##from ctgEnd or maxBin, either is OK, only keep the largest contact for each contig pair
my @Cluster;
my @ClusterId;
my %ClusteredCtgs;
my @Contig;
my %Contact;
my %CtgLen;
open IN, $cluster_result_file || die "fail open $cluster_result_file";
while (<IN>) {
next if(/^\#/);
chomp;
my @t = split /\s+/;
my $cluster_id = $t[0];
my $ctg_count = $t[1];
my $cluster_len = $t[2];
my $robustness = $t[3];
my $ctg_str = $t[4];
my @ary;
while ($ctg_str =~ /(\w+)[+-]/g) {
push @ary, $1;
$ClusteredCtgs{$1} = 1;
}
push @Cluster, \@ary;
push @ClusterId, $cluster_id;
}
close IN;
#print Dumper \@ClusterId;
#print Dumper \%ClusteredCtgs;
open IN, $contig_len_file || die "fail open $contig_len_file\n";
while (<IN>) {
chomp;
my @t = split /\s+/;
my $ctg_id = $t[0];
my $ctg_len = $t[1];
$CtgLen{$ctg_id} = $ctg_len;
push @Contig, $ctg_id if(! exists $ClusteredCtgs{$ctg_id});
}
close IN;
#print Dumper \@Contig;
open IN, $ctgContact_file || die "fail open $ctgContact_file\n";
while (<IN>) {
next if(/^\#/);
chomp;
my @t = split /\s+/;
my $ctg1 = $t[0];
my $ctg2 = $t[1];
my $contact = $t[2]; #normalized contact
##only keep the max value from head vs head, head vs tail, tail vs head, tail vs tail
if (!exists $Contact{$ctg1}{$ctg2} || $Contact{$ctg1}{$ctg2} < $contact) {
$Contact{$ctg1}{$ctg2} = $contact;
}
if (!exists $Contact{$ctg2}{$ctg1} || $Contact{$ctg2}{$ctg1} < $contact) {
$Contact{$ctg2}{$ctg1} = $contact;
}
}
close IN;
#print Dumper \%Contact;
print "#ContigId\tContigLen\tClusterId\tContact\tTimes\tmax_cluster_id:contig_id:contact_value\tsecond_cluster_id:contig_id:contact_value\n";
my @Output;
foreach my $this_ctg (@Contig) {
my @stored;
for (my $i = 0; $i < @Cluster; $i ++) {
my $p = $Cluster[$i];
my $max_contig = "";
my $max_contact = 0;
foreach my $cluter_ctg (@$p) {
#print $this_ctg."\t".$cluter_ctg."\n";
if(exists $Contact{$this_ctg}{$cluter_ctg}){
my $contact_value = $Contact{$this_ctg}{$cluter_ctg};
if ($contact_value > $max_contact) {
$max_contig = $cluter_ctg;
$max_contact = $contact_value;
}
}
}
push @stored, [$max_contact, $max_contig, $i];
}
@stored = sort {$b->[0] <=> $a->[0]} @stored;
my ($max_contact , $max_contig , $max_cluster ) = ($stored[0][0], $stored[0][1], $stored[0][2]);
my ($second_contact , $second_contig , $second_cluster ) = ($stored[1][0], $stored[1][1], $stored[1][2]);
my $max_cluster_id = $ClusterId[$max_cluster];
my $second_cluster_id = $ClusterId[$second_cluster];
my $assigned_cluster = "Unclustered";
$max_contact = 1 if($max_contact == 0);
$second_contact = $max_contact if($second_contact == 0);
my $times = sprintf("%.2f", $max_contact / $second_contact);
if ($max_contact >= $max_contact_cutoff && $times >= $times_max_second) {
$assigned_cluster = $max_cluster_id;
}
push @Output, [$this_ctg, $CtgLen{$this_ctg}, $assigned_cluster, $max_contact, $times, "$max_cluster_id:$max_contig:$max_contact", "$second_cluster_id:$second_contig:$second_contact"];
}
##output the sorted results
@Output = sort {$a->[2] cmp $b->[2]} @Output;
foreach my $p (@Output) {
my $line = join("\t", @$p);
print $line."\n";
}