forked from fanagislab/EndHiC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_merge.pl
executable file
·90 lines (84 loc) · 2 KB
/
cluster_merge.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/perl
use strict;
use Getopt::Long;
# parse input options
my ($help, $unclu);
GetOptions (
'un|unclustered' => \$unclu,
'h|help!' => \$help
);
if (not $ARGV[0] or $help) {
die "
Description: read GFA clusters at contig and cluster level and output the new connected clusters by their belonging contigs.
Author: Sen Wang, [email protected], 2021/8/16
Usage: perl cluster_merge.pl [--unclustered] contigs_gfa.cluster cluster_gfa.cluster > merged_gfa.cluster
\n";
}
# read GFA cluster at contig level
my (%cluster, %num, %len, %robust);
open IN, "<$ARGV[0]" or die "Cannot open $ARGV[0]!\n";
while (<IN>) {
chomp;
next if /^#/;
my @f = split(/\t/, $_);
my @l = split(/;/, $f[4]);
@{$cluster{$f[0]}} = @l;
$num{$f[0]} = $f[1];
$len{$f[0]} = $f[2];
$robust{$f[0]} = $f[3];
}
close IN;
# read GFA cluster at cluster level and output the connected clusters
open IN, "<$ARGV[1]" or die "Cannot open $ARGV[1]!\n";
while (<IN>) {
chomp;
if (/^#/) {
print "$_\n";
next;
}
my @f = split(/\t/, $_);
my @l = split(/;/, $f[4]);
if (@l > 1) {
my @tem;
foreach my $c (@l) {
if ($c =~ /\+$/) {
$c =~ s/[\+\-]//;
push @tem, @{$cluster{$c}};
delete $cluster{$c};
} elsif ($c =~ /\-$/) {
my @rev;
$c =~ s/[\+\-]//;
foreach my $ctg (reverse @{$cluster{$c}}) {
if ($ctg =~ /\+$/) {
$ctg =~ s/\+/\-/;
push @rev, $ctg;
} elsif ($ctg =~ /\-$/) {
$ctg =~ s/\-/\+/;
push @rev, $ctg;
}
}
push @tem, @rev;
delete $cluster{$c};
}
}
$f[4] = join(";", @tem);
$f[1] = @tem;
print join("\t", @f) . "\n";
} elsif (@l == 1) {
$f[4] =~ s/[\+\-]//;
my $k = $f[4];
my @tem = @{$cluster{$k}};
$f[1] = @tem;
$f[4] = join(";", @tem);
print join("\t", @f) . "\n";
delete $cluster{$k};
}
}
close IN;
# output the unconnected clusters
my @n = keys %cluster;
if ($unclu and @n > 0) {
foreach my $k (sort {$len{$b} <=> $len{$a}} keys %cluster) {
print "$k\t$num{$k}\t$len{$k}\t$robust{$k}\t" . join(";", @{$cluster{$k}}) . "\n";
}
}