-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscaffold_by_trueCtgContact.pl
executable file
·162 lines (113 loc) · 4.73 KB
/
scaffold_by_trueCtgContact.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/perl
=head1 Name
scaffold_by_trueCtgContact.pl -- scaffolding the contigs by using confident contig contacts derived from HiC
=head1 Description
Take the output file of ctgContact_from_maxBinContacts.pl or ctgContact_from_ctgEndContacts.pl as input data,
and only the first 5 columns of data are used.
The output is a gfa format file which can be viewed for graph topology in Bandage software.
Alert that the contact cutoff (--contacts ) must be carefully determined, which should
be around the turning point of the contact distribution, which will seperate real and noise signals.
The option "--reciprocalmax" means the contact count is reciprocal max between the ends of two contigs (head or tail),
this option can be chosen or not chosen.
=head1 Version
Author: Fan Wei, [email protected]
Version: 1.0, Date: 2021/7/6
Note:
=head1 Usage
scaffold_by_trueCtgContact.pl [options] <contig_length_file> <Contig_contact_tabular_file>
--contacts <int> the cutoff of HiC contact count between two contig ends, default=1
--reciprocalmax require the HiC contact count between two contig ends is reciprocal max
--verbose output running progress information to screen
--help output help information to screen
=head1 Exmple
perl scaffold_by_trueCtgContact.pl --contacts 1500 contigs_used.len formal_1000000.matrix.CtgContact > formal_1000000.matrix.CtgContact.gfa
perl scaffold_by_trueCtgContact.pl --reciprocalmax contigs_used.len formal_1000000.matrix.CtgContact > formal_1000000.matrix.CtgContact.gfa
perl scaffold_by_trueCtgContact.pl --contacts 1000 --reciprocalmax contigs_used.len formal_1000000.matrix.CtgContact > formal_1000000.matrix.CtgContact.gfa
perl scaffold_by_trueCtgContact.pl --contacts 1500 contigs_used.len formal_1000000_iced.matrix.CtgContact > formal_1000000_iced.matrix.CtgContact.gfa
perl scaffold_by_trueCtgContact.pl --reciprocalmax contigs_used.len formal_1000000_iced.matrix.CtgContact > formal_1000000_iced.matrix.CtgContact.gfa
perl scaffold_by_trueCtgContact.pl --contacts 1000 --reciprocalmax contigs_used.len formal_1000000_iced.matrix.CtgContact > formal_1000000_iced.matrix.CtgContact.gfa
=cut
use strict;
use Getopt::Long;
use FindBin qw($Bin $Script);
use File::Basename qw(basename dirname);
use Data::Dumper;
##get options from command line into variables and set default values
my $Contact_cutoff;
my $Reciprocal_max;
my ($Verbose,$Help);
GetOptions(
"contacts:f"=>\$Contact_cutoff,
"reciprocalmax"=>\$Reciprocal_max,
"verbose"=>\$Verbose,
"help"=>\$Help
);
$Contact_cutoff ||= 1;
die `pod2text $0` if (@ARGV == 0 || $Help);
my $Contig_used_file = shift;
my $CtgContact_file = shift;
my %UsedContigs;
my %ReciprocalMax;
##only do statistics for contigs included in $Contig_used_file
open IN, $Contig_used_file || die "fail open $Contig_used_file \n";
while (<IN>) {
if(/(\S+)\s+(\d+)/){
my $ctgId = $1;
my $ctgLen = $2;
$UsedContigs{$ctgId} = $ctgLen;
print "S\t$ctgId\t*\tLN:i:$ctgLen\n"; ##S s0.utg000001l * LN:i:636309
}
}
close IN;
##this file is generated by ctgContact_from_maxBinContacts.pl
##To get the max Hic conatact count for each contig ends
open IN, $CtgContact_file || die "fail open $CtgContact_file\n";;
while (<IN>) {
next if(/^\#/);
chomp;
my @t = split /\s+/;
my $ctg1 = $t[0];
my $ctg2 = $t[1];
my $contact = $t[2];
my $ctg1Pos = $t[3];
my $ctg2Pos= $t[4];
##filter contig links that not locus in either head or tail
next if(! exists $UsedContigs{$ctg1} || ! exists $UsedContigs{$ctg2} );
if ( $ReciprocalMax{"$ctg1$ctg1Pos"} < $contact ) {
$ReciprocalMax{"$ctg1$ctg1Pos"} = $contact;
}
if ( $ReciprocalMax{"$ctg2$ctg2Pos"} < $contact ) {
$ReciprocalMax{"$ctg2$ctg2Pos"} = $contact;
}
}
close IN;
#print Dumper \%ReciprocalMax;
#exit;
##this file is generated by ctgContact_from_maxBinContacts.pl
open IN, $CtgContact_file || die "fail open $CtgContact_file";
while (<IN>) {
next if(/^\#/);
my $line = $_;
chomp;
my @t = split /\s+/;
my $ctg1 = $t[0];
my $ctg2 = $t[1];
my $contact = $t[2];
my $ctg1Pos = $t[3];
my $ctg2Pos= $t[4];
next if(! exists $UsedContigs{$ctg1} || ! exists $UsedContigs{$ctg2} );
if ($contact >= $Contact_cutoff) {
my $ctg1Strand = ($ctg1Pos =~ /head/i) ? "-" : "+";
my $ctg2Strand = ($ctg2Pos =~ /head/i) ? "+" : "-";
##L s95.ctg013321l - s95.ctg010300l + 13193M L1:i:649
if (! defined $Reciprocal_max) {
print "L\t$ctg1\t$ctg1Strand\t$ctg2\t$ctg2Strand\t0M\tL1:i:0\tRM:i:0\tContact:i:$contact\n";
}else
{
if ($contact eq $ReciprocalMax{"$ctg1$ctg1Pos"} && $contact eq $ReciprocalMax{"$ctg2$ctg2Pos"}) {
print "L\t$ctg1\t$ctg1Strand\t$ctg2\t$ctg2Strand\t0M\tL1:i:0\tRM:i:1\tContact:i:$contact\n";
}
}
}
}
close IN;