-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrc_kmer_freqs3.pl
executable file
·48 lines (37 loc) · 1.31 KB
/
rc_kmer_freqs3.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/perl
#v3.0
#Example: rc_kmer_freqs.pl kmer_counts_file_from_sa > kmer_counts_correcting4rc
no warnings 'deprecated';
my %kmer=();
open IN,"$ARGV[0]";
while (my $line=<IN>) {
chomp($line);
#$progress++;
# if ($progress % 10000 == 0) {
# $date=`date`;
#print "$date on line $progress\n";
#}
($seq,$freq)=split/\s+/,$line;
$rc=revcomp($seq);
# only store the canonical kmer, one that appears first in sorted order, but count the frequency for occurence in both directions.
# $seq is never converted into canonical capitalized format, which could
# impact this comparison?! TODO - JN
my @array=sort($seq,$rc);
# print "@array\n";
$kmer{$array[0]} += $freq;
}
close IN;
# This 'sort' in the foreach is probably painful. Faster in perl, or in shell?
# Also, if in shell is faster, there is an opportunity to reduce code.
foreach my $seq (sort keys %kmer) {
print "$seq $kmer{$seq}\n";
}
# Note, this 'reverse' step is non-trivial, results in all characters being
# capitalized, U's being lossily converted into A's, W, S, and N's not getting
# transformed, and other values being swapped (e.g. A for T). - JN
sub revcomp {
my ($s) = @_;
$s =~ tr/wsatugcyrkmbdhvnATUGCYRKMBDHVN/WSTAACGRYMKVHDBNTAACGRYMKVHDBN/;
$s = reverse $s;
return $s;
}