-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgetRepeats.pl
133 lines (108 loc) · 3.6 KB
/
getRepeats.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/perl -w
#Elzo de Wit
#Netherlands Cancer Institute
#Select sequences from the genome from a given
#fragment map and determine whether they are unique
#in the genome. If they are not unique store the
#position in the repeat map.
#You will need to create a new repeat map for every
#fragment map (i.e. restriction enzyme combination) and
#sequence length used in the 4C.
#Please provide an existing directory (e.g. repeats/). In
#this directory a new directory is created based on the
#sequence length option that is provided.
#The options:
#frag_map: the fragment map generated by generate_fragment_map.pl
#re: restriction site of the 1st restriction enzyme
#seq_len: the length of the sequence (without the read primer sequence)
# this means if you have sequence of 65, with a primer sequence of 20
# and a GATC as a restriction site (4), the sequence length should be
# 65 - 20 + 4 = 49
#reference: reference genome, should be the same as on which the fragment map is based
#out_dir: directory in which the repeat map directory should be created (e.g if directory
# is repeats/ and seq_len is 49 a directory repeats/49/ will be created
#threads: optional parameter, for the amount of threads used in bwa
use strict;
my $frag_map = shift @ARGV or usage();
my $re = shift @ARGV or usage();
my $seq_len = shift @ARGV or usage();
my $reference = shift @ARGV or usage();
my $out_dir = shift @ARGV or usage();
my $threads = shift @ARGV;
#create output directory with the correct sequence length
$out_dir .= "/" . $seq_len;
mkdir $out_dir;
my @files = glob "$frag_map/*txt";
#create a temporary bed file
`cat /dev/null > temp_frag.bed`;
print "Creating BED file from fragments\n";
#create a bed file
for my $file ( @files ){
createBed( $file, $seq_len );
}
$threads = 1 if not defined $threads;
print "Selecting sequences from the genome\n";
#select the sequences using fastaFromBed
system("fastaFromBed -fi $reference -bed temp_frag.bed -fo temp_frag_seq.fa");
print "Mapping sequences to the genome\n";
my $mapping_command = "bwa bwasw -t $threads $reference temp_frag_seq.fa > temp_frag_seq.sam";
system($mapping_command);
print "Parsing resulting SAM file\n";
my %files;
open SAM, "temp_frag_seq.sam" or die "Cannot open file: $!";
while(<SAM>){
chomp;
next if /^@/;
my ($id, $qual, $seq ) = (split /\t/)[0,4,9];
next if $qual > 0;
$id =~ s/^>//;
my ($chrom, $start, $end) = split /[-:]/,$id;
my $pos;
$seq = uc($seq);
if($seq =~ /^$re/){
$pos = $start+1;
}else{
$pos = $end;
}
if(not defined $files{$chrom}){
my $fh;
open $fh, ">$out_dir/$chrom\.txt" or die "Cannot create file: $!";
$files{$chrom} = $fh;
}
my $out_fh = $files{$chrom};
print $out_fh $pos, "\n";
}
#remove the temporary files since we got to this point
system("rm temp_frag.bed temp_frag_seq.fa temp_frag_seq.sam");
#generate tje reverse complement
sub revComp{
my $seq = shift;
$seq =~ tr/ACGT/TGCA/;
reverse $seq;
}
sub createBed{
my ($file, $len) = @_;
open FRAG, $file or die "Cannot open file: $!";
open OUT, ">>temp_frag.bed" or die "Cannot open stream: $!";
my ($chrom) = $file =~ /.*\/(chr.*).txt/;
while(<FRAG>){
chomp;
my ($start,$end,$ori) = split /\t/;
$start--;
if( $end - $start > $len ){
if($ori == 5){
print OUT join("\t", ($chrom, $start, $start+$len-1)), "\n";
}else{
print OUT join("\t", ($chrom, $end-$len+1, $end)), "\n";
}
}else{
print OUT join("\t", ($chrom, $start, $end)), "\n";
}
}
close OUT;
}
sub usage{
print "getRepeats.pl fragment_map_dir restriction_site sequence_length reference output_directory [threads]\n";
exit;
}