-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_fragment_map.pl
102 lines (90 loc) · 2.97 KB
/
generate_fragment_map.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl -w
use strict;
#Elzo de Wit
#Netherlands Cancer Institute
#generate a map of the fragments that are used in the 4C analysis
#In 4C two restriction enzymes are used: RE1 and RE2. Using an inverse
#PCR DNA fragments are amplified that are ligated to a region of interest:
#the bait or viewpoint. The ligated fragments and the viewpoint are
#fragments that are between the first and second restriction enzyme.
#In the mapping pipeline the position of these fragments is used to
#create a mapping file.
#Note: in the 4C analysis the following fragments can be ligated to
#viewpoint:
#1: RE1-RE2 (or RE2-RE1)
#2: RE1-RE1 (blind fragments)
#The blind fragments have a lower performance compared to the
#non-blind fragments which is why we leave them out of the
#fragment map (and therefore the downstream analysis).
#Future versions may include an option to include blind fragments,
#although this impacts both the repeat finding and the mapping
#script, because you would need to discriminate blind from
#non-blind
#This script only generates fragment that are formed
#between the first and the second restriction enzyme.
my $fasta = shift @ARGV or usage(); #genome fasta file
my $re1 = shift @ARGV or usage(); #first restriction enzyme
my $re2 = shift @ARGV or usage(); #second restriction enzyme
my $output_dir = shift @ARGV or usage(); #where to store the fragment map
$|=1;
#create output directory with the fragment map
mkdir $output_dir;
my $chrom;
my $seq;
open FASTA, $fasta or die "Cannot open file: $!";
my $pos = 1;
my $lines = 0;
my $add_end;
my %seen;
while(<FASTA>){
chomp;
#fasta header
if(/^>(.*)/){
$chrom = $1;
#if there are whitespace characters in the header
#remove everything after the whitespace
$chrom =~ s/\s(.*)//;
if( defined $seen{$chrom} ){
print "$chrom has been seen before in this fasta file. Exiting\n";
exit;
}
$seen{$chrom} = 1;
#move into a new sequence and file for creating fragments for
$seq = "";
$pos = 1;
#create a new file to hold the positions of the fragments
open OUT, ">$output_dir/$chrom.txt" or die "Cannot create file: $!";
$add_end = 0;
}else{
if(/^N+$/){
$add_end += length($_);
}else{
$seq .= uc $_;
}
while($seq =~ /($re1.*?$re1)/g){
my ($sub_seq) = $seq =~ /.*?($re1.*?$re1)/;
my $end_pos = $pos + length($&) - 1 + $add_end;
if(length( $seq ) < 5e3){
#5' fragment
if($sub_seq =~ /^($re1.*?$re2)/ ){
my $sub_pos = $pos + length($1) - 1;
print OUT join("\t", ($pos,$sub_pos,5)), "\n";
}
#3' fragment
if($sub_seq =~ /.*($re2.*?$re1$)/){
my $sub_pos = $end_pos - length($1) + 1;
print OUT join("\t", ($sub_pos,$end_pos,3)), "\n";
}
}
#substitute the entire RE1 fragment to the sequence of RE1
$seq =~ s/.*?$re1.*?$re1/$re1/;
$pos = $end_pos - length($re1) + 1; #add one to get the GATC start position
$add_end = 0;
}
}
}
sub usage{
print "generate_fragment_map.pl genome_fasta re1 re2 output_directory\n";
exit;
}