4 # Verifies that Bowtie's paired-end mode gives alignments that are
5 # consistent with the alignments produced in single-end mode with -a
6 # and --nostrata options.
8 # Run this from the Bowtie directory.
10 # Usage: perl ../scripts/pe_verify.pl [mode] [index] [mates1] [nates2]
15 # [mates1] = reads/e_coli_1000_1.fq
16 # [mates2] = reads/e_coli_1000_2.fq
18 # E.g.: perl ../scripts/pe_verify.pl -v 0
19 # perl ../scripts/pe_verify.pl -v 0 e_coli reads/e_coli_1000_1.fq reads/e_coli_100000_e1_2.fq
24 use List::Util qw[min max];
26 Getopt::Long::Configure ("no_ignore_case");
44 GetOptions ("I=i" => \$I,
53 "verbose" => \$verbose,
54 "col-cseq" => \$colCseq,
55 "col-cqual" => \$colCqual,
57 ) || die "One or more errors parsing script arguments";
65 $extra_args = $e if defined($e);
66 my $match_mode = "-n 2";
67 $match_mode = "-v " . $v if defined($v);
68 $match_mode = "-n " . $n if defined($n);
69 $match_mode .= " -l " . $l if defined($l);
70 $match_mode .= " -e " . $e if defined($e);
71 $match_mode .= " -C" if defined($C);
72 $match_mode .= " -g" if defined($g);
73 $match_mode .= " --col-cseq" if defined($colCseq);
76 print "Using match mode: $match_mode\n";
79 my $bowtie_exe = "bowtie";
80 $bowtie_exe .= "-debug" if $d;
83 $index = $ARGV[0] if defined($ARGV[0]);
84 my $reads1 = "reads/e_coli_1000_1.fq";
85 $reads1 = $ARGV[1] if (defined($ARGV[1]));
86 my $reads2 = "reads/e_coli_1000_2.fq";
87 $reads2 = $ARGV[2] if (defined($ARGV[2]));
89 # Infer input type so we can provide Bowtie with appropriate option
90 if($reads1 =~ /\.fa/) {
91 $reads2 =~ /\.fa/ || die "Reads files $reads1 and $reads2 have different extensions";
92 $extra_args .= " -f ";
93 } elsif($reads1 =~ /\.raw/) {
94 $reads2 =~ /\.raw/ || die "Reads files $reads1 and $reads2 have different extensions";
95 $extra_args .= " -r ";
96 } elsif(!($reads1 =~ /\.fq/)) {
97 (!($reads2 =~ /\.fq/)) || die "Reads files $reads1 and $reads2 have different extensions";
98 $extra_args .= " -c ";
101 my $sesMustHavePes = 0; # force se pairs to have corresponding pe pairs
103 system("make -C $bowtie_dir $bowtie_exe") == 0 || die;
105 # Run Bowtie not in paired-end mode for first mate file
106 my $bowtie_se_cmd1 = "$bowtie_dir/$bowtie_exe $match_mode $args -y $extra_args -a --refidx $index $reads1";
108 # Run Bowtie not in paired-end mode for second mate file
109 my $bowtie_se_cmd2 = "$bowtie_dir/$bowtie_exe $match_mode $args -y $extra_args -a --refidx $index $reads2";
111 # Run Bowtie in paired-end mode
112 my $bowtie_pe_cmd = "$bowtie_dir/$bowtie_exe $match_mode $args -I $inner -X $outer -y $extra_args -a --refidx $index -1 $reads1 -2 $reads2";
113 print "$bowtie_pe_cmd\n";
114 open BOWTIE_PE, "$bowtie_pe_cmd |";
117 $inner = max(0, $inner-1);
118 $outer = max(0, $outer-1);
128 my $l2 = <BOWTIE_PE>;
131 my @l1s = split(/[\t]/, $l1);
132 my @l2s = split(/[\t]/, $l2);
133 $#l1s >= 5 || die "Paired-end alignment not formatted correctly: $l1";
134 $#l2s >= 5 || die "Paired-end alignment not formatted correctly: $l2";
135 # Split the read name
136 my @l1rs = split(/\//, $l1s[0]);
137 my @l2rs = split(/\//, $l2s[0]);
138 $#l1rs >= 1 || die "Read not formatted correctly: $l1s[0]";
139 $#l2rs >= 1 || die "Read not formatted correctly: $l2s[0]";
140 $l1rs[0] eq $l2rs[0] || die "Before-/ parts of read names don't match: $l1rs[0], $l2rs[0]";
141 my $mate1 = ($l1rs[$#l1rs] eq "1");
142 my $mate1str = $mate1 ? "1" : "0";
143 my $basename = $l1rs[0];
144 my $loff = int($l1s[3]);
145 my $roff = int($l2s[3]);
146 my $insLen = $roff - $loff + length($l2s[4]);
147 $insLen > length($l1s[4]) || die "Insert length did not exceed first mate length";
148 $insLen > length($l1s[5]) || die "Insert length did not exceed first mate length";
149 $insLen >= $inner || die "Insert length was $insLen < $inner\n";
150 $insLen <= $outer || die "Insert length was $insLen > $outer\n";
151 my $read1Short = $l1s[4];
152 my $qual1Short = $l1s[5];
153 my $read2Short = $l2s[4];
154 my $qual2Short = $l2s[5];
156 $read1Mms = $l1s[7] if defined($l1s[7]);
157 $read1Mms = "-" if $read1Mms eq "";
159 $read2Mms = $l2s[7] if defined($l2s[7]);
160 $read2Mms = "-" if $read2Mms eq "";
161 my $content = "$read1Short:$qual1Short $read2Short:$qual2Short";
162 $content = "" if $C && !$colCseq;
163 my $mcont = "$read1Mms $read2Mms";
165 my $val = "$basename $mate1str $l1s[2] $l1s[3] $l2s[3] $content $mcont";
166 #defined ($peHash{$basename}) && die "Already saw paired-end alignment for basename $basename";
167 $peHash{$basename} = $val;
168 if($mate1) { $pesFw++; } else { $pesRc++; }
175 my %seMatedHash = ();
176 my %unmatchedSe = ();
177 my $unmatchedSes = 0;
178 my $unmatchedSeReads = 0;
180 print "$bowtie_se_cmd1\n";
181 open BOWTIE_SE1, "$bowtie_se_cmd1 |";
182 while(<BOWTIE_SE1>) {
186 my @ls = split(/[\t]/, $_);
187 my @lrs = split(/\//, $ls[0]);
188 my $basename = $lrs[0];
190 my $off = int($ls[3]);
191 my $len = length($ls[4]);
192 my $readShort = $ls[4];
193 my $qualShort = $ls[5];
195 $mms = $ls[7] if defined($ls[7]);
196 $mms = "-" if $mms eq "";
197 my $content = "$readShort $qualShort $mms";
198 my $key = "$ref $ls[1] $off $len $content";
199 push @{ $seHash{$basename}{$ref} }, $key;
203 print "$bowtie_se_cmd2\n";
204 open BOWTIE_SE2, "$bowtie_se_cmd2 |";
205 open UNMATCHED_SE, ">.unmatched.se";
206 while(<BOWTIE_SE2>) {
210 my @ls = split(/[\t]/, $_);
211 my @lrs = split(/\//, $ls[0]);
212 my $basename = $lrs[0];
214 my $off = int($ls[3]);
215 my $len = length($ls[4]);
216 my $fw = $ls[1] eq "+" ? 1 : 0;
217 my $readShort = $ls[4];
218 my $qualShort = $ls[5];
220 $mms = $ls[7] if defined($ls[7]);
221 $mms = "-" if $mms eq "";
222 my $content = "$readShort $qualShort";
224 my $key = "$ref $ls[1] $off $len $content $mcont";
225 # Is the other mate already aligned?
226 if(defined($seHash{$basename}{$ref})) {
227 # Get all of the alignments for the mate
228 for my $om (@{ $seHash{$basename}{$ref} }) {
229 my @oms = split(/ /, $om);
230 $#oms == 6 || die "Wrong number of elements for oms: $#oms";
232 my $ofw = $oms[1] eq "+" ? 1 : 0;
233 my $ooff = int($oms[2]);
234 my $olen = int($oms[3]);
235 my $oreadShort = $oms[4];
236 my $oqualShort = $oms[5];
238 print "Trying $ref:$off and $oref:$ooff\n" if $verbose;
239 $omms = $oms[6] if defined($oms[6]);
240 $oref eq $ref || die "Refs don't match: $oref, $ref";
242 my $peKey = "$basename ";
244 # The #1 mate is on the right
245 my $my_m1fw = $m1fw ? 0 : 1;
246 my $my_m2fw = $m2fw ? 0 : 1;
247 $diff = $ooff - $off + $olen;
248 if ($diff <= $olen || $diff <= $len) {
249 print "diff $diff is <= $olen and $len\n" if $verbose;
252 # upstream mate contains downstream one?
253 #next if $off + $len >= $ooff + $olen;
254 # mates are at the same position?
256 print "overlapping offsets: $ooff, $off\n" if $verbose;
259 if ($diff < $inner || $diff > $outer) {
260 print "diff $diff is outside of inner/outer: [$inner, $outer]\n" if $verbose;
263 if($ofw != $my_m1fw) {
264 print "orientation of other $ofw doesn't match expected $my_m1fw\n" if $verbose;
267 if($fw != $my_m2fw) {
268 print "orientation of anchor $fw doesn't match expected $my_m2fw\n" if $verbose;
271 $content = "$readShort:$qualShort $oreadShort:$oqualShort";
272 $content = "" if $C && !$colCseq;
273 $mcont = "$mms $omms";
275 $peKey .= "0 $ref $off $ooff $content $mcont";
277 # The #1 mate is on the left
278 $diff = $off - $ooff + $len;
279 if ($diff <= $olen || $diff <= $len) {
280 print "diff $diff is <= $olen and $len\n" if $verbose;
283 # upstream mate contains downstream one?
284 #next if $ooff + $olen >= $off + $len;
285 # mates are at the same position?
287 print "overlapping offsets: $ooff, $off\n" if $verbose;
290 if ($diff < $inner || $diff > $outer) {
291 print "diff $diff is outside of inner/outer: [$inner, $outer]\n" if $verbose;
295 print "orientation of other $ofw doesn't match expected $m1fw\n" if $verbose;
299 print "orientation of anchor $fw doesn't match expected $m2fw\n" if $verbose;
302 $content = "$oreadShort:$oqualShort $readShort:$qualShort";
303 $content = "" if $C && !$colCseq;
304 $mcont = "$omms $mms";
306 $peKey .= "1 $ref $ooff $off $content $mcont";
308 # Found a legitimate paired-end alignment using a pair of
309 # single-end alignments
310 if($seMatedHash{$basename}) {
311 print "already found corresponding paired-end\n" if $verbose;
314 if($sesMustHavePes) {
315 defined($peHash{$basename}) ||
316 die "Found single-end alignment for $basename, but no paired-end";
318 if(!defined($peHash{$basename})) {
319 if(!defined($unmatchedSe{$basename})) {
320 $unmatchedSe{$basename} = 0;
323 $unmatchedSe{$basename}++;
325 print UNMATCHED_SE "Read $basename:\n$om\n$key\n";
328 if(defined($peHash{$basename}) && $peHash{$basename} eq $peKey) {
329 delete $peHash{$basename};
330 $seMatedHash{$basename} = 1;
332 print "No matchup:\n$peHash{$basename}\n$peKey\n" if $verbose;
334 #print "Found alignment for mate $otherMate of $ls[0]; diff: $diff\n";
342 for my $peKey (keys %peHash) {
343 print "Paired-end $peKey has a paired-end alignment without single-end support\n";
344 print "[ $peHash{$peKey} ]\n";
347 $die && die "Found $die paired-end reads with no corresponding single-end mates";
349 if($unmatchedSes > 0) {
350 print "Total of $unmatchedSes unmatched single-end alignments found for $unmatchedSes distinct pairs\n";
351 print "Ref orientation off len seq quals mms\n";
352 system("cat .unmatched.se");
356 print "PASSED; analyzed $pes paired-end ($pesFw fw, $pesRc rc) and $ses single-end alignments\n";