4 # reconcile_alignments_pe.pl
9 # Reconcile and sanity-check an pair of input files, an output
10 # alignment file (the default, verbose kind), and a pair of output
11 # unaligned-read files (generated with the --un option) to be sure
12 # they're consistent with each other. If the --max option was used
13 # during alignment, then the --max files should also be specified. If
14 # the --al option was also used, specify the --al files as the last
15 # arguments to include it in the sanity check.
17 # Usage: perl reconcile_alignments_pe.pl \
18 # [-k <int>] [-a] [-m <int>] \
19 # [-u <int>] [-v] [-q | -f | -r] \
20 # <input mate1 file> \
21 # <input mate2 file> \
34 Getopt::Long::Configure ("no_ignore_case");
50 GetOptions("m=i" => \$m,
59 "col-cseq" => \$colCseq,
60 "col-cedit" => \$colCedit,
61 "col-cqual" => \$colCqual,
62 "e" => \$raw) || die "Bad arguments";
66 $fastq = 1 if ($fastq + $fasta + $raw == 0);
68 $khits = 999999 if $allHits;
70 print "khits: $khits\nmaxhits: $m\nnum_reads: $u\n" if $verbose;
72 # Utility function that returns the reverse complement of its argument
76 $r =~ tr/aAcCgGtT/tTgGcCaA/;
80 defined($ARGV[0]) || die "Must specify input read file 1 as first arg";
81 defined($ARGV[1]) || die "Must specify input read file 2 as first arg";
82 defined($ARGV[2]) || die "Must specify alignment file as second arg";
83 defined($ARGV[3]) || die "Must specify unaligned-read file 1 as third arg";
84 defined($ARGV[4]) || die "Must specify unaligned-read file 2 as third arg";
86 my $read1_file = $ARGV[0];
87 my $read2_file = $ARGV[1];
88 my $hits_file = $ARGV[2];
89 my $un1_file = $ARGV[3];
90 my $un2_file = $ARGV[4];
92 $max1_file = $ARGV[5] if defined($ARGV[5]);
94 $max2_file = $ARGV[6] if defined($ARGV[6]);
96 $al1_file = $ARGV[7] if defined($ARGV[7]);
98 $al2_file = $ARGV[8] if defined($ARGV[8]);
101 print "read1: $read1_file\nread2: $read2_file\nalignments: $hits_file\n";
102 print "unaligned reads 1: $un1_file\nunaligned reads 2: $un2_file\n";
103 print "max reads 1: $max1_file\nmax reads 2: $max2_file\n";
108 my ($name, $seq, $qual) = ("", "", "");
111 return ("", "", "") unless defined($name);
113 $name = substr($name, 1);
121 return ("", "", "") unless defined($name);
123 $name = substr($name, 1);
128 die "Raw format not supported in reconcile_alignment_pe.pl; read names required";
130 return ($name, $seq, $qual);
134 # Encode colors as proxy nucleotides
138 my %nmap = ("0" => "A", "1" => "C", "2" => "G", "3" => "T", "." => "N",
139 "A" => "A", "C" => "C", "G" => "G", "T" => "T", "N" => "N");
141 for(my $i = 0; $i < length($s); $i++) {
142 my $c = uc substr($s, $i, 1);
143 defined($nmap{$c}) || die;
154 # Go through Bowtie-produced paired-end alignments
156 my $distinctHits = 0;
157 open(HITS, $hits_file);
161 # Read two alignments at a time, cleave off the trailing /1 or
162 # /2 and assert that they have the same base name.
165 my @s1 = split(/[\t]/, $l1);
168 my @s2 = split(/[\t]/, $l2);
172 my @name1s = split(/\//, $name1);
173 my @name2s = split(/\//, $name2);
174 $#name1s == $#name2s || die "Read names formatted differently: $name1, $name2";
175 $name1s[$#name1s] eq "1" || $name1s[$#name1s] eq "2" || die "Bad suffix on read name: $name1";
176 $name2s[$#name2s] eq "1" || $name2s[$#name2s] eq "2" || die "Bad suffix on read name: $name2";
177 $name1s[$#name1s] ne $name2s[$#name2s] || die "Read names have same suffix: $name1, $name2";
178 $name1s[0] eq $name2s[0] || die "Names don't match: $name1,$name2";
180 $swap = 1 if $name1s[$#name1s] eq "2";
181 my $read_name = $name1s[0];
183 # Check that the number of times that the basename has appeared
184 # in a paired alignment does not violate the -k or -m policies.
187 if(defined($num_hash{$read_name})) {
188 $num_hash{$read_name}++;
189 $add_read = 0; # already added this one
191 $num_hash{$read_name} = 1;
194 # Check whether num_hash violates -k
195 if($num_hash{$read_name} > $khits) {
196 die "Number of alignments for read $read_name exceeds -k limit of $khits";
198 # Check whether num_hash violates -m
199 if($num_hash{$read_name} > $m) {
200 die "Number of alignments for read $read_name exceeds -m limit of $m";
203 defined($hits_hash{$read_name}) &&
204 die "Read w/ name $read_name appears twice in alignment file";
208 my $fw1 = ($s1[1] eq "+");
212 # Reverse / reverse-comp
214 $seq1 = reverse $seq1;
216 $seq1 = reverseComp($seq1);
218 $qual1 = reverse $qual1;
220 my $fw2 = ($s2[1] eq "+");
224 # Reverse / reverse-comp
226 $seq2 = reverse $seq2;
228 $seq2 = reverseComp($seq2);
230 $qual2 = reverse $qual2;
233 $hits_hash{$read_name}{seq1} = $seq1;
234 $hits_hash{$read_name}{qual1} = $fasta ? "" : $qual1;
235 $hits_hash{$read_name}{seq2} = $seq2;
236 $hits_hash{$read_name}{qual2} = $fasta ? "" : $qual2;
238 $hits_hash{$read_name}{seq1} = $seq2;
239 $hits_hash{$read_name}{qual1} = $fasta ? "" : $qual2;
240 $hits_hash{$read_name}{seq2} = $seq1;
241 $hits_hash{$read_name}{qual2} = $fasta ? "" : $qual1;
245 # Drop %num_hash - don't need it anymore
249 # Go through entries of the FASTQ file for the unaligned reads
252 open $UN1, $un1_file;
253 open $UN2, $un2_file;
255 my ($name1, $seq1, $qual1) = get_read($UN1);
256 my ($name2, $seq2, $qual2) = get_read($UN2);
261 my @ls1 = split(/\//, $name1);
262 my @ls2 = split(/\//, $name2);
264 $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
265 $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
267 # Where have we seen it before? Nowhere, hopefully
269 if(defined($hits_hash{$name})) {
270 delete $hits_hash{$name};
274 defined($hits_hash{$name}) &&
275 die "Read $name appears both in hits file $hits_file and in --un file $un1_file/$un2_file";
277 defined($un_hash{$name}) &&
278 die "Read $name appears in --un file $un1_file/$un2_file more than once";
279 # Insert summary of the pair that didn't align
280 $un_hash{$name}{seq1} = $seq1;
281 $un_hash{$name}{qual1} = $qual1;
282 $un_hash{$name}{seq2} = $seq2;
283 $un_hash{$name}{qual2} = $qual2;
289 # Go through entries of the FASTQ file for the reads with a number of
290 # reportable alignments that exceeded the -m limit.
293 if($max1_file ne "") {
294 $max2_file ne "" || die;
297 open ($MAX1, $max1_file) || ($bad = 1);
298 open ($MAX2, $max2_file) || ($bad = 1);
300 my ($name1, $seq1, $qual1) = get_read($MAX1);
301 my ($name2, $seq2, $qual2) = get_read($MAX2);
307 my @ls1 = split(/\//, $name1);
308 my @ls2 = split(/\//, $name2);
309 $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
310 $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
312 # Where have we seen it before? Nowhere, hopefully
314 defined($hits_hash{$name}) ||
315 die "Read $name appears in --max file $max1_file/$max2_file but not in hits file $hits_file in -M mode";
316 delete $hits_hash{$name};
319 defined($hits_hash{$name}) &&
320 die "Read $name appears in hits file $hits_file and in --max file $max1_file/$max2_file";
322 defined($un_hash{$name}) &&
323 die "Read $name appears in --un file $un1_file/$un2_file and in --max file $max1_file/$max2_file";
324 defined($max_hash{$name}) &&
325 die "Read $name appears in --max file $max1_file/$max2_file more than once";
326 # Insert summary of the pair that didn't align
327 $max_hash{$name}{seq1} = $seq1;
328 $max_hash{$name}{qual1} = $qual1;
329 $max_hash{$name}{seq2} = $seq2;
330 $max_hash{$name}{qual2} = $qual2;
337 # Go through entries of the
340 if($al1_file ne "") {
341 $al2_file ne "" || die;
344 open ($AL1, $al1_file) || ($bad = 1);
345 open ($AL2, $al2_file) || ($bad = 1);
347 my ($name1, $seq1, $qual1) = get_read($AL1);
348 my ($name2, $seq2, $qual2) = get_read($AL2);
354 my @ls1 = split(/\//, $name1);
355 my @ls2 = split(/\//, $name2);
356 $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
357 $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
359 defined($hits_hash{$name}) ||
360 die "Read $name appears in --al file $al1_file/$al2_file but not in hits file $hits_file";
361 defined($un_hash{$name}) &&
362 die "Read $name appears in --un file $un1_file/$un2_file and in --al file $al1_file/$al2_file";
363 defined($max_hash{$name}) &&
364 die "Read $name appears in --max file $max1_file/$max2_file and in --al file $al1_file/$al2_file";
365 defined($al_hash{$name}) &&
366 die "Read $name appears in --al file $al1_file/$al2_file more than once";
367 # Insert summary of the pair that didn't align
368 $al_hash{$name}{seq1} = $seq1;
369 $al_hash{$name}{qual1} = $qual1;
370 $al_hash{$name}{seq2} = $seq2;
371 $al_hash{$name}{qual2} = $qual2;
377 # Go through entries of the FASTQ file for the input reads and make
378 # sure that each entry is mirrored by an entry either in the alignment
379 # file or in the unaligned-read file.
381 open $READ1, $read1_file;
382 open $READ2, $read2_file;
386 my ($name1, $seq1, $qual1) = get_read($READ1);
387 my ($name2, $seq2, $qual2) = get_read($READ2);
393 my @ls1 = split(/\//, $name1);
394 my @ls2 = split(/\//, $name2);
395 $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
396 $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
398 if(defined($hits_hash{$name})) {
402 $alseq1 = nucencode($seq1);
403 $alseq2 = nucencode($seq2);
405 if(!$C || $colCseq) {
406 $hits_hash{$name}{seq1} eq $alseq1 ||
407 die "Read $name in alignment file has different _1 sequence than in ".
408 "input read file.\nAlgn: \"$hits_hash{$name}{seq1}\"\nInput: \"$alseq1\"";
410 # Qualities can be legitimately different
411 #$hits_hash{$name}{qual1} eq $qual1 ||
412 # die "Read $name in alignment file has different _1 quals than in ".
413 # "input read file.\nAlgn: $hits_hash{$name}{qual1}\nInput: $qual1";
414 if(!$C || $colCseq) {
415 $hits_hash{$name}{seq2} eq $alseq2 ||
416 die "Read $name in alignment file has different _2 sequence than in ".
417 "input read file.\nAlgn: \"$hits_hash{$name}{seq2}\"\nInput: \"$alseq2\"";
419 #$hits_hash{$name}{qual2} eq $qual2 ||
420 # die "Read $name in alignment file has different _2 quals than in ".
421 # "input read file.\nAlgn: $hits_hash{$name}{qual2}\nInput: $seq2";
423 elsif(defined($un_hash{$name})) {
424 $un_hash{$name}{seq1} eq $seq1 ||
425 die "Read $name in --un file has different _1 sequence than in ".
426 "input read file.\nAlgn: $un_hash{$name}{seq1}\nInput: $seq1";
427 $un_hash{$name}{qual1} eq $qual1 ||
428 die "Read $name in --un file has different _1 quals than in ".
429 "input read file.\nAlgn: $un_hash{$name}{qual1}\nInput: $qual1";
430 $un_hash{$name}{seq2} eq $seq2 ||
431 die "Read $name in --un file has different _2 sequence than in ".
432 "input read file.\nAlgn: $un_hash{$name}{seq2}\nInput: $seq2";
433 $un_hash{$name}{qual2} eq $qual2 ||
434 die "Read $name in --un file has different _2 quals than in ".
435 "input read file.\nAlgn: $un_hash{$name}{qual2}\nInput: $seq2";
437 elsif(defined($max_hash{$name})) {
438 $max_hash{$name}{seq1} eq $seq1 ||
439 die "Read $name in --max file has different _1 sequence than in ".
440 "input read file.\nAlgn: $max_hash{$name}{seq1}\nInput: $seq1";
441 $max_hash{$name}{qual1} eq $qual1 ||
442 die "Read $name in --max file has different _1 quals than in ".
443 "input read file.\nAlgn: $max_hash{$name}{qual1}\nInput: $qual1";
444 $max_hash{$name}{seq2} eq $seq2 ||
445 die "Read $name in --max file has different _2 sequence than in ".
446 "input read file.\nAlgn: $max_hash{$name}{seq2}\nInput: $seq2";
447 $max_hash{$name}{qual2} eq $qual2 ||
448 die "Read $name in --max file has different _2 quals than in ".
449 "input read file.\nAlgn: $max_hash{$name}{qual2}\nInput: $seq2";
452 die "Read $name does not appear in hits, --un or ---max file";
455 if(defined($al_hash{$name})) {
456 $al_hash{$name}{seq1} eq $seq1 ||
457 die "Read $name in --al file has different _1 sequence than in ".
458 "input read file.\nAlgn: $al_hash{$name}{seq1}\nInput: $seq1";
459 $al_hash{$name}{qual1} eq $qual1 ||
460 die "Read $name in --al file has different _1 quals than in ".
461 "input read file.\nAlgn: $al_hash{$name}{qual1}\nInput: $qual1";
462 $al_hash{$name}{seq2} eq $seq2 ||
463 die "Read $name in --al file has different _2 sequence than in ".
464 "input read file.\nAlgn: $al_hash{$name}{seq2}\nInput: $seq2";
465 $al_hash{$name}{qual2} eq $qual2 ||
466 die "Read $name in --al file has different _2 quals than in ".
467 "input read file.\nAlgn: $al_hash{$name}{qual2}\nInput: $seq2";
471 last if $patid == $u;
476 if($al1_file ne "") {
477 $als == $distinctHits || die "number of --al $als does not match number of distinct hits $distinctHits";
479 $distinctHits + $uns + $maxs == $reads ||
480 die "distinct hits $distinctHits, --un $uns, --max $maxs doesn't add to reads $reads";
482 print "PASSED; processed $hits hits, $reads reads, $uns --un, $maxs --max, $als --al\n";