4 # reconcile_alignments.pl
9 # Reconcile and sanity-check an input read file, an output hits file
10 # (the default, verbose kind), and an output unaligned-read file
11 # (generated with the --un option) to be sure they're consistent with
12 # each other. If the --max option was used during alignment, then the
13 # --max file should also be specified. If the --al option was also
14 # used, specify the --al file as the last argument to include it in the
17 # Usage: perl reconcile_alignments.pl \
18 # [-k <int>] [-a] [-m <int>] \
19 # [-u <int>] [-f|-q|-r] \
30 Getopt::Long::Configure ("no_ignore_case");
45 GetOptions ("k=i" => \$k,
53 "col-cseq" => \$colCseq,
54 "col-cedit" => \$colCedit,
55 "col-cqual" => \$colCqual,
56 "r" => \$r) || die "One or more errors parsing script arguments";
59 $khits = int($k) if defined($k);
60 $khits = 999999 if $a;
62 $maxhits = int($m) if defined($m);
63 $maxhits = int($M) if defined($M);
65 $num_reads = $u if defined($u);
68 $format = "fasta" if $f;
69 $format = "raw" if $r;
71 # Utility function that returns the reverse complement of its argument
75 $r =~ tr/aAcCgGtT/tTgGcCaA/;
80 # Encode colors as proxy nucleotides
84 my %nmap = ("0" => "A", "1" => "C", "2" => "G", "3" => "T", "." => "N",
85 "A" => "A", "C" => "C", "G" => "G", "T" => "T", "N" => "N");
87 for(my $i = 0; $i < length($s); $i++) {
88 my $c = uc substr($s, $i, 1);
89 defined($nmap{$c}) || die;
95 defined($ARGV[0]) || die "Must specify input read file as first arg";
96 defined($ARGV[1]) || die "Must specify alignment file as second arg";
97 defined($ARGV[2]) || die "Must specify unaligned-read file as third arg";
99 my $read_files = $ARGV[0];
100 my $algn_file = $ARGV[1];
101 my $un_file = $ARGV[2];
103 $max_file = $ARGV[3] if defined($ARGV[3]);
105 $al_file = $ARGV[4] if defined($ARGV[4]);
112 # Go through Bowtie-produced alignments
114 my $distinctHits = 0;
116 open(ALGN, $algn_file) || die "Could not open alignment file $algn_file\n";
117 my %num_hash = (); # for checking -k/-m
124 if(defined($num_hash{$name})) {
126 $add_read = 0; # already added this one
128 $num_hash{$name} = 1;
131 if($num_hash{$name} > $khits) {
132 die "Number of alignments for read $name exceeds -k limit of $khits";
133 } elsif($num_hash{$name} > $maxhits) {
134 die "Number of alignments for read $name exceeds -m limit of $maxhits";
137 defined($hits_hash{$name}{seq}) &&
138 die "Read w/ name $name appears twice in alignment file";
142 my $fw = ($s[1] eq "+");
146 # Reverse / reverse-comp
150 $seq = reverseComp($seq);
152 $qual = reverse $qual;
154 $hits_hash{$name}{seq} = $seq;
155 $hits_hash{$name}{qual} = (($format eq "fasta")? "" : $qual);
163 my ($name, $seq, $qual) = ("", "", "");
164 if($format eq "fastq") {
166 return ("", "", "") unless defined($name);
168 $name = substr($name, 1);
174 } elsif($format eq "fasta") {
176 return ("", "", "") unless defined($name);
178 $name = substr($name, 1);
182 $format eq "raw" || die;
183 die "Raw format not supported in reconcile_alignment.pl; read names required";
185 return ($name, $seq, $qual);
188 # Go through entries of the FASTQ file for the unaligned reads
192 open $UN, $un_file || die "Could not open unaligned-read file $un_file\n";
194 my ($name, $seq, $qual) = get_read($UN);
198 defined($hits_hash{$name}) &&
199 die "Read $name appears both in hits file $algn_file and in --un file $un_file";
200 } elsif(defined($hits_hash{$name})) {
202 delete $hits_hash{$name};
204 defined($un_hash{$name}) &&
205 die "Read $name appears more than once in --un file $un_file";
206 $un_hash{$name}{seq} = $seq;
207 $un_hash{$name}{qual} = $qual;
213 if($max_file ne "" && -f $max_file) {
215 open $MAX, $max_file || die "Could not open maxed-read file $max_file\n";
216 # Go through entries of the MAX file for the unaligned reads
218 my ($name, $seq, $qual) = get_read($MAX);
222 defined($hits_hash{$name}) ||
223 die "Read $name appears in --max file $max_file but not in alignment file $algn_file";
225 delete $hits_hash{$name};
227 defined($hits_hash{$name}) &&
228 die "Read $name appears both in hits file $algn_file and in --max file $max_file";
230 defined($un_hash{$name}) &&
231 die "Read $name appears both in --un file $un_file and in --max file $max_file";
232 defined($max_hash{$name}) &&
233 die "Read $name appears in --max file $max_file more than once";
234 $max_hash{$name}{seq} = $seq;
235 $max_hash{$name}{qual} = $qual;
244 # Go through entries of the MAX file for the unaligned reads
246 my ($name, $seq, $qual) = get_read($AL);
249 defined($hits_hash{$name}) ||
250 die "Read $name appears --al file $al_file but not in hits file $algn_file";
251 defined($un_hash{$name}) &&
252 die "Read $name appears both in --un file $un_file and in --al file $al_file";
253 defined($max_hash{$name}) &&
254 die "Read $name appears both in --max file $max_file and in --al file $al_file";
255 defined($al_hash{$name}) &&
256 die "Read $name appears in --al file $al_file more than once";
257 $al_hash{$name}{seq} = $seq;
258 $al_hash{$name}{qual} = $qual;
263 my @read_list = split(/,/, $read_files);
265 for my $read_file (@read_list) {
266 # Go through entries of the FASTQ file for the input reads and make
267 # sure that each entry is mirrored by an entry either in the alignment
268 # file or in the unaligned-read file.
270 open $READ, $read_file;
273 my ($name, $seq, $qual) = get_read($READ);
276 if(defined($hits_hash{$name})) {
279 $alseq = nucencode($seq);
281 if(!$C || $colCseq) {
282 $hits_hash{$name}{seq} eq $alseq ||
283 die "Read $name in hits file $algn_file has different sequence ".
284 "from input read.\nHit: \"$hits_hash{$name}{seq}\"\nInput: \"$alseq\"";
286 # Qualities can be legitimately different
287 #$hits_hash{$name}{qual} eq $qual ||
288 # die "Read $name in hits file $algn_file has different sequence ".
289 # "from input read.\nHit: $hits_hash{$name}{qual}\nInput: $qual";
291 elsif(defined($un_hash{$name})) {
292 $un_hash{$name}{seq} eq $seq ||
293 die "Read $name in --un file $un_file has different sequence ".
294 "from input read.\nHit: \"$un_hash{$name}{seq}\"\nInput: \"$seq\"";
295 $un_hash{$name}{qual} eq $qual ||
296 die "Read $name in --un file $un_file has different sequence ".
297 "from input read.\nHit: \"$un_hash{$name}{qual}\"\nInput: \"$qual\"";
299 elsif(defined($max_hash{$name})) {
300 $max_hash{$name}{seq} eq $seq ||
301 die "Read $name in --max file $max_file has different sequence ".
302 "from input read.\nHit: \"$max_hash{$name}{seq}\"\nInput: \"$seq\"";
303 $max_hash{$name}{qual} eq $qual ||
304 die "Read $name in --max file $max_file has different sequence ".
305 "from input read.\nHit: \"$max_hash{$name}{qual}\"\nInput: \"$qual\"";
308 die "Read with name $name appears in input, but not in any of the output files";
310 if(defined($al_hash{$name})) {
311 $al_hash{$name}{seq} eq $seq ||
312 die "Read $name in --al file $al_file has different sequence ".
313 "from input read.\nHit: \"$al_hash{$name}{seq}\"\nInput: \"$seq\"";
314 $al_hash{$name}{qual} eq $qual ||
315 die "Read $name in --al file $al_file has different sequence ".
316 "from input read.\nHit: \"$al_hash{$name}{qual}\"\nInput: \"$qual\"";
319 last if $patid == $num_reads;
325 $als == $distinctHits || die "number of --al $als does not match number of distinct hits $distinctHits";
327 $distinctHits + $uns + $maxs == $reads ||
328 die "distinct hits $distinctHits, --un $uns, --max $maxs doesn't add to reads $reads";
330 print "PASSED; processed $hits hits, $reads reads, $uns --un, $maxs --max, $als --al\n";