scripts/reconcile_alignments.pl

   1 #!/usr/bin/perl -w
   2
   3 #
   4 # reconcile_alignments.pl
   5 #
   6 #  Author: Ben Langmead
   7 #    Date: 6/14/2009
   8 #
   9 # Reconcile and sanity-check an input read file, an output hits file
  10 # (the default, verbose kind), and an output unaligned-read file
  11 # (generated with the --un option) to be sure they're consistent with
  12 # each other.  If the --max option was used during alignment, then the
  13 # --max file should also be specified.  If the --al option was also
  14 # used, specify the --al file as the last argument to include it in the
  15 # sanity check.
  16 #
  17 # Usage: perl reconcile_alignments.pl \
  18 #            [-k <int>] [-a] [-m <int>] \
  19 #            [-u <int>] [-f|-q|-r] \
  20 #            <input read file> \
  21 #            <hits file> \
  22 #            <--un file> \
  23 #            <--max file> \
  24 #            <--al file>
  25 #
  26
  27 use strict;
  28 use warnings;
  29 use Getopt::Long;
  30 Getopt::Long::Configure ("no_ignore_case");
  31
  32 my $k = undef;
  33 my $m = undef;
  34 my $M = undef;
  35 my $u = undef;
  36 my $f = undef;
  37 my $r = undef;
  38 my $a = undef;
  39 my $q = undef;
  40 my $C = undef;
  41 my $colCseq = undef;
  42 my $colCqual = undef;
  43 my $colCedit = undef;
  44
  45 GetOptions ("k=i" => \$k,
  46             "m=i" => \$m,
  47             "M=i" => \$M,
  48             "u=i" => \$u,
  49             "q"   => \$q,
  50             "f"   => \$f,
  51             "a"   => \$a,
  52             "C"   => \$C,
  53             "col-cseq" => \$colCseq,
  54             "col-cedit" => \$colCedit,
  55             "col-cqual" => \$colCqual,
  56             "r"   => \$r) || die "One or more errors parsing script arguments";
  57
  58 my $khits = 1;
  59 $khits = int($k) if defined($k);
  60 $khits = 999999 if $a;
  61 my $maxhits = 999999;
  62 $maxhits = int($m) if defined($m);
  63 $maxhits = int($M) if defined($M);
  64 my $num_reads = -1;
  65 $num_reads = $u if defined($u);
  66
  67 my $format = "fastq";
  68 $format = "fasta" if $f;
  69 $format = "raw" if $r;
  70
  71 # Utility function that returns the reverse complement of its argument
  72 sub reverseComp($) {
  73         my $r = shift;
  74         $r = reverse($r);
  75         $r =~ tr/aAcCgGtT/tTgGcCaA/;
  76         return $r;
  77 }
  78
  79 ##
  80 # Encode colors as proxy nucleotides
  81 #
  82 sub nucencode($) {
  83         my $s = shift;
  84         my %nmap = ("0" => "A", "1" => "C", "2" => "G", "3" => "T", "." => "N",
  85                     "A" => "A", "C" => "C", "G" => "G", "T" => "T", "N" => "N");
  86         my $ret = "";
  87         for(my $i = 0; $i < length($s); $i++) {
  88                 my $c = uc substr($s, $i, 1);
  89                 defined($nmap{$c}) || die;
  90                 $ret .= $nmap{$c};
  91         }
  92         return $ret;
  93 }
  94
  95 defined($ARGV[0]) || die "Must specify input read file as first arg";
  96 defined($ARGV[1]) || die "Must specify alignment file as second arg";
  97 defined($ARGV[2]) || die "Must specify unaligned-read file as third arg";
  98
  99 my $read_files  = $ARGV[0];
 100 my $algn_file   = $ARGV[1];
 101 my $un_file     = $ARGV[2];
 102 my $max_file    = "";
 103 $max_file = $ARGV[3] if defined($ARGV[3]);
 104 my $al_file     = "";
 105 $al_file = $ARGV[4] if defined($ARGV[4]);
 106
 107 my %hits_hash = ();
 108 my %al_hash = ();
 109 my %un_hash = ();
 110 my %max_hash = ();
 111
 112 # Go through Bowtie-produced alignments
 113 my $hits = 0;
 114 my $distinctHits = 0;
 115 if(-f $algn_file) {
 116         open(ALGN, $algn_file) || die "Could not open alignment file $algn_file\n";
 117         my %num_hash = (); # for checking -k/-m
 118         while(<ALGN>) {
 119                 my @s = split /\t/;
 120                 my $name = $s[0];
 121                 my $add_read = 1;
 122                 $hits++;
 123                 if($khits > 1) {
 124                         if(defined($num_hash{$name})) {
 125                                 $num_hash{$name}++;
 126                                 $add_read = 0; # already added this one
 127                         } else {
 128                                 $num_hash{$name} = 1;
 129                         $distinctHits++;
 130                         }
 131                         if($num_hash{$name} > $khits) {
 132                                 die "Number of alignments for read $name exceeds -k limit of $khits";
 133                         } elsif($num_hash{$name} > $maxhits) {
 134                                 die "Number of alignments for read $name exceeds -m limit of $maxhits";
 135                         }
 136                 } else {
 137                         defined($hits_hash{$name}{seq}) &&
 138                                 die "Read w/ name $name appears twice in alignment file";
 139                         $distinctHits++;
 140                 }
 141                 if($add_read) {
 142                         my $fw = ($s[1] eq "+");
 143                         my $seq = $s[4];
 144                         my $qual = $s[5];
 145                         if(!$fw) {
 146                                 # Reverse / reverse-comp
 147                                 if($C && $colCseq) {
 148                                         $seq = reverse $seq;
 149                                 } else {
 150                                         $seq = reverseComp($seq);
 151                                 }
 152                                 $qual = reverse $qual;
 153                         }
 154                         $hits_hash{$name}{seq} = $seq;
 155                         $hits_hash{$name}{qual} = (($format eq "fasta")? "" : $qual);
 156                 }
 157         }
 158         close(ALGN);
 159 }
 160
 161 sub get_read($) {
 162         my $fh = shift;
 163         my ($name, $seq, $qual) = ("", "", "");
 164         if($format eq "fastq") {
 165                 $name = <$fh>;
 166                 return ("", "", "") unless defined($name);
 167                 chomp($name);
 168                 $name = substr($name, 1);
 169                 $seq = <$fh>;
 170                 chomp($seq);
 171                 my $tmp = <$fh>;
 172                 $qual = <$fh>;
 173                 chomp($qual);
 174         } elsif($format eq "fasta") {
 175                 $name = <$fh>;
 176                 return ("", "", "") unless defined($name);
 177                 chomp($name);
 178                 $name = substr($name, 1);
 179                 $seq = <$fh>;
 180                 chomp($seq);
 181         } else {
 182                 $format eq "raw" || die;
 183                 die "Raw format not supported in reconcile_alignment.pl; read names required";
 184         }
 185         return ($name, $seq, $qual);
 186 }
 187
 188 # Go through entries of the FASTQ file for the unaligned reads
 189 my $uns = 0;
 190 if(-f $un_file) {
 191         my $UN;
 192         open $UN, $un_file || die "Could not open unaligned-read file $un_file\n";
 193         while(1) {
 194                 my ($name, $seq, $qual) = get_read($UN);
 195                 last if $name eq "";
 196                 $uns++;
 197                 unless($M) {
 198                         defined($hits_hash{$name}) &&
 199                                 die "Read $name appears both in hits file $algn_file and in --un file $un_file";
 200                 } elsif(defined($hits_hash{$name})) {
 201                         $distinctHits--;
 202                         delete $hits_hash{$name};
 203                 }
 204                 defined($un_hash{$name}) &&
 205                         die "Read $name appears more than once in --un file $un_file";
 206                 $un_hash{$name}{seq} = $seq;
 207                 $un_hash{$name}{qual} = $qual;
 208         }
 209         close($UN);
 210 }
 211
 212 my $maxs = 0;
 213 if($max_file ne "" && -f $max_file) {
 214         my $MAX;
 215         open $MAX, $max_file || die "Could not open maxed-read file $max_file\n";
 216         # Go through entries of the MAX file for the unaligned reads
 217         while(1) {
 218                 my ($name, $seq, $qual) = get_read($MAX);
 219                 last if $name eq "";
 220                 $maxs++;
 221                 if($M) {
 222                         defined($hits_hash{$name}) ||
 223                                 die "Read $name appears in --max file $max_file but not in alignment file $algn_file";
 224                         $distinctHits--;
 225                         delete $hits_hash{$name};
 226                 } else {
 227                         defined($hits_hash{$name}) &&
 228                                 die "Read $name appears both in hits file $algn_file and in --max file $max_file";
 229                 }
 230                 defined($un_hash{$name})   &&
 231                         die "Read $name appears both in --un file $un_file and in --max file $max_file";
 232                 defined($max_hash{$name})  &&
 233                         die "Read $name appears in --max file $max_file more than once";
 234                 $max_hash{$name}{seq} = $seq;
 235                 $max_hash{$name}{qual} = $qual;
 236         }
 237         close($MAX);
 238 }
 239
 240 my $als = 0;
 241 if($al_file ne "") {
 242         my $AL;
 243         open $AL, $al_file;
 244         # Go through entries of the MAX file for the unaligned reads
 245         while(1) {
 246                 my ($name, $seq, $qual) = get_read($AL);
 247                 last if $name eq "";
 248                 $als++;
 249                 defined($hits_hash{$name}) ||
 250                         die "Read $name appears --al file $al_file but not in hits file $algn_file";
 251                 defined($un_hash{$name})   &&
 252                         die "Read $name appears both in --un file $un_file and in --al file $al_file";
 253                 defined($max_hash{$name})  &&
 254                         die "Read $name appears both in --max file $max_file and in --al file $al_file";
 255                 defined($al_hash{$name})  &&
 256                         die "Read $name appears in --al file $al_file more than once";
 257                 $al_hash{$name}{seq} = $seq;
 258                 $al_hash{$name}{qual} = $qual;
 259         }
 260         close($AL);
 261 }
 262
 263 my @read_list = split(/,/, $read_files);
 264 my $reads = 0;
 265 for my $read_file (@read_list) {
 266         # Go through entries of the FASTQ file for the input reads and make
 267         # sure that each entry is mirrored by an entry either in the alignment
 268         # file or in the unaligned-read file.
 269         my $READ;
 270         open $READ, $read_file;
 271         my $patid = 0;
 272         while(1) {
 273                 my ($name, $seq, $qual) = get_read($READ);
 274                 last if $name eq "";
 275                 $reads++;
 276                 if(defined($hits_hash{$name})) {
 277                         my $alseq = $seq;
 278                         if($C && $colCseq) {
 279                                 $alseq = nucencode($seq);
 280                         }
 281                         if(!$C || $colCseq) {
 282                                 $hits_hash{$name}{seq} eq $alseq ||
 283                                         die "Read $name in hits file $algn_file has different sequence ".
 284                                             "from input read.\nHit: \"$hits_hash{$name}{seq}\"\nInput: \"$alseq\"";
 285                         }
 286                         # Qualities can be legitimately different
 287                         #$hits_hash{$name}{qual} eq $qual ||
 288                         #       die "Read $name in hits file $algn_file has different sequence ".
 289                         #           "from input read.\nHit: $hits_hash{$name}{qual}\nInput: $qual";
 290                 }
 291                 elsif(defined($un_hash{$name})) {
 292                         $un_hash{$name}{seq} eq $seq ||
 293                                 die "Read $name in --un file $un_file has different sequence ".
 294                                     "from input read.\nHit: \"$un_hash{$name}{seq}\"\nInput: \"$seq\"";
 295                         $un_hash{$name}{qual} eq $qual ||
 296                                 die "Read $name in --un file $un_file has different sequence ".
 297                                     "from input read.\nHit: \"$un_hash{$name}{qual}\"\nInput: \"$qual\"";
 298                 }
 299                 elsif(defined($max_hash{$name})) {
 300                         $max_hash{$name}{seq} eq $seq ||
 301                                 die "Read $name in --max file $max_file has different sequence ".
 302                                     "from input read.\nHit: \"$max_hash{$name}{seq}\"\nInput: \"$seq\"";
 303                         $max_hash{$name}{qual} eq $qual ||
 304                                 die "Read $name in --max file $max_file has different sequence ".
 305                                     "from input read.\nHit: \"$max_hash{$name}{qual}\"\nInput: \"$qual\"";
 306                 }
 307                 else {
 308                         die "Read with name $name appears in input, but not in any of the output files";
 309                 }
 310                 if(defined($al_hash{$name})) {
 311                         $al_hash{$name}{seq} eq $seq ||
 312                                 die "Read $name in --al file $al_file has different sequence ".
 313                                     "from input read.\nHit: \"$al_hash{$name}{seq}\"\nInput: \"$seq\"";
 314                         $al_hash{$name}{qual} eq $qual ||
 315                                 die "Read $name in --al file $al_file has different sequence ".
 316                                     "from input read.\nHit: \"$al_hash{$name}{qual}\"\nInput: \"$qual\"";
 317                 }
 318                 $patid++;
 319                 last if $patid == $num_reads;
 320         }
 321         close($READ);
 322 }
 323
 324 if($al_file ne "") {
 325         $als == $distinctHits || die "number of --al $als does not match number of distinct hits $distinctHits";
 326 }
 327 $distinctHits + $uns + $maxs == $reads ||
 328         die "distinct hits $distinctHits, --un $uns, --max $maxs doesn't add to reads $reads";
 329
 330 print "PASSED; processed $hits hits, $reads reads, $uns --un, $maxs --max, $als --al\n";