scripts/reconcile_alignments_pe.pl

   1 #!/usr/bin/perl -w
   2
   3 #
   4 # reconcile_alignments_pe.pl
   5 #
   6 #  Author: Ben Langmead
   7 #    Date: 6/14/2009
   8 #
   9 # Reconcile and sanity-check an pair of input files, an output
  10 # alignment file (the default, verbose kind), and a pair of output
  11 # unaligned-read files (generated with the --un option) to be sure
  12 # they're consistent with each other.  If the --max option was used
  13 # during alignment, then the --max files should also be specified.  If
  14 # the --al option was also used, specify the --al files as the last
  15 # arguments to include it in the sanity check.
  16 #
  17 # Usage: perl reconcile_alignments_pe.pl \
  18 #            [-k <int>] [-a] [-m <int>] \
  19 #            [-u <int>] [-v] [-q | -f | -r] \
  20 #            <input mate1 file> \
  21 #            <input mate2 file> \
  22 #            <alignment-file> \
  23 #            <--un file 1> \
  24 #            <--un file 2> \
  25 #            <--max file 1> \
  26 #            <--max file 2> \
  27 #            <--al file 1> \
  28 #            <--al file 2>
  29 #
  30
  31 use strict;
  32 use warnings;
  33 use Getopt::Long;
  34 Getopt::Long::Configure ("no_ignore_case");
  35
  36 my $khits = 1;
  37 my $allHits = 0;
  38 my $m = 999999;
  39 my $M = undef;
  40 my $u = -1;
  41 my $verbose = 0;
  42 my $fastq = 0;
  43 my $fasta = 0;
  44 my $raw = 0;
  45 my $C = undef;
  46 my $colCseq = undef;
  47 my $colCqual = undef;
  48 my $colCedit = undef;
  49
  50 GetOptions("m=i" => \$m,
  51            "M=i" => \$M,
  52            "k=i" => \$khits,
  53            "u=i" => \$u,
  54            "a"   => \$allHits,
  55            "v"   => \$verbose,
  56            "q"   => \$fastq,
  57            "f"   => \$fasta,
  58            "C"   => \$C,
  59            "col-cseq" => \$colCseq,
  60            "col-cedit" => \$colCedit,
  61            "col-cqual" => \$colCqual,
  62            "e"   => \$raw) || die "Bad arguments";
  63
  64 $m = $M if $M;
  65
  66 $fastq = 1 if ($fastq + $fasta + $raw == 0);
  67
  68 $khits = 999999 if $allHits;
  69
  70 print "khits: $khits\nmaxhits: $m\nnum_reads: $u\n" if $verbose;
  71
  72 # Utility function that returns the reverse complement of its argument
  73 sub reverseComp($) {
  74         my $r = shift;
  75         $r = reverse($r);
  76         $r =~ tr/aAcCgGtT/tTgGcCaA/;
  77         return $r;
  78 }
  79
  80 defined($ARGV[0]) || die "Must specify input read file 1 as first arg";
  81 defined($ARGV[1]) || die "Must specify input read file 2 as first arg";
  82 defined($ARGV[2]) || die "Must specify alignment file as second arg";
  83 defined($ARGV[3]) || die "Must specify unaligned-read file 1 as third arg";
  84 defined($ARGV[4]) || die "Must specify unaligned-read file 2 as third arg";
  85
  86 my $read1_file   = $ARGV[0];
  87 my $read2_file   = $ARGV[1];
  88 my $hits_file    = $ARGV[2];
  89 my $un1_file = $ARGV[3];
  90 my $un2_file = $ARGV[4];
  91 my $max1_file = "";
  92 $max1_file = $ARGV[5] if defined($ARGV[5]);
  93 my $max2_file = "";
  94 $max2_file = $ARGV[6] if defined($ARGV[6]);
  95 my $al1_file = "";
  96 $al1_file  = $ARGV[7] if defined($ARGV[7]);
  97 my $al2_file = "";
  98 $al2_file = $ARGV[8] if defined($ARGV[8]);
  99
 100 if($verbose) {
 101         print "read1: $read1_file\nread2: $read2_file\nalignments: $hits_file\n";
 102         print "unaligned reads 1: $un1_file\nunaligned reads 2: $un2_file\n";
 103         print "max reads 1: $max1_file\nmax reads 2: $max2_file\n";
 104 }
 105
 106 sub get_read($) {
 107         my $fh = shift;
 108         my ($name, $seq, $qual) = ("", "", "");
 109         if($fastq) {
 110                 $name = <$fh>;
 111                 return ("", "", "") unless defined($name);
 112                 chomp($name);
 113                 $name = substr($name, 1);
 114                 $seq = <$fh>;
 115                 chomp($seq);
 116                 my $tmp = <$fh>;
 117                 $qual = <$fh>;
 118                 chomp($qual);
 119         } elsif($fasta) {
 120                 $name = <$fh>;
 121                 return ("", "", "") unless defined($name);
 122                 chomp($name);
 123                 $name = substr($name, 1);
 124                 $seq = <$fh>;
 125                 chomp($seq);
 126         } else {
 127                 $raw || die;
 128                 die "Raw format not supported in reconcile_alignment_pe.pl; read names required";
 129         }
 130         return ($name, $seq, $qual);
 131 }
 132
 133 ##
 134 # Encode colors as proxy nucleotides
 135 #
 136 sub nucencode($) {
 137         my $s = shift;
 138         my %nmap = ("0" => "A", "1" => "C", "2" => "G", "3" => "T", "." => "N",
 139                     "A" => "A", "C" => "C", "G" => "G", "T" => "T", "N" => "N");
 140         my $ret = "";
 141         for(my $i = 0; $i < length($s); $i++) {
 142                 my $c = uc substr($s, $i, 1);
 143                 defined($nmap{$c}) || die;
 144                 $ret .= $nmap{$c};
 145         }
 146         return $ret;
 147 }
 148
 149 my %hits_hash = ();
 150 my %un_hash = ();
 151 my %max_hash = ();
 152 my %al_hash = ();
 153
 154 # Go through Bowtie-produced paired-end alignments
 155 my $hits = 0;
 156 my $distinctHits = 0;
 157 open(HITS, $hits_file);
 158 {
 159         my %num_hash = ();
 160         while(<HITS>) {
 161                 # Read two alignments at a time, cleave off the trailing /1 or
 162                 # /2 and assert that they have the same base name.
 163                 my $l1 = $_;
 164                 chomp($l1);
 165                 my @s1 = split(/[\t]/, $l1);
 166                 my $l2 = <HITS>;
 167                 chomp($l2);
 168                 my @s2 = split(/[\t]/, $l2);
 169                 my $name1 = $s1[0];
 170                 my $name2 = $s2[0];
 171                 $hits++;
 172                 my @name1s = split(/\//, $name1);
 173                 my @name2s = split(/\//, $name2);
 174                 $#name1s == $#name2s || die "Read names formatted differently: $name1, $name2";
 175                 $name1s[$#name1s] eq "1" || $name1s[$#name1s] eq "2" || die "Bad suffix on read name: $name1";
 176                 $name2s[$#name2s] eq "1" || $name2s[$#name2s] eq "2" || die "Bad suffix on read name: $name2";
 177                 $name1s[$#name1s] ne $name2s[$#name2s] || die "Read names have same suffix: $name1, $name2";
 178                 $name1s[0] eq $name2s[0] || die "Names don't match: $name1,$name2";
 179                 my $swap = 0;
 180                 $swap = 1 if $name1s[$#name1s] eq "2";
 181                 my $read_name = $name1s[0];
 182                 my $add_read = 1;
 183                 # Check that the number of times that the basename has appeared
 184                 # in a paired alignment does not violate the -k or -m policies.
 185                 if($khits > 1) {
 186                         # Update num_hash
 187                         if(defined($num_hash{$read_name})) {
 188                                 $num_hash{$read_name}++;
 189                                 $add_read = 0; # already added this one
 190                         } else {
 191                                 $num_hash{$read_name} = 1;
 192                                 $distinctHits++;
 193                         }
 194                         # Check whether num_hash violates -k
 195                         if($num_hash{$read_name} > $khits) {
 196                                 die "Number of alignments for read $read_name exceeds -k limit of $khits";
 197                         }
 198                         # Check whether num_hash violates -m
 199                         if($num_hash{$read_name} > $m) {
 200                                 die "Number of alignments for read $read_name exceeds -m limit of $m";
 201                         }
 202                 } else {
 203                         defined($hits_hash{$read_name}) &&
 204                                 die "Read w/ name $read_name appears twice in alignment file";
 205                         $distinctHits++;
 206                 }
 207                 if($add_read) {
 208                         my $fw1 = ($s1[1] eq "+");
 209                         my $seq1 = $s1[4];
 210                         my $qual1 = $s1[5];
 211                         if(!$fw1) {
 212                                 # Reverse / reverse-comp
 213                                 if($C && $colCseq) {
 214                                         $seq1 = reverse $seq1;
 215                                 } else {
 216                                         $seq1 = reverseComp($seq1);
 217                                 }
 218                                 $qual1 = reverse $qual1;
 219                         }
 220                         my $fw2 = ($s2[1] eq "+");
 221                         my $seq2 = $s2[4];
 222                         my $qual2 = $s2[5];
 223                         if(!$fw2) {
 224                                 # Reverse / reverse-comp
 225                                 if($C && $colCseq) {
 226                                         $seq2 = reverse $seq2;
 227                                 } else {
 228                                         $seq2 = reverseComp($seq2);
 229                                 }
 230                                 $qual2 = reverse $qual2;
 231                         }
 232                         if(!$swap) {
 233                                 $hits_hash{$read_name}{seq1} = $seq1;
 234                                 $hits_hash{$read_name}{qual1} = $fasta ? "" : $qual1;
 235                                 $hits_hash{$read_name}{seq2} = $seq2;
 236                                 $hits_hash{$read_name}{qual2} = $fasta ? "" : $qual2;
 237                         } else {
 238                                 $hits_hash{$read_name}{seq1} = $seq2;
 239                                 $hits_hash{$read_name}{qual1} = $fasta ? "" : $qual2;
 240                                 $hits_hash{$read_name}{seq2} = $seq1;
 241                                 $hits_hash{$read_name}{qual2} = $fasta ? "" : $qual1;
 242                         }
 243                 }
 244         }
 245         # Drop %num_hash - don't need it anymore
 246 }
 247 close(HITS);
 248
 249 # Go through entries of the FASTQ file for the unaligned reads
 250 my $uns = 0;
 251 my ($UN1, $UN2);
 252 open $UN1, $un1_file;
 253 open $UN2, $un2_file;
 254 while(1) {
 255         my ($name1, $seq1, $qual1) = get_read($UN1);
 256         my ($name2, $seq2, $qual2) = get_read($UN2);
 257         if($name1 eq "") {
 258                 $name2 eq "" || die;
 259                 last;
 260         }
 261         my @ls1 = split(/\//, $name1);
 262         my @ls2 = split(/\//, $name2);
 263         $uns++;
 264         $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
 265         $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
 266         my $name = $ls1[0];
 267         # Where have we seen it before?  Nowhere, hopefully
 268         if($M) {
 269                 if(defined($hits_hash{$name})) {
 270                         delete $hits_hash{$name};
 271                         $distinctHits--;
 272                 }
 273         } else {
 274                 defined($hits_hash{$name}) &&
 275                         die "Read $name appears both in hits file $hits_file and in --un file $un1_file/$un2_file";
 276         }
 277         defined($un_hash{$name}) &&
 278                 die "Read $name appears in --un file $un1_file/$un2_file more than once";
 279         # Insert summary of the pair that didn't align
 280         $un_hash{$name}{seq1} = $seq1;
 281         $un_hash{$name}{qual1} = $qual1;
 282         $un_hash{$name}{seq2} = $seq2;
 283         $un_hash{$name}{qual2} = $qual2;
 284 }
 285 close($UN1);
 286 close($UN2);
 287
 288 #
 289 # Go through entries of the FASTQ file for the reads with a number of
 290 # reportable alignments that exceeded the -m limit.
 291 #
 292 my $maxs = 0;
 293 if($max1_file ne "") {
 294         $max2_file ne "" || die;
 295         my ($MAX1, $MAX2);
 296         my $bad = 0;
 297         open ($MAX1, $max1_file) || ($bad = 1);
 298         open ($MAX2, $max2_file) || ($bad = 1);
 299         while(!$bad) {
 300                 my ($name1, $seq1, $qual1) = get_read($MAX1);
 301                 my ($name2, $seq2, $qual2) = get_read($MAX2);
 302                 if($name1 eq "") {
 303                         $name2 eq "" || die;
 304                         last;
 305                 }
 306                 $maxs++;
 307                 my @ls1 = split(/\//, $name1);
 308                 my @ls2 = split(/\//, $name2);
 309                 $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
 310                 $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
 311                 my $name = $ls1[0];
 312                 # Where have we seen it before?  Nowhere, hopefully
 313                 if($M) {
 314                         defined($hits_hash{$name}) ||
 315                                 die "Read $name appears in --max file $max1_file/$max2_file but not in hits file $hits_file in -M mode";
 316                         delete $hits_hash{$name};
 317                         $distinctHits--;
 318                 } else {
 319                         defined($hits_hash{$name}) &&
 320                                 die "Read $name appears in hits file $hits_file and in --max file $max1_file/$max2_file";
 321                 }
 322                 defined($un_hash{$name}) &&
 323                         die "Read $name appears in --un file $un1_file/$un2_file and in --max file $max1_file/$max2_file";
 324                 defined($max_hash{$name}) &&
 325                         die "Read $name appears in --max file $max1_file/$max2_file more than once";
 326                 # Insert summary of the pair that didn't align
 327                 $max_hash{$name}{seq1} = $seq1;
 328                 $max_hash{$name}{qual1} = $qual1;
 329                 $max_hash{$name}{seq2} = $seq2;
 330                 $max_hash{$name}{qual2} = $qual2;
 331         }
 332         close($MAX1);
 333         close($MAX2);
 334 }
 335
 336 #
 337 # Go through entries of the
 338 #
 339 my $als = 0;
 340 if($al1_file ne "") {
 341         $al2_file ne "" || die;
 342         my ($AL1, $AL2);
 343         my $bad = 0;
 344         open ($AL1, $al1_file) || ($bad = 1);
 345         open ($AL2, $al2_file) || ($bad = 1);
 346         while(!$bad) {
 347                 my ($name1, $seq1, $qual1) = get_read($AL1);
 348                 my ($name2, $seq2, $qual2) = get_read($AL2);
 349                 if($name1 eq "") {
 350                         $name2 eq "" || die;
 351                         last;
 352                 }
 353                 $als++;
 354                 my @ls1 = split(/\//, $name1);
 355                 my @ls2 = split(/\//, $name2);
 356                 $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
 357                 $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
 358                 my $name = $ls1[0];
 359                 defined($hits_hash{$name}) ||
 360                         die "Read $name appears in --al file $al1_file/$al2_file but not in hits file $hits_file";
 361                 defined($un_hash{$name}) &&
 362                         die "Read $name appears in --un file $un1_file/$un2_file and in --al file $al1_file/$al2_file";
 363                 defined($max_hash{$name}) &&
 364                         die "Read $name appears in --max file $max1_file/$max2_file and in --al file $al1_file/$al2_file";
 365                 defined($al_hash{$name}) &&
 366                         die "Read $name appears in --al file $al1_file/$al2_file more than once";
 367                 # Insert summary of the pair that didn't align
 368                 $al_hash{$name}{seq1} = $seq1;
 369                 $al_hash{$name}{qual1} = $qual1;
 370                 $al_hash{$name}{seq2} = $seq2;
 371                 $al_hash{$name}{qual2} = $qual2;
 372         }
 373         close($AL1);
 374         close($AL2);
 375 }
 376
 377 # Go through entries of the FASTQ file for the input reads and make
 378 # sure that each entry is mirrored by an entry either in the alignment
 379 # file or in the unaligned-read file.
 380 my ($READ1, $READ2);
 381 open $READ1, $read1_file;
 382 open $READ2, $read2_file;
 383 my $patid = 0;
 384 my $reads = 0;
 385 while(1) {
 386         my ($name1, $seq1, $qual1) = get_read($READ1);
 387         my ($name2, $seq2, $qual2) = get_read($READ2);
 388         if($name1 eq "") {
 389                 $name2 eq "" || die;
 390                 last;
 391         }
 392         $reads++;
 393         my @ls1 = split(/\//, $name1);
 394         my @ls2 = split(/\//, $name2);
 395         $#ls1 == $#ls2 || die "Differently-formatted names: $name1, $name2";
 396         $ls1[0] eq $ls2[0] || die "Different names for paired alignments: $ls1[0], $ls2[0]";
 397         my $name = $ls1[0];
 398         if(defined($hits_hash{$name})) {
 399                 my $alseq1 = $seq1;
 400                 my $alseq2 = $seq2;
 401                 if($C && $colCseq) {
 402                         $alseq1 = nucencode($seq1);
 403                         $alseq2 = nucencode($seq2);
 404                 }
 405                 if(!$C || $colCseq) {
 406                         $hits_hash{$name}{seq1} eq $alseq1 ||
 407                                 die "Read $name in alignment file has different _1 sequence than in ".
 408                                     "input read file.\nAlgn: \"$hits_hash{$name}{seq1}\"\nInput: \"$alseq1\"";
 409                 }
 410                 # Qualities can be legitimately different
 411                 #$hits_hash{$name}{qual1} eq $qual1 ||
 412                 #       die "Read $name in alignment file has different _1 quals than in ".
 413                 #           "input read file.\nAlgn: $hits_hash{$name}{qual1}\nInput: $qual1";
 414                 if(!$C || $colCseq) {
 415                         $hits_hash{$name}{seq2} eq $alseq2 ||
 416                                 die "Read $name in alignment file has different _2 sequence than in ".
 417                                     "input read file.\nAlgn: \"$hits_hash{$name}{seq2}\"\nInput: \"$alseq2\"";
 418                 }
 419                 #$hits_hash{$name}{qual2} eq $qual2 ||
 420                 #       die "Read $name in alignment file has different _2 quals than in ".
 421                 #           "input read file.\nAlgn: $hits_hash{$name}{qual2}\nInput: $seq2";
 422         }
 423         elsif(defined($un_hash{$name})) {
 424                 $un_hash{$name}{seq1} eq $seq1 ||
 425                         die "Read $name in --un file has different _1 sequence than in ".
 426                             "input read file.\nAlgn: $un_hash{$name}{seq1}\nInput: $seq1";
 427                 $un_hash{$name}{qual1} eq $qual1 ||
 428                         die "Read $name in --un file has different _1 quals than in ".
 429                             "input read file.\nAlgn: $un_hash{$name}{qual1}\nInput: $qual1";
 430                 $un_hash{$name}{seq2} eq $seq2 ||
 431                         die "Read $name in --un file has different _2 sequence than in ".
 432                             "input read file.\nAlgn: $un_hash{$name}{seq2}\nInput: $seq2";
 433                 $un_hash{$name}{qual2} eq $qual2 ||
 434                         die "Read $name in --un file has different _2 quals than in ".
 435                             "input read file.\nAlgn: $un_hash{$name}{qual2}\nInput: $seq2";
 436         }
 437         elsif(defined($max_hash{$name})) {
 438                 $max_hash{$name}{seq1} eq $seq1 ||
 439                         die "Read $name in --max file has different _1 sequence than in ".
 440                             "input read file.\nAlgn: $max_hash{$name}{seq1}\nInput: $seq1";
 441                 $max_hash{$name}{qual1} eq $qual1 ||
 442                         die "Read $name in --max file has different _1 quals than in ".
 443                             "input read file.\nAlgn: $max_hash{$name}{qual1}\nInput: $qual1";
 444                 $max_hash{$name}{seq2} eq $seq2 ||
 445                         die "Read $name in --max file has different _2 sequence than in ".
 446                             "input read file.\nAlgn: $max_hash{$name}{seq2}\nInput: $seq2";
 447                 $max_hash{$name}{qual2} eq $qual2 ||
 448                         die "Read $name in --max file has different _2 quals than in ".
 449                             "input read file.\nAlgn: $max_hash{$name}{qual2}\nInput: $seq2";
 450         }
 451         else {
 452                 die "Read $name does not appear in hits, --un or ---max file";
 453         }
 454
 455         if(defined($al_hash{$name})) {
 456                 $al_hash{$name}{seq1} eq $seq1 ||
 457                         die "Read $name in --al file has different _1 sequence than in ".
 458                             "input read file.\nAlgn: $al_hash{$name}{seq1}\nInput: $seq1";
 459                 $al_hash{$name}{qual1} eq $qual1 ||
 460                         die "Read $name in --al file has different _1 quals than in ".
 461                             "input read file.\nAlgn: $al_hash{$name}{qual1}\nInput: $qual1";
 462                 $al_hash{$name}{seq2} eq $seq2 ||
 463                         die "Read $name in --al file has different _2 sequence than in ".
 464                             "input read file.\nAlgn: $al_hash{$name}{seq2}\nInput: $seq2";
 465                 $al_hash{$name}{qual2} eq $qual2 ||
 466                         die "Read $name in --al file has different _2 quals than in ".
 467                             "input read file.\nAlgn: $al_hash{$name}{qual2}\nInput: $seq2";
 468         }
 469
 470         $patid++;
 471         last if $patid == $u;
 472 }
 473 close($READ1);
 474 close($READ2);
 475
 476 if($al1_file ne "") {
 477         $als == $distinctHits || die "number of --al $als does not match number of distinct hits $distinctHits";
 478 }
 479 $distinctHits + $uns + $maxs == $reads ||
 480         die "distinct hits $distinctHits, --un $uns, --max $maxs doesn't add to reads $reads";
 481
 482 print "PASSED; processed $hits hits, $reads reads, $uns --un, $maxs --max, $als --al\n";