X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=misc%2Fexport2sam.pl;h=ec6dacfec4dbd20cbf0e9c4ae3513c2fb4216789;hp=8e3e2800c3ac82cf9d3d3581b02768ddfd9a16f2;hb=HEAD;hpb=b27e00385f41769d03a8cca4dbd71275fc9fa906 diff --git a/misc/export2sam.pl b/misc/export2sam.pl index 8e3e280..ec6dacf 100755 --- a/misc/export2sam.pl +++ b/misc/export2sam.pl @@ -1,107 +1,545 @@ -#!/usr/bin/perl -w - -# Contact: lh3 -# Version: 0.1.2 (03JAN2009) - -use strict; -use warnings; -use Getopt::Std; - -&export2sam; -exit; - -sub export2sam { - my ($fh1, $fh2, $is_paired); - $is_paired = (@ARGV >= 2); - die("export2sam.pl []\n") if (@ARGV == 0); - open($fh1, $ARGV[0]) || die; - if ($is_paired) { - open($fh2, $ARGV[1]) || die; - } - # conversion table - my @conv_table; - for (-64..64) { - $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); - } - # core loop - while (<$fh1>) { - my (@s1, @s2); - &export2sam_aux($_, \@s1, \@conv_table, $is_paired); - if ($is_paired) { - $_ = <$fh2>; - &export2sam_aux($_, \@s2, \@conv_table, $is_paired); - if (@s1 && @s2) { # then set mate coordinate - my $isize = 0; - if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize - my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3]; - my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3]; - $isize = $x2 - $x1; - } - # update mate coordinate - if ($s2[2] ne '*') { - @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize); - $s1[1] |= 0x20 if ($s2[1] & 0x10); - } else { - $s1[1] |= 0x8; - } - if ($s1[2] ne '*') { - @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize); - $s2[1] |= 0x20 if ($s1[1] & 0x10); - } else { - $s2[1] |= 0x8; - } - } - } - print join("\t", @s1), "\n" if (@s1); - print join("\t", @s2), "\n" if (@s2 && $is_paired); - } - close($fh1); - close($fh2) if ($is_paired); -} - -sub export2sam_aux { - my ($line, $s, $ct, $is_paired) = @_; - chomp($line); - my @t = split("\t", $line); - @$s = (); - return if ($t[21] ne 'Y'); - # read name - $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; - # initial flag (will be updated later) - $s->[1] = 0; - $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired); - # read & quality - $s->[9] = $t[8]; $s->[10] = $t[9]; - if ($t[13] eq 'R') { # then reverse the sequence and quality - $s->[9] = reverse($t[8]); - $s->[9] =~ tr/ACGTacgt/TGCAtgca/; - $s->[10] = reverse($t[9]); - } - $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding - # cigar - $s->[5] = length($s->[9]) . "M"; - # coor - my $has_coor = 0; - $s->[2] = "*"; - if ($t[10] eq 'NM' || $t[10] eq 'QC') { - $s->[1] |= 0x4; # unmapped - } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) { - $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? - push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") - } else { - $s->[2] = $t[10]; - $has_coor = 1; - } - $s->[3] = $has_coor? $t[12] : 0; - $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R'); - # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?) - $s->[4] = 0; - $s->[4] = $t[15] if ($t[15] ne ''); - $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]); - # mate coordinate - $s->[6] = '*'; $s->[7] = $s->[8] = 0; - # aux - push(@$s, "BC:Z:$t[6]") if ($t[6]); - push(@$s, "MD:Z:$t[14]") if ($has_coor); - push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor); -} +#!/usr/bin/env perl +# +# +# export2sam.pl converts GERALD export files to SAM format. +# +# +# +########## License: +# +# The MIT License +# +# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +# Modified SAMtools work copyright (c) 2010 Illumina, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# +# +# +########## ChangeLog: +# +# Version: 2.3.1 (18MAR2011) +# +# - Restore file '-' as stdin input. +# +# Version: 2.3.0 (24JAN2011) +# +# - Add support for export reserved chromosome name "CONTROL", +# which is translated to optional field "XC:Z:CONTROL". +# - Check for ".gz" file extension on export files and open +# these as gzip pipes when the extension is found. +# +# Version: 2.2.0 (16NOV2010) +# +# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y +# - For export records with reserved chromosome name identifiers +# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" +# to the SAM record, so that these cases can be distinguished +# from other unmatched reads. +# +# Version: 2.1.0 (21SEP2010) +# +# - Additional export record error checking. +# - Convert export records with chromomsome value of "RM" to unmapped +# SAM records. +# +# Version: 2.0.0 (15FEB2010) +# +# Script updated by Illumina in conjunction with CASAVA 1.7.0 +# release. +# +# Major changes are as follows: +# - The CIGAR string has been updated to include all gaps from +# ELANDv2 alignments. +# - The ELAND single read alignment score is always stored in the +# optional "SM" field and the ELAND paired read alignment score +# is stored in the optional "AS" field when it exists. +# - The MAPQ value is set to the higher of the two alignment scores, +# but no greater than 254, i.e. min(254,max(SM,AS)) +# - The SAM "proper pair" bit (0x0002) is now set for read pairs +# meeting ELAND's expected orientation and insert size criteria. +# - The default quality score translation is set for export files +# which contain Phread+64 quality values. An option, +# "--qlogodds", has been added to translate quality values from +# the Solexa+64 format used in export files prior to Pipeline +# 1.3 +# - The export match descriptor is now reverse-complemented when +# necessary such that it always corresponds to the forward +# strand of the reference, to be consistent with other +# information in the SAM record. It is now written to the +# optional 'XD' field (rather than 'MD') to acknowledge its +# minor differences from the samtools match descriptor (see +# additional detail below). +# - An option, "--nofilter", has been added to include reads which +# have failed primary analysis quality filtration. Such reads +# will have the corresponding SAM flag bit (0x0200) set. +# - Labels in the export 'contig' field are preserved by setting +# RNAME to "$export_chromosome/$export_contig" when the contig +# label exists. +# +# +# Contact: lh3 +# Version: 0.1.2 (03JAN2009) +# +# +# +########## Known Conversion Limitations: +# +# - Export records for reads that map to a position < 1 (allowed +# in export format), are converted to unmapped reads in the SAM +# record. +# - Export records contain the reserved chromosome names: "NM", +# "QC","RM" and "CONTROL". "NM" indicates that the aligner could +# not map the read to the reference sequence set. "QC" means that +# the aligner did not attempt to map the read due to some +# technical limitation. "RM" means that the read mapped to a set +# of 'contaminant' sequences specified in GERALD's RNA-seq +# workflow. "CONTROL" means that the read is a control. All of +# these alignment types are collapsed to the single unmapped +# alignment state in the SAM record, but the optional SAM "XC" +# field is used to record the original reserved chromosome name of +# the read for all but the "NM" case. +# - The export match descriptor is slightly different than the +# samtools match descriptor. For this reason it is stored in the +# optional SAM field 'XD' (and not 'MD'). Note that the export +# match descriptor differs from the samtools version in two +# respects: (1) indels are explicitly closed with the '$' +# character and (2) insertions must be enumerated in the match +# descriptor. For example a 35-base read with a two-base insertion +# is described as: 20^2$14 +# +# +# + +my $version = "2.3.1"; + +use strict; +use warnings; + +use Getopt::Long; +use File::Spec; +use List::Util qw(min max); + + +use constant { + EXPORT_MACHINE => 0, + EXPORT_RUNNO => 1, + EXPORT_LANE => 2, + EXPORT_TILE => 3, + EXPORT_X => 4, + EXPORT_Y => 5, + EXPORT_INDEX => 6, + EXPORT_READNO => 7, + EXPORT_READ => 8, + EXPORT_QUAL => 9, + EXPORT_CHROM => 10, + EXPORT_CONTIG => 11, + EXPORT_POS => 12, + EXPORT_STRAND => 13, + EXPORT_MD => 14, + EXPORT_SEMAP => 15, + EXPORT_PEMAP => 16, + EXPORT_PASSFILT => 21, + EXPORT_SIZE => 22, +}; + + +use constant { + SAM_QNAME => 0, + SAM_FLAG => 1, + SAM_RNAME => 2, + SAM_POS => 3, + SAM_MAPQ => 4, + SAM_CIGAR => 5, + SAM_MRNM => 6, + SAM_MPOS => 7, + SAM_ISIZE => 8, + SAM_SEQ => 9, + SAM_QUAL => 10, +}; + + +# function prototypes for Richard's code +sub match_desc_to_cigar($); +sub match_desc_frag_length($); +sub reverse_compl_match_descriptor($); +sub write_header($;$;$); + + +&export2sam; +exit; + + + + +sub export2sam { + + my $cmdline = $0 . " " . join(" ",@ARGV); + my $arg_count = scalar @ARGV; + my $progname = (File::Spec->splitpath($0))[2]; + + my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values + my $is_nofilter = 0; + my $read1file; + my $read2file; + my $print_version = 0; + my $help = 0; + + my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, + "nofilter" => \$is_nofilter, + "read1=s" => \$read1file, + "read2=s" => \$read2file, + "version" => \$print_version, + "help" => \$help ); + + my $usage = <) { + $export_line_count++; + my (@s1, @s2); + &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); + if ($is_paired) { + my $read2line = <$fh2>; + if(not $read2line){ + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); + } + &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); + + if (@s1 && @s2) { # then set mate coordinate + if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ + die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); + } + + my $isize = 0; + if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize + my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; + my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; + $isize = $x2 - $x1; + } + + foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ + my ($sa,$sb,$is) = @{$_}; + if ($sb->[SAM_RNAME] ne '*') { + $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; + $sa->[SAM_MPOS] = $sb->[SAM_POS]; + $sa->[SAM_ISIZE] = $is; + $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); + } else { + $sa->[SAM_FLAG] |= 0x8; + } + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); + } + close($fh1); + if($is_paired) { + while(my $read2line = <$fh2>){ + $export_line_count++; + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); + } + close($fh2); + } +} + +sub export2sam_aux { + my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; + chomp($line); + my @t = split("\t", $line); + if(scalar(@t) < EXPORT_SIZE) { + my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n"; + $msg.="\t...erroneous export record:\n" . $line . "\n\n"; + die($msg); + } + @$s = (); + my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); + return if(not ($isPassFilt or $is_nofilter)); + # read name + my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); + $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), + int($t[EXPORT_X]), int($t[EXPORT_Y])); + # initial flag (will be updated later) + $s->[SAM_FLAG] = 0; + if($is_paired) { + if($t[EXPORT_READNO] != $read_no){ + die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); + } + $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); + } + $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); + + # read & quality + my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); + if ($is_export_rev) { # then reverse the sequence and quality + $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); + $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; + $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); + } else { + $s->[SAM_SEQ] = $t[EXPORT_READ]; + $s->[SAM_QUAL] = $t[EXPORT_QUAL]; + } + my @convqual = (); + foreach (unpack('C*', $s->[SAM_QUAL])){ + my $val=$ct->[$_]; + if(not defined $val){ + my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; + if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } + die($msg . "\n"); + } + push @convqual,$val; + } + + $s->[SAM_QUAL] = pack('C*',@convqual); # change coding + + + # coor + my $has_coor = 0; + $s->[SAM_RNAME] = "*"; + if (($t[EXPORT_CHROM] eq 'NM') or + ($t[EXPORT_CHROM] eq 'QC') or + ($t[EXPORT_CHROM] eq 'RM') or + ($t[EXPORT_CHROM] eq 'CONTROL')) { + $s->[SAM_FLAG] |= 0x4; # unmapped + push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); + } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { + $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } elsif ($t[EXPORT_POS] < 1) { + $s->[SAM_FLAG] |= 0x4; # unmapped + } else { + $s->[SAM_RNAME] = $t[EXPORT_CHROM]; + $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); + $has_coor = 1; + } + $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; + +# print STDERR "t[14] = " . $t[14] . "\n"; + my $matchDesc = ''; + $s->[SAM_CIGAR] = "*"; + if($has_coor){ + $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; + + if($matchDesc =~ /\^/){ + # construct CIGAR string using Richard's function + $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing + } else { + $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; + } + } + +# print STDERR "cigar_string = $cigar_string\n"; + + $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); + if($has_coor){ + my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; + my $pemap = 0; + if($is_paired) { + $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; + + # set `proper pair' bit if non-blank, non-zero PE alignment score: + $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); + } + $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); + } else { + $s->[SAM_MAPQ] = 0; + } + # mate coordinate + $s->[SAM_MRNM] = '*'; + $s->[SAM_MPOS] = 0; + $s->[SAM_ISIZE] = 0; + # aux + push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); + if($has_coor){ + # The export match descriptor differs slightly from the samtools match descriptor. + # In order for the converted SAM files to be as compliant as possible, + # we put the export match descriptor in optional field 'XD' rather than 'MD': + push(@$s, "XD:Z:$matchDesc"); + push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); + push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); + } +} + + + +# +# the following code is taken from Richard Shaw's sorted2sam.pl file +# +sub reverse_compl_match_descriptor($) +{ +# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; + my ($match_desc) = @_; + my $rev_compl_match_desc = reverse($match_desc); + $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; + + # Unreverse the digits of numbers. + $rev_compl_match_desc = join('', + map {($_ =~ /\d+/) + ? join('', reverse(split('', $_))) + : $_} split(/(\d+)/, + $rev_compl_match_desc)); + + return $rev_compl_match_desc; +} + + + +sub match_desc_to_cigar($) +{ + my ($match_desc) = @_; + + my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); + my $cigar_str = ''; + my $cigar_del_ch = 'D'; + my $cigar_ins_ch = 'I'; + my $cigar_match_ch = 'M'; + + foreach my $match_desc_part (@match_desc_parts) { + next if (!$match_desc_part); + + if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { + # Deletion + $cigar_str .= (length($1) . $cigar_del_ch); + } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { + # Insertion + $cigar_str .= ($1 . $cigar_ins_ch); + } else { + $cigar_str .= (match_desc_frag_length($match_desc_part) + . $cigar_match_ch); + } + } + + return $cigar_str; +} + + +#------------------------------------------------------------------------------ + +sub match_desc_frag_length($) + { + my ($match_desc_str) = @_; + my $len = 0; + + my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); + + foreach my $match_desc_field (@match_desc_fields) { + next if ($match_desc_field eq ''); + + $len += (($match_desc_field =~ /(\d+)/) + ? $1 : length($match_desc_field)); + } + + return $len; +} + + +# argument holds the command line +sub write_header($;$;$) +{ + my ($progname,$version,$cl) = @_; + my $complete_header = ""; + $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; + + return $complete_header; +}