converts the HTSW align format (derived from Eland format) into ENCODE 'TagAlign...
authorRami Rauch <rrauch@stanford.edu>
Wed, 19 Nov 2008 19:01:53 +0000 (19:01 +0000)
committerRami Rauch <rrauch@stanford.edu>
Wed, 19 Nov 2008 19:01:53 +0000 (19:01 +0000)
htswanalysis/scripts/align2TagAlign.pm [new file with mode: 0755]

diff --git a/htswanalysis/scripts/align2TagAlign.pm b/htswanalysis/scripts/align2TagAlign.pm
new file mode 100755 (executable)
index 0000000..9bf60b1
--- /dev/null
@@ -0,0 +1,60 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+use Switch;
+
+# Transform Align.txt format, example: TTTTTCTTTCTTTTCTCTCTTTCTT 12500 1 chr9:19863256 F TTTTTCTTTTCTTTCTCTCTTTCTT 11453  
+# to 
+# ENCODE "TagAlign" format: chrom | chromStart | chromEnd | Sequence | Score | Strand (+/-)|
+# See online documentation of ENCODE data submission formats at http://encodewiki.ucsc.edu/EncodeDCC/index.php/File_Formats 
+
+open (IFILE, "< $ARGV[0]") or die "Can't open file $ARGV[0]";
+
+open (OFILE, "> $ARGV[0].TagAlign") or die "Can't open output file";
+
+my $i = 0;
+my $mismatches = 0;
+my $delimit = '\s+';
+# print "\nchr\tstart\tend\tsbjseq\tmismatched\tstrand\treps";
+my @testArray = [];
+while(<IFILE>) ### && $i < 10) 
+{
+  # print "\nRead record $i: $_";
+  if(!defined($_)) { $i++; print "\nRecord $i not defined."; next; }
+  chomp;
+  my $BEDrec = '';
+
+  @testArray = split(/$delimit/,$_);
+  if($#testArray eq 6)
+  {
+    my($sbjseq,$score,$reps,$chr_pos,$strand,$genseq,$score2) = split(/$delimit/,$_);  
+    if($chr_pos =~ /^chr/)
+    {
+      my($chr,$pos) = split(/:/,$chr_pos);
+      my $end = $pos + length($sbjseq);
+      $mismatches = 0;
+      switch($score)
+      { 
+        case 12500 {$mismatches = 0}
+        case 11453 {$mismatches = 1}
+        case 10406 {$mismatches = 2}
+      }  
+      $strand =~ s/F/+/i;
+      $strand =~ s/R/-/i;
+      $BEDrec = "$chr\t$pos\t$end\t$sbjseq\t$mismatches\t$strand";
+      # print "\nBED rec: $BEDrec";
+      print OFILE "$BEDrec\n";
+      $i++;
+    }
+  }
+}
+
+close IFILE;
+close OFILE;
+print "\n==== F I N I S H E D processing $i records ===========";
+print "\n==== INPUT FILE NAME: $ARGV[0]  ===============";
+print "\n==== OUTPUT FILE NAME: $ARGV[0].TagAlign  ===============\n";
+
+exit;
+