From: Rami Rauch Date: Mon, 22 Sep 2008 22:33:19 +0000 (+0000) Subject: converts QuEST peakcalls output file into narrowPeak ENCODE format X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=c7cf17f07df74015e0703cf3469470ec86d5a9d6 converts QuEST peakcalls output file into narrowPeak ENCODE format --- diff --git a/htswanalysis/scripts/QuEST_2_narrowPeak.pm b/htswanalysis/scripts/QuEST_2_narrowPeak.pm new file mode 100755 index 0000000..3545317 --- /dev/null +++ b/htswanalysis/scripts/QuEST_2_narrowPeak.pm @@ -0,0 +1,118 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + + +##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Transform QuEST peakcalling .out file to naarowPeak format (see doc at http://genomewiki.ucsc.edu/EncodeDCC/index.php/File_Formats) +# Input rec example: chr8 w: 433 locmax: 5.68905 at: 126511693 nulls: 0.145256 ef: 27.6411204454239 +# w: we don't need. +# locmax:is the absolute height of the peak. it does not correspond to the actual tag reads, because not all tags contribute equally to the peak determination. +# at: + 1 is the start position (to adjust for 0 base open, which means start at 1) +# at: + 2is the end position. +# nulls: is the control value +# ef: is the foldof enrichment at that peak, one base +##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# 1. Format into Tab delimited and remove field names inside records + +my $track_name = $ARGV[1] || "QuEST Peak Calling"; + +############################################################################## +# 2. Find EF Max and nulls MIN +my $INTFNAME = $ARGV[0].'.INT'; +open (IFILE, "< $ARGV[0]") or die "Can't open file $ARGV[0]"; +open (INTFILE, "> $INTFNAME") or die "Can't open INTERMIDIATE output file"; + +## vars +my $i = 0; +my $delimit = '[\t\s]+'; +my $name = ''; +my $score = 0; +my $end = 0; +my $logef = 0; +my $INTrec = ''; +my $max=-999; # the max ef value +my $min_ctrl=999; # the minimum background value +my $seqwindow = 50; +## print INTFILE "chr\tstart\tend\tname\tef\tlog(ef)\tscore(=rand(log(ef))\n"; +my $r=0; + +while() +{ + chomp; + next if($_ =~ /^#/ || $_ =~ /^$/); + ## chr8 w: 433 locmax: 5.68905 at: 126511693 nulls: 0.145256 ef: 27.6411204454239 + $r++; + $i++; + # print "\nRead record $i: $_"; + if(!defined($_)) { $i++; print "\nRecord $i not defined."; next; } + $INTrec=''; + my($chr,$x1,$w,$x2,$locmax,$x3,$start,$x4,$nulls,$x5,$ef) = split(/ /,$_); + $start++; # to adjust for 0 base open + # apply arbitrary range of 50 bases around the peak + + $end = $start + $seqwindow; + $start -= $seqwindow; + + $name = "peak$i"; + if($ef eq "NA") { + $score="NA"; + } else { + $score = log($ef)/log(2); + if($max<$score){$max=$score;} + if($min_ctrl>$nulls){$min_ctrl=$nulls;} + } + + $INTrec = "$chr\t$start\t$end\t$name\t$locmax\t$ef\t$logef\t$score"; + print INTFILE "$INTrec\n"; +} +close IFILE; +close INTFILE; +print "\n ==== Max EF = $max ===\n"; + +################################################# + +##### Generate 0-1000 score and the narrowPeak score +my $NPFNAME = $ARGV[0].'.narrowPeak'; +open (IFILE, "< $INTFNAME") or die "Can't open file $INTFNAME"; +open (NPFILE, "> $NPFNAME") or die "Can't open NARROWPEAK output file $NPFNAME"; +my $b=0; +my $NPrec = ''; +my $bigscore = 0; # 0-1000 score +my $graphscore = 0; + +while() +{ + $b++; + if(!defined($_)) { $b++; print "\nRecord $b not defined."; next; } + chomp; + $NPrec = ''; + my($chr,$start,$end,$name,$locmax,$ef,$logef,$score) = split(/$delimit/,$_); + $bigscore = $score; + $graphscore = $score; + + if($graphscore =~ /^NA$/) { $graphscore = log($locmax/$min_ctrl)/log(2);} + + if($graphscore > $max){$max = $graphscore;} + $bigscore = int( ($graphscore/$max)*1000 ); + + ##$BEDrec = "$chr\t$start\t$end\t$name\t$bigscore"; + $NPrec = sprintf("%s\t%d\t%d\t%s\t%d\t%s\t%0.3f\t%s\t%d",$chr,$start,$end,$name,$bigscore,'.',$graphscore,'-1',$seqwindow); ## DBG: ,$bigscore,$score,$locmax,$ef); + print NPFILE "$NPrec\n"; +} + +close NPFILE; +close IFILE; + +print "\n==== F I N I S H E D. Created NARROWPEAK file from QuEST peaks output file, processing $i records ===============\n"; +print "\n ==== Max EF(After clac NA values) = $max ===\n"; +print "\n ==== Min Background = $min_ctrl ===\n"; +print "\n( Temp file is $INTFNAME )"; +print "\n\nOUTPUT file name is $NPFNAME\n"; + +if($r != $b){ print "\n!!! WARNING !!! SOMETHING'S WRONG: different number of records processed into NARROWPEAK ($b) is different from number of records in INPUT file ($r).\n";} + +exit; +