7 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 # Transform QuEST peakcalling .out file to naarowPeak format (see doc at http://genomewiki.ucsc.edu/EncodeDCC/index.php/File_Formats)
9 # Input rec example: chr8 w: 433 locmax: 5.68905 at: 126511693 nulls: 0.145256 ef: 27.6411204454239
11 # locmax:is the absolute height of the peak. it does not correspond to the actual tag reads, because not all tags contribute equally to the peak determination.
12 # at: + 1 is the start position (to adjust for 0 base open, which means start at 1)
13 # at: + 2is the end position.
14 # nulls: is the control value
15 # ef: is the foldof enrichment at that peak, one base
16 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 # 1. Format into Tab delimited and remove field names inside records
20 my $track_name = $ARGV[1] || "QuEST Peak Calling";
22 ##############################################################################
23 # 2. Find EF Max and nulls MIN
24 my $INTFNAME = $ARGV[0].'.INT';
25 open (IFILE, "< $ARGV[0]") or die "Can't open file $ARGV[0]";
26 open (INTFILE, "> $INTFNAME") or die "Can't open INTERMIDIATE output file";
30 my $delimit = '[\t\s]+';
36 my $max=-999; # the max ef value
37 my $min_ctrl=999; # the minimum background value
39 ## print INTFILE "chr\tstart\tend\tname\tef\tlog(ef)\tscore(=rand(log(ef))\n";
45 next if($_ =~ /^#/ || $_ =~ /^$/);
46 ## chr8 w: 433 locmax: 5.68905 at: 126511693 nulls: 0.145256 ef: 27.6411204454239
49 # print "\nRead record $i: $_";
50 if(!defined($_)) { $i++; print "\nRecord $i not defined."; next; }
52 my($chr,$x1,$w,$x2,$locmax,$x3,$start,$x4,$nulls,$x5,$ef) = split(/ /,$_);
53 $start++; # to adjust for 0 base open
54 # apply arbitrary range of 50 bases around the peak
56 $end = $start + $seqwindow;
63 $score = log($ef)/log(2);
64 if($max<$score){$max=$score;}
65 if($min_ctrl>$nulls){$min_ctrl=$nulls;}
68 $INTrec = "$chr\t$start\t$end\t$name\t$locmax\t$ef\t$logef\t$score";
69 print INTFILE "$INTrec\n";
73 print "\n ==== Max EF = $max ===\n";
75 #################################################
77 ##### Generate 0-1000 score and the narrowPeak score
78 my $NPFNAME = $ARGV[0].'.narrowPeak';
79 open (IFILE, "< $INTFNAME") or die "Can't open file $INTFNAME";
80 open (NPFILE, "> $NPFNAME") or die "Can't open NARROWPEAK output file $NPFNAME";
83 my $bigscore = 0; # 0-1000 score
89 if(!defined($_)) { $b++; print "\nRecord $b not defined."; next; }
92 my($chr,$start,$end,$name,$locmax,$ef,$logef,$score) = split(/$delimit/,$_);
96 if($graphscore =~ /^NA$/) { $graphscore = log($locmax/$min_ctrl)/log(2);}
98 if($graphscore > $max){$max = $graphscore;}
99 $bigscore = int( ($graphscore/$max)*1000 );
101 ##$BEDrec = "$chr\t$start\t$end\t$name\t$bigscore";
102 $NPrec = sprintf("%s\t%d\t%d\t%s\t%d\t%s\t%0.3f\t%s\t%d",$chr,$start,$end,$name,$bigscore,'.',$graphscore,'-1',$seqwindow); ## DBG: ,$bigscore,$score,$locmax,$ef);
103 print NPFILE "$NPrec\n";
109 print "\n==== F I N I S H E D. Created NARROWPEAK file from QuEST peaks output file, processing $i records ===============\n";
110 print "\n ==== Max EF(After clac NA values) = $max ===\n";
111 print "\n ==== Min Background = $min_ctrl ===\n";
112 print "\n( Temp file is $INTFNAME )";
113 print "\n\nOUTPUT file name is $NPFNAME\n";
115 if($r != $b){ print "\n!!! WARNING !!! SOMETHING'S WRONG: different number of records processed into NARROWPEAK ($b) is different from number of records in INPUT file ($r).\n";}