7 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 # Transform MACS peakcalling output to broadPeak format (see doc at http://genomewiki.ucsc.edu/EncodeDCC/index.php/File_Formats)
10 # Input can be BED file like: chr11 35886059 35886527 3228.29 (chr number | start pos | end pos | score)
11 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
14 open (IFILE, "< $ARGV[0]") or die "Can't open INPUT file $ARGV[0]";
17 my $delimit = '[\t\s]+';
23 my $max=-99999; # the max value
28 next if($_ =~ /^#/ || $_ =~ /^$/);
30 # print "\nRead record $i: $_";
31 if(!defined($_)) { $i++; print "\nRecord $i not defined."; next; }
33 my($chr,$start,$end,$signalValue) = split(/$delimit/,$_);
36 if($max<$signalValue){$max=$signalValue;}
39 ##print "\n ==== Max signalValue = $max ===\n";
41 #################################################
43 ##### Generate 0-1000 score.
44 my $BPFNAME = $ARGV[0].'.broadPeak';
45 open (IFILE, "< $ARGV[0]") or die "Can't open INPUT file $ARGV[0]";
46 open (BPFILE, "> $BPFNAME") or die "Can't open BROADPEAK output file $BPFNAME";
48 my $bigscore = 0; # 0-1000 score
54 if(!defined($_)) { $i++; print "\nRecord $i not defined."; next; }
58 my($chr,$start,$end,$signalValue) = split(/$delimit/,$_);
63 $bigscore = int( ($signalValue/$max)*1000 );
64 $rec = sprintf("%s\t%d\t%d\t%s\t%d\t%s\t%0.3f\t%s\t%s",$chr,$start,$end,$name,$bigscore,'.',$signalValue,'-1','-1');
65 print BPFILE "$rec\n";
69 print "\n!! Invalid line ($i) skipped: $_\n";
76 print "\n==== F I N I S H E D. Created BROADPEAK file ($BPFNAME) from MACS BED file=====\n";
77 print "\nMax signalValue = $max (corresponding to 1000)";
78 print "\nTotal processed records = $i";
79 print "\nNumber of valid peak records = $p";
85 #Taken from http://encodewiki.ucsc.edu/EncodeDCC/index.php/File_Formats#broadPeak:_Broad_Peaks_.28or_Regions.29_Format ON JUN/26/09
86 #broadPeak: Broad Peaks (or Regions) Format
87 #This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data. It is a BED 6+3 format.
88 # field type description
90 # chrom string Name of the chromosome.
91 # chromStart int Starting position on the chromosome.
92 # chromEnd int Ending position on the chromosome.
93 # name string Name given to a region (preferably unique). Use `.` if no name is assigned.
94 # score int Indicates how dark the peak will be displayed in the browser (1-1000). If `0`, the DCC will assign this based on signal value. Ideally average signalValue per base spread between 100-1000.
95 # strand char +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
96 # signalValue float Measurement of overall (usually, average) enrichment for the region.
97 # pValue float Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned.
98 # qValue float Measurement of statistical significance using false discovery rate. Use -1 if no qValue is assigned.