--- /dev/null
+ROOT = .
+SRC_DIR = $(ROOT)/src/
+MISC_DIR = $(ROOT)/misc/
+BIN_DIR = $(ROOT)/bin/
+#../misc_files/
+CC = g++
+#CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops -pthread
+INCLUDE = $(MISC_DIR)
+COPY = cp
+
+include config.mk
+
+LOCAL_TARGETS = generate_peak_profile peak_caller quick_window_scan calibrate_peak_shift
+
+LOCAL_EXECS = $(LOCAL_TARGETS)
+LOCAL_OBJECTS = $(LOCAL_TARGETS:=.o)
+OTHER_DEPENDENCIES = params utils seq_contig sequence_utility_functions
+OTHER_OBJECTS = $(OTHER_DEPENDENCIES:=.o)
+
+EXPANDED_OTHER_OBJECTS = $(addprefix $(MISC_DIR), $(OTHER_OBJECTS))
+
+default: all
+
+.PHONY: default all clean
+
+
+all: other_dependencies local_objects local_execs
+
+place:
+ mv $(LOCAL_TARGETS) $(BIN_DIR)
+
+local_execs: $(LOCAL_EXECS)
+
+local_objects:
+ cd $(SRC_DIR) && $(MAKE) $(LOCAL_OBJECTS)
+
+$(LOCAL_EXECS) : %: $(SRC_DIR)/%.o
+ echo $(INCLUDE)
+ $(CC) -I$(INCLUDE) $(CCFLAGS) $(SRC_DIR)$@.o $(EXPANDED_OTHER_OBJECTS) -o $@
+
+other_dependencies: other_objects
+
+other_objects:
+ cd $(MISC_DIR) && $(MAKE) $(OTHER_OBJECTS)
+
+clean:
+ rm -rf $(SRC_DIR)*.o
+ cd $(MISC_DIR) && $(MAKE) clean
+
+all_clean:
+ rm -rf $(SRC_DIR)*.o
+ cd $(MISC_DIR) && $(MAKE) clean
--- /dev/null
+current version: 0.3
+current development status: beta
+
+Disclosure:
+
+QuEST is in its beta state and is expected to run,
+but may contain minor bugs, glitches, and inconsistencies.
+If you discover any bugs or have any suggestions,
+please contact Anton Valouev (valouev@stanford.edu).
+
+QuEST is a statistical package that is designed to analyze ChIP-Seq data
+and to produce positions of ChIP enrichment.
+
+QuEST is free of charge. You can use it for any purpose you like
+and modify/redistribute it in any way you like without permission of the
+authors.
+
+If you use QuEST or any deriviatives of its results, as well as
+the QuEST code in part or in whole for any purpose, please cite
+in your publications, patents and code:
+
+Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,
+Myers RM, Sidow A. A Statistical Framework for Genome-Wide
+Identification of Transcription Factor Binding Sites Based on ChIP-Seq
+Data. 2008 (under review)
+
+===============================================================
+
+Target OS:
+
+QuEST was tested on Linux ES with kernel 2.6.9, and Mac OS 10.5
+with Perl and g++ compiler. You will also need a fair amount
+of disc space (~20 Gb for a run on a human) to store the score files.
+
+===============================================================
+
+What you will need to run QuEST:
+
+1. ChIP data
+
+ Reads from the ChIP-Seq experiment.
+ Supported formats:
+ - Solexa align
+ - QuEST align
+
+ You will only need to specify one file.
+
+2. RX_noIP data
+
+ Reads from the RX_noIP experiment. This is also known as total chromatin or input.
+ Supported formats:
+ - Solexa align
+ - QuEST align
+
+ You will only need to specify one file.
+
+3. Reference genome or genome table
+
+ For the reference genome, there should be one contig per .fa file.
+ Alternatively, you can provide a genome table containing contig names
+ and their sizes (see below). Supported formats: fasta; genome table.
+ Note that you need either the analysis path or the genome table. But not
+ both, or else the script will not run.
+
+==============================================================
+
+How to run QuEST:
+
+1. Configure compilation flags
+
+ This is done by changing into the QuEST dirstribution directory and
+ typing "./configure.pl".
+
+2. Compile QuEST:
+
+ cd into the QuEST distribution directory and type "make", no extra
+ work is needed (you need to have g++ compiler).
+
+3. Configure QuEST to run with your data. This will generate all the necessary
+ configuration files/thresholds/etc:
+
+ command: generate_QuEST_parameters.pl
+
+ flags:
+ -solexa_align_ChIP <file containing ChIP reads in solexa align format (see below)>
+ -solexa_align_RX_noIP <file containing RX_noIP (aka input/total chromatin) reads
+ in solexa align format (see below)>
+ -rp <path to the reference genome containing .fa files>
+ -ap <analysis path where the QuEST results and all files will be located>
+
+ example:
+
+ $/home/agent_smith/QuEST/generate_QuEST_parameters.pl -solexa_align_ChIP /Volumes/Projects/GABP/GABP.align_25.hg18.txt -solexa_align_RX_noIP /Volumes/Projects/background/Jurkat_RX_noIP.align_25.hg18.txt -rp /Volumes/Projects/Genomes/Human/hg18/ -ap /Volumes/Projects/GABP/analysis/
+
+ flags:
+
+ to list flags call "generate_QuEST_parameters.pl"
+
+4. Run QuEST:
+
+ command: run_QuEST_with_param_file.pl
+
+ flags:
+ -p <QuEST batch param file>
+
+ example:
+
+ $/home/agent_smith/QuEST/run_QuEST_with_param_file.pl -p /Volumes/Projects/GABP/analysis/QuEST.batch.pars
+
+5. QuEST will output the following files of interest:
+
+ ChIP peak calls: peak_caller.ChIP.out
+
+ pseudo_ChIP peak calls: peak_caller.pseudo_ChIP.out
+
+ QuEST summary: QuEST.out
+
+===================================================================
+
+File formats:
+
+1. Read position files:
+
+ - QuEST align file:
+
+ These files contain five prime end coordinates and orientation
+ of reads (starting with 0) after mapping to the genome.
+
+ <contig> <coordinate> <orientation>
+
+ chr2 13839273 -
+
+ forward/reverse orientation is indicated by having + or - in the last field
+
+ - Solexa align file:
+
+ These files are outputs of Solexa analysis pipeline and can be used
+ directly as inputs to QuEST software. The file should contain one
+ alignment per line according to the following format:
+
+ <read> <NA> <NA> <contig:coordinate> <orientation>
+ AAAACCACACAA.CACAACACCCAA 1250 1 chr11:56352492 F
+
+ the rest of the fields can be anything you want (NA means the program doesn't
+ care about the value of these fields).
+
+2. Reference genome:
+
+ - FASTA files:
+
+ The program was tested with Human hg18 and D. melanogaster dm3 releases
+ downloaded from the UCSC Genome browser. Files are expected to contain
+ one contig per file. QuEST will convert the genome to a table file
+ (see below) which you can use for further QuEST runs. The table will
+ be saved in the analysis path.
+
+ - Genome Table file:
+
+ This file contains the description of genome one contig per line using
+ the following format:
+
+ <contig_name> <contig_size>
+ chr1 247249719
+
+3. QuEST peak calls:
+
+ - peak_caller.ChIP.out; peak_caller.pseudo_ChIP.out
+
+ Files containg peak calls for ChIP and pseudo_ChIP analysis
+ using the following format:
+
+ <contig> w: <width of the region of enrichment> locmax: <ChIP enrichment value> at: <position relative to the contig position 0> nulls: <background enrichment value> ef: <enrichment fold normalized for the sequencing depth>
+
+ chr2L w: 754 locmax: 8.81752 at: 195765587 nulls: 0.627292 ef: 162.382
+
+=============================================================
+
+Detailed description of individual QuEST modules and scripts will follow in
+later versions. Execute Perl scripts without parameters to list available options.
--- /dev/null
+CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops -m64 -pthread
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $makefile_fname = "./Makefile";
+my $misc_path = "./misc/";
+my $misc_makefile_fname = $misc_path . "Makefile";
+my $config_out_fname = "./config.mk";
+
+my $uname_out = `uname -a`;
+
+my @uname_out_fields = split(/ /, $uname_out);
+
+my $OS = $uname_out_fields[0];
+my $Darwin_ver = "";
+if(scalar(@uname_out_fields) >= 3){
+ my @Darwin_ver_fields = split(/\./, $uname_out_fields[2]);
+ if(scalar(@Darwin_ver_fields) >= 3){
+ $Darwin_ver = $Darwin_ver_fields[0];
+ }
+}
+
+print "Testing OS. OS = $OS.\n";
+print "Testing compiler."; $|++;
+my $gpp_out = `which g++`;
+chomp($gpp_out);
+
+if( $gpp_out eq ""){
+ print "Error: g++ compiler not found.\n";
+}
+else{
+ print " Ok.\n";
+}
+
+#print "Testing pthread flag."; $|++;
+#my $tst_cpp_fname = "./test.cpp";
+#my $tst_exec = "./tst.out";
+#unlink ($tst_cpp_fname);
+#open test_cpp_file, "> $tst_cpp_fname" || die "$tst_cpp_fname: $!\n";
+#print test_cpp_file "\#include <iostream>\n";
+#print test_cpp_file "\#include <pthread.h>\n";
+#print test_cpp_file "using namespace std;\n";
+#print test_cpp_file "int main(){ cout<<\"pthreads work\"<<endl; return 0;}\n";
+
+#close test_cpp_file;
+#my @dummy = `g++ -pthread $tst_cpp_fname -o $tst_exec`;
+#my $tst_exec_out = `$tst_exec`;
+#chomp($tst_exec_out);
+
+#if($tst_exec_out ne "pthreads work"){
+# print " pthreads failed.\n";
+#}
+#else{
+# print " Ok.\n";
+#}
+#unlink($tst_exec_out);
+#unlink($tst_cpp_fname);
+
+
+if(-e $config_out_fname){
+ unlink($config_out_fname);
+}
+open config_out_file, "> $config_out_fname" || die "$config_out_fname: $!\n";
+
+my $CCFLAGS = "-Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops"; # -pthread";
+
+if( $OS eq "Darwin" ){
+ if($Darwin_ver >= 9){
+ $CCFLAGS = $CCFLAGS . " -m64 -pthread";
+ }
+}
+else{
+ $CCFLAGS = $CCFLAGS . " -pthread"
+}
+
+print config_out_file "CCFLAGS = $CCFLAGS\n";
+
+close config_out_file;
--- /dev/null
+#!/usr/bin/perl
+
+## this program takes peak calls files and converts them
+## to bed file for UCSC browser where peak calls are
+## displayed as features
+
+use strict;
+use warnings;
+use diagnostics;
+
+#use lib '/home/shogo/tools/ucsc_tracks';
+use lib './ucsc_tracks';
+
+use rgb_colors;
+
+my $usage = qq{
+ peakcalls2bed
+
+ This program will take peak calls file and output
+ the reported peak calls as bed file that is usable
+ within UCSC genome browser.
+
+ -----------------------
+ mandatory parameters:
+
+ -p peak_calls_file file containeding peak calls
+ -o output_file bed output file
+
+ -----------------------
+ optional parameters:
+
+ -w feature_width feature width in the browser
+ -s feature_shift feature shift in the browser
+ -n track_name track name
+ -c feature_color bed feature color
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+my $peak_calls_fname = "";
+my $output_fname = "";
+
+## default arguments
+
+my $feature_width = 5;
+my $feature_shift = 0;
+my $track_name = "TF peak calls";
+#my $rgb_color_fname = "/home/shogo/dev/chip_seq/perl_code/rgb.colors";
+
+## parse command line arguments
+
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ if ( $this_arg eq '-p') {$peak_calls_fname = shift @ARGV;}
+ if ( $this_arg eq '-w') {$feature_width = shift @ARGV;}
+ if ( $this_arg eq '-s') {$feature_shift = shift @ARGV;}
+ if ( $this_arg eq '-o') {$output_fname = shift @ARGV;}
+ if ( $this_arg eq '-n') {$track_name = shift @ARGV;}
+}
+
+
+if ( $peak_calls_fname eq '' || $feature_width eq '' || $feature_shift eq '' || $output_fname eq '' ||
+ $track_name eq ''){
+ die "$usage\n";
+}
+
+my $track_color="";
+$track_color = main::get_random_color();
+
+print "track_color: $track_color\n";
+print "track_name: $track_name\n";
+
+open peak_calls_file, "< $peak_calls_fname" || die "$peak_calls_fname: $\n";
+
+unlink($output_fname);
+unlink($output_fname.".gz");
+open output_file, "> $output_fname" || die "$output_fname: $\n";
+
+print output_file "track name=\"$track_name\" description=\"$track_name\" visibility=2 itemRgb=\"On\"\n";
+
+while(<peak_calls_file>){
+ chomp;
+ if(length($_) > 0){
+ if(substr($_,0,1) ne "#"){
+ my @cur_entries = split(/ /, $_);
+ if(scalar(@cur_entries) > 5){
+ my $this_chr_id = $cur_entries[0];
+ my $this_coord = $cur_entries[6];
+ my $this_peak_value = $cur_entries[4];
+ my $browser_start_coord = $this_coord + 1 + $feature_shift - ($feature_width-1)/2;
+ my $browser_end_coord = $this_coord + 1 + $feature_shift + ($feature_width-1)/2;
+
+ printf output_file "%s\t%d\t%d\t%.2f\t%.2f\t+\t%d\t%d\t%s\n", $this_chr_id, $browser_start_coord,
+ $browser_end_coord, $this_peak_value, $this_peak_value, $browser_start_coord,
+ $browser_end_coord, $track_color;
+
+ #print output_file "$this_chr_id\t$browser_start_coord\t$browser_end_coord\t$this_peak_value\t0\t+";
+ #print output_file "\t$browser_start_coord\t$browser_end_coord\t$track_color\n";
+ #print "$this_chr_id: $this_coord $this_peak_value\n";
+ }
+ }
+ }
+}
+
+close peak_calls_file;
+close output_file;
+
+system("gzip $output_fname");
--- /dev/null
+use strict;
+use warnings;
+
+my @color_alias;
+my @color_value;
+
+sub init_colors{
+ #push(@color_alias,"snow"); push(@color_value, "255,250,250");
+ #push(@color_alias,"GhostWhite"); push(@color_value, "248,248,255");
+ #push(@color_alias,"WhiteSmoke"); push(@color_value, "245,245,245");
+ #push(@color_alias,"gainsboro"); push(@color_value, "220,220,220");
+ #push(@color_alias,"FloralWhite"); push(@color_value, "255,250,240");
+ #push(@color_alias,"OldLace"); push(@color_value, "253,245,230");
+ #push(@color_alias,"linen"); push(@color_value, "250,240,230");
+ #push(@color_alias,"AntiqueWhite"); push(@color_value, "250,235,215");
+ #push(@color_alias,"PapayaWhip"); push(@color_value, "255,239,213");
+ #push(@color_alias,"BlanchedAlmond"); push(@color_value, "255,235,205");
+ #push(@color_alias,"bisque"); push(@color_value, "255,228,196");
+ #push(@color_alias,"PeachPuff"); push(@color_value, "255,218,185");
+ #push(@color_alias,"NavajoWhite"); push(@color_value, "255,222,173");
+ #push(@color_alias,"moccasin"); push(@color_value, "255,228,181");
+ #push(@color_alias,"cornsilk"); push(@color_value, "255,248,220");
+ #push(@color_alias,"LemonChiffon"); push(@color_value, "255,250,205");
+ #push(@color_alias,"seashell"); push(@color_value, "255,245,238");
+ #push(@color_alias,"honeydew"); push(@color_value, "240,255,240");
+ #push(@color_alias,"MintCream"); push(@color_value, "245,255,250");
+ #push(@color_alias,"azure"); push(@color_value, "240,255,255");
+ #push(@color_alias,"AliceBlue"); push(@color_value, "240,248,255");
+ #push(@color_alias,"lavender"); push(@color_value, "230,230,250");
+ #push(@color_alias,"LavenderBlush"); push(@color_value, "255,240,245");
+ #push(@color_alias,"MistyRose"); push(@color_value, "255,228,225");
+ #push(@color_alias,"white"); push(@color_value, "255,255,255");
+ #push(@color_alias,"black"); push(@color_value, "000,000,000");
+ push(@color_alias,"DarkSlateGray"); push(@color_value, "047,079,079");
+ push(@color_alias,"DarkSlateGrey"); push(@color_value, "047,079,079");
+ push(@color_alias,"DimGray"); push(@color_value, "105,105,105");
+ push(@color_alias,"DimGrey"); push(@color_value, "105,105,105");
+ push(@color_alias,"SlateGray"); push(@color_value, "112,128,144");
+ push(@color_alias,"SlateGrey"); push(@color_value, "112,128,144");
+ push(@color_alias,"LightSlateGray"); push(@color_value, "119,136,153");
+ push(@color_alias,"LightSlateGrey"); push(@color_value, "119,136,153");
+ push(@color_alias,"gray"); push(@color_value, "190,190,190");
+ push(@color_alias,"grey"); push(@color_value, "190,190,190");
+ #push(@color_alias,"LightGrey"); push(@color_value, "211,211,211");
+ #push(@color_alias,"LightGray"); push(@color_value, "211,211,211");
+ push(@color_alias,"MidnightBlue"); push(@color_value, "025,025,112");
+ push(@color_alias,"navy"); push(@color_value, "000,000,128");
+ push(@color_alias,"NavyBlue"); push(@color_value, "000,000,128");
+ push(@color_alias,"CornflowerBlue"); push(@color_value, "100,149,237");
+ push(@color_alias,"DarkSlateBlue"); push(@color_value, "072,061,139");
+ push(@color_alias,"SlateBlue"); push(@color_value, "106,090,205");
+ push(@color_alias,"MediumSlateBlue"); push(@color_value, "123,104,238");
+ push(@color_alias,"LightSlateBlue"); push(@color_value, "132,112,255");
+ push(@color_alias,"MediumBlue"); push(@color_value, "000,000,205");
+ push(@color_alias,"RoyalBlue"); push(@color_value, "065,105,225");
+ push(@color_alias,"blue"); push(@color_value, "000,000,255");
+ push(@color_alias,"DodgerBlue"); push(@color_value, "030,144,255");
+ push(@color_alias,"DeepSkyBlue"); push(@color_value, "000,191,255");
+ push(@color_alias,"SkyBlue"); push(@color_value, "135,206,235");
+ push(@color_alias,"LightSkyBlue"); push(@color_value, "135,206,250");
+ push(@color_alias,"SteelBlue"); push(@color_value, "070,130,180");
+ push(@color_alias,"LightSteelBlue"); push(@color_value, "176,196,222");
+ push(@color_alias,"LightBlue"); push(@color_value, "173,216,230");
+ push(@color_alias,"PowderBlue"); push(@color_value, "176,224,230");
+ push(@color_alias,"PaleTurquoise"); push(@color_value, "175,238,238");
+ push(@color_alias,"DarkTurquoise"); push(@color_value, "000,206,209");
+ push(@color_alias,"MediumTurquoise"); push(@color_value, "072,209,204");
+ push(@color_alias,"turquoise"); push(@color_value, "064,224,208");
+ push(@color_alias,"cyan"); push(@color_value, "000,255,255");
+ push(@color_alias,"LightCyan"); push(@color_value, "224,255,255");
+ push(@color_alias,"CadetBlue"); push(@color_value, "095,158,160");
+ push(@color_alias,"MediumAquamarine"); push(@color_value, "102,205,170");
+ push(@color_alias,"aquamarine"); push(@color_value, "127,255,212");
+ push(@color_alias,"DarkGreen"); push(@color_value, "000,100,000");
+ push(@color_alias,"DarkOliveGreen"); push(@color_value, "085,107,047");
+ push(@color_alias,"DarkSeaGreen"); push(@color_value, "143,188,143");
+ push(@color_alias,"SeaGreen"); push(@color_value, "046,139,087");
+ push(@color_alias,"MediumSeaGreen"); push(@color_value, "060,179,113");
+ push(@color_alias,"LightSeaGreen"); push(@color_value, "032,178,170");
+ push(@color_alias,"PaleGreen"); push(@color_value, "152,251,152");
+ push(@color_alias,"SpringGreen"); push(@color_value, "000,255,127");
+ push(@color_alias,"LawnGreen"); push(@color_value, "124,252,000");
+ push(@color_alias,"green"); push(@color_value, "000,255,000");
+ push(@color_alias,"chartreuse"); push(@color_value, "127,255,000");
+ push(@color_alias,"MediumSpringGreen"); push(@color_value, "000,250,154");
+ # push(@color_alias,"GreenYellow"); push(@color_value, "173,255,047");
+ push(@color_alias,"LimeGreen"); push(@color_value, "050,205,050");
+ push(@color_alias,"YellowGreen"); push(@color_value, "154,205,050");
+ push(@color_alias,"ForestGreen"); push(@color_value, "034,139,034");
+ push(@color_alias,"OliveDrab"); push(@color_value, "107,142,035");
+ push(@color_alias,"dark"); push(@color_value, "khaki");
+ push(@color_alias,"DarkKhaki"); push(@color_value, "189,183,107");
+ #push(@color_alias,"khaki"); push(@color_value, "240,230,140");
+ push(@color_alias,"PaleGoldenrod"); push(@color_value, "238,232,170");
+ push(@color_alias,"LightGoldenrodYellow"); push(@color_value, "250,250,210");
+ push(@color_alias,"LightYellow"); push(@color_value, "255,255,224");
+ push(@color_alias,"yellow"); push(@color_value, "255,255,000");
+ push(@color_alias,"gold"); push(@color_value, "255,215,000");
+ push(@color_alias,"LightGoldenrod"); push(@color_value, "238,221,130");
+ push(@color_alias,"goldenrod"); push(@color_value, "218,165,032");
+ push(@color_alias,"DarkGoldenrod"); push(@color_value, "184,134,011");
+ push(@color_alias,"RosyBrown"); push(@color_value, "188,143,143");
+ push(@color_alias,"IndianRed"); push(@color_value, "205,092,092");
+ push(@color_alias,"SaddleBrown"); push(@color_value, "139,069,019");
+ push(@color_alias,"sienna"); push(@color_value, "160,082,045");
+ push(@color_alias,"peru"); push(@color_value, "205,133,063");
+ push(@color_alias,"burlywood"); push(@color_value, "222,184,135");
+ #push(@color_alias,"beige"); push(@color_value, "245,245,220");
+ push(@color_alias,"wheat"); push(@color_value, "245,222,179");
+ push(@color_alias,"SandyBrown"); push(@color_value, "244,164,096");
+ push(@color_alias,"tan"); push(@color_value, "210,180,140");
+ push(@color_alias,"chocolate"); push(@color_value, "210,105,030");
+ push(@color_alias,"firebrick"); push(@color_value, "178,034,034");
+ push(@color_alias,"brown"); push(@color_value, "165,042,042");
+ push(@color_alias,"DarkSalmon"); push(@color_value, "233,150,122");
+ push(@color_alias,"salmon"); push(@color_value, "250,128,114");
+ push(@color_alias,"LightSalmon"); push(@color_value, "255,160,122");
+ push(@color_alias,"orange"); push(@color_value, "255,165,000");
+ push(@color_alias,"DarkOrange"); push(@color_value, "255,140,000");
+ push(@color_alias,"coral"); push(@color_value, "255,127,080");
+ push(@color_alias,"LightCoral"); push(@color_value, "240,128,128");
+ push(@color_alias,"tomato"); push(@color_value, "255,099,071");
+ push(@color_alias,"OrangeRed"); push(@color_value, "255,069,000");
+ push(@color_alias,"red"); push(@color_value, "255,000,000");
+ push(@color_alias,"HotPink"); push(@color_value, "255,105,180");
+ push(@color_alias,"DeepPink"); push(@color_value, "255,020,147");
+ push(@color_alias,"pink"); push(@color_value, "255,192,203");
+ push(@color_alias,"LightPink"); push(@color_value, "255,182,193");
+ push(@color_alias,"PaleVioletRed"); push(@color_value, "219,112,147");
+ push(@color_alias,"maroon"); push(@color_value, "176,048,096");
+ push(@color_alias,"MediumVioletRed"); push(@color_value, "199,021,133");
+ push(@color_alias,"VioletRed"); push(@color_value, "208,032,144");
+ push(@color_alias,"magenta"); push(@color_value, "255,000,255");
+ push(@color_alias,"violet"); push(@color_value, "238,130,238");
+ push(@color_alias,"plum"); push(@color_value, "221,160,221");
+ push(@color_alias,"orchid"); push(@color_value, "218,112,214");
+ push(@color_alias,"MediumOrchid"); push(@color_value, "186,085,211");
+ push(@color_alias,"DarkOrchid"); push(@color_value, "153,050,204");
+ push(@color_alias,"DarkViolet"); push(@color_value, "148,000,211");
+ push(@color_alias,"BlueViolet"); push(@color_value, "138,043,226");
+ #push(@color_alias,"purple"); push(@color_value, "160,032,240");
+ push(@color_alias,"MediumPurple"); push(@color_value, "147,112,219");
+ push(@color_alias,"thistle"); push(@color_value, "216,191,216");
+}
+
+sub get_color{
+ my $cur_color_alias = shift;
+ for(my $i=0; $i<scalar(@color_alias); $i++){
+ if($cur_color_alias eq $color_alias[$i]){
+ return $color_value[$i];
+ }
+ }
+ return "not found";
+}
+
+sub get_random_color{
+ use POSIX qw(ceil floor);
+ my $cur_rand_int = floor(rand() * (scalar(@color_value) ) );
+ return $color_value[$cur_rand_int];
+}
+
+BEGIN{
+ init_colors();
+}
+
+#my $cur_color = get_random_color("azure");
+#print "$cur_color\n";
+
+
+
--- /dev/null
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like.
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## Valouev A, Johnson DS, Sundquist A, Median C, Anton E, Batzoglou S,
+## Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide
+## Identification of Transcription Factor Binding Sites Based on ChIP-Seq
+## Data. under review.
+
+## This program generates parameters for individual QuEST modules
+
+use strict;
+use warnings;
+use diagnostics;
+
+use Cwd qw(realpath);
+
+my $usage = qq{
+ generate_QuEST_parameters.pl
+
+ --------------------------------
+
+ This program is a part of QuEST analytical pipeline. If you use
+ this software by itself or as a part of another software or use
+ results obtained using this software,
+ for purposes of publication, patents, or development, you must cite the
+ publication anouncing QuEST release:
+
+ Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,
+ Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide
+ Identification of Transcription Factor Binding Sites Based on ChIP-Seq
+ Data. under review.
+
+ -------------------------------
+
+ This script generates parameters for individual QuEST modules
+
+ -----------------------
+ mandatory arguments:
+ -----------------------
+
+ for the ChIP data use one of the following:
+
+ -QuEST_align_ChIP <QuEST_align_file>
+ -solexa_align_ChIP <solexa_ChIP_align_file>
+
+ ( solexa align file containing mapped ChIP reads )
+
+ - - - - - - - - - - - -
+
+ for the background data (RX_noIP, input, total chromatin) use one of the following:
+
+ -QuEST_align_RX_noIP <QuEST_align_RX_noIP_file>
+ -solexa_align_RX_noIP <solexa_RX_noIP_align_file>
+
+ ( solexa align file containing mapped RX_noIP control experiment )
+ ( reads (aka total chromatin) )
+
+ - - - - - - - - - - -
+
+ for the reference genome use one of the following:
+
+ -rp <reference_path> a path to the reference genome fasta files
+ formated in UCSC Genome Browser database fashion
+ (typical naming convention is chr1.fa chr2.fa ...
+ chrM.fa)
+
+ -gt <genome_table> a genome table containing contig names followed by
+ their sizes
+
+ - - - - - - - - - - -
+
+ for the analysis:
+
+ -ap <analysis_path> specify a directory where all QuEST analysis results
+ will be reported
+
+ -----------------------
+ optional arguments:
+
+ -silent assume all default parameters and do not alert
+ to override any values (good for automation)
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+## mandatory argmuments
+my $ChIP_align_fname = "** missing **";
+my $ChIP_file_type = "";
+
+my $RX_noIP_align_fname = "** missing **";
+my $RX_noIP_file_type = "";
+
+my $reference_path = "** missing **";
+my $genome_table_source_fname = "** missing **";
+
+my $analysis_path = "** missing **";
+
+## /mandatory arguments
+
+## optional arguments
+
+## /optional arguments
+
+## flag options
+
+my $solexa_align_ChIP_flag = "off";
+my $QuEST_align_ChIP_flag = "off";
+my $solexa_align_RX_noIP_flag = "off";
+my $QuEST_align_RX_noIP_flag = "off";
+my $rp_flag = "off";
+my $gt_flag = "off";
+my $ap_flag = "off";
+my $silent_flag = "off";
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ elsif ( $this_arg eq '-solexa_align_ChIP') {$ChIP_align_fname = shift @ARGV; $ChIP_file_type = "solexa_align"; $solexa_align_ChIP_flag = "on";}
+ elsif ( $this_arg eq '-QuEST_align_ChIP') {$ChIP_align_fname = shift @ARGV; $ChIP_file_type = "QuEST_align"; $QuEST_align_ChIP_flag = "on";}
+ elsif ( $this_arg eq '-solexa_align_RX_noIP') {$RX_noIP_align_fname = shift @ARGV; $RX_noIP_file_type = "solexa_align"; $solexa_align_RX_noIP_flag = "on";}
+ elsif ( $this_arg eq '-QuEST_align_RX_noIP') {$RX_noIP_align_fname = shift @ARGV; $RX_noIP_file_type = "QuEST_align"; $QuEST_align_RX_noIP_flag = "on";}
+ elsif ( $this_arg eq '-rp') {$reference_path = shift @ARGV; $rp_flag = "on";}
+ elsif ( $this_arg eq '-gt') { $genome_table_source_fname = shift @ARGV; $gt_flag = "on";}
+ elsif ( $this_arg eq '-ap') {$analysis_path = shift @ARGV; $ap_flag = "on";}
+ elsif ( $this_arg eq '-silent') {$silent_flag = "on";}
+ else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+if($rp_flag eq "on" && $gt_flag eq "on"){
+ print "You need to specify either the reference path (-rp) or the genome table (-gt), but not both. Aborting.\n";
+ exit(1);
+}
+
+if($solexa_align_ChIP_flag eq "on" && $QuEST_align_ChIP_flag eq "on"){
+ print "You need to specify either the solexa_align_ChIP file or QuEST_align_ChIP file but not both. Aborting.\n";
+ exit(1);
+}
+
+if($solexa_align_RX_noIP_flag eq "on" && $QuEST_align_RX_noIP_flag eq "on"){
+ print "You need to specify either the solexa_align_ChIP or QuEST_align_ChIP file, but not both. Aborting.\n";
+ exit(1);
+}
+my $die_pars_bad = "false"; ## die if parameters are bad
+my $bad_par = ""; ## store bad parameter here
+
+my $cur_path = "";
+
+my $fullpath = realpath(".");
+
+my @fullpath_fields = split(/\//,$fullpath);
+for(my $i=0; $i<scalar(@fullpath_fields); $i++){
+ if($fullpath_fields[$i] ne ""){
+ $cur_path = $cur_path."/".$fullpath_fields[$i];
+ }
+}
+
+if( substr($analysis_path,0,2) eq "./" ){
+ $analysis_path = $cur_path . "/" . substr($analysis_path,2,length($analysis_path)-2);
+}
+
+print "\n=====================\n\n";
+print "parameters: \n\n";
+print "ChIP_align_file: $ChIP_align_fname\n";
+print "ChIP_file_type: $ChIP_file_type\n";
+print "\n";
+print "RX_noIP_align_file: $RX_noIP_align_fname\n";
+print "RX_noIP_file_type: $RX_noIP_file_type\n";
+print "\n";
+
+print "reference path: $reference_path\n";
+print "genome_table: $genome_table_source_fname\n";
+
+print "\n";
+
+print "analysis_path: $analysis_path\n";
+
+print "\n";
+
+print "silent: $silent_flag\n";
+
+
+
+if( $analysis_path eq "** missing **"){ $die_pars_bad = "true"; $bad_par = $bad_par." "."analysis_path"; }
+
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+ print "$usage";
+ print "Bad parameters: $bad_par\n";
+ exit(0);
+}
+## /reading arguments
+
+if( $reference_path eq "** missing **" and $genome_table_source_fname eq "** missing **"){
+ print "You should provide either the path to the reference genome or \n";
+ print "the genome table containing contig names and their sizes. Aborting.\n";
+ exit(0);
+}
+
+if( $ChIP_file_type eq ""){
+ print "You must supply ChIP align file. Aborting.\n";
+ exit(0);
+}
+
+if( $RX_noIP_file_type eq ""){
+ print "You must supply RX_noIP align_file. Aborting.\n";
+ exit(0);
+}
+
+## print disclaimer
+
+print "\n";
+print "********* DISCLAIMER ************\n";
+print "\n";
+print "The recoomendation provided below are based on our\n";
+print "limited experience of analyzing ChIP-Seq data, and are\n";
+print "not guaranteed to yield optimal results. Therefore, all\n";
+print "recommendations should be treated as a good staring point\n";
+print "for setting QuEST parameters and should be refined if needed.\n";
+print "\n";
+print "This software is free for use and redistribution, however\n";
+print "any information, results, or software derived from from or using QuEST\n";
+print "should cite the following publication: \n";
+print "\n";
+
+print " Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,\n";
+print " Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide\n";
+print " Identification of Transcription Factor Binding Sites Based on ChIP-Seq\n";
+print " Data. under review.\n\n";
+print "press Enter to continue.\n";
+
+my $dummy_kbhit;
+if($silent_flag ne "on"){
+ $dummy_kbhit = <STDIN>;
+}
+## /print disclaimer
+
+## setting up analysis directory
+if(-d $analysis_path){
+}
+else{
+ mkdir $analysis_path or die "Failed to create the analysis directory\n";
+}
+## /setting up analysis directory
+
+print "analysis directory.....................OK\n";
+
+## setting up analysis subdirectories
+
+print "analysis subdirectories................OK\n";
+## /setting up analysis subdirectories
+
+## reading reference files
+
+my @contig_names;
+my @contig_sizes;
+my $contig_counts = 0;
+
+if( $reference_path ne "** missing **" ){
+ opendir(REF_DIR, $reference_path) or die "can't open $reference_path: $!";
+
+ my @fasta_fnames;
+ my $fasta_fnames_counter = 0;
+ while (defined(my $cur_fname = readdir(REF_DIR))){
+ if ( substr($cur_fname,length($cur_fname)-3,3) eq ".fa" ){
+ $fasta_fnames[$fasta_fnames_counter] = $cur_fname;
+ $fasta_fnames_counter++;
+ }
+ }
+ close(REF_DIR);
+ print "\n";
+ print "The following fasta files were found in your reference path:\n\n";
+ for(my $i=0; $i<scalar(@fasta_fnames); $i++){
+
+ print "$fasta_fnames[$i]\n";
+ }
+ print "\nEvaluating the size of the genome. This may take several minutes.\n";
+
+ print "\nFound the following contigs: \n\n";
+
+
+ for(my $i=0; $i<scalar(@fasta_fnames); $i++){
+ my $cur_contig_fname = $reference_path . "/$fasta_fnames[$i]";
+ my $cur_contig_name = "";
+ my $cur_contig_size = 0;
+
+ if(-e $cur_contig_fname){
+ open cur_contig_file, "< $cur_contig_fname";
+ while(<cur_contig_file>){
+ chomp;
+ if( substr($_,0,1) eq ">" ){
+ if($cur_contig_name eq ""){
+ # first contig, don't save anything
+ }
+ else{
+ $contig_names[$contig_counts] = $cur_contig_name;
+ $contig_sizes[$contig_counts] = $cur_contig_size;
+ $contig_counts++;
+
+ print "$cur_contig_name: $cur_contig_size bps\n";
+ }
+ $cur_contig_name = substr($_,1,length($_)-1);
+ }
+ elsif( substr($_,0,1) eq "#" ){
+ # skip, comment
+ }
+ else{
+ $cur_contig_size = $cur_contig_size + length($_);
+ #increment the size
+ }
+ }
+ if($cur_contig_name ne ""){
+ $contig_names[$contig_counts] = $cur_contig_name;
+ $contig_sizes[$contig_counts] = $cur_contig_size;
+ $contig_counts++;
+
+ print "$cur_contig_name: $cur_contig_size bps\n";
+ }
+ close cur_contig_file;
+ }
+ }
+}
+else{
+ print "\nFound the following contigs in the genome table: \n\n";
+ if( ! -e $genome_table_source_fname ){
+ print "Failed to locate genome table: $genome_table_source_fname\n";
+ print "Aborting.\n";
+ }
+
+ open genome_table_source_file, "< $genome_table_source_fname" || die "$genome_table_source_fname: $!\n";
+
+ my @gt = <genome_table_source_file>;
+
+ for(my $j=0; $j < scalar(@gt); $j++){
+ my $cur_gt_entry = $gt[$j];
+ chomp($cur_gt_entry);
+ if(substr($cur_gt_entry,0,1) ne "#"){
+ my @cur_gt_entry_fields = split(/ /, $cur_gt_entry);
+ if( scalar(@cur_gt_entry_fields) == 2){
+ $contig_names[$contig_counts] = $cur_gt_entry_fields[0];
+ $contig_sizes[$contig_counts] = $cur_gt_entry_fields[1];
+ $contig_counts++;
+
+ print "$cur_gt_entry_fields[0] $cur_gt_entry_fields[1]\n";
+ }
+ }
+ }
+
+ close genome_table_source_file;
+}
+
+if (scalar(@contig_names) == 0){
+ print "There were not contigs found. Please correct reference path or genome table.\n";
+ exit(1);
+}
+
+my $contigs_string = "";
+
+my $genome_size = 0;
+
+for(my $i=0; $i<scalar(@contig_names); $i++){
+ if($i!=0){ $contigs_string = $contigs_string . ","; }
+ $contigs_string = $contigs_string . "$contig_names[$i]";
+ $genome_size = $genome_size + $contig_sizes[$i];
+}
+
+printf "\nEstimated size of the genome is %.1f Kb ( %.2f Gbps ).\n", $genome_size/1000, $genome_size/1000000000;
+
+my $contig_table_fname = $analysis_path . "genome_table";
+
+if( -e $contig_table_fname ){
+ unlink($contig_table_fname);
+}
+
+open contig_table_file, "> $contig_table_fname";
+
+for(my $i=0; $i<scalar(@contig_sizes); $i++){
+ print contig_table_file "$contig_names[$i] $contig_sizes[$i]\n";
+}
+
+close contig_table_file;
+
+my $updated_ChIP_align_fname = $analysis_path . "/ChIP.align.txt";
+
+if(-e "$updated_ChIP_align_fname"){
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "$updated_ChIP_align_fname already exists. \n";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "Do you want to overwrite? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "y\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+ if( $answer eq "y"){
+ unlink($updated_ChIP_align_fname);
+ }
+ }
+ }
+}
+
+if( ! -e $ChIP_align_fname ){
+ print "ChIP_align_file: $ChIP_align_fname does not exsist. Aborting.\n";
+ exit(1);
+}
+
+my $ChIP_tags = 0;
+
+print "\nConverting ChIP reads file into the QuEST format.\n";
+print "If the counter below stays at zero, the program can't\n";
+print "match the ChIP alignments to the reference genome that you provided.\n";
+
+if($ChIP_file_type eq "solexa_align" || $ChIP_file_type eq "QuEST_align"){
+ open ChIP_align_file, "< $ChIP_align_fname" || die "Failed to open $ChIP_align_fname\n";
+ open updated_ChIP_align_file, "> $updated_ChIP_align_fname" ||
+ die "Failed to open $updated_ChIP_align_fname\n";
+ while(<ChIP_align_file>){
+
+ if($ChIP_tags % 10000 == 0){
+ printf "\rfound %.2f M reads", $ChIP_tags / 1000000;
+ $|++; ## flush the buffer, that's all
+ }
+
+ my $cur_contig = "";
+ my $cur_fivep_coord = "";
+ my $cur_QuEST_orient = "";
+ my $read_ok = "true";
+
+ chomp;
+ my @cur_fields = split(/ /,$_);
+
+
+ if( $ChIP_file_type eq "solexa_align"){
+ if( scalar(@cur_fields) > 4 ){
+ my $cur_seq_length = length($cur_fields[0]);
+ my @coord_fields = split(/:/,$cur_fields[3]);
+ my $cur_orient = $cur_fields[4];
+ if( scalar(@coord_fields) == 2 &&
+ ( $cur_orient eq "F" or $cur_orient eq "R") ){
+
+ $cur_contig = $coord_fields[0];
+ my $cur_coord = $coord_fields[1];
+
+ if($cur_orient eq "F"){
+ $cur_fivep_coord = $cur_coord - 1;
+ $cur_QuEST_orient = "+";
+ }
+ elsif($cur_orient eq "R"){
+ $cur_fivep_coord = $cur_coord + $cur_seq_length - 1 - 1;
+ $cur_QuEST_orient = "-";
+ }
+ }
+ else{ $read_ok = "false"; }
+ }
+ else{ $read_ok = "false"; }
+ }
+ elsif( $ChIP_file_type eq "QuEST_align" ){
+ if( scalar(@cur_fields) == 3 ){
+ $cur_contig = $cur_fields[0];
+ $cur_fivep_coord = $cur_fields[1];
+ $cur_QuEST_orient = $cur_fields[2];
+ if($cur_QuEST_orient eq "+" || $cur_QuEST_orient eq "-"){
+ # do nothing
+ }
+ else{ $read_ok = "false"; }
+ }
+ }
+
+ if( $read_ok eq "true" ){
+ my $cur_contig_ind = -1;
+
+ for(my $i=0; $i<scalar(@contig_names); $i++){
+ if( $contig_names[$i] eq $cur_contig ){
+ $cur_contig_ind = $i;
+ }
+ }
+
+ if($cur_contig_ind != -1){
+ ## Solexa coords start with 1, ours with 0.
+ if( $cur_fivep_coord < 0 ||
+ $cur_fivep_coord >= $contig_sizes[$cur_contig_ind] ){
+ print "Warning: read start out of boundaries.\n";
+ print "Expected [ 0,$contig_sizes[$cur_contig_ind] ] but found $cur_fivep_coord. Skipping.\n";
+ $read_ok = "false";
+ }
+ else{
+ print updated_ChIP_align_file "$cur_contig $cur_fivep_coord $cur_QuEST_orient\n";
+ $ChIP_tags++;
+ }
+ }
+ else{
+ # couldn't locate the read
+ }
+ }
+ }
+
+ close ChIP_align_file;
+ close updated_ChIP_align_file;
+ printf "\n";
+}
+
+print "\n";
+## /creating ChIP file
+
+## creating RX_noIP file
+
+my $updated_RX_noIP_align_fname = $analysis_path . "/RX_noIP.align.txt";
+
+if(-e "$updated_RX_noIP_align_fname"){
+ print "$updated_RX_noIP_align_fname already exists.\n";
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "Do you want to overwrite? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+ if( $answer eq "y"){
+ unlink($updated_RX_noIP_align_fname);
+ }
+ }
+}
+
+my $RX_noIP_tags = 0;
+
+print "\nConverting RX_noIP reads file into the QuEST format.\n";
+print "If the counter below stays at zero, the program can't\n";
+print "match the RX_noIP alignments to the reference genome that you provided.\n";
+
+if( ! -e $RX_noIP_align_fname ){
+ print "RX_noIP_align_file $RX_noIP_align_fname does not exist. Aborting.\n";
+ exit(1);
+}
+
+if($RX_noIP_file_type eq "solexa_align" || $RX_noIP_file_type eq "QuEST_align"){
+ open RX_noIP_align_file, "< $RX_noIP_align_fname" || die "Failed to open $RX_noIP_align_fname\n";
+ open updated_RX_noIP_align_file, "> $updated_RX_noIP_align_fname" ||
+ die "Failed to open $updated_RX_noIP_align_fname\n";
+ while(<RX_noIP_align_file>){
+
+ if($RX_noIP_tags % 10000 == 0){
+ printf "\rfound %.2f M reads", $RX_noIP_tags / 1000000;
+ $|++; ## flush the buffer, that's all
+ }
+
+ my $cur_contig = "";
+ my $cur_fivep_coord = "";
+ my $cur_QuEST_orient = "";
+ my $read_ok = "true";
+
+ chomp;
+ my @cur_fields = split(/ /,$_);
+
+
+ if( $RX_noIP_file_type eq "solexa_align" ){
+ if( scalar(@cur_fields) > 4 ){
+ my $cur_seq_length = length($cur_fields[0]);
+ my @coord_fields = split(/:/,$cur_fields[3]);
+ my $cur_orient = $cur_fields[4];
+ if( scalar(@coord_fields) == 2 &&
+ ( $cur_orient eq "F" or $cur_orient eq "R") ){
+
+ $cur_contig = $coord_fields[0];
+ my $cur_coord = $coord_fields[1];
+
+ if($cur_orient eq "F"){
+ $cur_fivep_coord = $cur_coord - 1;
+ $cur_QuEST_orient = "+";
+ }
+ elsif($cur_orient eq "R"){
+ $cur_fivep_coord = $cur_coord + $cur_seq_length - 1 - 1;
+ $cur_QuEST_orient = "-";
+ }
+ }
+ else{ $read_ok = "false"; }
+ }
+ else{ $read_ok = "false"; }
+ }
+ elsif( $RX_noIP_file_type eq "QuEST_align" ){
+ if( scalar(@cur_fields) == 3 ){
+ $cur_contig = $cur_fields[0];
+ $cur_fivep_coord = $cur_fields[1];
+ $cur_QuEST_orient = $cur_fields[2];
+ if($cur_QuEST_orient eq "+" || $cur_QuEST_orient eq "-"){
+ # do nothing
+ }
+ else{ $read_ok = "false"; }
+ }
+ }
+
+ if( $read_ok eq "true" ){
+ my $cur_contig_ind = -1;
+
+ for(my $i=0; $i<scalar(@contig_names); $i++){
+ if( $contig_names[$i] eq $cur_contig ){
+ $cur_contig_ind = $i;
+ }
+ }
+
+ if($cur_contig_ind != -1){
+ ## Solexa coords start with 1, ours with 0.
+ if( $cur_fivep_coord < 0 ||
+ $cur_fivep_coord >= $contig_sizes[$cur_contig_ind] ){
+ print "Warning: read start out of boundaries.\n";
+ print "Expected [ 0,$contig_sizes[$cur_contig_ind] ] but found $cur_fivep_coord. Skipping.\n";
+ $read_ok = "false";
+ }
+ else{
+ print updated_RX_noIP_align_file "$cur_contig $cur_fivep_coord $cur_QuEST_orient\n";
+ $RX_noIP_tags++;
+ }
+ }
+ else{
+ # couldn't locate the read
+ }
+ }
+ }
+
+ close RX_noIP_align_file;
+ close updated_RX_noIP_align_file;
+ printf "\n";
+}
+
+print "\n";
+
+## creating RX_noIP file
+
+## /creating RX_noIP file
+
+## make assessment to the number of tags
+
+my $ChIP_recommended_tag_count = ( $genome_size / 3080436051 ) * 5000000;
+my $ChIP_excess_fold = $ChIP_tags / $ChIP_recommended_tag_count;
+
+my $RX_noIP_recommended_tag_count = ( $genome_size / 3080436051 ) * 7000000;
+my $RX_noIP_excess_fold = $RX_noIP_tags / $RX_noIP_recommended_tag_count;
+my $RX_noIP_excess_fold_with_FDR = ($RX_noIP_tags - $ChIP_tags ) / $RX_noIP_recommended_tag_count;
+
+printf "You seem to have at least %.2f fold the recommended amount of ChIP reads.\n", $ChIP_excess_fold;
+printf "You seem to have at least %.2f fold the recommended amount of RX_noIP reads without FDR estimate.\n", $RX_noIP_excess_fold;
+printf "You seem to have at least %.2f fold the recommended amount of RX_noIP reads with FDR estimate\n.", $RX_noIP_excess_fold_with_FDR;
+print "\n";
+
+if($ChIP_excess_fold < 1){
+ print "\nWarning: you seem to not have sufficiently many ChIP sequencing reads.\n";
+ print "We recommend at least $ChIP_recommended_tag_count aligned reads,\n";
+ print "however you only provided $ChIP_tags. QuEST will still work, but your results\n";
+ print "may not be as accurate. \n";
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "Do you want to continue? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+ if($answer eq "y"){
+ # do nothing
+ }
+ elsif($answer eq "n"){
+ print "You have aborted QuEST configuration.\n";
+ exit(0);
+ }
+ }
+}
+else{
+ printf "You seem to have at least %.2f fold the recommended amount of ChIP sequence reads.\n", $ChIP_excess_fold;
+}
+
+if($RX_noIP_excess_fold < 1){
+
+ print "\nWarning: you we seem to not have sufficiently many RX_noIP sequencing reads.\n";
+ printf "We recommend at least %.0f aligned reads,\n", $RX_noIP_recommended_tag_count;
+ print "however you only provided $RX_noIP_tags. QuEST will still work, but your results\n";
+ print "may not be as accurate. \n";
+
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "Do you want to continue? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+ if($answer eq "y"){
+ # do nothing
+ }
+ elsif($answer eq "n"){
+ print "You have aborted QuEST configuration.\n";
+ exit(0);
+ }
+ }
+}
+else{
+ if( $RX_noIP_excess_fold_with_FDR >=1 ){
+ print "You have enough RX_noIP reads to run the analysis with FDR.\n";
+ }
+ else{
+ print "You don't have enought RX_noIP reads to run the analysis with FDR.\n";
+ }
+}
+
+my $analysis_with_FDR = "not defined";
+my $background_tags = -1;
+my $pseudo_ChIP_tags = -1;
+
+
+if($RX_noIP_tags < $ChIP_tags){
+ print "Since the number of Rx_noIP tags is less than ChIP tags, there is no\n";
+ print "possibility to provide an FDR estimate. \n";
+ print "QuEST will be configured without FDR estimation.\n";
+ $analysis_with_FDR = "false";
+ $background_tags = $RX_noIP_tags;
+}
+else{
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "\nDo you want to allocate Rx_noIP reads for FDR estimation? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if($answer eq "y"){
+ $analysis_with_FDR = "true";
+ $pseudo_ChIP_tags = $ChIP_tags;
+ $background_tags = $RX_noIP_tags - $pseudo_ChIP_tags;
+
+ print "\nWe recommend using $background_tags for the background tags\n";
+ print "and $pseudo_ChIP_tags for the pseudo_ChIP tags. \n";
+
+ my $cur_answer = "";
+ while( $cur_answer ne "y" && $cur_answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $cur_answer = "y";
+ print "$cur_answer\n";
+ }
+ else{
+ $cur_answer = <STDIN>;
+ }
+ chomp($cur_answer);
+
+ if($cur_answer eq "y"){
+ print "\nAllocating reads. \n\n";
+ }
+ elsif($cur_answer eq "n"){
+ print "\nPlease specify the number of background and pseudo_ChIP tags.\n";
+ print "These two numbers should add to the total number of RX_noIP tags ($RX_noIP_tags)\n";
+ print "and we recommend using the same number of tags for pseudoChIP as you have for\n";
+ print "your ChIP (ChIP tags)\n";
+
+ print "Please enter the number of pseudo_ChIP tags between 0 and $RX_noIP_tags (recommended $ChIP_tags): ";
+ my $tags_entered = <STDIN>;
+ chomp($tags_entered);
+ if($tags_entered <= 0 or $tags_entered > $RX_noIP_tags){
+ print "The number that you have specified is not in the correct range. Aborting\n";
+ exit(0);
+ }
+
+ $pseudo_ChIP_tags = $tags_entered;
+ my $RX_noIP_tags_remaining = $RX_noIP_tags - $pseudo_ChIP_tags;
+ print "\nPlease enter the number of background tags between 0 and $RX_noIP_tags_remaining (recommended $RX_noIP_tags_remaining): ";
+ $tags_entered = <STDIN>;
+
+ chomp($tags_entered);
+ if($tags_entered <= 0 or $tags_entered > $RX_noIP_tags_remaining){
+ print "The number you have specified is not in the correct range. Aborting\n";
+ }
+ $background_tags = $tags_entered;
+ }
+ }
+ }
+ elsif($answer eq "n"){
+ print "QuEST will be configured without FDR estimation.\n";
+ $analysis_with_FDR = "false";
+ $background_tags = $RX_noIP_tags;
+ }
+ }
+}
+
+## /make assessment to the number of tags
+
+## allocating RX_noIP_tags
+my $background_RX_noIP_fname = "";
+my $pseudo_ChIP_RX_noIP_fname = "";
+
+if($analysis_with_FDR eq "true"){
+ $background_RX_noIP_fname = $analysis_path . "/background_RX_noIP.align.txt";
+ $pseudo_ChIP_RX_noIP_fname = $analysis_path . "/pseudo_ChIP_RX_noIP.align.txt";
+
+ if(-e $background_RX_noIP_fname){
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "File $background_RX_noIP_fname already exists. Do you want to overwrite it? (y/n): ";
+ $answer = <STDIN>;
+ chomp($answer);
+ if($answer eq "y"){
+ unlink($background_RX_noIP_fname);
+ }
+ elsif($answer eq "n"){
+ print "Aborting QuEST configuration.\n";
+ exit(0);
+ }
+ }
+ }
+
+ if(-e $pseudo_ChIP_RX_noIP_fname){
+ my $answer = "";
+ while( $answer ne "y" && $answer ne "n" ){
+ print "File $pseudo_ChIP_RX_noIP_fname already exists. Do you want to overwrite it? (y/n): ";
+ $answer = <STDIN>;
+ chomp($answer);
+ if($answer eq "y"){
+ unlink($pseudo_ChIP_RX_noIP_fname);
+ }
+ elsif($answer eq "n"){
+ print "Aborting QuEST configuration. \n";
+ exit(0);
+ }
+ }
+ }
+
+ open background_RX_noIP_file, "> $background_RX_noIP_fname" ||
+ die "Failed to open $background_RX_noIP_fname\n";
+ open pseudo_ChIP_RX_noIP_file, "> $pseudo_ChIP_RX_noIP_fname" ||
+ die "Failed to open $pseudo_ChIP_RX_noIP_fname\n";
+ open updated_RX_noIP_align_file, "< $updated_RX_noIP_align_fname" ||
+ die "Failed to open $updated_RX_noIP_align_fname\n";
+
+ my $cur_line_counter = 0;
+ while(<updated_RX_noIP_align_file>){
+ chomp;
+ if($cur_line_counter < $background_tags){
+ print background_RX_noIP_file "$_\n";
+ }
+ else{
+ if($cur_line_counter < $background_tags + $pseudo_ChIP_tags){
+ print pseudo_ChIP_RX_noIP_file "$_\n";
+ }
+ }
+ $cur_line_counter++;
+ }
+ close background_RX_noIP_file;
+ close pseudo_ChIP_RX_noIP_file;
+ close updated_RX_noIP_align_file;
+}
+else{
+ $background_tags = $RX_noIP_tags;
+ $background_RX_noIP_fname = $updated_RX_noIP_align_fname;
+}
+## /allocating RX_noIP_tags
+
+## calculating cutoffs/thresholds
+
+print "\nWe will now walk through threshold setup for different software modules.\n";
+
+## calculating Quick Window Scan thresholds
+
+print "\n";
+print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+print "~~~~~~~~~~~~~~ Quick Window Scan Module ~~~~~~~~~~~~~~~~~~~~~\n";
+
+my $quick_window_scan_calc_window = -1;
+my $quick_window_scan_count_threshold = -1;
+my $recommended_quick_window_scan_calc_window = 300;
+my $recommended_quick_window_scan_count_threshold =
+ 50 * ( $ChIP_tags / 10000000 ) * ( $recommended_quick_window_scan_calc_window / 300 ) *
+ ( 3080436051 / $genome_size );
+
+print "\n";
+print "Quick window scan is the module that identifies a training\n";
+print "set of potential peaks for estimating peak shift parameter.\n";
+print "To do so, it needs the size of the window to count the tags in\n";
+print "and a count threshold to identify positive regions where that\n";
+print "threshold is exceeded.\n\n";
+print "Based on the size of the genome and the number of ChIP tags, we\n";
+print "recommend using $recommended_quick_window_scan_calc_window for the window size and ";
+printf "%.0f for the the ", $recommended_quick_window_scan_count_threshold;
+print "tag count threshold.\n";
+
+my $answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ printf "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if($answer eq "y"){
+ $quick_window_scan_calc_window = $recommended_quick_window_scan_calc_window;
+ $quick_window_scan_count_threshold = $recommended_quick_window_scan_count_threshold;
+ }
+ elsif($answer eq "n"){
+ print "Please enter the the size of the count window (recommended $recommended_quick_window_scan_calc_window): ";
+ my $cur_answer1 = <STDIN>;
+ chomp($cur_answer1);
+ if($cur_answer1 >= 0){
+ $quick_window_scan_calc_window = $cur_answer1;
+ }
+ else{
+ print "The value cannot be negative. Aborting.\n";
+ exit(0);
+ }
+
+ print "Please enter the count threshold for the number of tags: ";
+ $cur_answer1 = <STDIN>;
+ chomp($cur_answer1);
+
+ if($cur_answer1 >= 0){
+ $quick_window_scan_count_threshold = $cur_answer1;
+ }
+ }
+}
+
+print "\n";
+print "Specified parameters:\n\n";
+print "quick_window_scan_calc_window: $quick_window_scan_calc_window\n";
+print "quick_window_scan_count_threshold: $quick_window_scan_count_threshold\n";
+
+print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
+
+## /calculating Quick Window Scan thresholds
+
+## calculating Peak Shift Calibrator thresholds
+
+print "\n";
+print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+print "~~~~~~~~~~~~~ Peak Shift Calibrator Module ~~~~~~~~~~~~~~~~~~\n";
+
+my $recommended_KDE_bandwidth = 30;
+my $recommended_scan_window = 1000;
+my $recommended_quick_scan_tag_count_threshold = 0;
+my $recommended_top_count = 200;
+my $recommended_ChIP_to_background_fold = 10;
+my $recommended_best_to_second_best_peak_ratio = 10;
+my $recommended_ChIP_threshold_for_PSC = 0.3 * ( $ChIP_tags / 7000000 ) * ( 3080436051 / $genome_size );
+my $recommended_ChIP_for_rev_peak_max_min_ratio = 1.5;
+
+my $KDE_bandwidth = -1;
+my $scan_window = -1;
+my $quick_scan_tag_count_threshold = 0;
+my $top_count = -1;
+my $ChIP_to_background_fold = -1;
+my $best_to_second_best_peak_ratio = -1;
+my $ChIP_threshold_for_PSC = -1;
+my $ChIP_for_rev_peak_max_min_ratio = -1;
+my $peak_shift_estimation_method = "NA";
+
+print "\nCalibrate Peak Shift Module estimates the value of peak shift based on the\n";
+print "the list of the regions provided by the Quick Window Scan Module.\n\n";
+
+
+print "Kernel Density Estimation requires a smoothing parameter called KDE bandwidth.\n";
+print "For sequence-specific TFs we recommend using $recommended_KDE_bandwidth bps for the KDE bandwidth.\n";
+print "If your factor is not sequence specific or library has a large range of inserts,\n";
+print "consider increasing the value of KDE bandwidth to achieve more smoothing.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $KDE_bandwidth = $recommended_KDE_bandwidth;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the value of KDE bandwidth: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer > 0 ){
+ $KDE_bandwidth = $cur_answer;
+ }
+ else{ print "You should specify a positive number. Aborting.\n"; exit(0); }
+ }
+}
+
+$answer = "";
+
+print "\nScan window defines a region within which the peak is expected to be found located.\n";
+print "We recommend using sufficiently large window size such as $recommended_scan_window.\n";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $scan_window = $recommended_scan_window;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the value of scan window: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer > 0 ){
+ $scan_window = $cur_answer;
+ }
+ else{ print "You should specify a positive number. Aborting.\n"; exit(0); }
+ }
+}
+
+print "\nYou can limit the regions by the number of tags found by the \n";
+print "Quick Window Scan Module. We suggest not doing that. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $quick_scan_tag_count_threshold = $recommended_quick_scan_tag_count_threshold;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the value of quick scan tag count threshold: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer >= 0 ){
+ $quick_scan_tag_count_threshold = $cur_answer;
+ }
+ else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }
+ }
+}
+
+
+print "\nYou can limit the total number of regions to consider to save time. We\n";
+print "recommend using at least $recommended_top_count regions. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $top_count = $recommended_top_count;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the number of regions to consider: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer >= 0 ){
+ $top_count = $cur_answer;
+ }
+ else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }
+ }
+}
+
+print "\nFor each peak to be qualified it must be enriched over the background.\n";
+print "We recommend using $recommended_ChIP_to_background_fold fold enrichment to identify\n";
+print "positive regions. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $ChIP_to_background_fold = $recommended_ChIP_to_background_fold;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the ChIP to background fold enrichment: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer >= 0 ){
+ $ChIP_to_background_fold = $cur_answer;
+ }
+ else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }
+ }
+}
+
+print "\nMultiple peaks may exist within the same window. To assign them correctly,\n";
+print "we need to reject close value peaks on the same strand.\n";
+print "We suggest using the value of $recommended_best_to_second_best_peak_ratio ";
+print "to reject the second best peaks.\n";
+print "Higher value is more stringent, lower will pass more peaks.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree to this value? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $best_to_second_best_peak_ratio = $recommended_best_to_second_best_peak_ratio;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the best-to-second-best peak ratio: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer >= 0 ){
+ $best_to_second_best_peak_ratio = $cur_answer;
+ }
+ else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }
+
+ }
+}
+
+print "\nMatching the peaks on opposite strands requires their heights to be very close.\n";
+print "We suggest rejecting all opposite strand peaks for which the ratio between\n";
+print "the larger to the smaller peak is greater than $recommended_ChIP_for_rev_peak_max_min_ratio.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $ChIP_for_rev_peak_max_min_ratio = $recommended_ChIP_for_rev_peak_max_min_ratio;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the forward/reverse ratio of the larger to the smaller peak: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer >= 0 ){
+ $ChIP_for_rev_peak_max_min_ratio = $cur_answer;
+ }
+ else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }
+ }
+}
+
+print "\nLow peaks must be rejected since they are unlikely to be true binding events.\n";
+print "According to the number of ChIP tags and the size of the genome, we suggest\n";
+printf "using value of %.2f for the ChIP threshold. \n", $recommended_ChIP_threshold_for_PSC;
+
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if( $answer eq "y" ){
+ $ChIP_threshold_for_PSC = $recommended_ChIP_threshold_for_PSC;
+ }
+ elsif( $answer eq "n" ){
+ print "Please specify the ChIP threshold: ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ if( $cur_answer >= 0 ){
+ $ChIP_threshold_for_PSC = $cur_answer;
+ }
+ else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }
+ }
+}
+
+print "\nYou can choose between the median and the mean based estimates for the peak shift.\n";
+print "Mean based estimates tend to be more accurate, however mode is more robust\n";
+print "to errors. We recommend using the mean-based estimate. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if($answer eq "y"){
+ $peak_shift_estimation_method = "mean";
+ }
+ elsif($answer eq "n"){
+ print "Setting the estimation method to mean.\n";
+ $peak_shift_estimation_method = "median";
+ }
+}
+
+print "\n";
+print "Specified parameters for the Peak Shift Calibration Module:\n\n";
+
+print "KDE_bandwidth: $KDE_bandwidth\n";
+print "scan_window: $scan_window\n";
+print "quick_scan_tag_count_threshold: $quick_scan_tag_count_threshold\n";
+print "top_count: $top_count\n";
+print "ChIP_to_background_fold: $ChIP_to_background_fold\n";
+print "best_to_second_best_peak_ratio: $best_to_second_best_peak_ratio\n";
+printf "ChIP_threshold: %.2f\n", $ChIP_threshold_for_PSC;
+print "ChIP_for_rev_peak_max_min_ratio: $ChIP_for_rev_peak_max_min_ratio\n";
+print "peak_shift_estimation_method: $peak_shift_estimation_method\n";
+
+## /calculating Peak Shift Calibrator thresholds
+
+
+print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
+
+## calculating Peak Caller thresholds
+
+print "\n";
+print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+print "~~~~~~~~~~~~~~~~~~ Peak Caller Module ~~~~~~~~~~~~~~~~~~~~~~~\n";
+
+
+my ($ChIP_threshold, $background_threshold, $er, $rr, $dip_fraction);
+
+my $recommended_ChIP_threshold = 0.3 * ( 3080436051 / $genome_size ) * ( $ChIP_tags / 7000000 );
+my $recommended_enrichment_fold = 3.0;
+my $recommended_rescue_fold = 10;
+my $recommended_rr = $recommended_rescue_fold * ( $ChIP_tags / $background_tags );
+my $recommended_dip_fraction = 0.1;
+
+
+print "\n";
+print "\n";
+print "*** Important: this is where you dial in your detection stringency. ****\n";
+print "\n";
+
+print "Judging by the number of ChIP tags and size of the genome,\n";
+printf "we suggest using %.2f for the ChIP profile threshold.\n", $recommended_ChIP_threshold;
+print "Higher threshold will give more stringency). \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if($answer eq "y"){
+ # do nothing
+ $ChIP_threshold = $recommended_ChIP_threshold;
+ }
+ elsif($answer eq "n"){
+ print "Please specify the ChIP profile threshold (recommended $recommended_ChIP_threshold): ";
+ my $answer1 = <STDIN>;
+ chomp($answer1);
+ $ChIP_threshold = $answer1;
+ }
+}
+
+my $recommended_background_threshold
+ = $ChIP_threshold * ($background_tags / $ChIP_tags) / $recommended_enrichment_fold;
+
+
+print "\n";
+print "We recommend a value of $recommended_enrichment_fold for the enrichment fold.\n";
+print "This value is used to descriminate enrichment between ChIP and \n";
+print "background datasets at loci with low amount of background.\n";
+printf "With enrichment fold of %.2f, the corresponding background profile\n", $recommended_enrichment_fold;
+printf "threshold will be %.2f. \n", $recommended_background_threshold;
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+ if($answer eq "y"){
+ # do nothing
+ $er = $recommended_enrichment_fold;
+ $background_threshold = $recommended_background_threshold;
+ }
+ elsif($answer eq "n"){
+ print "Please specify the enrichment fold (recommended 3.0 and up): ";
+ my $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ $background_threshold = $ChIP_threshold * ($background_tags / $ChIP_tags) / $cur_answer;
+
+ print "According to your selection the background threshold was calculated to be $background_threshold\n";
+ }
+}
+
+print "\n";
+print "Rescue fold specifies when peaks int the high-background regions are called.\n";
+print "We recommend using a value of $recommended_rescue_fold for the the value\n";
+printf "of rescue_fold. This will correspond to the value of %.2f for the rescue ratio.\n", $recommended_rr;
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if($answer eq "y"){
+ $rr = $recommended_rr;
+ }
+ elsif($answer eq "n"){
+ print "Please specify the rescue fold (we recommend $recommended_rescue_fold): ";
+ my $cur_answer2 = <STDIN>;
+ chomp($cur_answer2);
+ $rr = $cur_answer2 * ($ChIP_tags) / ($background_tags);
+ print "The corresponding value of rescue ratio will be $rr\n";
+ print "\n";
+ }
+}
+
+print "\nAdjacent peaks are distinguished if the profile dips between the two peaks\n";
+printf "by certain amount. We recommend using %.2f for the value of the dip fraction\n", $recommended_dip_fraction;
+print "between the two peaks. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+ chomp($answer);
+
+ if($answer eq "y"){
+ $dip_fraction = $recommended_dip_fraction;
+ }
+ elsif($answer eq "n"){
+ print "Please specify an alternative dip fraction value: ";
+ my $cur_answer2 = <STDIN>;
+ chomp($cur_answer2);
+ $dip_fraction = $cur_answer2;
+ }
+}
+
+print "\n";
+print "Specified parameters for the Peak Caller Module:\n\n";
+
+printf "ChIP_threshold: %.2f\n", $ChIP_threshold;
+printf "background_threshold: %.2f\n", $background_threshold;
+printf "enrichment_ratio: %.2f\n", $er;
+printf "rescue_ratio: %.2f\n", $rr;
+printf "dip_fraction: %.2f\n", $dip_fraction;
+
+print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
+
+## /calculating Peak Caller thresholds
+
+## system info
+
+my $processors = -1;
+my $recommended_threads = -1;
+my $threads = -1;
+
+if(-e "/proc/cpuinfo"){
+ $processors = `cat /proc/cpuinfo | grep processor | wc -l`;
+ chomp($processors);
+}
+elsif(-e "/usr/sbin/sysctl"){
+ $processors = `/usr/sbin/sysctl -n hw.ncpu`;
+}
+else{
+ print "I failed to determine the number of processors on your platform.\n";
+ print "Please specify it manually: ";
+ $answer = <STDIN>;
+ chomp($answer);
+ if($answer <= 0){
+ print "You need to specify a positive number. Aborting.\n";
+ exit(0);
+ }
+ $processors = $answer;
+}
+
+if($processors == 0){
+ print "I failed to determine the number of processors on your platform.\n";
+ print "Please specify it manually: ";
+ $answer = <STDIN>;
+ chomp($answer);
+ if($answer <= 0){
+ print "You need to specify a positive number. Aborting.\n";
+ exit(0);
+ }
+ $processors = $answer;
+}
+
+print "\nNumber of processors: $processors\n";
+
+if($processors == 1){
+ $recommended_threads = 1;
+}
+else{ $recommended_threads = $processors - 1; }
+
+print "\nThe Profiler Module supports thread-level parallelism. We suggest utilizing \n";
+print "$recommended_threads for the number of threads to distribute the computation over.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+ print "Do you agree? (y/n): ";
+ if($silent_flag eq "on"){
+ $answer = "y";
+ print "$answer\n";
+ }
+ else{
+ $answer = <STDIN>;
+ }
+
+ chomp($answer);
+
+ if($answer eq "y"){
+ $threads = $recommended_threads;
+ }
+ elsif($answer eq "n"){
+ print "Please specify desired number of threads: ";
+ my $cur_answer2 = <STDIN>;
+ chomp($cur_answer2);
+ $threads = $cur_answer2;
+ }
+}
+
+## /system info
+
+## /calculating cutoffs/thresholds
+
+print "\n";
+
+## writing parameter files
+
+## writing Quick Window Scan parameters
+
+my $quick_window_scan_par_fname = $analysis_path . "/quick_window_scan.batch.pars";
+my $quick_window_scan_output_fname = $analysis_path . "/quick_window_scan.out";
+
+unlink($quick_window_scan_par_fname);
+
+open quick_window_scan_par_file, "> $quick_window_scan_par_fname"
+ or die "Failed to open $quick_window_scan_par_fname\n";
+print quick_window_scan_par_file "genome_table = $contig_table_fname\n";
+print quick_window_scan_par_file "align_file = $updated_ChIP_align_fname\n";
+print quick_window_scan_par_file "output_file = $quick_window_scan_output_fname\n";
+print quick_window_scan_par_file "calc_window = $quick_window_scan_calc_window\n";
+printf quick_window_scan_par_file "count_threshold = %d", $quick_window_scan_count_threshold;
+
+close quick_window_scan_par_file;
+
+print "Saved Quick Window Scan batch parameter file.\n";
+
+## /writing Quick Window Scan parameters
+
+## writing Calibrate Peak Shift parameters
+
+my $calibrate_peak_shift_par_fname = $analysis_path . "/calibrate_peak_shift.batch.pars";
+my $calibrate_peak_shift_output_fname = $analysis_path . "/calibrate_peak_shift.out";
+
+open calibrate_peak_shift_par_file, "> $calibrate_peak_shift_par_fname"
+ or die "Failed to open $calibrate_peak_shift_par_fname\n";
+
+if($silent_flag eq "on"){
+ print calibrate_peak_shift_par_file "verbose = off\n";
+}
+else{
+ print calibrate_peak_shift_par_file "verbose = yes\n";
+}
+print calibrate_peak_shift_par_file "quick_scan_file = $quick_window_scan_output_fname.sorted\n";
+print calibrate_peak_shift_par_file "align_chip_file = $updated_ChIP_align_fname\n";
+print calibrate_peak_shift_par_file "align_background_file = $updated_RX_noIP_align_fname\n";
+print calibrate_peak_shift_par_file "output_file = $calibrate_peak_shift_output_fname\n";
+print calibrate_peak_shift_par_file "kde_bandwidth = $KDE_bandwidth\n";
+print calibrate_peak_shift_par_file "scan_window = $scan_window\n";
+print calibrate_peak_shift_par_file "quick_scan_tag_count_threshold = $quick_scan_tag_count_threshold\n";
+print calibrate_peak_shift_par_file "top_count = $top_count\n";
+print calibrate_peak_shift_par_file "chip_to_background_fold = $ChIP_to_background_fold\n";
+print calibrate_peak_shift_par_file "best_to_second_best_peak_ratio = $best_to_second_best_peak_ratio\n";
+print calibrate_peak_shift_par_file "chip_peak_threshold = $ChIP_threshold_for_PSC\n";
+print calibrate_peak_shift_par_file "chip_for_rev_peak_max_min_ratio = $ChIP_for_rev_peak_max_min_ratio\n";
+print calibrate_peak_shift_par_file "method = $peak_shift_estimation_method\n";
+
+
+close calibrate_peak_shift_par_file;
+
+print "Saved Calibrate Peak Shift batch parameter file.\n";
+
+## /writing Calibrate Peak Shift parameters
+
+## writing Generate Peak Profile parameters
+
+my $Profiler_calc_window = $KDE_bandwidth * 10;
+
+my $scores_path = $analysis_path . "/scores";
+
+if(-d $scores_path){
+ print "Warning: score directory $scores_path already exists\n";
+}
+else{
+ mkdir($scores_path);
+}
+
+my $ChIP_scores_path = $scores_path . "/ChIP/";
+my $pseudo_ChIP_scores_path = $scores_path . "/pseudo_ChIP/";
+my $background_scores_path = $scores_path . "/background/";
+
+if(-d $ChIP_scores_path){
+ print "Warning: ChIP scores directory $ChIP_scores_path already exists\n";
+ print "When the Profiler Module will run the files will be overwritten\n";
+}
+else{ mkdir($ChIP_scores_path); }
+
+
+if( $analysis_with_FDR eq "true"){
+ if(-d $pseudo_ChIP_scores_path){
+ print "Warning: pseudo_ChIP scores directory $pseudo_ChIP_scores_path already exists\n";
+ print "When the Profiler Module will run the files will be overwritten\n";
+ }
+ else{ mkdir($pseudo_ChIP_scores_path); }
+}
+
+if(-d $background_scores_path){
+ print "Warning: backgrounds scores directory $background_scores_path already exists\n";
+ print "When the Profiler Module will run the files will be overwritten\n";
+}
+else{ mkdir($background_scores_path); }
+
+my $ChIP_generate_profile_par_fname = $analysis_path . "/generate_profile.ChIP.batch.pars";
+my $ChIP_generate_profile_output_fname = $analysis_path . "/generate_profile.ChIP.out";
+
+my $pseudo_ChIP_generate_profile_par_fname = $analysis_path . "/generate_profile.pseudo_ChIP.batch.pars";
+my $pseudo_ChIP_generate_profile_output_fname = $analysis_path . "/generate_profile.pseudo_ChIP.out";
+
+my $background_generate_profile_par_fname = $analysis_path . "/generate_profile.background.batch.pars";
+my $background_generate_profile_output_fname = $analysis_path . "/generate_profile.background.out";
+
+## writing ChIP Profiler param file
+
+unlink($ChIP_generate_profile_par_fname);
+
+open ChIP_generate_profile_ChIP_par_file, "> $ChIP_generate_profile_par_fname"
+ or die "Failed to open $ChIP_generate_profile_par_fname";
+
+print ChIP_generate_profile_ChIP_par_file "genome_table = $contig_table_fname\n";
+print ChIP_generate_profile_ChIP_par_file "align_file = $updated_ChIP_align_fname\n";
+print ChIP_generate_profile_ChIP_par_file "output_score_path = $ChIP_scores_path\n";
+print ChIP_generate_profile_ChIP_par_file "peak_shift_source_file = $calibrate_peak_shift_output_fname\n";
+print ChIP_generate_profile_ChIP_par_file "kde_bandwidth = $KDE_bandwidth\n";
+print ChIP_generate_profile_ChIP_par_file "calc_window = $Profiler_calc_window\n";
+print ChIP_generate_profile_ChIP_par_file "threads = $threads\n";
+
+close ChIP_generate_profile_ChIP_par_file;
+
+print "Saved Profiler batch parameter file for the ChIP data\n";
+
+## /writing ChIP Profiler param file
+
+## writing background Profiler param file
+
+unlink($background_generate_profile_par_fname);
+
+open background_generate_profile_ChIP_par_file, "> $background_generate_profile_par_fname"
+ or die "Failed to open $background_generate_profile_par_fname";
+print background_generate_profile_ChIP_par_file "genome_table = $contig_table_fname\n";
+print background_generate_profile_ChIP_par_file "align_file = $background_RX_noIP_fname\n";
+print background_generate_profile_ChIP_par_file "output_score_path = $background_scores_path\n";
+print background_generate_profile_ChIP_par_file "peak_shift_source_file = $calibrate_peak_shift_output_fname\n";
+print background_generate_profile_ChIP_par_file "kde_bandwidth = $KDE_bandwidth\n";
+print background_generate_profile_ChIP_par_file "calc_window = $Profiler_calc_window\n";
+print background_generate_profile_ChIP_par_file "threads = $threads\n";
+
+close background_generate_profile_ChIP_par_file;
+
+print "Saved Profiler batch parameter file for the background data.\n";
+
+
+## /writing background Profiler param file
+
+## writing pseudo_ChIP Profiler param file
+
+if($analysis_with_FDR eq "true"){
+
+ unlink($pseudo_ChIP_generate_profile_par_fname);
+
+ open pseudo_ChIP_generate_profile_ChIP_par_file, "> $pseudo_ChIP_generate_profile_par_fname"
+ or die "Failed to open $pseudo_ChIP_generate_profile_par_fname";
+
+ print pseudo_ChIP_generate_profile_ChIP_par_file "genome_table = $contig_table_fname\n";
+ print pseudo_ChIP_generate_profile_ChIP_par_file "align_file = $pseudo_ChIP_RX_noIP_fname\n";
+ print pseudo_ChIP_generate_profile_ChIP_par_file "output_score_path = $pseudo_ChIP_scores_path\n";
+ print pseudo_ChIP_generate_profile_ChIP_par_file "peak_shift_source_file = $calibrate_peak_shift_output_fname\n";
+ print pseudo_ChIP_generate_profile_ChIP_par_file "kde_bandwidth = $KDE_bandwidth\n";
+ print pseudo_ChIP_generate_profile_ChIP_par_file "calc_window = $Profiler_calc_window\n";
+ print pseudo_ChIP_generate_profile_ChIP_par_file "threads = $threads\n";
+
+ close pseudo_ChIP_generate_profile_ChIP_par_file;
+
+ print "Saved Profiler batch parameter file for the pseudo_ChIP data\n";
+}
+
+## /writing pseudo_ChIP Profiler param file
+
+## writing ChIP Peak Caller param file
+
+my $ChIP_peak_caller_par_fname = $analysis_path . "/peak_caller.ChIP.batch.pars";
+my $ChIP_peak_calls_fname = $analysis_path . "/peak_caller.ChIP.out";
+
+my $pseudo_ChIP_peak_caller_par_fname = $analysis_path . "/peak_caller.pseudo_ChIP.batch.pars";
+my $pseudo_ChIP_peak_calls_fname = $analysis_path . "/peak_caller.pseudo_ChIP.out";
+
+unlink($ChIP_peak_caller_par_fname);
+
+open ChIP_peak_caller_par_file, "> $ChIP_peak_caller_par_fname"
+ or die "Failed to open $ChIP_peak_caller_par_fname\n";
+
+print ChIP_peak_caller_par_file "genome_table = $contig_table_fname\n";
+print ChIP_peak_caller_par_file "chip_profile_path = $ChIP_scores_path\n";
+print ChIP_peak_caller_par_file "background_profile_path = $background_scores_path\n";
+print ChIP_peak_caller_par_file "output_file = $ChIP_peak_calls_fname\n";
+print ChIP_peak_caller_par_file "chip_threshold = $ChIP_threshold\n";
+print ChIP_peak_caller_par_file "background_threshold = $background_threshold\n";
+print ChIP_peak_caller_par_file "rescue_ratio = $rr\n";
+print ChIP_peak_caller_par_file "local_maximum_radius = 10\n";
+print ChIP_peak_caller_par_file "dip_fraction = $dip_fraction\n";
+print ChIP_peak_caller_par_file "ChIP_reads = $ChIP_tags\n";
+print ChIP_peak_caller_par_file "background_reads = $background_tags\n";
+
+close ChIP_peak_caller_par_file;
+
+print "Saved Peak Caller batch parameter file for the ChIP data\n";
+## /writing ChIP Peak Caller param file
+
+## writing pseudo_ChIP Peak Caller param file
+if($analysis_with_FDR eq "true"){
+ unlink($pseudo_ChIP_peak_caller_par_fname);
+
+ open pseudo_ChIP_peak_caller_par_file, "> $pseudo_ChIP_peak_caller_par_fname"
+ or die "Failed to open $pseudo_ChIP_peak_caller_par_fname\n";
+
+ print pseudo_ChIP_peak_caller_par_file "genome_table = $contig_table_fname\n";
+ print pseudo_ChIP_peak_caller_par_file "chip_profile_path = $pseudo_ChIP_scores_path\n";
+ print pseudo_ChIP_peak_caller_par_file "background_profile_path = $background_scores_path\n";
+ print pseudo_ChIP_peak_caller_par_file "output_file = $pseudo_ChIP_peak_calls_fname\n";
+ print pseudo_ChIP_peak_caller_par_file "chip_threshold = $ChIP_threshold\n";
+ print pseudo_ChIP_peak_caller_par_file "background_threshold = $background_threshold\n";
+ print pseudo_ChIP_peak_caller_par_file "rescue_ratio = $rr\n";
+ print pseudo_ChIP_peak_caller_par_file "local_maximum_radius = 10\n";
+ print pseudo_ChIP_peak_caller_par_file "dip_fraction = $dip_fraction\n";
+ print pseudo_ChIP_peak_caller_par_file "ChIP_reads = $pseudo_ChIP_tags\n";
+ print pseudo_ChIP_peak_caller_par_file "background_reads = $background_tags\n";
+
+ close pseudo_ChIP_peak_caller_par_file;
+
+ print "Saved Peak Caller batch parameter file for the pseudo_ChIP data.\n";
+}
+
+## /writing pseudo_ChIP Peak Caller param file
+
+## writing a superscript param file
+
+my $QuEST_param_fname = $analysis_path . "/QuEST.batch.pars";
+my $QuEST_log_fname = $analysis_path . "/QuEST.log";
+my $QuEST_output_fname = $analysis_path . "/QuEST.out";
+
+if( -e $QuEST_param_fname ){ unlink($QuEST_param_fname); }
+
+open QuEST_param_file, "> $QuEST_param_fname"
+ or die "Failed to open $QuEST_param_fname\n";
+
+print QuEST_param_file "analysis_with_FDR = $analysis_with_FDR\n";
+print QuEST_param_file "quick_window_scan_param_file = $quick_window_scan_par_fname\n";
+print QuEST_param_file "calibrate_peak_shift_param_file = $calibrate_peak_shift_par_fname\n";
+print QuEST_param_file "ChIP_generate_profile_param_file = $ChIP_generate_profile_par_fname\n";
+print QuEST_param_file "background_generate_profile_param_file = $background_generate_profile_par_fname\n";
+print QuEST_param_file "ChIP_peak_caller_param_file = $ChIP_peak_caller_par_fname\n";
+
+if($analysis_with_FDR eq "true"){
+ print QuEST_param_file "pseudo_ChIP_generate_profile_param_file = $pseudo_ChIP_generate_profile_par_fname\n";
+ print QuEST_param_file "pseudo_ChIP_peak_caller_param_file = $pseudo_ChIP_peak_caller_par_fname\n";
+}
+
+print QuEST_param_file "log_file = $QuEST_log_fname\n";
+print QuEST_param_file "output_file = $QuEST_output_fname\n";
+
+print QuEST_param_file "\n";
+print QuEST_param_file "# ** Pipeline steps, use \# to comment out **\n";
+print QuEST_param_file "\n";
+
+print QuEST_param_file "quick_window_scan: ChIP\n";
+print QuEST_param_file "calibrate_peak_shift: ChIP\n";
+print QuEST_param_file "generate_profile: ChIP\n";
+print QuEST_param_file "generate_profile: background\n";
+if($analysis_with_FDR eq "true"){
+ print QuEST_param_file "generate_profile: pseudo_ChIP\n";
+}
+print QuEST_param_file "peak_caller: ChIP vs background\n";
+if($analysis_with_FDR eq "true"){
+ print QuEST_param_file "peak_caller: pseudo_ChIP vs background\n";
+}
+
+close QuEST_param_file;
+
+## /writing a superscript param file
+
+## /writing parameter files
--- /dev/null
+CC = g++
+#CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops
+
+include ../config.mk
+
+TARGETS = seq_contig sequence_utility_functions ticker utils array_2d params
+SOURCES = $(TARGETS:=.cpp)
+HEADERS = $(TARGETS:=.h)
+OBJECTS = $(TARGETS:=.o)
+
+all: $(OBJECTS)
+
+%.o : %.cpp %.h
+ $(CC) $(INCLUDE) $(CCFLAGS) -c $<
+
+clean:
+ rm -rf *.o
+
+#all: seq_contig.o contig_set.o sequence_utility_functions.o ticker.o utils.o params.o array_2d.o
+
+#%.o:
+#%.o : %.cpp %.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c $<
+#-o $@.o
+#$(OBJ_DIR)/$@.o
+
+#$(TARGETS): $(SOURCES) $(HEADERS)
+# $(CC) $(INCLUDE) $(CCFLAGS) -c $(SOURCES)
+
+#params.o: params.cpp params.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c params.cpp
+#
+#seq_contig.o: seq_contig.cpp seq_contig.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c seq_contig.cpp
+#
+#contig_set.o: contig_set.cpp contig_set.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c contig_set.cpp
+#
+#sequence_utility_functions.o: sequence_utility_functions.cpp sequence_utility_functions.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c sequence_utility_functions.cpp
+#
+#ticker.o: ticker.cpp ticker.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c ticker.cpp
+#
+#utils.o: utils.cpp utils.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c utils.cpp
+#
+#array_2d.o: array_2d.cpp array_2d.h
+# $(CC) $(INCLUDE) $(CCFLAGS) -c array_2d.cpp
+#
+#clean:
+# rm -rf *.o
--- /dev/null
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "assert.h"
+
+using std::string;
+using std::vector;
+
+
+
+#include "params.h"
+
+namespace Error{
+ struct bad_argc{
+ int argc;
+ bad_argc(int _argc){ argc = _argc; }
+ };
+ struct bad_param{
+ const char* value;
+ bad_param(const char* _value){ value = _value; }
+ };
+ struct bad_param_type{
+ param_type bp_type;
+ bad_param_type(param_type _bp_type){ bp_type = _bp_type;}
+ };
+ struct param_missing{
+ const char* par_missing;
+ param_missing(const char* _par_missing){ par_missing = _par_missing; }
+ };
+}
+
+params::params(int _argc, char* _argv[]){
+ if(_argc < 1) throw Error::bad_argc(_argc);
+
+ par_names.resize(_argc);
+ par_values.resize(_argc);
+
+ par_names[0] = "executable";
+ par_values[0] = _argv[0];
+
+ for(int i=1; i<_argc; i++){
+ string cur_par_name="";
+ string cur_par_value="";
+ try{
+ parse(_argv[i], cur_par_name, cur_par_value);
+ par_names[i] = cur_par_name;
+ par_values[i] = cur_par_value;
+ }
+ catch(Error::bad_param bp){
+ std::cerr<<"Warning:\ncouldn't parse the parameter: "<<bp.value<<"\nskipping\n";
+ par_names[i] = "";
+ par_values[i] = "";
+ }
+ }
+ //std::cout<<"this can print\n";
+}
+
+void params::parse(const char* this_par, string& cur_par_name, string& cur_par_value){
+ string dummy_string = this_par;
+ unsigned int eq_pos = 0;
+ bool eq_pos_found = false;
+
+ for(unsigned int i=0; i<dummy_string.length() && !eq_pos_found; i++){
+ if(dummy_string[i] == '='){
+ eq_pos_found = true;
+ eq_pos = i;
+ }
+ }
+
+ if(!eq_pos_found || eq_pos == dummy_string.length()-1){
+ throw Error::bad_param(this_par);
+ }
+
+ cur_par_name = dummy_string.substr(0, eq_pos);
+ cur_par_value = dummy_string.substr(eq_pos+1, dummy_string.length()-eq_pos-1);
+
+}
+
+void params::require(const string& par_name, const string& par_legend, param_type par_type){
+ required_params.push_back(par_name);
+ required_params_legend.push_back(par_legend);
+ required_params_type.push_back(par_type);
+}
+
+void params::require(const char* this_par_name, const char* this_par_legend,
+ param_type this_par_type){
+ string this_par_name_str = this_par_name;
+ string this_par_legend_str = this_par_legend;
+
+ this->require(this_par_name_str, this_par_legend_str, this_par_type);
+}
+
+void params::optional(const char* this_par_name, const char* this_par_legend,
+ const char* default_value, param_type par_type){
+ string _this_par_name = this_par_name;
+ string _this_par_legend = this_par_legend;
+ string _default_value = default_value;
+
+ optional_params.push_back(_this_par_name);
+ optional_params_legend.push_back(_this_par_legend);
+ optional_params_type.push_back(par_type);
+ optional_params_default_value.push_back(default_value);
+
+ bool par_found = false;
+ unsigned int par_index = 0;
+ for(unsigned int i=0; i<par_names.size(); i++){
+ if(par_names.at(i) == _this_par_name){
+ par_found = true;
+ par_index = i;
+ }
+ }
+ if(!par_found){
+ par_names.push_back(_this_par_name);
+ par_values.push_back(default_value);
+ }
+}
+
+string stringify_type(param_type par_type){
+ string dummy_string;
+ switch(par_type){
+ case CHAR_TYPE:
+ dummy_string = "CHAR_TYPE";
+ break;
+
+ case STRING_TYPE:
+ dummy_string = "STRING_TYPE";
+ break;
+
+ case INT_TYPE:
+ dummy_string = "INT_TYPE";
+ break;
+
+ case DOUBLE_TYPE:
+ dummy_string = "DOUBLE_TYPE";
+ break;
+ default:
+ std::cerr<<"bad par_type: "<<par_type<<"\n";
+ throw Error::bad_param_type(par_type);
+ }
+ return dummy_string;
+}
+
+bool good_type(const string& cur_par, param_type req_type){
+ switch(req_type){
+ case CHAR_TYPE:
+ if(cur_par.length() == 1) return true; break;
+ case STRING_TYPE:
+ return true; break;
+ case INT_TYPE:
+ return true; break;
+ case DOUBLE_TYPE:
+ return true; break;
+ default:
+ std::cerr<<"bad param type: "<<req_type<<"\n";
+ throw Error::bad_param_type(req_type);
+ break;
+ }
+ return false;
+}
+
+bool params::enforce(){
+ bool all_enforced = true;
+ for(unsigned int i=0; i<required_params.size(); i++){
+ string cur_req_param = required_params.at(i);
+ string cur_par_legend = required_params_legend.at(i);
+ param_type cur_req_type = required_params_type.at(i);
+
+ //std::cout<<"enforcing "<<cur_req_param<<"\n";
+
+ bool param_found = false;
+ unsigned int param_pos = 0;
+
+ for(unsigned j=0; j<par_names.size() && !param_found; j++){
+ if(par_names.at(j) == cur_req_param){
+ param_found = true;
+ param_pos = j;
+ }
+ }
+ if(!param_found){
+ all_enforced = false;
+ std::cout<<" Required:\t"<<cur_req_param<<" = <"<<cur_par_legend<<">\t"<<stringify_type(cur_req_type)<<" [ parameter missing ]\n";
+ }
+ else{ //type_check now
+ string cur_par_value = par_values.at(param_pos);
+ if(!good_type(cur_par_value, cur_req_type)){
+ all_enforced = false;
+ std::cout<<" Required:\t"<<cur_req_param<<" = <"<<cur_par_legend<<">\t"<<stringify_type(cur_req_type)<<" [ unexpected type ]\n";
+ }
+ }
+ }
+ assert(optional_params.size() == optional_params_legend.size());
+ assert(optional_params.size() == optional_params_default_value.size());
+ assert(optional_params.size() == optional_params_type.size());
+
+ /*
+ for(unsigned int i=0; i<optional_params.size(); i++){
+ string cur_param = optional_params.at(i);
+ string cur_legend = optional_params_legend.at(i);
+ string cur_default = optional_params_default_value.at(i);
+ param_type cur_type = optional_params_type.at(i);
+
+ bool param_found = false;
+ unsigned int param_pos = 0;
+
+ for(unsigned j=0; j<par_names.size() && !param_found; j++){
+ if(par_names.at(j) == cur_param){
+ param_found = true;
+ param_pos = j;
+ }
+ }
+
+ //std::cout<<"param_found: "<<param_found<<"\n";
+
+ if(!param_found){
+ std::cerr<<" Optional:\t"<<cur_param<<" = <"<<cur_legend<<">\t"<<stringify_type(cur_type)<<" [ default: "<<cur_default<<" ]\n";
+ }
+ else{ //type_check now
+ string cur_par_value = par_values.at(param_pos);
+ if(!good_type(cur_par_value, cur_type)){
+ all_enforced = false;
+ std::cerr<<" Optional:\t"<<cur_param<<" = <"<<cur_legend<<">\t"<<stringify_type(cur_type)<<" [ unexpected type ]\n";
+ }
+ }
+ }
+ */
+
+ return all_enforced;
+}
+
+void params::list_all_params(){
+ assert(par_names.size() == par_values.size());
+ std::cout<<"Specified parameters:\n";
+ for(unsigned int i=0; i<par_names.size(); i++){
+ std::cout<<par_names.at(i)<<"\t"<<par_values.at(i)<<"\n";
+ }
+}
+
+int params::get_int_value(const char* par_name){
+ string dummy_string = par_name;
+ for(unsigned int i=0; i<par_names.size(); i++){
+ if(par_name == par_names.at(i)) return atoi(par_values.at(i).c_str());
+ }
+#ifdef DEBUG
+ std::cerr<<"bad par_name: "<<par_name<<"\n";
+#endif
+ throw Error::param_missing(par_name);
+}
+
+double params::get_double_value(const char* par_name){
+ string dummy_string = par_name;
+ for(unsigned int i=0; i<par_names.size(); i++){
+ if(par_name == par_names.at(i)) return atof(par_values.at(i).c_str());
+ }
+#ifdef DEBUG
+ std::cerr<<"bad par_name: "<<par_name<<"\n";
+#endif
+ throw Error::param_missing(par_name);
+}
+
+string params::get_string_value(const char* par_name){
+ string dummy_string = par_name;
+ for(unsigned int i=0; i<par_names.size(); i++){
+ if(par_name == par_names.at(i)) return par_values.at(i);
+ }
+#ifdef DEBUG
+ std::cerr<<"bad par_name: "<<par_name<<"\n";
+#endif
+ throw Error::param_missing(par_name);
+}
+char params::get_char_value(const char* par_name){
+ string dummy_string = par_name;
+ for(unsigned int i=0; i<par_names.size(); i++){
+ if(par_name == par_names.at(i)){
+ string cur_par_value = par_values.at(i);
+ return cur_par_value.at(0);
+ }
+ }
+ throw Error::param_missing(par_name);
+}
--- /dev/null
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include <vector>
+#include <string>
+
+#ifndef param_type
+#define param_type unsigned int
+#endif
+
+#ifndef PARAM_TYPES
+#define PARAM_TYPES
+
+#define CHAR_TYPE 0
+#define STRING_TYPE 10
+#define INT_TYPE 20
+#define DOUBLE_TYPE 30
+
+#endif
+
+using std::vector;
+using std::string;
+
+class params{
+ private:
+ //unsigned int argc;
+ //vector<string> argv;
+
+ vector<string> required_params;
+ vector<string> required_params_legend;
+ vector<param_type> required_params_type;
+
+ vector<string> optional_params;
+ vector<string> optional_params_legend;
+ vector<param_type> optional_params_type;
+ vector<string> optional_params_default_value;
+
+ vector<string> par_names;
+ vector<string> par_values;
+
+ void parse(const char* this_par, string& cur_par_name, string& cur_par_value);
+ public:
+ params(int _argc, char* _argv[]);
+ void require(const char* this_par_name, const char* this_par_legend, param_type this_par_type); //specify what sort of parameters are required
+ void require(const string& par_name, const string& par_legend, param_type par_type); //same
+ void optional(const char* this_par_name, const char* this_par_legend, const char* default_value, param_type par_type);
+
+ void list_all_params();
+ bool enforce(); //enforces the required parameters
+
+ string get_string_value(const char* par_name);
+ char get_char_value(const char* par_name);
+ int get_int_value(const char* par_name);
+ double get_double_value(const char* par_name);
+};
+
+#endif
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#ifndef _ASSERT_H
+#include "assert.h"
+#endif
+
+using std::string;
+using std::cerr;
+using std::endl;
+
+#include "seq_contig.h"
+#include "sequence_utility_functions.h"
+
+#ifndef FAILED_TO_LOAD_CONTIG_FROM_FILE
+#define FAILED_TO_LOAD_CONTIG_FROM_FILE -1
+#endif
+
+#ifndef SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE
+#define SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE 1
+#endif
+
+namespace Error{
+ struct bad_ifstream_in_the_function{
+ const char* thrown_from;
+ bad_ifstream_in_the_function(const char* _thrown_from){
+ thrown_from = _thrown_from;
+ }
+ };
+}
+
+seq_contig::seq_contig(unsigned int _size){
+ seq.reserve(_size);
+}
+void seq_contig::read_contig(std::ifstream& o){
+ string dummy_str;
+ getline(o, dummy_str);
+ assert(fa_header(dummy_str));
+
+ seq = "";
+ header = dummy_str;
+
+ bool continue_reading = true;
+ while(continue_reading){
+ getline(o, dummy_str);
+ if(o.good() && !fa_header(dummy_str)){
+ assert(valid_seq_string(dummy_str));
+ seq += dummy_str;
+ }
+ else{ continue_reading = false; }
+ }
+}
+
+void seq_contig::read_contig(const char* _fname){
+ std::ifstream _seq_fname(_fname);
+ if(!_seq_fname.good()){
+ cerr<<"Bad filename: "<<_fname<<endl;
+ }
+ read_contig(_seq_fname);
+}
+
+int seq_contig::load_contig_from_file(const string& contig_id, std::ifstream& o){
+ unsigned int contig_counter = 0;
+ if(!o.good()) throw Error::bad_ifstream_in_the_function("int seq_contig::load_contig_from_file");
+
+ std::ios::pos_type save_pos = o.tellg(); //save position to go back later
+ o.seekg(0, std::ios::beg);
+
+ std::ios::pos_type contig_start_pos;
+ bool continue_search = true;
+ while(continue_search){
+ string dummy_string;
+ std::ios::pos_type current_pos = o.tellg();
+ getline(o, dummy_string);
+ if(o.good()){
+ if(dummy_string[0] == '>'){ //fasta header
+ //std::cerr<<"header found: "<<dummy_string<<"\n";
+ if(dummy_string.substr(1, dummy_string.length()-1) == contig_id){
+ continue_search = false;
+ contig_start_pos = current_pos;
+ contig_ind = contig_counter;
+ }
+ contig_counter++;
+ }
+ }
+ else{
+ continue_search = false;
+ o.clear();
+ o.seekg(save_pos);
+ return FAILED_TO_LOAD_CONTIG_FROM_FILE;
+ }
+ }
+ o.seekg(contig_start_pos);
+ read_contig(o);
+ o.seekg(save_pos);
+ return SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE;
+}
+
+seq_contig::seq_contig(std::ifstream& o){
+ read_contig(o);
+}
+
+seq_contig::seq_contig(const char* _fname){
+ read_contig(_fname);
+}
+
+
+void seq_contig::ag_recode(){
+ string nseq = seq.substr(0,seq.size()-1);
+ for(unsigned int i=0; i<seq.size()-1; i++){
+ char cur_code = 'n';
+ if(valid_nucl(seq[i]) && valid_nucl(seq[i]))
+ cur_code = ag_encode(seq[i],seq[i+1]);
+ nseq[i] = cur_code;
+ }
+ seq = nseq;
+}
+
+void seq_contig::save(std::ofstream& ostr){
+ unsigned int str_length = 50;
+ ostr<<header<<endl;
+ for(unsigned int i=0; i<(seq.size()-seq.size()%str_length)/str_length; i++){
+ int cur_start = i*str_length;
+ string cur_str = seq.substr(cur_start, str_length);
+ ostr<<cur_str<<endl;
+ }
+ if(seq.size()%str_length != 0){
+ string cur_str = seq.substr(seq.size()-seq.size()%str_length, seq.size()%str_length);
+ ostr<<cur_str<<endl;
+ }
+}
+
+void seq_contig::save(std::ofstream& ostr, int region_start, int region_end){
+ assert(region_start >= 0);
+ assert(region_start < region_end);
+ assert(region_end <= (int)(seq.length()));
+ int region_size = region_end - region_start;
+ unsigned int str_length = 50;
+ ostr<<header<<endl;
+ for(int i=0; i<(int)((region_size-region_size%str_length)/str_length); i++){
+ int cur_start = region_start+i*str_length;
+ string cur_str = seq.substr(cur_start, str_length);
+ ostr<<cur_str<<endl;
+ }
+ if(seq.size()%str_length != 0){
+ string cur_str = seq.substr(region_start + region_size - region_size%str_length,
+ region_size%str_length);
+ ostr<<cur_str<<endl;
+ }
+}
+
+int seq_contig::size(){
+ return seq.length();
+}
+
+void seq_contig::validate(){
+ for(unsigned int i=0; i<seq.size(); i++){
+ assert(valid_letter(seq[i]));
+ }
+}
+
+char& seq_contig::operator[](unsigned int pos){
+#ifdef DEBUG
+ assert(pos < seq.size());
+#endif
+ return seq[pos];
+}
+
+
--- /dev/null
+#ifndef SEQ_CONTIG_H
+#define SEQ_CONTIG_H
+
+#define FAILED_TO_LOAD_CONTIG_FROM_FILE -1
+#define SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE 1
+
+
+class seq_contig{
+public:
+ std::string header;
+ std::string seq;
+ unsigned int contig_ind; //the index of contig in the file it's read from
+
+ char& operator[](unsigned int pos);
+
+ void read_contig(std::ifstream& o);
+ void read_contig(const char* fname);
+
+ int size();
+ void validate();
+ seq_contig(){}
+ seq_contig(unsigned int _size);//{ seq.reserve(_size);}
+ seq_contig(std::string _header, std::string _seq){ header = _header; seq = _seq; validate();}
+ seq_contig(std::ifstream& o);
+ seq_contig(const char* fname);
+
+ void ag_recode();
+ void save(std::ofstream& ostr);
+ void save(std::ofstream& ostr, int region_start, int region_end);
+ int load_contig_from_file(const std::string& contig_id, std::ifstream& o);
+ int get_contig_ind();//const std::string& contig_id, std::ifstream& o);
+};
+
+#endif
--- /dev/null
+#include <string>
+#include <iostream>
+
+#include <assert.h>
+
+using std::string;
+using std::cerr;
+using std::endl;
+
+#include "sequence_utility_functions.h"
+
+static unsigned int pow4[] =
+ {1,4,16,64,256,1024,4096,16384,65536,262144,1048576,
+ 4194304,16777216,67108864,268435456,1073741824};
+/*
+namespace Error{
+ struct Bad_ag_encode{
+ char first_char, second_char;
+ Bad_ag_encode(char _first, char _second){
+ first_char = _first; second_char = _second;
+ }
+ };
+ struct Bad_nucleotide{
+ char c;
+ Bad_nucleotide(char _c){ c=_c;}
+ };
+ struct Bad_fasta_letter{
+ char c;
+ Bad_fasta_letter(char _c){c=_c;}
+ };
+ struct Failed_to_compute_seq_tuple_index{
+ string bad_tuple;
+ Failed_to_compute_seq_tuple_index(const string& _str){
+ bad_tuple = _str;
+ }
+ };
+}
+*/
+char complement(char c){
+ switch(c){
+ case 'a':{ return 't'; break;}
+ case 't':{ return 'a'; break;}
+ case 'g':{ return 'c'; break;}
+ case 'c':{ return 'g'; break;}
+ case 'A':{ return 'T'; break;}
+ case 'T':{ return 'A'; break;}
+ case 'G':{ return 'C'; break;}
+ case 'C':{ return 'G'; break;}
+ case 'n':{ return 'n'; break;}
+ case 'N':{ return 'N'; break;}
+ default:{ cerr<<"bad char in complement: "<<c<<endl;
+ assert(false); return 'Q'; break;}
+ }
+ //assert(valid_letter(c));
+ //char nc = capitalize(c);
+ /*
+ if(nc == 'N') return 'N';
+ if(nc == 'T') return 'A';
+ if(nc == 'A') return 'T';
+ if(nc == 'C') return 'G';
+ if(nc == 'G') return 'C';
+ assert(false);
+ */
+}
+
+
+bool small(char c){
+ if(c == 'a' || c == 't' || c == 'g' || c == 'c' || c == 'n') return true;
+ return false;
+}
+
+string capitalize(const string& _seq){
+ string res = _seq;
+ for(unsigned int i=0; i<_seq.length(); i++){
+ switch(_seq[i]){
+ case 'a':{res[i] = 'A'; break;}
+ case 'A':{break;}
+ case 't':{res[i] = 'T'; break;}
+ case 'T':{break;}
+ case 'g':{res[i] = 'G'; break;}
+ case 'G':{break;}
+ case 'c':{res[i] = 'C'; break;}
+ case 'C':{break;}
+ case 'n':{res[i] = 'N'; break;}
+ case 'N':{break;}
+ default:{ throw Error::Bad_nucleotide(_seq[i]); break;}
+ }
+ }
+ return res;
+}
+char capitalize(char c){
+ if(c == 'n') return 'N';
+ if(c == 'a') return 'A';
+ if(c == 't') return 'T';
+ if(c == 'g') return 'G';
+ if(c == 'c') return 'C';
+
+ if(c == 'N') return 'N';
+ if(c == 'A') return 'A';
+ if(c == 'T') return 'T';
+ if(c == 'G') return 'G';
+ if(c == 'C') return 'C';
+
+ //cerr<<c;
+ assert(false);
+}
+
+string rev_compl(const string& _str){
+ //string dummy_str = _str;
+ //reverse(dummy_str.begin(), dummy_str.end());
+ string res;
+ for(int i=_str.size()-1; i>=0; i--){
+ char cur_char = _str[i];
+ //cout<<cur_char;
+ //assert(false);
+ res.push_back(complement(cur_char));
+ }
+ return res;
+}
+
+bool valid_nucl(char c){
+ if(c == 'a' || c == 'A' || c == 't' || c == 'T' ||
+ c == 'g' || c == 'G' || c == 'c' || c == 'C')
+ return true;
+ else return false;
+}
+bool valid_col(char c){
+ if(c == '0' || c == '1' || c == '2' || c == '3')
+ return true;
+ else return false;
+}
+bool valid_letter(char c){
+ if(c == 'a' || c == 'A' || c == 't' || c == 'T' ||
+ c == 'g' || c == 'G' || c == 'c' || c == 'C' ||
+ c == 'n' || c == 'N' || c == 'm' || c == 'M' ||
+ c == 'r' || c == 'R')
+ return true;
+ else return false;
+}
+
+bool seq_missing(char c){
+ //if(c== 'n' || c== 'N' || c=='m' || c=='M'|| c) return true;
+ if(!valid_nucl(c)) return true;
+ return false;
+}
+
+bool col_missing(char c){
+ //if(c== 'n' || c== 'N' || c=='m' || c=='M'|| c) return true;
+ if(! (valid_col(c))) return true;
+ return false;
+}
+
+bool fa_header(string _str){
+ if(_str[0] == '>') return true;
+ return false;
+}
+string ag_recode(const string& str){
+ string nseq =str.substr(0,str.size()-1);
+ for(unsigned int i=0; i<str.size()-1; i++){
+ char cur_code = 'n';
+ if(valid_nucl(str[i]) && valid_nucl(str[i]))
+ cur_code = ag_encode(str[i],str[i+1]);
+ nseq[i] = cur_code;
+ }
+ return nseq;
+}
+bool valid_seq_string(string _str){
+ for(unsigned int i=0; i<_str.size(); i++){
+ if(!valid_letter(_str[i])){
+
+ cerr<<"warning: bad letter ["<<_str[i]<<"] code: "<<(int) _str[i]<<" at position "<<i<<endl;
+ cerr<<" in the expected sequence string "<<_str<<endl;
+ throw(Error::Bad_fasta_letter(_str[i]));
+ return false;
+ }
+ }
+ return true;
+}
+
+char ag_encode(char c1, char c2){
+ if(c1 == 'N' || c1 == 'n' || c2 == 'N' || c2 == 'n') return 'N';
+
+ if(((c1 == 'A' || c1 == 'a') && (c2 == 'A' || c2 == 'a')) ||
+ ((c1 == 'C' || c1 == 'c') && (c2 == 'C' || c2 == 'c')) ||
+ ((c1 == 'G' || c1 == 'g') && (c2 == 'G' || c2 == 'g')) ||
+ ((c1 == 'T' || c1 == 't') && (c2 == 'T' || c2 == 't'))) return '0';
+
+ if(((c1 == 'A' || c1 == 'a') && (c2 == 'C' || c2 == 'c')) ||
+ ((c1 == 'C' || c1 == 'c') && (c2 == 'A' || c2 == 'a')) ||
+ ((c1 == 'G' || c1 == 'g') && (c2 == 'T' || c2 == 't')) ||
+ ((c1 == 'T' || c1 == 't') && (c2 == 'G' || c2 == 'g'))) return '1';
+
+ if(((c1 == 'A' || c1 == 'a') && (c2 == 'G' || c2 == 'g')) ||
+ ((c1 == 'C' || c1 == 'c') && (c2 == 'T' || c2 == 't')) ||
+ ((c1 == 'G' || c1 == 'g') && (c2 == 'A' || c2 == 'a')) ||
+ ((c1 == 'T' || c1 == 't') && (c2 == 'C' || c2 == 'c'))) return '2';
+
+ if(((c1 == 'A' || c1 == 'a') && (c2 == 'T' || c2 == 't')) ||
+ ((c1 == 'C' || c1 == 'c') && (c2 == 'G' || c2 == 'g')) ||
+ ((c1 == 'G' || c1 == 'g') && (c2 == 'C' || c2 == 'c')) ||
+ ((c1 == 'T' || c1 == 't') && (c2 == 'A' || c2 == 'a'))) return '3';
+
+ throw Error::Bad_ag_encode(c1,c2);
+
+ //cerr<<c1<<c2<<endl;
+ //assert(false);
+ //return '?';
+}string ag_encode(const string& str, int start, int length){
+ assert(start >= 0 && (unsigned int) (start+length) <= str.length());
+ string res;
+ for(int i=start; i<start+length-1; i++){
+ try{ res += ag_encode(str[i], str[i+1]); }
+ catch( Error::Bad_ag_encode e ){
+ cerr<<"failed to encode ( "<<e.first_char<<","<<e.second_char<<" ) ";
+ cerr<<" at pos: "<<i<<endl;
+ //skip();
+ //assert(false);
+ }
+ }
+ return res;
+}
+
+string ag_encode(const string& str){
+ return ag_encode(str, 0, str.length());
+}
+
+unsigned short encode_nucl(char c){
+ switch(c){
+ case 'A':{ return 0; break;}
+ case 'a':{ return 0; break;}
+ case 'C':{ return 1; break;}
+ case 'c':{ return 1; break;}
+ case 'G':{ return 2; break;}
+ case 'g':{ return 2; break;}
+ case 'T':{ return 3; break;}
+ case 't':{ return 3; break;}
+ case 'N':{ return 0; break;}
+ case '.':{ return 0; break;}
+ case 'n':{ return 0; break;}
+ default:{ throw Error::Bad_nucleotide(c);}
+ }
+}
+unsigned short encode_nucl_complement(char c){
+ switch(c){
+ case 'A':{ return 3; break;}
+ case 'a':{ return 3; break;}
+ case 'C':{ return 2; break;}
+ case 'c':{ return 2; break;}
+ case 'G':{ return 1; break;}
+ case 'g':{ return 1; break;}
+ case 'T':{ return 0; break;}
+ case 't':{ return 0; break;}
+ case 'N':{ return 3; break;}
+ case 'n':{ return 3; break;}
+ case '.':{ return 3; break;}
+ default:{ throw Error::Bad_nucleotide(c);}
+ }
+}
+
+char decode_nucl(unsigned int code){
+ assert(code < 4);
+ switch(code){
+ case 0: return 'A';
+ case 1: return 'C';
+ case 2: return 'G';
+ case 3: return 'T';
+ }
+ return -1;
+}
+
+unsigned int seq_space_tuple_index(const string& str, unsigned int starts_at_pos, unsigned int tuple_size){
+ // cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+ //a-0, c-1, g-2, t-3
+ assert(starts_at_pos >= 0);
+ assert(starts_at_pos + tuple_size <= (str.length()));
+
+ unsigned int res = 0;
+
+ for(unsigned int i = 0; i<tuple_size; i++){
+ try{
+ res += encode_nucl(str[starts_at_pos+i])*pow4[i];
+ }
+ catch(Error::Bad_nucleotide e){
+ cerr<<"seq_space_tuple_index: Error [ "<<e.c<<" ] in encode char at pos: "<<starts_at_pos<<" ";
+ for(unsigned int j=0; j<tuple_size; j++){
+ cerr<<str[starts_at_pos + j];
+ //cerr<<endl;
+ //throw Error::Failed_to_compute_seq_tuple_index(str.substr(starts_at_pos, tuple_size));
+ }
+ cerr<<endl;
+ throw Error::Failed_to_compute_seq_tuple_index(str.substr(starts_at_pos, tuple_size));
+ }
+ }
+
+ assert(res < pow4[tuple_size]);
+ return res;
+}
+
+unsigned int seq_space_tuple_index_rev_compl(const string& str, unsigned int starts_at_pos, unsigned int tuple_size){
+ // cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+ assert(starts_at_pos >= 0);
+ assert(starts_at_pos < (str.length()));
+ assert(starts_at_pos - tuple_size + 1 >= 0);
+
+ unsigned int res = 0;
+
+ for(unsigned int i = 0; i<tuple_size; i++){
+ try{
+ //setbits(res, str[starts_at_pos-i],i);
+ res += /*ag_encode_char*/encode_nucl_complement(str[starts_at_pos-i])*pow4[i];//int_pow(4,i);
+ }
+ catch(Error::Bad_nucleotide e){
+ cerr<<"seq_space_tuple_index_rev_compl: Error [ "<<e.c<<" ] in encode char at pos: "<<starts_at_pos<<" ";
+ for(unsigned int j=0; j<tuple_size; j++){
+ cerr<<str[starts_at_pos - j];
+ }
+ cerr<<endl;
+ }
+ }
+
+ assert(res < pow4[tuple_size]);
+ return res;
+}
+
+unsigned int color_space_tuple_index(const string& str, int starts_at_pos, int tuple_size){
+ // cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+ assert(starts_at_pos >= 0);
+ assert(starts_at_pos + tuple_size <= (int)(str.length()));
+
+ unsigned int res = 0;
+
+ for(int i = 0; i<tuple_size; i++){
+ try{
+ res += ag_encode_char(str[starts_at_pos+i])*pow4[i];//int_pow(4,i);
+ }
+ catch(int a){
+ if(a == 20){
+ cerr<<"Error in encode char at pos: "<<starts_at_pos<<" ";
+ for(int j=0; j<tuple_size; j++){
+ cerr<<str[starts_at_pos + j];
+ }
+ cerr<<endl;
+ throw 21;
+ }
+ }
+ }
+
+ assert(res < pow4[tuple_size]);
+ return res;
+}
+
+int color_space_tuple_index_rev(const string& str, int starts_at_pos, int tuple_size){
+ // cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+ assert(starts_at_pos >= 0);
+ assert(starts_at_pos < (int)(str.length()));
+ assert(starts_at_pos - tuple_size + 1 >= 0);
+
+ int res = 0;
+
+ for(int i = 0; i<tuple_size; i++){
+ try{
+ res += ag_encode_char(str[starts_at_pos-i])*pow4[i];//int_pow(4,i);
+ }
+ catch(Error::Bad_ag_encode e){
+ cerr<<"Failed to color encode: "<<e.first_char<<" "<<e.second_char<<endl;
+ assert(false);
+ }
+ }
+ return res;
+}
+
+short ag_encode_char(char c){
+ switch(c){
+ case '0':{ return 0; break;}
+ case '1':{ return 1; break;}
+ case '2':{ return 2; break;}
+ case '3':{ return 3; break;}
+ default:{throw 20; return -1; break;assert(false); break;} //you shouldn't be here
+ }
+}
+
+
+string invert(const string& str){
+ string res;
+ for(int i=str.length()-1; i>=0; i--){
+ res += str[i];
+ }
+ return res;
+}
--- /dev/null
+#ifndef SEQUENCE_UTILITY_FUNCTIONS_H_
+#define SEQUENCE_UTILITY_FUNCTIONS_H_
+
+using std::string;
+
+bool fa_header(string _str);
+string ag_recode(const string& str);
+bool valid_seq_string(string _str);
+bool valid_nucl(char c);
+bool valid_col(char c);
+bool valid_letter(char c);
+bool seq_missing(char c);
+bool col_missing(char c);
+
+char ag_encode(char c1, char c2);
+string ag_encode(const string& str, int start, int length);
+string ag_encode(const string& str);
+short ag_encode_char(char c);
+
+unsigned short encode_nucl(char c);
+unsigned short encode_nucl_complement(char c);
+
+char decode_nucl(unsigned int code);
+
+unsigned int seq_space_tuple_index(const string& str, unsigned int starts_at_pos, unsigned int tuple_size);
+unsigned int seq_space_tuple_index_rev_compl(const string& str, unsigned int starts_at_pos, unsigned int tuple_size);
+unsigned int color_space_tuple_index(const string& str, int starts_at_pos, int tuple_size);
+int color_space_tuple_index_rev(const string& str, int starts_at_pos, int tuple_size);
+char capitalize(char c);
+string capitalize(const string& _seq);
+string rev_compl(const string& str);
+char complement(char c);
+string invert(const string& str);
+
+namespace Error{
+ struct Bad_ag_encode{
+ char first_char, second_char;
+ Bad_ag_encode(char _first, char _second){
+ first_char = _first; second_char = _second;
+ }
+ };
+ struct Bad_nucleotide{
+ char c;
+ Bad_nucleotide(char _c){ c=_c;}
+ };
+ struct Bad_fasta_letter{
+ char c;
+ Bad_fasta_letter(char _c){c=_c;}
+ };
+ struct Failed_to_compute_seq_tuple_index{
+ string bad_tuple;
+ Failed_to_compute_seq_tuple_index(const string& _str){
+ bad_tuple = _str;
+ }
+ };
+}
+
+#endif
--- /dev/null
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#include <assert.h>
+
+using std::vector;
+using std::string;
+using std::cerr;
+using std::endl;
+
+//#include "utils.h"
+/*
+std::vector<string> split(const string& inp_string, const string& sep){
+ std::vector<string> res;
+ if(inp_string.length() == 0) return res;
+ if(inp_string.length() < sep.length()){
+ res.push_back(inp_string);
+ return res;
+ }
+ unsigned int last_i = 0;
+ for(unsigned int i=0; i<inp_string.length()-sep.length(); i++){
+ //std::cout<<"comp: "<<inp_string.substr(i,sep.length())<<" to "<<sep<<std::endl;
+ if(inp_string.substr(i,sep.length()) == sep){
+ if(i>last_i+1){
+ res.push_back(inp_string.substr(last_i, i-last_i));
+ }
+ last_i = i+1;
+ //}
+ }
+ }
+ if(last_i < inp_string.length()-1)
+ res.push_back(inp_string.substr(last_i, inp_string.length()-last_i));
+
+ if(res.size() == 0 && inp_string[0] != sep[0]) res.push_back(inp_string);
+ return res;
+}
+*/
+std::vector<string> split(const string& inp_string, char sep){
+ std::vector<string> res;
+ if(inp_string.length() == 0) return res;
+
+ unsigned int last_i = 0;
+ for(unsigned int i=0; i<inp_string.length()/*-1*/; i++){
+ //if(inp_string.substr(i,sep.length()) == sep){
+ if(inp_string[i] == sep){
+ if(i>last_i){
+ res.push_back(inp_string.substr(last_i, i-last_i));
+ }
+ last_i = i+1;
+ //}
+ }
+ }
+ if(last_i < inp_string.length())
+ res.push_back(inp_string.substr(last_i, inp_string.length()-last_i));
+
+ if(res.size() == 0 && inp_string[0] != sep) res.push_back(inp_string);
+ return res;
+}
+
+unsigned int get_string_count(std::ifstream& ifs){
+ using std::ios;
+
+ assert(ifs.good());
+ std::ios::pos_type cur_pos = ifs.tellg();
+ ifs.seekg(0, ios::beg);
+
+ string dummy_string;
+ unsigned int string_counter = 0;
+ while(ifs.good()){
+ getline(ifs,dummy_string);
+ if(ifs.good()) string_counter++;
+ }
+
+ ifs.clear();
+ ifs.seekg(cur_pos);
+ return string_counter;
+}
+
+void chomp(string& str){
+ //erases bad characters
+ bool continue_chomp = true;
+
+ while(continue_chomp){
+ switch(str[str.length()-1]){
+ case char(13):{ str.erase(str.length()-1,1); break;}
+ default:{ continue_chomp = false; break; }
+ }
+ }
+}
+
+
+std::ios::pos_type perc_file_length(std::ifstream& ifs){
+ //assert(ifs.good());
+ if(!ifs.good()){ cerr<<"bad file in perc_file_length"<<endl; std::terminate();}
+ std::ios::pos_type cur_pos = ifs.tellg();
+ ifs.seekg(0, std::ios::end);
+ std::ios::pos_type _length = ifs.tellg();
+ ifs.seekg(cur_pos);
+ return (_length - _length%100) / 100;
+}
--- /dev/null
+#ifndef UTILS_H
+#define UTILS_H
+
+std::vector<std::string> split(const std::string& inp_string, const std::string& sep);
+std::vector<std::string> split(const std::string& inp_string, char sep);
+
+void chomp(std::string& str);
+std::ios::pos_type perc_file_length(std::ifstream& ifs);
+unsigned int get_string_count(std::ifstream& ifs);
+#endif
--- /dev/null
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like.
+## However, any redistribution of this software should
+## indicate that this code was derived from the QuEST
+## analytical software and a citation:
+
+## the citation goes here
+
+## Any publication that uses results from this code's
+## deriviative should cite this paper.
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+
+## This program is a wrapper that all modules of QuEST software pipeline
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+ run_QuEST_with_param_file.pl
+
+ This program is a wrapper that runs QuEST analytical pipeline
+
+ -----------------------
+ mandatory parameters:
+ -p <params_file> a file containing QuEST batch parameters
+
+ -----------------------
+ optional parameters:
+
+ (Use directives below to override execution directives in QuEST batch parameter file)
+
+ -quick_window_scan supply to run quick window scan
+ -calibrate_peak_shift supply to estimate peak shift
+ -generate_profile_ChIP supply to generate ChIP profile
+ -generate_profile_background supply to generate background profile
+ -generate_profile_pseudo_ChIP supply to generate RX_noIP profile
+ -peak_call_ChIP call peaks in ChIP data
+ -peak_call_pseudo_CHIP call peaks in pseudo_ChIP_data
+
+ -h to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+ if($fullpath_fields[$i] ne ""){
+ $exec_path = $exec_path."/".$fullpath_fields[$i];
+ }
+}
+
+
+my $override_QuEST_directives = "false";
+
+my $override_quick_window_scan = "false";
+my $override_calibrate_peak_shift = "false";
+my $override_generate_profile_ChIP = "false";
+my $override_generate_profile_background = "false";
+my $override_generate_profile_pseudo_ChIP = "false";
+my $override_peak_call_ChIP = "false";
+my $override_peak_call_pseudo_ChIP = "false";
+## /setting optional arguments
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+ elsif ( $this_arg eq '-quick_window_scan') {$override_quick_window_scan = "true";}
+ elsif ( $this_arg eq '-calibrate_peak_shift') {$override_calibrate_peak_shift = "true";}
+ elsif ( $this_arg eq '-generate_profile_ChIP') {$override_generate_profile_ChIP = "true";}
+ elsif ( $this_arg eq '-generate_profile_background') {$override_generate_profile_background = "true";}
+ elsif ( $this_arg eq '-generate_profile_pseudo_ChIP') {$override_generate_profile_pseudo_ChIP = "true";}
+ elsif ( $this_arg eq '-peak_call_ChIP') {$override_peak_call_ChIP = "true";}
+ elsif ( $this_arg eq '-peak_call_pseudo_ChIP') {$override_peak_call_pseudo_ChIP = "true";}
+# elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+ else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false"; ## die if parameters are bad
+my $bad_par = ""; ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file: $params_fname\n";
+print "exec_path: $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+ print "$usage";
+ print "Bad parameters: $bad_par\n";
+ exit(0);
+}
+## /reading arguments
+
+if($override_quick_window_scan eq "true" ||
+ $override_calibrate_peak_shift eq "true" ||
+ $override_generate_profile_ChIP eq "true" ||
+ $override_generate_profile_background eq "true" ||
+ $override_generate_profile_pseudo_ChIP eq "true" ||
+ $override_peak_call_ChIP eq "true" ||
+ $override_peak_call_pseudo_ChIP eq "true"){
+ $override_QuEST_directives = "true";
+}
+
+## top-level script
+if( ! -e $params_fname){
+ print "Failed to open $params_fname. Aborting.\n";
+ exit(1);
+}
+open params_file, "< $params_fname" || die "$params_fname: $!\n";
+my @params = <params_file>;
+close params_file;
+
+my $analysis_with_FDR = "";
+my $quick_window_scan_param_fname = "";
+my $calibrate_peak_shift_param_fname = "";
+
+my $ChIP_generate_profile_param_fname = "";
+my $background_generate_profile_param_fname = "";
+my $pseudo_ChIP_generate_profile_param_fname = "";
+
+my $ChIP_peak_caller_param_fname = "";
+my $pseudo_ChIP_peak_caller_param_fname = "";
+
+my $quick_window_scan_ChIP = "false";
+my $calibrate_peak_shift_ChIP = "false";
+my $generate_profile_ChIP = "false";
+my $generate_profile_background = "false";
+my $generate_profile_pseudo_ChIP = "false";
+my $peak_caller_ChIP_vs_background = "false";
+my $peak_caller_pseudo_ChIP_vs_background = "false";
+
+my $log_fname = "";
+my $output_fname = "";
+
+
+for(my $i=0; $i<scalar(@params); $i++){
+ my $cur_param = $params[$i];
+ chomp($cur_param);
+
+ if( length($cur_param) > 0 ){
+ if( substr($cur_param,0,1) ne "#" ){
+ my @cur_par_fields = split(/ /, $cur_param);
+
+ if(scalar(@cur_par_fields >= 2)){
+ my $cur_par_name = $cur_par_fields[0];
+ if ($cur_par_name eq "analysis_with_FDR"){
+ $analysis_with_FDR = $cur_par_fields[2];
+ }
+ elsif( $cur_par_name eq "quick_window_scan:" ){
+ if( $cur_par_fields[1] eq "ChIP" ){
+ $quick_window_scan_ChIP = "true";
+ }
+ }
+ elsif( $cur_par_name eq "calibrate_peak_shift:" ){
+ if( $cur_par_fields[1] eq "ChIP" ){
+ $calibrate_peak_shift_ChIP = "true";
+ }
+ }
+ elsif( $cur_par_name eq "generate_profile:"){
+ if( $cur_par_fields[1] eq "ChIP" ){
+ $generate_profile_ChIP = "true";
+ }
+ elsif( $cur_par_fields[1] eq "background" ){
+ $generate_profile_background = "true";
+ }
+ elsif( $cur_par_fields[1] eq "pseudo_ChIP" ){
+ $generate_profile_pseudo_ChIP = "true";
+ }
+ }
+ elsif( $cur_par_name eq "peak_caller:" ){
+ if( scalar(@cur_par_fields) == 4 ){
+ if( $cur_par_fields[1] eq "ChIP" &&
+ $cur_par_fields[2] eq "vs" &&
+ $cur_par_fields[3] eq "background"){
+ $peak_caller_ChIP_vs_background = "true";
+ }
+ elsif( $cur_par_fields[1] eq "pseudo_ChIP" &&
+ $cur_par_fields[2] eq "vs" &&
+ $cur_par_fields[3] eq "background"){
+ $peak_caller_pseudo_ChIP_vs_background = "true";
+ }
+ }
+ }
+
+ elsif ($cur_par_name eq "quick_window_scan_param_file"){
+ $quick_window_scan_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "calibrate_peak_shift_param_file"){
+ $calibrate_peak_shift_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "ChIP_generate_profile_param_file"){
+ $ChIP_generate_profile_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "background_generate_profile_param_file"){
+ $background_generate_profile_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "pseudo_ChIP_generate_profile_param_file"){
+ $pseudo_ChIP_generate_profile_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "ChIP_peak_caller_param_file"){
+ $ChIP_peak_caller_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "pseudo_ChIP_peak_caller_param_file"){
+ $pseudo_ChIP_peak_caller_param_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "log_file"){
+ $log_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "output_file"){
+ $output_fname = $cur_par_fields[2];
+ }
+ else{
+ if($params[$i] ne ""){
+ print "Warning: unrecognized parameter: $cur_par_name";
+ }
+ }
+ }
+ }
+ }
+}
+
+if($override_QuEST_directives eq "true"){
+ if($override_quick_window_scan eq "true" ){
+ $quick_window_scan_ChIP = "true";
+ }
+ else{ $quick_window_scan_ChIP = "false"; }
+
+ if($override_calibrate_peak_shift eq "true" ){
+ $calibrate_peak_shift_ChIP = "true";
+ }
+ else{ $calibrate_peak_shift_ChIP = "false"; }
+
+ if($override_generate_profile_ChIP eq "true" ){
+ $generate_profile_ChIP = "true";
+ }
+ else{ $generate_profile_ChIP = "false"; }
+
+ if($override_generate_profile_background eq "true" ){
+ $generate_profile_background = "true";
+ }
+ else{ $generate_profile_background = "false"; }
+
+ if( $override_generate_profile_pseudo_ChIP eq "true" ){
+ $generate_profile_pseudo_ChIP = "true";
+ }
+ else{ $generate_profile_pseudo_ChIP = "false"; }
+
+ if( $override_peak_call_ChIP eq "true" ){
+ $peak_caller_ChIP_vs_background = "true";
+ }
+ else{ $peak_caller_ChIP_vs_background = "false"; }
+
+ if( $override_peak_call_pseudo_ChIP eq "true" ){
+ $peak_caller_pseudo_ChIP_vs_background = "true";
+ $analysis_with_FDR = "true";
+ }
+ else{ $peak_caller_pseudo_ChIP_vs_background = "false"; }
+}
+
+print "override_QuEST_directives: $override_QuEST_directives\n";
+print "quick_window_scan_ChIP: $quick_window_scan_ChIP\n";
+print "calibrate_peak_shift_ChIP: $calibrate_peak_shift_ChIP\n";
+print "generate_profile_ChIP: $generate_profile_ChIP\n";
+print "generate_profile_pseudo_ChIP: $generate_profile_pseudo_ChIP\n";
+print "generate_profile_background: $generate_profile_background\n";
+print "peak_caller_ChIP_vs_background: $peak_caller_ChIP_vs_background\n";
+print "peak_caller_pseudo_ChIP_vs_background: $peak_caller_pseudo_ChIP_vs_background\n";
+
+
+
+print "Read the following parameters: \n\n";
+
+print "analysis_with_FDR: $analysis_with_FDR\n";
+print "\n";
+print "quick_window_scan_param_file: $quick_window_scan_param_fname\n";
+print "calibrate_peak_shift_param_file: $calibrate_peak_shift_param_fname\n";
+print "\n";
+print "ChIP_generate_profile_param_file: $ChIP_generate_profile_param_fname\n";
+print "background_generate_profile_param_file: $background_generate_profile_param_fname\n";
+if($analysis_with_FDR eq "yes"){
+ print "pseudo_ChIP_generate_profile_param_file: $pseudo_ChIP_generate_profile_param_fname\n";
+}
+
+print "\n";
+print "ChIP_peak_caller_param_file: $ChIP_peak_caller_param_fname\n";
+if($analysis_with_FDR eq "yes"){
+ print "pseudo_ChIP_peak_caller_param_file: $pseudo_ChIP_peak_caller_param_fname\n";
+}
+
+print "\n";
+print "log_file: $log_fname\n";
+print "output_file: $output_fname\n";
+
+## QuEST execution
+
+## some necessary checks/setups
+if(-e $log_fname){
+ unlink($log_fname);
+}
+if(-e $output_fname){
+ unlink($output_fname);
+}
+## some necessary checks/setups
+
+my $quick_window_scan_batcher = $exec_path . "/run_quick_window_scan_with_param_file.pl";
+my $calibrate_peak_shift_batcher = $exec_path . "/run_calibrate_peak_shift_with_param_file.pl";
+my $generate_profile_batcher = $exec_path . "/run_generate_profile_with_param_file.pl";
+my $peak_caller_batcher = $exec_path . "/run_peak_caller_with_param_file.pl";
+
+if( $quick_window_scan_ChIP eq "true"){
+ my $error_code =
+ system("$quick_window_scan_batcher -p $quick_window_scan_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+if( $calibrate_peak_shift_ChIP eq "true" ){
+ my $error_code =
+ system("$calibrate_peak_shift_batcher -p $calibrate_peak_shift_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+if( $generate_profile_ChIP eq "true" ){
+ my $error_code =
+ system("$generate_profile_batcher -p $ChIP_generate_profile_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+if( $generate_profile_background eq "true" ){
+ my $error_code =
+ system("$generate_profile_batcher -p $background_generate_profile_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+if( $analysis_with_FDR eq "true" && $generate_profile_pseudo_ChIP eq "true"){
+ my $error_code =
+ system("$generate_profile_batcher -p $pseudo_ChIP_generate_profile_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+
+if( $peak_caller_ChIP_vs_background eq "true" ){
+ my $error_code =
+ system("$peak_caller_batcher -p $ChIP_peak_caller_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+
+if( $analysis_with_FDR eq "true" && $peak_caller_pseudo_ChIP_vs_background eq "true" ){
+ my $error_code =
+ system("$peak_caller_batcher -p $pseudo_ChIP_peak_caller_param_fname\n");
+ if($error_code != 0){
+ print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+ exit(4);
+ }
+}
+
+
+
+open log_file, "> $log_fname" || die "$log_fname: $\n";
+open output_file, "> $output_fname" || die "$output_fname: $\n";
+
+# creating QuEST summaries
+
+my $ChIP_peak_calls_fname = "";
+my $pseudo_ChIP_peak_calls_fname = "";
+
+if(-e $ChIP_peak_caller_param_fname){
+ open ChIP_peak_caller_param_file, "< $ChIP_peak_caller_param_fname";
+
+ while( <ChIP_peak_caller_param_file> ){
+ chomp;
+ my @cur_fields = split(/ /, $_);
+ if(scalar(@cur_fields) > 0){
+ if($cur_fields[0] eq "output_file"){
+ $ChIP_peak_calls_fname = $cur_fields[2];
+ }
+ }
+ }
+ close ChIP_peak_caller_param_file;
+}
+else{
+ print "Error in run_QuEST_with_param_file.pl: Failed to locate ChIP_peak_caller_param_fname: $ChIP_peak_caller_param_fname";
+ print log_file"Error in run_QuEST_with_param_file.pl: Failed to locate ChIP_peak_caller_param_fname: $ChIP_peak_caller_param_fname";
+}
+
+
+if( $analysis_with_FDR eq "true"){
+ if(-e $pseudo_ChIP_peak_caller_param_fname){
+ open pseudo_ChIP_peak_caller_param_file, "< $pseudo_ChIP_peak_caller_param_fname";
+
+ while( <pseudo_ChIP_peak_caller_param_file> ){
+ chomp;
+ my @cur_fields = split(/ /, $_);
+ if(scalar(@cur_fields) > 0){
+ if($cur_fields[0] eq "output_file"){
+ $pseudo_ChIP_peak_calls_fname = $cur_fields[2];
+ }
+ }
+ }
+ close pseudo_ChIP_peak_caller_param_file;
+ }
+ else{
+ print "Error in run_QuEST_with_param_file.pl: Failed to locate pseudo_ChIP_peak_caller_param_fname: $pseudo_ChIP_peak_caller_param_fname";
+ print log_file "Error in run_QuEST_with_param_file.pl: Failed to locate pseudo_ChIP_peak_caller_param_fname: $pseudo_ChIP_peak_caller_param_fname";
+ }
+}
+
+# counting ChIP peaks
+my $ChIP_peaks = -1;
+if( $peak_caller_ChIP_vs_background eq "true" ){
+ $ChIP_peaks = 0;
+ if(-e $ChIP_peak_calls_fname){
+ open ChIP_peak_calls_file, "< $ChIP_peak_calls_fname";
+
+ while( <ChIP_peak_calls_file> ){
+ chomp;
+ my @cur_fields = split(/ /,$_);
+ if(scalar(@cur_fields) >= 4){
+ if($cur_fields[3] eq "locmax:"){
+ $ChIP_peaks++;
+ }
+ }
+ }
+ close ChIP_peak_calls_file;
+ }
+ else{
+ print "Failed to locate ChIP_peak_calls_file: $ChIP_peak_calls_fname\n";
+ print log_file "Failed to locate ChIP_peak_calls_file: $ChIP_peak_calls_fname\n";
+ }
+}
+# /counting ChIP peaks
+
+# counting pseudo_ChIP peaks
+my $pseudo_ChIP_peaks = -1;
+if( $peak_caller_pseudo_ChIP_vs_background eq "true" ){
+ if( $analysis_with_FDR eq "true" ){
+ $pseudo_ChIP_peaks = 0;
+ if(-e $pseudo_ChIP_peak_calls_fname){
+ open pseudo_ChIP_peak_calls_file, "< $pseudo_ChIP_peak_calls_fname";
+
+ while( <pseudo_ChIP_peak_calls_file> ){
+ chomp;
+ my @cur_fields = split(/ /,$_);
+ if(scalar(@cur_fields) >= 4){
+ if($cur_fields[3] eq "locmax:"){
+ $pseudo_ChIP_peaks++;
+ }
+ }
+ }
+ close pseudo_ChIP_peak_calls_file;
+ }
+ else{
+ print "Failed to locate pseudo_ChIP_peak_calls_file: $pseudo_ChIP_peak_calls_fname\n";
+ print log_file "Failed to locate pseudo_ChIP_peak_calls_file: $pseudo_ChIP_peak_calls_fname\n";
+ }
+ }
+}
+# /counting pseudo_ChIP peaks
+
+# /creating QuEST summaries
+
+print output_file "\#\# please cite: \n";
+print output_file "\#\# Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,\n";
+print output_file "\#\# Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide\n";
+print output_file "\#\# Identification of Transcription Factor Binding Sites Based on ChIP-Seq\n";
+print output_file "\#\# Data. under review.\n\n";
+
+if($ChIP_peaks >= 0){
+ print output_file "ChIP peaks: $ChIP_peaks\n";
+}
+else{
+ print output_file "ChIP peaks: not available\n";
+}
+if( $analysis_with_FDR eq "true" ){
+ if($pseudo_ChIP_peaks >= 0){
+ print output_file "pseudo_ChIP peaks: $pseudo_ChIP_peaks\n";
+ if($ChIP_peaks != 0){
+ printf output_file "FDR estimate: %.2f percent\n", 100 * $pseudo_ChIP_peaks / $ChIP_peaks;
+ }
+ else{
+ print output_file "FDR estimate: NA\n";
+ }
+ }
+ else{
+ print "FDR estimate: not available\n";
+ }
+}
+
+close output_file;
+close log_file;
+
+## /QuEST execution
+
--- /dev/null
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like.
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+## This program is a wrapper that runs calibrate_peak_shift
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+ run_calibrate_peak_shift_with_param_file.pl
+
+ This program is a wrapper that runs the calibrate_peak_shift
+
+ -----------------------
+ mandatory parameters:
+ -p <params_file> a file containing parameters calibrate_peak_shift
+
+ -----------------------
+ optional parameters:
+ -e <exec_path> path to the calibrate_peak_shift executable
+ -h to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+ if($fullpath_fields[$i] ne ""){
+ $exec_path = $exec_path."/".$fullpath_fields[$i];
+ }
+}
+
+$exec_path = $exec_path."/calibrate_peak_shift";
+## /setting optional arguments
+
+if( ! -e $exec_path ){
+ print "Error in calibrate peak shift master script.\n";
+ print "Couldn't locate executable $exec_path. Aborting.\n";
+ exit(0);
+}
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+ elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+ else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false"; ## die if parameters are bad
+my $bad_par = ""; ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file: $params_fname\n";
+print "exec_path: $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+ print "$usage";
+ print "Bad parameters: $bad_par\n";
+ exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $silent_flag = "off";
+my $quick_scan_file = "** missing **";
+my $QuEST_align_chip_file = "** missing **";
+my $QuEST_align_background_file = "** missing **";
+my $output_fname = "** missing **";
+
+my $kde_bandwidth = ""; # optional
+my $scan_window = ""; # optional
+my $quick_scan_tag_count_threshold = ""; # optional
+#my $aligned_part_length = ""; # optional
+my $top_count = ""; # optional
+my $chip_to_background_fold = ""; # optional
+my $best_to_second_best_peak_ratio = ""; # optional
+my $chip_peak_threshold = ""; # optional
+my $chip_for_rev_peak_max_min_ratio = ""; # optional
+my $estimation_method = "";
+
+my $verbose = "yes";
+
+for(my $i=0; $i<scalar(@params); $i++){
+ my $cur_param = $params[$i];
+ chomp($cur_param);
+
+ my @cur_par_fields = split(/ /, $cur_param);
+ if(scalar(@cur_par_fields >= 2)){
+ my $cur_par_name = $cur_par_fields[0];
+ if ($cur_par_name eq "quick_scan_file"){
+ $quick_scan_file = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "align_chip_file"){
+ $QuEST_align_chip_file = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "align_background_file"){
+ $QuEST_align_background_file = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "output_file"){
+ $output_fname = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "kde_bandwidth"){
+ $kde_bandwidth = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "scan_window"){
+ $scan_window = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "quick_scan_tag_count_threshold"){
+ $quick_scan_tag_count_threshold = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "top_count"){
+ $top_count = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "chip_to_background_fold"){
+ $chip_to_background_fold = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "best_to_second_best_peak_ratio"){
+ $best_to_second_best_peak_ratio = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "chip_peak_threshold"){
+ $chip_peak_threshold = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "chip_for_rev_peak_max_min_ratio"){
+ $chip_for_rev_peak_max_min_ratio = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "method"){
+ $estimation_method = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "verbose"){
+ $verbose = $cur_par_fields[2];
+ }
+ else{
+ if($params[$i] ne ""){
+ print "Warning: unrecognized parameter: $cur_par_name";
+ }
+ }
+ }
+}
+
+print "Read the following parameters: \n\n";
+
+print "quick_scan_file: $quick_scan_file\n";
+print "QuEST_align_chip_file: $QuEST_align_chip_file\n";
+print "QuEST_align_background_file: $QuEST_align_background_file\n";
+print "output_file: $output_fname\n";
+print "kde_bandwidth: $kde_bandwidth\n"; # optional
+print "scan_window: $scan_window\n"; # optional
+print "quick_scan_tag_count_threshold: $quick_scan_tag_count_threshold\n"; # optional
+print "top_count: $top_count\n"; # optional
+print "chip_to_background_fold: $chip_to_background_fold\n"; # optional
+print "best_to_second_best_peak_ratio: $best_to_second_best_peak_ratio\n"; # optional
+print "chip_peak_threshold: $chip_peak_threshold\n"; # optional
+print "chip_for_rev_peak_max_min_ratio: $chip_for_rev_peak_max_min_ratio\n"; # optional
+print "verbose: $verbose\n";
+print "\n";
+
+my $peak_shift_recommended_max = 100;
+my $peak_shift_recommended_value = 50;
+my $peak_shift_recommended_min = 35;
+my $recommended_used_regions = 10;
+my $recommended_used_fraction = 0.33;
+
+my $output_fname_tmp = $output_fname . ".tmp";
+
+my $optional_param_string = "";
+if($kde_bandwidth ne ""){
+ $optional_param_string = $optional_param_string . " kde_bandwidth=$kde_bandwidth";
+}
+if($scan_window ne ""){
+ $optional_param_string = $optional_param_string . " scan_window=$scan_window";
+}
+if($quick_scan_tag_count_threshold ne ""){
+ $optional_param_string = $optional_param_string . " quick_scan_tag_count_threshold=$quick_scan_tag_count_threshold";
+}
+#if($aligned_part_length ne ""){
+# $optional_param_string = $optional_param_string . " aligned_part_length=$aligned_part_length";
+#}
+if($top_count ne ""){
+ $optional_param_string = $optional_param_string . " top_count=$top_count";
+}
+if($chip_to_background_fold ne ""){
+ $optional_param_string = $optional_param_string . " chip_to_background_fold=$chip_to_background_fold";
+}
+if($best_to_second_best_peak_ratio ne ""){
+ $optional_param_string = $optional_param_string . " best_to_second_best_peak_ratio=$best_to_second_best_peak_ratio";
+}
+if($chip_peak_threshold ne ""){
+ $optional_param_string = $optional_param_string . " chip_peak_threshold=$chip_peak_threshold";
+}
+if($chip_for_rev_peak_max_min_ratio ne ""){
+ $optional_param_string = $optional_param_string . " chip_for_rev_peak_max_min_ratio=$chip_for_rev_peak_max_min_ratio";
+}
+if($estimation_method ne ""){
+ $optional_param_string = $optional_param_string . " method=$estimation_method";
+}
+
+my $mandatory_param_string = "quick_scan_file=$quick_scan_file align_chip_file=$QuEST_align_chip_file align_background_file=$QuEST_align_background_file output_file=$output_fname_tmp";
+
+my $system_command = $exec_path . " " . $mandatory_param_string . $optional_param_string;
+
+#print "$system_command\n";
+my $error_code = system("$system_command\n");
+
+if($error_code != 0){
+ print "run_calibrate_peak_shift_with_param_file.pl: Caught an error message (code $error_code), passing the error message to the top script.\n";
+ exit(2);
+}
+
+if(!-e $output_fname_tmp){
+ print "Error in run_calibrate_peak_shift_with_param_file.pl: Failed to locate $output_fname_tmp. Aborting.\n";
+ exit(1);
+}
+
+open output_file_tmp, "< $output_fname_tmp";
+
+my $regions = -1;
+my $used_regions = -1;
+my $peak_shift_estimate = -1;
+my $estimation_method_detected = "";
+
+while( <output_file_tmp> ){
+ chomp;
+ if(length($_) > 0){
+ if( substr($_,0,1) ne "#" ){
+ my @cur_entry_fields = split(/ /, $_);
+ if( scalar(@cur_entry_fields) >= 2){
+ if($cur_entry_fields[0] eq "regions:"){
+ $regions = $cur_entry_fields[1];
+ }
+ elsif($cur_entry_fields[0] eq "used:"){
+ $used_regions = $cur_entry_fields[1];
+ }
+ elsif($cur_entry_fields[0] eq "peak_shift_estimate:"){
+ $peak_shift_estimate = $cur_entry_fields[1];
+ }
+ elsif($cur_entry_fields[0] eq "estimation_method:"){
+ $estimation_method_detected = $cur_entry_fields[1];
+ }
+ }
+ }
+ }
+}
+
+close output_file_tmp;
+
+if( $verbose eq "yes" ){
+ my $suggest_override = "false";
+ if( $used_regions <= $recommended_used_fraction * $regions ){
+ printf "\nQuEST has determined that the fraction of used regions ( %.2f ) for peak\n",
+ $used_regions / $regions;
+ print "estimate is low. It does not mean that the estimate is bad, but\n";
+ print "the peak shift estimate may not be very accurate.\n";
+ $suggest_override = "true";
+ }
+ if( $used_regions <= $recommended_used_regions ){
+ print "\nQuEST has determined that the number of used regions ( $used_regions ) for peak\n";
+ print "estimate is low. It does not mean that the estimate is bad, but\n";
+ print "the peak shift estimate may not be very accurate.\n";
+ $suggest_override = "true";
+ }
+ if( $peak_shift_estimate < $peak_shift_recommended_min ){
+ printf "\nQuEST has determined that the peak shift estimate ( %.0f )\n", $peak_shift_estimate;
+ print "is too low. This may be normal in some cases, but also may be indicative\n";
+ print "of read mapping artifacts into repetetive regions. You might consider\n";
+ print "revising the list of candidate regions to exclude the ones falling into the\n";
+ print "regions annotated as repeats or try using median-based estimate for the peak shift\n";
+ print "to achieve a more robust estimate.\n";
+ $suggest_override = "true";
+ }
+
+ if( $suggest_override eq "true" ){
+ my $cur_answer = "";
+ while($cur_answer ne "y" && $cur_answer ne "n" ){
+ print "Do you want to override the value of peak shift? (y/n): ";
+ $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ }
+ if( $cur_answer eq "y"){
+ $estimation_method_detected = "user_input";
+ print "Please enter the value of peak shift (recommended $peak_shift_recommended_value): ";
+ $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ $peak_shift_estimate = $cur_answer;
+ }
+ }
+
+ if( $peak_shift_estimate > $peak_shift_recommended_max ){
+ printf "\nQuEST has determined that the peak shift estimate ( %.0f )\n", $peak_shift_estimate;
+ print "is too high. This may be normal in some cases especially \n";
+ print "if the insert size in the library is large, however \n";
+ print "typically values around $peak_shift_recommended_value are expected.\n";
+ print "regions annotated as repeats or try using median-based estimate for the peak shift.\n";
+ my $cur_answer = "";
+ while($cur_answer ne "y" && $cur_answer ne "n" ){
+ print "Do you want to override the value of peak shift? (y/n): ";
+ $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ }
+ if( $cur_answer eq "y"){
+ $estimation_method_detected = "user_input";
+ print "Please enter the value of peak shift (recommended $peak_shift_recommended_value): ";
+ $cur_answer = <STDIN>;
+ chomp($cur_answer);
+ $peak_shift_estimate = $cur_answer;
+ }
+ }
+}
+
+if(-e $output_fname){
+ unlink($output_fname);
+}
+
+if(-e $output_fname_tmp){
+ unlink($output_fname_tmp);
+}
+
+open output_file, "> $output_fname";
+print output_file "regions: $regions\n";
+print output_file "used: $used_regions\n";
+print output_file "peak_shift_estimate: $peak_shift_estimate\n";
+print output_file "estimation_method: $estimation_method_detected\n";
+close output_file;
--- /dev/null
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like.
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+## This program is a wrapper that runs generate_peak_profile program
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+ run_generate_peak_profile_with_param_file.pl
+
+ This program is a wrapper that runs the generate_peak_profile
+
+ -----------------------
+ mandatory parameters:
+ -p <params_file> a file containing parameters calibrate_peak_shift
+
+ -----------------------
+ optional parameters:
+ -e <exec_path> path to the calibrate_peak_shift executable
+ -h to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+ if($fullpath_fields[$i] ne ""){
+ $exec_path = $exec_path."/".$fullpath_fields[$i];
+ }
+}
+
+$exec_path = $exec_path."/generate_peak_profile";
+
+if( ! -e $exec_path ){
+ print "Error in generate profile batch script:\n";
+ print "Failed to locate executable $exec_path.\n";
+ print "Aborting.\n";
+ exit(0);
+}
+
+## /setting optional arguments
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+ elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+ else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false"; ## die if parameters are bad
+my $bad_par = ""; ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file: $params_fname\n";
+print "exec_path: $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+ print "$usage";
+ print "Bad parameters: $bad_par\n";
+ exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $QuEST_align_file = "** missing **";
+# my @contigs;
+# my $reference_path = "** missing **";
+my $genome_table_fname = "** missing **";
+my $output_score_path = "** missing **";
+
+my $peak_shift = ""; # optional
+my $peak_shift_source_fname = ""; # optional
+my $kde_bandwidth = ""; # optional
+my $calc_window = ""; # optional
+my $threads = ""; # optional
+#my $aligned_part_length = ""; # optional
+
+for(my $i=0; $i<scalar(@params); $i++){
+ my $cur_param = $params[$i];
+ chomp($cur_param);
+
+ my @cur_par_fields = split(/ /, $cur_param);
+ if(scalar(@cur_par_fields >= 2)){
+ my $cur_par_name = $cur_par_fields[0];
+ if ($cur_par_name eq "align_file"){
+ $QuEST_align_file = $cur_par_fields[2];
+ }
+ #elsif($cur_par_name eq "contigs"){
+ # @contigs = split(/\,/, $cur_par_fields[2]);
+ #}
+ #elsif($cur_par_name eq "reference_path"){
+ # $reference_path = $cur_par_fields[2];
+ #}
+ elsif( $cur_par_name eq "genome_table" ){
+ $genome_table_fname= $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "output_score_path"){
+ $output_score_path = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "peak_shift"){
+ $peak_shift = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "peak_shift_source_file"){
+ $peak_shift_source_fname = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "kde_bandwidth"){
+ $kde_bandwidth = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "calc_window"){
+ $calc_window = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "threads"){
+ $threads = $cur_par_fields[2];
+ }
+ #elsif($cur_par_name eq "aligned_part_length"){
+ # $aligned_part_length = $cur_par_fields[2];
+ #}
+ else{
+ if($params[$i] ne ""){
+ print "Warning: unrecognized parameter: $cur_par_name";
+ }
+ }
+ }
+}
+
+if($peak_shift eq "" and $peak_shift_source_fname ne ""){
+ print "Extracting peak shift value from the $peak_shift_source_fname file\n";
+ open peak_shift_source_file, "< $peak_shift_source_fname" ||
+ die "Failed to open $peak_shift_source_fname\n";
+
+ while(<peak_shift_source_file>){
+ chomp;
+ my @cur_fields = split(/ /,$_);
+ if($cur_fields[0] eq "peak_shift_estimate:"){
+ $peak_shift = $cur_fields[1];
+ }
+ }
+}
+
+print "Read the following parameters: \n\n";
+
+print "align_file: $QuEST_align_file\n";
+print "genome_table: $genome_table_fname\n";
+#print "contigs: @contigs\n";
+#print "reference_path: $reference_path\n";
+print "output_score_path: $output_score_path\n";
+print "peak_shift: $peak_shift\n"; # optional
+print "kde_bandwidth: $kde_bandwidth\n"; # optional
+print "calc_window: $calc_window\n"; # optional
+print "threads: $threads\n"; # optional
+#print "aligned_part_length: $aligned_part_length\n"; # optional
+
+#my $stall = <STDIN>;
+print "\n";
+
+my $optional_param_string = "";
+if($kde_bandwidth ne ""){
+ $optional_param_string = $optional_param_string . " kde_bandwidth=$kde_bandwidth";
+}
+if($peak_shift ne ""){
+ $optional_param_string = $optional_param_string . " peak_shift=$peak_shift";
+}
+if($threads ne ""){
+ $optional_param_string = $optional_param_string . " threads=$threads";
+}
+#if($aligned_part_length ne ""){
+# $optional_param_string = $optional_param_string . " aligned_part_length=$aligned_part_length";
+#}
+if($calc_window ne ""){
+ $optional_param_string = $optional_param_string . " calc_window=$calc_window";
+}
+
+
+## reading genome table
+
+if( ! -e $genome_table_fname ){
+ print "Error in generate profile master script: \n";
+ print "Failed to locate genome table $genome_table_fname. Aborting.\n";
+ exit(0);
+}
+
+
+open genome_table_file, "< $genome_table_fname" || die "$genome_table_fname: $\n";
+my @genome_table = <genome_table_file>;
+
+close genome_table_file;
+
+my @contig_names;
+my @contig_sizes;
+my $contig_counter = 0;
+
+for(my $i=0; $i < scalar(@genome_table); $i++){
+ my $cur_contig_entry = $genome_table[$i];
+ chomp($cur_contig_entry);
+ my @cur_contig_entry_fields = split(/ /, $cur_contig_entry);
+ if(scalar(@cur_contig_entry_fields) == 2){
+ $contig_names[$contig_counter] = $cur_contig_entry_fields[0];
+ $contig_sizes[$contig_counter] = $cur_contig_entry_fields[1];
+ $contig_counter++;
+ }
+}
+
+## /reading genome table
+
+
+
+for(my $i=0; $i<scalar(@contig_names); $i++){
+ my $cur_contig_name = $contig_names[$i];
+ my $cur_contig_size = $contig_sizes[$i];
+ my $system_command = $exec_path . " align_file=$QuEST_align_file contig_id=$cur_contig_name contig_size=$cur_contig_size output_score_file=$output_score_path/$cur_contig_name.score" . $optional_param_string;
+ print "$system_command\n";
+ my $error_code = system("$system_command\n");
+ if($error_code != 0){
+ print "run_generate_profile_with_param_file.pl: Caught an error message (code $error_code), passing error message to the top script.\n";
+ exit(3);
+ }
+ #print "error_code: $error_code\n";
+ #exit(0);
+
+}
+
+
--- /dev/null
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like.
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+## This program is a wrapper that runs peak_caller program
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+ run_peak_caller_with_param_file.pl
+
+ This program is a wrapper that runs the peak_caller
+
+ -----------------------
+ mandatory parameters:
+ -p <params_file> a file containing parameters calibrate_peak_shift
+
+ -----------------------
+ optional parameters:
+ -e <exec_path> path to the calibrate_peak_shift executable
+ -h to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+ if($fullpath_fields[$i] ne ""){
+ $exec_path = $exec_path."/".$fullpath_fields[$i];
+ }
+}
+
+$exec_path = $exec_path."/peak_caller";
+## /setting optional arguments
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+ elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+ else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false"; ## die if parameters are bad
+my $bad_par = ""; ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file: $params_fname\n";
+print "exec_path: $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+ print "$usage";
+ print "Bad parameters: $bad_par\n";
+ exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $chip_profile_path = "** missing **";
+my $background_profile_path = "** missing **";
+my $output_fname = "** missing **";
+my $chip_threshold = "** missing **";
+my $background_threshold = "** missing **";
+my $rescue_ratio = "** missing **";
+my $genome_table_fname = "** missing **";
+my $ChIP_reads = "** missing **";
+my $background_reads = "** missing **";
+
+my $local_maximum_radius = ""; # optional
+my $dip_fraction = ""; # optional
+
+
+for(my $i=0; $i<scalar(@params); $i++){
+ my $cur_param = $params[$i];
+ chomp($cur_param);
+
+ my @cur_par_fields = split(/ /, $cur_param);
+ if(scalar(@cur_par_fields >= 2)){
+ my $cur_par_name = $cur_par_fields[0];
+ if ($cur_par_name eq "chip_profile_path"){
+ $chip_profile_path = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "background_profile_path"){
+ $background_profile_path = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "output_file"){
+ $output_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "chip_threshold"){
+ $chip_threshold = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "background_threshold"){
+ $background_threshold = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "rescue_ratio"){
+ $rescue_ratio = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "genome_table"){
+ $genome_table_fname = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "local_maximum_radius"){
+ $local_maximum_radius = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "dip_fraction"){
+ $dip_fraction = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "ChIP_reads"){
+ $ChIP_reads = $cur_par_fields[2];
+ }
+ elsif ($cur_par_name eq "background_reads"){
+ $background_reads = $cur_par_fields[2];
+ }
+ else{
+ if($params[$i] ne ""){
+ print "Warning: unrecognized parameter: $cur_par_name";
+ }
+ }
+ }
+}
+
+print "Read the following parameters: \n\n";
+
+print "chip_profile_path: $chip_profile_path\n";
+print "background_profile_path: $background_profile_path\n";
+print "output_file: $output_fname\n";
+print "chip_threshold: $chip_threshold\n";
+print "background_threshold: $background_threshold\n";
+print "rescue_ratio: $rescue_ratio\n";
+print "genome_table_file: $genome_table_fname\n";
+print "ChIP_reads: $ChIP_reads\n";
+print "background_reads: $background_reads\n";
+print "local_maximum_radius: $local_maximum_radius\n"; # optional
+print "dip_fraction: $dip_fraction\n"; # optional
+
+print "\n";
+
+my $optional_param_string = "";
+if($local_maximum_radius ne ""){
+ $optional_param_string = $optional_param_string . " local_maximum_radius=$local_maximum_radius";
+}
+if($dip_fraction ne ""){
+ $optional_param_string = $optional_param_string . " dip_fraction=$dip_fraction";
+}
+
+my @files_to_trash;
+my $files_to_trash_counter = 0;
+
+my @peak_call_files;
+my $peak_call_files_counter = 0;
+
+
+## read genome table
+
+if( ! -e $genome_table_fname){
+ print "Error in run peak caller script: couldn't find genome table $genome_table_fname.\n";
+ print "Aborting.\n";
+ exit(0);
+}
+
+open genome_table_file, "< $genome_table_fname" || die "$genome_table_fname: $\n";
+my @genome_table = <genome_table_file>;
+
+my @contigs;
+my $contig_counter = 0;
+
+for(my $i=0; $i<scalar(@genome_table); $i++){
+ my $cur_genome_table_entry = $genome_table[$i];
+ chomp($cur_genome_table_entry);
+
+ my @cur_genome_table_entry_fields = split(/ /, $cur_genome_table_entry);
+ if( scalar(@cur_genome_table_entry_fields) == 2){
+ $contigs[$contig_counter] = $cur_genome_table_entry_fields[0];
+ $contig_counter++;
+ }
+}
+
+
+## /read genome table
+
+for(my $i=0; $i<scalar(@contigs); $i++){
+ my $cur_contig = $contigs[$i];
+ my $system_command =
+ $exec_path .
+ " score_file1=$chip_profile_path/$cur_contig.score" .
+ " score_file2=$background_profile_path/$cur_contig.score" .
+ " output_file=$chip_profile_path/$cur_contig.peak_calls.tmp" .
+ " chip_threshold=$chip_threshold" .
+ " background_threshold=$background_threshold" .
+ " rescue_ratio=$rescue_ratio" .
+ " contig_id=$cur_contig" .
+ $optional_param_string;
+ print "$system_command\n";
+
+ $files_to_trash[$files_to_trash_counter] = "$chip_profile_path/$cur_contig.peak_calls.tmp";
+ $files_to_trash_counter++;
+
+ $peak_call_files[$peak_call_files_counter] = "$chip_profile_path/$cur_contig.peak_calls.tmp";
+ $peak_call_files_counter++;
+ my $error_code = system("$system_command");
+ if($error_code != 0){
+ print "run_peak_caller_with_param_file.pl: Caught an error message (code $error_code), passing the error message to the top script.\n";
+ exit(2);
+ }
+}
+
+# merging
+my $cat_string = "";
+
+if(-e $output_fname){
+ unlink($output_fname);
+}
+#else{
+#system("touch $output_fname");
+#}
+#}
+
+my $extended_fname = $output_fname . ".extended";
+
+print "output files: ";
+print "$output_fname\n";
+print "extended_file: $extended_fname\n";
+
+
+if(-e $extended_fname){
+ unlink($extended_fname);
+}
+#else{
+system("touch $extended_fname");
+#}
+
+
+for(my $i=0; $i<scalar(@peak_call_files); $i++){
+ if($i!=0){
+ $cat_string = $cat_string . " ";
+ }
+ $cat_string = $cat_string . "$peak_call_files[$i]";
+}
+# /merging
+
+
+# output
+
+#if(-e $output_fname){
+# unlink($output_fname);
+#}
+
+
+open output_file, "> $output_fname" or die "can't open $output_fname: $!\n";
+
+
+print output_file "\#\# please cite: \n";
+print output_file "\#\# Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,\n";
+print output_file "\#\# Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide\n";
+print output_file "\#\# Identification of Transcription Factor Binding Sites Based on ChIP-Seq\n";
+print output_file "\#\# Data. under review.\n\n";
+#print output_file "\#\# please cite:
+#system("echo \#\# Valouev A, Johnson DS, Sundquist A, Median C, Anton E, Batzoglou S, >> $extended_fname");
+#system("echo \#\# Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide >> $extended_fname");
+#system("echo \#\# Identification of Transcription Factor Binding Sites Based on ChIP-Seq >> $extended_fname");
+#system("echo \#\# Data. under review. >> $output_fname");
+
+close output_file;
+
+system("cat $cat_string >> $extended_fname");
+
+
+my $output_fname_tmp1 = $output_fname . ".tmp1";
+system("sort -nrk 5 $extended_fname | grep loc > $output_fname_tmp1");
+open output_file_tmp1, "< $output_fname_tmp1" or die "can't open $output_fname_tmp1: $!\n";
+
+my $output_fname_tmp2 = $output_fname . ".tmp2";
+if(-e $output_fname_tmp2){ unlink($output_fname_tmp2); }
+open output_file_tmp2, "> $output_fname_tmp2" or die "can't open $output_fname_tmp2: $!\n";
+while(<output_file_tmp1>){
+ chomp;
+ if(length($_) > 0){
+ if( substr($_,0,1) ne "#" ){
+ my @cur_entry_fields = split(/ /, $_);
+ if( scalar(@cur_entry_fields) >= 9){
+ my $cur_locmax = $cur_entry_fields[4];
+ my $cur_nulls = $cur_entry_fields[8];
+ my $cur_ef = "NA";
+ if( $cur_nulls != 0){
+ $cur_ef = ($cur_locmax / $ChIP_reads) / ($cur_nulls / $background_reads);
+ }
+ print output_file_tmp2 "$_ ef: $cur_ef\n";
+ }
+ }
+ else{
+ print output_file_tmp2 "$_\n";
+ }
+ }
+}
+close output_file_tmp1;
+close output_file_tmp2;
+
+system("cat $output_fname_tmp2 >> $output_fname");
+
+for(my $i=0; $i<scalar(@files_to_trash); $i++){
+ system("rm $files_to_trash[$i]");
+}
+
+system("rm $output_fname_tmp1");
+system("rm $output_fname_tmp2");
+# /output
+
+
+
--- /dev/null
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like.
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+
+## This program is a wrapper that runs quick_window_scan
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+ run_quick_window_scan_with_param_file.pl
+
+ This program is a wrapper that runs the quick_window_scan
+ once contig at a time
+
+ -----------------------
+ mandatory parameters:
+ -p <params_file> a file containing parameters for quick_window_scan
+
+ -----------------------
+ optional parameters:
+ -e <exec_path> path to the quick window scan executable
+ -h to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+ print $usage;
+ exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+ if($fullpath_fields[$i] ne ""){
+ $exec_path = $exec_path."/".$fullpath_fields[$i];
+ }
+}
+
+$exec_path = $exec_path."/quick_window_scan";
+## /setting optional arguments
+
+if(-e $exec_path){
+ ## executable exists do nothing
+}
+else{
+ print "Failed to locate executable $exec_path.\n";
+ print "Aborting quick_window_scan batch script.\n";
+ exit(0);
+}
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+ my $this_arg = shift @ARGV;
+ if ( $this_arg eq '-h') {die "$usage\n";}
+ elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+ elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+ else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false"; ## die if parameters are bad
+my $bad_par = ""; ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file: $params_fname\n";
+print "exec_path: $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+ print "$usage";
+ print "Bad parameters: $bad_par\n";
+ exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $QuEST_align_file = "";
+##my @contigs;
+##my $reference_path = "";
+my $genome_table_fname = "";
+my $output_file = "";
+my $calc_window = "";
+my $count_threshold = "";
+
+for(my $i=0; $i<scalar(@params); $i++){
+ my $cur_param = $params[$i];
+ chomp($cur_param);
+ my @cur_par_fields = split(/ /, $cur_param);
+ my $cur_par_name = $cur_par_fields[0];
+ if( $cur_par_name eq "align_file" ){
+ $QuEST_align_file = $cur_par_fields[2];
+ }
+# elsif($cur_par_name eq "contigs"){
+# @contigs = split(/\,/, $cur_par_fields[2]);
+# }
+# elsif($cur_par_name eq "reference_path"){
+# $reference_path = $cur_par_fields[2];
+# }
+ elsif( $cur_par_name eq "genome_table" ){
+ $genome_table_fname = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "output_file"){
+ $output_file = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "calc_window"){
+ $calc_window = $cur_par_fields[2];
+ }
+ elsif($cur_par_name eq "count_threshold"){
+ $count_threshold = $cur_par_fields[2];
+ }
+ else{
+ if($params[$i] ne ""){
+ print "Warning: unrecognized parameter: $cur_par_name";
+ }
+ }
+}
+
+print "Read the following parameters: \n\n";
+print "align_file: $QuEST_align_file\n";
+#print "contigs: @contigs\n";
+#print "reference_path: $reference_path\n";
+print "genome_table_file: $genome_table_fname\n";
+print "output_file: $output_file\n";
+print "calc_window: $calc_window\n";
+print "count_threshold: $count_threshold\n";
+
+my $optional_param_string = "";
+if($calc_window ne ""){
+ $optional_param_string = "calc_window=$calc_window";
+}
+if($count_threshold ne ""){
+ $optional_param_string = $optional_param_string . " count_threshold=$count_threshold";
+}
+
+if( ! -e $genome_table_fname){
+ print "Error in Quick Window Scan batch script:\n";
+ print "Failed to locate genome table $genome_table_fname.\n";
+ print "Aborting.\n";
+ exit(0);
+}
+
+open genome_table_file, "< $genome_table_fname" || die "$genome_table_fname: $\n";
+my @genome_table = <genome_table_file>;
+close genome_table_file;
+
+my @contig_names;
+my @contig_sizes;
+my $contig_counter = 0;
+
+for(my $i=0; $i<scalar(@genome_table); $i++){
+ my $cur_entry = $genome_table[$i];
+ chomp($cur_entry);
+
+ my @cur_entry_fields = split(/ /, $cur_entry);
+ if( scalar(@cur_entry_fields) == 2 ){
+ $contig_names[$contig_counter] = $cur_entry_fields[0];
+ $contig_sizes[$contig_counter] = $cur_entry_fields[1];
+ $contig_counter++;
+ }
+}
+
+for(my $i=0; $i<scalar(@contig_names); $i++){
+ my $cur_contig_name = $contig_names[$i];
+ my $cur_contig_size = $contig_sizes[$i];
+ my $system_command = "$exec_path contig_id=$cur_contig_name contig_size=$cur_contig_size align_file=$QuEST_align_file output_file=$output_file.$cur_contig_name $optional_param_string";
+ #print "$system_command\n";
+ my $error_code = system("$system_command");
+ if( $error_code != 0 ){
+ print "run_quick_window_scan_with_param_file.pl: Caught an error message (code $error_code). Passing the error message to the trop script.\n";
+ exit(3);
+ }
+}
+
+my $output_file_list = "";
+for(my $i=0; $i<scalar(@contig_names); $i++){
+ my $cur_contig = $contig_names[$i];
+ if($i>0){
+ $output_file_list = $output_file_list . " ";
+ }
+ $output_file_list = $output_file_list . "$output_file.$cur_contig"
+}
+
+unlink("$output_file");
+unlink("$output_file.sorted");
+system("cat $output_file_list > $output_file");
+system("sort -nr -k3 $output_file > $output_file.sorted");
+#system("sort -nr +2 $output_file > $output_file.sorted");
+system("rm $output_file_list");
+
+## /top-level script
--- /dev/null
+CC = g++
+#CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops
+OBJECTS := $(patsubst %.cpp, %.o,$(wildcard *.cpp))
+MISC_DIR = ./../misc
+INCLUDE := $(MISC_DIR)
+
+include ../config.mk
+
+all: $(OBJECTS)
+
+%.o : %.cpp
+ $(CC) -I$(INCLUDE) $(CCFLAGS) -c $<
+
+clean:
+ rm -rf *.o
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+#include <math.h>
+
+#include <time.h>
+#include <assert.h>
+
+#include "utils.h"
+#include "seq_contig.h"
+//#include "array_2d.h"
+//#include "ticker.h"
+#include "params.h"
+
+using namespace std;
+
+#define BLACK_BACKGROUND
+
+
+#define read_size 25
+
+int el(const int* counts, unsigned int i, unsigned int j){
+ return counts[i*3+j];
+}
+
+double weight(int x){
+
+ if(-50<= x && x<=50) return x/15000.0+1.0/300.0;
+ if(50<x && x<=250) return -1*x/(30000) + 1.0/120.0;
+ return 0;
+
+}
+
+double weight_for(int x){
+ return weight(-x);
+}
+double weight_rev(int x){
+ return weight(x);
+}
+
+double indicator_for(int x){
+ if (x >= -300 && x <= 300) return 1;
+ else return 0;
+ //if ( x>= -250 && x <= 50 ) return 1;
+ //else return 0;
+}
+double indicator_rev(int x){
+ if (x >= -300 && x <= 300) return 1;
+ else return 0;
+
+ //if ( x>= -50 && x <= 250) return 1;
+ //else return 0;
+}
+double normal_weight(int x, double sigma){
+ double pi=3.14;
+ return exp(-x*x/(2*sigma*sigma))/(sqrt(2*pi)*sigma);
+}
+
+int __int_min(int i, int j){
+ if ( i < j ) return i;
+ return j;
+}
+
+int __int_max(int i, int j){
+ if ( i > j) return i;
+ return j;
+}
+
+double max2min(double x, double y){
+ if( x > y) return x/y;
+ else return y/x;
+}
+
+vector<double> non_normalized_kde(vector<int> counts_vec, double kde_bandwidth){
+ int win_size = (int) (kde_bandwidth*5);
+
+ vector<double> scores(counts_vec.size());
+
+ for(int i=0; i<(int)scores.size(); i++){
+ scores[i] = 0;
+ for(int j=__int_max(0,i-win_size); j<__int_min(i+win_size, counts_vec.size()); j++){
+ scores[i] += counts_vec[j] * indicator_for(j-i) * normal_weight(j-i, kde_bandwidth);
+ }
+ }
+ return scores;
+}
+
+double mean(const vector<double>& vec){
+ double res = 0;
+ for(unsigned int i=0; i<vec.size(); i++){
+ res += vec[i] / vec.size();
+ }
+ return res;
+}
+
+double stdev(const vector<double>& vec){
+ double cur_mean = mean(vec);
+ double var = 0;
+
+ for(unsigned int i=0; i<vec.size(); i++){
+ var += (vec[i]-cur_mean)*(vec[i]-cur_mean) / (vec.size() - 1);
+ }
+ return sqrt(var);
+}
+
+double median(const vector<double>& vec){
+ vector<double> tmp_vec = vec;
+ sort(tmp_vec.begin(), tmp_vec.end());
+ int tmp_vec_size = tmp_vec.size();
+ if(tmp_vec_size%2 == 1){
+ return tmp_vec[(tmp_vec.size() - 1 ) / 2] + 1;
+ }
+ else return 0.5 * (tmp_vec[tmp_vec.size()/2 - 1] + tmp_vec[tmp_vec.size()/2] );
+}
+
+string bool2string(bool x){
+ string res;
+ if(x== true) res = "yes";
+ else res = "no";
+ return res;
+}
+
+int main(int argc, char* argv[]){
+
+ bool with_background_estimate = false;
+
+ params pars(argc, argv);
+ pars.require("quick_scan_file","quick_scan_file",STRING_TYPE);
+ pars.require("align_chip_file","align_chip_file",STRING_TYPE);
+ pars.require("align_background_file","align_background_file",STRING_TYPE);
+ pars.require("output_file","output_file",STRING_TYPE);
+
+ if(with_background_estimate)
+ pars.require("peak_free_regions_file","file containing regions free of binding sites",STRING_TYPE);
+
+ pars.optional("kde_bandwidth","kde_bandwidth","30",DOUBLE_TYPE);
+ pars.optional("method","estimation method","mean",STRING_TYPE);
+ pars.optional("scan_window","scan_window","1000",INT_TYPE);
+ pars.optional("quick_scan_tag_count_threshold","quick_scan_tag_count_threshold","0",INT_TYPE);
+ //pars.optional("aligned_part_length","length of the aligned part of solexa read","25",INT_TYPE);
+ pars.optional("top_count","quick scan entries to consider","100",INT_TYPE);
+ pars.optional("chip_to_background_fold","chip peak to background value ratio fold","20",DOUBLE_TYPE);
+ pars.optional("best_to_second_best_peak_ratio","ratio threshold of best peak to second best peak",
+ "20",DOUBLE_TYPE);
+ pars.optional("chip_peak_threshold","chip_peak_threshold","1",DOUBLE_TYPE);
+ pars.optional("chip_for_rev_peak_max_min_ratio","threshold ratio between forward and reverse peaks",
+ "1.5",DOUBLE_TYPE);
+
+ if(!pars.enforce()){
+ pars.list_all_params();
+ exit(1);
+ }
+ pars.list_all_params();
+
+ string quick_scan_fname = pars.get_string_value("quick_scan_file");
+ string align_chip_fname = pars.get_string_value("align_chip_file");
+ string align_background_fname = pars.get_string_value("align_background_file");
+ string output_fname = pars.get_string_value("output_file");
+ double kde_bandwidth = pars.get_double_value("kde_bandwidth");
+ int scan_window = pars.get_int_value("scan_window");
+ int quick_scan_tag_count_threshold = pars.get_int_value("quick_scan_tag_count_threshold");
+ //int aligned_part_length = pars.get_int_value("aligned_part_length");
+ int top_count = pars.get_int_value("top_count");
+
+ string peak_free_regions_fname = "";
+ if(with_background_estimate) peak_free_regions_fname = pars.get_string_value("peak_free_regions_file");
+
+ double chip_to_mock_peak_ratio = pars.get_double_value("chip_to_background_fold");
+ double best_to_second_best_ratio = pars.get_double_value("best_to_second_best_peak_ratio");
+ double chip_peak_threshold = pars.get_double_value("chip_peak_threshold");
+ double chip_for_rev_peak_ratio = pars.get_double_value("chip_for_rev_peak_max_min_ratio");
+
+ string estimation_method = pars.get_string_value("method");
+
+ if(estimation_method != "median" && estimation_method != "mean"){
+ cout<<"You have specified bad estimation method: "<<estimation_method<<endl;
+ cout<<"I was expecting median or mean"<<endl;
+ exit(0);
+ }
+
+ string dummy_string;
+
+ int good_chip_reads = 0;
+ int good_mock_reads = 0;
+
+ char sep = ' ';
+ char sep1 = ' ';
+ //char sep2 = ':';
+ char sep3 = char(9);
+
+ ifstream peak_free_regions_str;
+ vector<string> peak_free_regions_contig_ids;
+ vector<int> peak_free_regions_start_coord;
+ vector<int> peak_free_regions_end_coord;
+
+ int peak_free_chip_reads = 0;
+ int peak_free_mock_reads = 0;
+
+ if(with_background_estimate){
+ peak_free_regions_str.open(peak_free_regions_fname.c_str());
+ if(!peak_free_regions_str.good()){
+ cerr<<"bad file name: "<<peak_free_regions_fname<<endl;
+ exit(1);
+ }
+
+ while(peak_free_regions_str.good()){
+ getline(peak_free_regions_str, dummy_string);
+ if(peak_free_regions_str.good() && dummy_string[0] != '#'){
+ vector<string> cur_region_entries = split(dummy_string, sep3);
+ if(cur_region_entries.size()>= 3){
+ peak_free_regions_contig_ids.push_back(cur_region_entries[0]);
+ peak_free_regions_start_coord.push_back(atoi(cur_region_entries[1].c_str()));
+ peak_free_regions_end_coord.push_back(atoi(cur_region_entries[2].c_str()));
+ }
+ }
+ }
+
+ peak_free_regions_str.close();
+ cout<<"read: "<<peak_free_regions_contig_ids.size()<<" peak free regions"<<endl;
+ // exit(1);
+ }
+
+ ifstream quick_scan_str(quick_scan_fname.c_str());
+ if(!quick_scan_str.good()){
+ cerr<<"bad file name: "<<quick_scan_fname<<endl;
+ exit(1);
+ }
+
+ // do: reading quick scan peaks
+
+ vector<string> quick_scan_contig_ids;
+ vector<int> quick_scan_coords;
+ vector<int> quick_scan_tag_counts;
+
+ int peak_count = 0;
+
+ while(quick_scan_str.good() && peak_count < top_count){
+ getline(quick_scan_str, dummy_string);
+ if(quick_scan_str.good()){
+ vector<string> cur_pile_entries = split(dummy_string, sep);
+ if(cur_pile_entries.size() >= 3){ if(atoi(cur_pile_entries[2].c_str()) > quick_scan_tag_count_threshold){
+ quick_scan_contig_ids.push_back(cur_pile_entries[0]);
+ quick_scan_coords.push_back(atoi(cur_pile_entries[1].c_str()));
+ quick_scan_tag_counts.push_back(atoi(cur_pile_entries[2].c_str()));
+ peak_count++;
+ }
+ }
+ }
+ }
+
+ quick_scan_str.close();
+
+ // done: reading quick scan peaks
+
+ cout<<"read "<<quick_scan_contig_ids.size()<<" quick_scan peaks"<<endl;
+
+ ifstream chip_str(align_chip_fname.c_str());
+ if(!chip_str.good()){
+ cerr<<"bad file name: "<<align_chip_fname<<endl;
+ exit(1);
+ }
+
+ ifstream mock_str(align_background_fname.c_str());
+ if(!mock_str.good()){
+ cerr<<"bad file name: "<<align_background_fname<<endl;
+ exit(1);
+ }
+
+ ofstream out_str(output_fname.c_str());
+ if(!out_str.good()){
+ cerr<<"bad file name: "<<output_fname<<endl;
+ exit(1);
+ }
+
+ // do: some necessary inits
+
+ cout<<"Initializing arrays..."<<endl;
+
+ int region_size = scan_window;
+
+ vector<int> dummy_int_vec(region_size);
+
+ vector< vector <int> > chip_counts_for;
+ vector< vector <int> > chip_counts_rev;
+ vector< vector <int> > mock_counts_for;
+ vector< vector <int> > mock_counts_rev;
+
+ for(int i=0; i<region_size; i++){
+ dummy_int_vec[i] = 0;
+ }
+
+ for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+ chip_counts_for.push_back(dummy_int_vec);
+ chip_counts_rev.push_back(dummy_int_vec);
+ mock_counts_for.push_back(dummy_int_vec);
+ mock_counts_rev.push_back(dummy_int_vec);
+ }
+ //cout<<"done..."<<endl;
+ // done: some necessary inits
+
+ unsigned int chip_reads_for = 0;
+ unsigned int chip_reads_rev = 0;
+ unsigned int mock_reads_for = 0;
+ unsigned int mock_reads_rev = 0;
+
+ // do: load chip reads
+
+ cout<<"Loading ChIP data..."<<endl;
+ while(chip_str.good()){
+ getline(chip_str, dummy_string);
+ if(chip_str.good()){
+ if(dummy_string.length() > 0){
+ if( dummy_string[0] != '#'){
+
+ vector<string> cur_hit_entry = split(dummy_string, sep1);
+ if(cur_hit_entry.size() == 3){
+ good_chip_reads++;
+ //vector<string> cur_coord = split(cur_hit_entry[3], sep2);
+ string cur_read_contig_name = cur_hit_entry[0];
+ int cur_read_5p_coord = atoi(cur_hit_entry[1].c_str());
+ string cur_read_orient = cur_hit_entry[2];
+
+ int start_coord = cur_read_5p_coord;
+
+ //if(cur_hit_entry[4] == "F"){
+ // start_coord = atoi(cur_coord[1].c_str()) - 1;
+ // }
+ //else{
+ //assert(cur_hit_entry[4] == "R");
+ //start_coord = atoi(cur_coord[1].c_str()) + aligned_part_length - 1;
+ //}
+ //}
+ //cur_contig_id = cur_coord[0];
+
+ for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+ string cur_quick_scan_contig_id = quick_scan_contig_ids[i];
+ int cur_quick_scan_coord = quick_scan_coords[i];
+ //int cur_quick_scan_tag_count = quick_scan_tag_counts[i];
+
+ int cur_region_start_coord = __int_max(0, cur_quick_scan_coord - scan_window/2);
+ int cur_region_end_coord = cur_quick_scan_coord + scan_window/2;
+
+ if( start_coord >= cur_region_start_coord && start_coord <= cur_region_end_coord &&
+ cur_quick_scan_contig_id == cur_read_contig_name ){
+
+ if(cur_read_orient == "+"){
+ chip_counts_for[i][start_coord - cur_region_start_coord]++; chip_reads_for++;}
+ else{ chip_counts_rev[i][start_coord - cur_region_start_coord]++; chip_reads_rev++;}
+ }
+ }
+
+ if(with_background_estimate){
+ for(unsigned int i=0; i<peak_free_regions_contig_ids.size(); i++){
+ if(start_coord >= peak_free_regions_start_coord[i] &&
+ start_coord <= peak_free_regions_end_coord[i] &&
+ cur_read_contig_name == peak_free_regions_contig_ids[i]) peak_free_chip_reads++;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // done: load chip reads
+
+ // do: load mock reads
+ cout<<"Loading RX-noIP data..."<<endl;
+ while(mock_str.good()){
+ getline(mock_str, dummy_string);
+ if(mock_str.good() && dummy_string[0] != '#'){
+
+ vector<string> cur_hit_entry = split(dummy_string, sep1);
+ if(cur_hit_entry.size() == 3){ //>= 5){
+ good_mock_reads++;
+ //vector<string> cur_coord = split(cur_hit_entry[3], sep2);
+
+ string cur_read_contig_name = cur_hit_entry[0];
+ int cur_read_5p_coord = atoi(cur_hit_entry[1].c_str());
+ string cur_read_orient = cur_hit_entry[2];
+
+ int start_coord = cur_read_5p_coord;
+ //string cur_contig_id;
+
+ //if(cur_hit_entry[4] == "F"){
+ // start_coord = atoi(cur_coord[1].c_str()) - 1;
+ // }
+ //else{
+ // assert(cur_hit_entry[4] == "R");
+ // start_coord = atoi(cur_coord[1].c_str()) + aligned_part_length - 1;
+ //}
+ string cur_contig_id = cur_read_contig_name; //cur_coord[0];
+
+ for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+ string cur_quick_scan_contig_id = quick_scan_contig_ids[i];
+ int cur_quick_scan_coord = quick_scan_coords[i];
+ //int cur_quick_scan_tag_count = quick_scan_tag_counts[i];
+
+ int cur_region_start_coord = __int_max(0, cur_quick_scan_coord - scan_window/2);
+ int cur_region_end_coord = cur_quick_scan_coord + scan_window/2;
+
+ if( start_coord >= cur_region_start_coord && start_coord <= cur_region_end_coord &&
+ cur_quick_scan_contig_id == cur_contig_id){
+ if(cur_read_orient == "+"){
+ mock_counts_for[i][start_coord - cur_region_start_coord]++; mock_reads_for++;}
+ else{ mock_counts_rev[i][start_coord - cur_region_start_coord]++; mock_reads_rev++;
+ }
+ }
+ }
+ if(with_background_estimate){
+ for(unsigned int i=0; i<peak_free_regions_contig_ids.size(); i++){
+ if(start_coord >= peak_free_regions_start_coord[i] &&
+ start_coord <= peak_free_regions_end_coord[i] &&
+ cur_contig_id == peak_free_regions_contig_ids[i]) peak_free_mock_reads++;
+ }
+ }
+ }
+ }
+ }
+
+ chip_str.close();
+ mock_str.close();
+
+ // done: load mock reads
+ double chip_to_mock_adj_peak_ratio = chip_to_mock_peak_ratio;
+ if(with_background_estimate){
+ cout<<"peak_free_chip_reads:"<<peak_free_chip_reads<<endl;
+ cout<<"peak_free_mock_reads:"<<peak_free_mock_reads<<endl;
+
+ double chip_background_fraction = ( ((double)peak_free_chip_reads) / ((double)good_chip_reads)) /
+ (((double)peak_free_mock_reads) / ((double)good_mock_reads));
+ chip_to_mock_adj_peak_ratio = chip_to_mock_peak_ratio * ((double)good_chip_reads) /
+ ((double)good_mock_reads);
+
+ cout<<"chip background fraction: "<<chip_background_fraction<<endl;
+ }
+
+ // do: calculating densities
+
+ cout<<"calculating densities..."<<endl;
+
+ unsigned int qualified_peaks = 0;
+ vector<double> peak_shifts;
+
+ for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+ //printf("processing region: %d \r", i);
+ vector<double> chip_scores_for = non_normalized_kde(chip_counts_for[i], kde_bandwidth);
+ vector<double> chip_scores_rev = non_normalized_kde(chip_counts_rev[i], kde_bandwidth);
+ vector<double> mock_scores_for = non_normalized_kde(mock_counts_for[i], kde_bandwidth);
+ vector<double> mock_scores_rev = non_normalized_kde(mock_counts_rev[i], kde_bandwidth);
+
+ // do: find a maximum
+
+ bool chip_for_peak_good = false;
+ int chip_for_peak_coord = -1;
+ double chip_for_peak = 0;
+ double chip_for_enrich_ratio = 0;
+
+ vector<double> chip_for_peaks;
+
+ int chip_for_best_peak_coord = -1;
+
+ for(unsigned int j=1; j<chip_scores_for.size()-1; j++){
+ if(chip_scores_for[j] > chip_scores_for[j-1] &&
+ chip_scores_for[j] > chip_scores_for[j+1] ){
+ chip_for_peaks.push_back(chip_scores_for[j]);
+ }
+ }
+
+ //cout<<"found: "<<chip_for_peaks.size()<<" lmax peaks"<<endl;
+
+ sort(chip_for_peaks.begin(), chip_for_peaks.end());
+
+ if(chip_for_peaks.size() == 1 && chip_for_peaks[0] >= chip_peak_threshold)
+ chip_for_peak_good = true;
+
+ if(chip_for_peaks.size() > 1){
+ if(chip_for_peaks[chip_for_peaks.size()-1] / chip_for_peaks[chip_for_peaks.size()-2] >=
+ best_to_second_best_ratio) chip_for_peak_good = true;
+ }
+
+ if(chip_for_peak_good){
+ for(unsigned int j=0; j<chip_scores_for.size(); j++){
+ if(chip_scores_for[j] == chip_for_peaks[chip_for_peaks.size()-1])
+ chip_for_best_peak_coord = j;
+ }
+ assert(chip_for_best_peak_coord >= 0 && chip_for_best_peak_coord < (int)chip_scores_for.size());
+ }
+
+ if(chip_for_peak_good){
+ chip_for_peak_coord = chip_for_best_peak_coord;
+ chip_for_peak = chip_scores_for[chip_for_best_peak_coord];
+ chip_for_enrich_ratio =
+ chip_scores_for[chip_for_best_peak_coord] / mock_scores_for[chip_for_best_peak_coord];
+ //cout<<"peak_for good"<<endl;
+ //cout<<"chip_for_coord: "<<chip_for_best_peak_coord;
+ //cout<<" s: "<<chip_scores_for[chip_for_best_peak_coord]<<endl;
+ }
+ //else cout<<"peak_for bad"<<endl;
+
+ bool chip_rev_peak_good = false;
+ int chip_rev_peak_coord = -1;
+ double chip_rev_peak = 0;
+ double chip_rev_enrich_ratio = 0;
+
+ vector<double> chip_rev_peaks;
+
+ int chip_rev_best_peak_coord = -1;
+ for(unsigned int j=1; j<chip_scores_rev.size()-1; j++){
+ if(chip_scores_rev[j] > chip_scores_rev[j-1] &&
+ chip_scores_rev[j] > chip_scores_rev[j+1] ){
+ chip_rev_peaks.push_back(chip_scores_rev[j]);
+ }
+ }
+
+ //cout<<"found: "<<chip_rev_peaks.size()<<" lmax peaks"<<endl;
+
+ sort(chip_rev_peaks.begin(), chip_rev_peaks.end());
+
+ if(chip_rev_peaks.size() == 1 && chip_rev_peaks[0] >= chip_peak_threshold)
+ chip_rev_peak_good = true;
+
+ if(chip_rev_peaks.size() > 1){
+ if(chip_rev_peaks[chip_rev_peaks.size()-1] / chip_rev_peaks[chip_rev_peaks.size()-2] >=
+ best_to_second_best_ratio) chip_rev_peak_good = true;
+ }
+
+ if(chip_rev_peak_good){
+ for(unsigned int j=0; j<chip_scores_rev.size(); j++){
+ if(chip_scores_rev[j] == chip_rev_peaks[chip_rev_peaks.size()-1])
+ chip_rev_best_peak_coord = j;
+ }
+ assert(chip_rev_best_peak_coord >= 0 && chip_rev_best_peak_coord < (int)chip_scores_rev.size());
+ }
+
+ if(chip_rev_peak_good){
+ chip_rev_peak_coord = chip_rev_best_peak_coord;
+ chip_rev_peak = chip_scores_rev[chip_rev_best_peak_coord];
+ chip_rev_enrich_ratio =
+ chip_scores_rev[chip_rev_best_peak_coord] / mock_scores_rev[chip_rev_best_peak_coord];
+ //cout<<"peak_rev good"<<endl;
+ //cout<<"chip_rev_coord: "<<chip_rev_best_peak_coord;
+ //cout<<" s: "<<chip_scores_rev[chip_rev_best_peak_coord]<<endl;
+ }
+ //else cout<<"peak_for bad"<<endl;
+
+ if(chip_for_peak_good && chip_rev_peak_good &&
+ chip_for_enrich_ratio >= chip_to_mock_adj_peak_ratio &&
+ chip_rev_enrich_ratio >= chip_to_mock_adj_peak_ratio &&
+ max2min(chip_for_peak, chip_rev_peak) <= chip_for_rev_peak_ratio){
+ double ps = 0.5 * ( (double) (chip_rev_peak_coord - chip_for_peak_coord ) );
+ cout<<"i: "<<i<<" "<<quick_scan_contig_ids[i]<<" "<<quick_scan_coords[i]<<" +: "<<chip_for_peak<<" -: "<<chip_rev_peak<<" ps: "<<ps<<endl;
+ peak_shifts.push_back(ps);
+ qualified_peaks++;
+ }
+ else{
+ cout<<"i: "<<i<<" "<<quick_scan_contig_ids[i]<<" disqualified";
+ cout<<" chip_for_peak_good: "<<bool2string(chip_for_peak_good);
+ cout<<" chip_rev_peak_good: "<<bool2string(chip_rev_peak_good);
+ cout<<endl;
+ }
+
+ //exit(1);
+ }
+
+ cout<<"done"<<endl;
+
+ cout<<"median: "<<median(peak_shifts)<<endl;
+ cout<<"mean: "<<mean(peak_shifts)<<" stdev: "<<stdev(peak_shifts)<<endl;
+ cout<<"based on "<<qualified_peaks<<" qualified peaks"<<endl;
+ cout<<"estimation_method: "<<estimation_method<<endl;
+ // done: calculating densities
+
+ out_str<<"regions: "<<quick_scan_coords.size()<<endl;
+ out_str<<"used: "<<qualified_peaks<<endl;
+ if(estimation_method == "mean"){
+ out_str<<"peak_shift_estimate: "<<mean(peak_shifts)<<endl;
+ }
+ else{
+ out_str<<"peak_shift_estimate: "<<median(peak_shifts)<<endl;
+ }
+ out_str<<"estimation_method: "<<estimation_method;
+
+ out_str.close();
+ return 0;
+}
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+#include <math.h>
+
+#include <assert.h>
+#include <time.h>
+#include <pthread.h>
+
+#include "utils.h"
+#include "seq_contig.h"
+#include "params.h"
+
+using namespace std;
+
+/*
+char random_char(){
+ string random_char_string = "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ1234567890.";
+ int cur_rand_int = rand()%random_char_string.length();
+
+ return random_char_string[cur_rand_int];
+}
+
+string random_string(int string_length){
+ string result = "";
+ for(int i=0; i<string_length; i++){
+ result += random_char();
+ }
+ return result;
+}
+*/
+
+
+double normal_weight(double x, double sigma){
+ double pi=3.14;
+ return exp(-x*x/(2*sigma*sigma))/(sqrt(2.0*pi)*sigma);
+}
+
+
+struct pthread_pars{
+ //vector<unsigned int>* hit_counts_for_ptr;
+ //vector<unsigned int>* hit_counts_rev_ptr;
+ //vector<double>* score_ptr;
+
+ vector<int>* pos_hits;
+ vector<int>* neg_hits;
+ float* score_ptr;
+
+ //int pos_hits_size;
+ //int neg_hits_size;
+ int score_size;
+
+ double peak_shift;
+ double kde_bandwidth;
+
+ int calc_start_value;
+ int calc_end_value;
+
+ int thread_count;
+ unsigned int calc_window;
+};
+
+int int_max(int i, int j){
+ if ( i>j ) return i;
+ return j;
+}
+int int_min(int i, int j){
+ if (i<j) return i;
+ return j;
+}
+
+void* pscore(void* vptr){
+ pthread_pars* pars_ptr = (pthread_pars*)(vptr);
+
+ if ( !( pars_ptr->calc_start_value <= pars_ptr->calc_end_value )){
+ cerr<<"thread: "<<pars_ptr->thread_count<<endl;
+ cerr<<"start_count: "<<pars_ptr->calc_start_value<<endl;
+ cerr<<"end_count: "<<pars_ptr->calc_end_value<<endl;
+ assert(false);
+ }
+
+ assert( pars_ptr->calc_start_value >= 0);
+ int calc_window = pars_ptr->calc_window;
+ vector<int>* pos_hits = pars_ptr->pos_hits;
+ vector<int>* neg_hits = pars_ptr->neg_hits;
+ //int pos_hits_size = pars_ptr->pos_hits_size;
+ //int neg_hits_size = pars_ptr->neg_hits_size;
+ float* score = pars_ptr->score_ptr;
+ int score_size = pars_ptr->score_size;
+ int calc_start_value = pars_ptr->calc_start_value;
+ int calc_end_value = pars_ptr->calc_end_value;
+
+
+ //vector<unsigned int>* hit_counts_for_ptr = pars_ptr->hit_counts_for_ptr;
+ //vector<unsigned int>* hit_counts_rev_ptr = pars_ptr->hit_counts_rev_ptr;
+ double peak_shift = pars_ptr->peak_shift;
+ double kde_bandwidth = pars_ptr->kde_bandwidth;
+
+ /*
+ for(int i=pars_ptr->calc_start_value; i<pars_ptr->calc_end_value; i++){
+ for(int j=i-calc_window/2; j<=i+((int)calc_window)/2; j++){
+ (*(pars_ptr->score_ptr))[i] +=
+ ((double) (*hit_counts_for_ptr)[j] ) * normal_weight(j-i+peak_shift, kde_bandwidth) +
+ ((double) (*hit_counts_rev_ptr)[j] ) * normal_weight(j-i-peak_shift, kde_bandwidth);
+ }
+ }
+ */
+
+ //locating boundaries in the hits list
+ bool pos_start_found = false;
+ bool pos_end_found = false;
+ bool pos_end_exists = false;
+
+ int pos_start = -1;
+ int pos_end = -1;
+
+ bool neg_start_found = false;
+ bool neg_end_found = false;
+ bool neg_end_exists = false;
+
+ int neg_start = -1;
+ int neg_end = -1;
+
+ // looking for the window of pos hits corresponding to this region
+ for(int i=(int)pos_hits->size()-1; i>=0; i--){ //can replace this with the binary search
+ if((*pos_hits)[i] >= calc_start_value - calc_window){
+ pos_start_found = true;
+ pos_end_exists = true;
+ pos_start = i;
+ }
+ if((*pos_hits)[i] >= calc_end_value + calc_window){
+ pos_end_found = true;
+ pos_end = i;
+ }
+ }
+
+ if(pos_end_found == false && pos_end_exists == true){
+ if(pos_hits->size() > 0){
+ pos_end_found = true;
+ pos_end = pos_hits->size();
+ }
+ }
+
+ // looking for the window of neg hits corresponding to this region
+ for(int i = (int)neg_hits->size()-1; i>=0; i--){ //can replace this with the binary search
+ if((*neg_hits)[i] >= calc_start_value - calc_window){
+ neg_start_found = true;
+ neg_end_exists = true;
+ neg_start = i;
+ }
+ if((*neg_hits)[i] >= calc_end_value + calc_window){
+ neg_end_found = true;
+ neg_end = i;
+ }
+ }
+
+ if(neg_end_found == false && neg_end_exists == true){
+ if(neg_hits->size() > 0){
+ neg_end_found = true;
+ neg_end = neg_hits->size();
+ }
+ }
+
+ //cout<<"pos_start: "<<pos_start<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+ //cout<<"pos_end: "<<pos_end<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+ //cout<<"neg_start: "<<neg_start<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+ //cout<<"neg_end: "<<neg_end<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+
+
+ if(pos_start_found && pos_end_found){
+ for(int i=pos_start; i<pos_end; i++){
+ int cur_pos_coord = (*pos_hits)[i];
+ for(int j=int_max(0, cur_pos_coord - calc_window/2 + (int)peak_shift);
+ j<=int_min(score_size, cur_pos_coord + calc_window/2 + (int)peak_shift); j++){
+ score[j] += normal_weight(j-cur_pos_coord-peak_shift, kde_bandwidth);
+ }
+ }
+ }
+
+ if(neg_start_found && neg_end_found){
+ for(int i=neg_start; i<neg_end; i++){
+ int cur_pos_coord = (*neg_hits)[i];
+ for(int j=int_max(0, cur_pos_coord - calc_window/2 - (int)peak_shift);
+ j<=int_min(score_size, cur_pos_coord + calc_window/2 - (int)peak_shift); j++){
+ score[j] += normal_weight(j-cur_pos_coord+peak_shift, kde_bandwidth);
+ }
+ }
+ }
+
+ /*
+ for(int i=pars_ptr->calc_start_value; i<pars_ptr->calc_end_value; i++){
+ for(int j=i-calc_window/2; j<=i+((int)calc_window)/2; j++){
+ (*(pars_ptr->score_ptr))[i] +=
+ ((double) (*hit_counts_for_ptr)[j] ) * normal_weight(j-i+peak_shift, kde_bandwidth) +
+ ((double) (*hit_counts_rev_ptr)[j] ) * normal_weight(j-i-peak_shift, kde_bandwidth);
+ }
+ }
+ */
+
+ return 0;
+}
+
+
+int main(int argc, char* argv[]){
+
+ params pars(argc, argv);
+ pars.require("align_file","align_file",STRING_TYPE);
+ pars.require("contig_id","contig_id",STRING_TYPE);
+ pars.require("contig_size","contig_size",INT_TYPE);
+ pars.require("output_score_file","output_score_file",STRING_TYPE);
+
+ pars.optional("peak_shift","peak_shift","50",INT_TYPE);
+ pars.optional("calc_window","calc_window","300",INT_TYPE);
+ pars.optional("kde_bandwidth","kde_bandwidth","30",DOUBLE_TYPE);
+ pars.optional("threads","threads","7",INT_TYPE);
+
+ if(!pars.enforce()){
+ exit(1);
+ }
+
+ cout<<endl;
+ pars.list_all_params();
+
+ string align_fname = pars.get_string_value("align_file");
+ string contig_id = pars.get_string_value("contig_id");
+ int contig_size = pars.get_int_value("contig_size");
+
+ if(contig_size <= 0){
+ cout<<"Error: wrong contig size: "<<contig_size<<endl;
+ exit(0);
+ }
+
+ string output_fname = pars.get_string_value("output_score_file");
+
+ int peak_shift = pars.get_int_value("peak_shift");
+ int calc_window = pars.get_int_value("calc_window");
+ double kde_bandwidth = pars.get_double_value("kde_bandwidth");
+ int thread_num = pars.get_int_value("threads");
+
+ cout<<endl<<"Will be using the following:"<<endl<<endl;
+ cout<<"peak_shift: "<<peak_shift<<endl;
+ cout<<"kde_bandwidth: "<<kde_bandwidth<<endl;
+ cout<<"calc_window: "<<calc_window<<endl;
+
+ cout<<"align_file: "<<align_fname<<endl;
+
+ cout<<endl;
+
+ ifstream hit_str(align_fname.c_str());
+ if(!hit_str.good()){
+ cerr<<"bad file name: "<<align_fname<<endl;
+ exit(1);
+ }
+
+ cout<<"processing contig: "<<contig_id<<" [ "<<contig_size<<" bps ]"<<endl;
+
+
+ remove(output_fname.c_str());
+ ofstream out_str(output_fname.c_str());
+ if(!out_str.good()){
+ cerr<<"bad file name: "<<output_fname<<endl;
+ exit(1);
+ }
+
+ /*
+ cout<<"allocating memory..."<<endl;
+ vector<unsigned int> hit_counts_for(contig_size);
+ vector<unsigned int> hit_counts_rev(contig_size);
+ vector<double> score(contig_size);
+
+ for(int i=0; i<contig_size; i++){
+ hit_counts_for[i] = 0;
+ hit_counts_rev[i] = 0;
+ score[i] = 0;
+ }
+ */
+
+ cout<<"reading align_file..."<<endl;
+
+ int entries_pos = 0;
+ int entries_neg = 0;
+ string dummy_string;
+
+ char sep1 = ' ';
+
+ while(hit_str.good()){
+ getline(hit_str, dummy_string);
+ if(hit_str.good()){
+ if(dummy_string.length() > 0){
+ if( dummy_string[0] != '#'){
+
+ vector<string> cur_hit_entry_fields = split(dummy_string, sep1);
+ if(cur_hit_entry_fields.size() == 3){
+ string cur_hit_contig_name = cur_hit_entry_fields[0];
+ int cur_hit_5p_coord = atoi(cur_hit_entry_fields[1].c_str());
+ string cur_hit_orient = cur_hit_entry_fields[2];
+
+ if(cur_hit_orient != "+" && cur_hit_orient != "-"){
+ cout<<"Warning: expected +/- for orientation of read but found "<<cur_hit_orient;
+ cout<<". Skipping."<<endl;
+ }
+ else{
+
+ if(cur_hit_contig_name == contig_id){
+ if(cur_hit_5p_coord < 0 || cur_hit_5p_coord >= contig_size){
+ cout<<"Warning read coordinate "<<cur_hit_5p_coord<<" is out of boundaries [ 0,";
+ cout<<contig_size<<" ]. Skipping."<<endl;
+ }
+ else{
+
+ //entries++;
+
+ //int start_coord = cur_hit_5p_coord;
+
+ if(cur_hit_orient == "+"){
+ entries_pos++;
+ //hit_counts_for[start_coord]++;
+ }
+ else{ // "-"
+ entries_neg++;
+ //hit_counts_rev[start_coord]++;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ hit_str.close();
+
+ cout<<"read: "<<entries_pos<<" + entries "<<endl;
+ cout<<"read: "<<entries_neg<<" - entries "<<endl;
+
+ //int* pos_hits = new int [entries_pos];
+ //int* neg_hits = new int [entries_neg];
+
+ vector<int> pos_hits(entries_pos);
+ vector<int> neg_hits(entries_neg);
+
+ ifstream hit_str_pass2(align_fname.c_str());
+ if(!hit_str_pass2.good()){
+ cerr<<"Failed to read the file "<<align_fname<<endl;
+ exit(1);
+ }
+
+
+ int entries_pos_pass2 = 0;
+ int entries_neg_pass2 = 0;
+
+ while(hit_str_pass2.good()){
+ getline(hit_str_pass2, dummy_string);
+ if(hit_str_pass2.good()){
+ if(dummy_string.length() > 0){
+ if( dummy_string[0] != '#'){
+
+ vector<string> cur_hit_entry_fields = split(dummy_string, sep1);
+ if(cur_hit_entry_fields.size() == 3){
+ string cur_hit_contig_name = cur_hit_entry_fields[0];
+ int cur_hit_5p_coord = atoi(cur_hit_entry_fields[1].c_str());
+ string cur_hit_orient = cur_hit_entry_fields[2];
+
+ if(cur_hit_orient != "+" && cur_hit_orient != "-"){
+ cout<<"Warning: expected +/- for orientation of read but found "<<cur_hit_orient;
+ cout<<". Skipping."<<endl;
+ }
+ else{
+ if(cur_hit_contig_name == contig_id){
+ if(cur_hit_5p_coord < 0 || cur_hit_5p_coord >= contig_size){
+ cout<<"Warning read coordinate "<<cur_hit_5p_coord<<" is out of boundaries [ 0,";
+ cout<<contig_size<<" ]. Skipping."<<endl;
+ }
+ else{
+ int start_coord = cur_hit_5p_coord;
+
+ if(cur_hit_orient == "+"){
+ if(entries_pos_pass2 >= entries_pos){
+ cout<<"reading out of bounds. Aborting."<<endl;
+ exit(1);
+ }
+ pos_hits[entries_pos_pass2] = start_coord;
+ entries_pos_pass2++;
+ //hit_counts_for[start_coord]++;
+ }
+ else{ // "-"
+ if(entries_neg_pass2 >= entries_neg){
+ cout<<"reading out of bounds. Aborting."<<endl;
+ exit(1);
+ }
+ neg_hits[entries_neg_pass2] = start_coord;
+ entries_neg_pass2++;
+ //hit_counts_rev[start_coord]++;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ hit_str_pass2.close();
+
+ //cout<<"read: "<<entries_pos_pass2<<" positive reads"<<endl;
+ //cout<<"read: "<<entries_neg_pass2<<" negative reads"<<endl;
+
+ cout<<"sorting hits"<<endl;
+ sort(pos_hits.begin(), pos_hits.end());
+ sort(neg_hits.begin(), neg_hits.end());
+
+ //cout<<"enter something "<<endl;
+ //int blah;
+ //cin>>blah;
+ //cout<<"you've entered "<<blah<<endl;
+
+ cout<<"allocating memory for density profile CDP"<<endl;
+ float *scores = new float[contig_size];
+ cout<<"zeroing the score array"<<endl;
+ for(int i=0; i<contig_size; i++){
+ scores[i] = 0;
+ }
+
+ //int blah;
+ //cin>>blah;
+ //cout<<"you've entered "<<blah<<endl;
+
+
+ cout<<"calculating scores"<<endl;
+
+ cout<<"using "<<thread_num<<" parallel threads"<<endl;
+
+ pthread_t* threads = new pthread_t[thread_num];
+ pthread_pars* pars_t = new pthread_pars[thread_num];
+
+ int set_size = ( ( contig_size - calc_window ) - ( calc_window ) );
+ int remainder = set_size % thread_num;
+ int portion = (set_size - remainder) / thread_num;
+
+ int last_end = calc_window;
+
+ for(int i=0; i<thread_num; i++){
+
+ int cur_start_value = last_end;
+ int cur_end_value;
+
+ if( i < remainder ) cur_end_value = cur_start_value + portion + 1;
+ else cur_end_value = cur_start_value + portion;
+
+ assert( cur_end_value > cur_start_value);
+ assert( cur_start_value >= 0);
+
+ last_end = cur_end_value;
+
+ //pars_t[i].hit_counts_for_ptr = &hit_counts_for;
+ //pars_t[i].hit_counts_rev_ptr = &hit_counts_rev;
+ pars_t[i].pos_hits = &pos_hits;
+ pars_t[i].neg_hits = &neg_hits;
+
+ //pars_t[i].pos_hits_size = pos_hits.size();//entries_pos;
+ //pars_t[i].neg_hits_size = neg_hits.size(); //entries_neg;
+
+ pars_t[i].score_ptr = scores;
+ pars_t[i].score_size = contig_size;
+
+ pars_t[i].kde_bandwidth = kde_bandwidth;
+ pars_t[i].peak_shift = peak_shift;
+ pars_t[i].calc_start_value = cur_start_value;
+ pars_t[i].calc_end_value = cur_end_value;
+ pars_t[i].thread_count = i;
+ pars_t[i].calc_window = calc_window;
+
+ if( i == thread_num - 1){
+ if( cur_end_value != (int)contig_size - calc_window ){
+ cerr<<"failed to match the end. expected: "<<contig_size - calc_window;
+ cerr<<" but obtained: "<<cur_end_value<<endl;
+ assert(false);
+ }
+ }
+ }
+
+ for(int i=0; i<thread_num; i++){
+ pthread_create( threads+i, NULL, pscore, (void*)(pars_t+i) );
+ }
+
+ for(int i=0; i<thread_num; i++){
+ pthread_join( *(threads+i), NULL);
+ }
+
+
+ cout<<"saving the score file"<<endl;
+
+ int entries = entries_pos + entries_neg;
+
+ out_str.write((char*)(&contig_size), sizeof(contig_size));
+ out_str.write((char*)(&entries), sizeof(entries));
+ out_str.write((char*)(&scores[0]), sizeof(scores[0])*contig_size);
+
+ out_str.close();
+
+ delete[] threads;
+ delete[] pars_t;
+ delete[] scores;
+
+ // delete threads;
+ // delete pars_t;
+ // delete scores;
+
+ return 0;
+
+}
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+//#include <algorithm>
+#include <math.h>
+#include <assert.h>
+#include <time.h>
+
+using namespace std;
+
+#define BLACK_BACKGROUND
+
+//#include "ticker.h"
+//#include "seq_contig.h"
+#include "params.h"
+
+#define read_size 25
+
+int main(int argc, char* argv[]){
+
+ params pars(argc, argv);
+ pars.require("score_file1","score_file1",STRING_TYPE);
+ pars.require("score_file2","score_file2",STRING_TYPE);
+ pars.require("output_file","output_file",STRING_TYPE);
+ pars.require("chip_threshold","chip_threshold",STRING_TYPE);
+ pars.optional("peak_caller_method","maximum/local_maximum","local_maximum",STRING_TYPE);
+ pars.optional("local_maximum_radius","local_maximum_radius","10",INT_TYPE);
+ pars.require("contig_id","contig_id",STRING_TYPE);
+ pars.require("background_threshold","background_threshold",DOUBLE_TYPE);
+ pars.optional("dip_fraction","dip_fraction","0.1",DOUBLE_TYPE);// how much of the lowest
+ //pars.optional("above_threshold_mult","above_threshold_mult","30.0",DOUBLE_TYPE);
+ pars.require("rescue_ratio","rescue_ratio",DOUBLE_TYPE);
+ //peak the graph should dip before going for another peak
+
+ if(!pars.enforce()) exit(1);
+
+ string scores_fname1 = pars.get_string_value("score_file1");
+ string scores_fname2 = pars.get_string_value("score_file2");
+ string output_fname = pars.get_string_value("output_file");
+ double score_threshold = pars.get_double_value("chip_threshold");
+ string peak_caller_method = pars.get_string_value("peak_caller_method");
+ string contig_id = pars.get_string_value("contig_id");
+ int local_maximum_radius = pars.get_int_value("local_maximum_radius");
+ double background_threshold = pars.get_double_value("background_threshold");
+ double dip_fraction = pars.get_double_value("dip_fraction");
+ double above_threshold_mult = pars.get_double_value("rescue_ratio");
+
+ if(peak_caller_method == "maximum" || peak_caller_method == "local_maximum"){}
+ else{
+ cerr<<"bad peak_caller_method: "<<peak_caller_method<<endl;
+ exit(1);
+ }
+ //string contig_id(argv[2]);
+ //cout<<"processing contig: "<<contig_id<<endl;
+
+ //string ref_str_fname(argv[1]);
+ //ifstream ref_str(ref_str_fname.c_str());
+ //if(!ref_str.good()){
+ // cerr<<"bad file: "<<ref_str_fname<<endl;
+ // exit(1);
+ // }
+ cout<<endl<<"contig: "<<contig_id<<endl;
+ cout<<"score_file1: "<<scores_fname1<<endl;
+ cout<<"score_file2: "<<scores_fname2<<endl;
+ cout<<"score_threshold: "<<score_threshold<<endl;
+ cout<<"background_threshold: "<<background_threshold<<endl;
+
+
+ ifstream scores_str1(scores_fname1.c_str());
+ if(!scores_str1.good()){
+ cerr<<"bad file: "<<scores_fname1<<endl;
+ exit(1);
+ }
+
+ ifstream scores_str2(scores_fname2.c_str());
+ if(!scores_str2.good()){
+ cerr<<"bad file: "<<scores_fname2<<endl;
+ exit(1);
+ }
+
+ /*
+ seq_contig ref_contig;
+ int load_contig_msg = ref_contig.load_contig_from_file(contig_id, ref_str);
+ if(load_contig_msg != SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE){
+ cerr<<"failed to load contig [ "<<contig_id<<" ] from file "<<ref_str_fname<<endl;
+ exit(1);
+ }
+ else{
+ cerr<<"loaded "<<ref_contig.seq.length()<<" letters"<<endl;
+ }
+
+ ref_str.close();
+ */
+
+ cout<<"loading the scores"<<endl;
+
+ unsigned int _size1, _size2;
+ int entries1, entries2;
+
+ scores_str1.read((char*) (&_size1), sizeof(_size1));
+ scores_str2.read((char*) (&_size2), sizeof(_size2));
+
+ cout<<"_size1: "<<_size1<<endl;
+ cout<<"_size2: "<<_size2<<endl;
+ assert(_size1 == _size2);
+
+ scores_str1.read((char*) (&entries1), sizeof(entries1));
+ scores_str2.read((char*) (&entries2), sizeof(entries2));
+
+ cout<<"the scores file contains "<<_size1<<" entries"<<endl;
+
+ cout<<"score file 1: "<<entries1<<" tags."<<endl;
+ cout<<"score file 2: "<<entries2<<" tags."<<endl;
+
+ vector<float> scores1(_size1);
+ vector<float> scores2(_size2);
+
+ cout<<"loading the score values..."<<endl;
+ scores_str1.read((char*) &(scores1[0]), sizeof(scores1[0])*_size1);
+ cout<<"file1 loaded."<<endl;
+ scores_str2.read((char*) &(scores2[0]), sizeof(scores2[0])*_size2);
+ cout<<"file2 loaded."<<endl;
+
+ //for(int i=0; i<_size; i++){
+ //if(scores[i] > 20) { cout<<"i: "<<i<<" s: "<<scores[i]<<endl;/* exit(1);*/}
+ //}
+ //cout<<"done reading! "<<scores[100]<<" "<<scores[101]<<endl;
+
+ assert(_size1 = _size2);
+ unsigned int _size = _size1;
+
+ //assert(_size == ref_contig.seq.length());
+
+ //string out_fname(argv[5]);
+
+ remove(output_fname.c_str());
+ ofstream out_str(output_fname.c_str());
+ if(!out_str.good()){
+ cerr<<"bad file name: "<<output_fname<<endl;
+ exit(1);
+ }
+
+ //double thresh = 20;//30;//50;
+
+ cout<<"score_threshold: "<<score_threshold<<endl;
+ cout<<"background_threshold: "<<background_threshold<<endl;
+ cout<<"dip_fraction: "<<dip_fraction<<endl;
+ cout<<"above_threshold_mult: "<<above_threshold_mult<<endl;
+
+ cout<<"analyzing..."<<endl;
+ unsigned int peak_calls = 0;
+ unsigned int last_start_pos = 0;
+ //double max_value = -1;
+ //unsigned int max_pos = 0;
+ double last_score = -1;
+ for(unsigned int i=0; i<_size; i++){
+ double cur_score = scores1[i];
+
+ if(i>0){
+ if(cur_score >= score_threshold){
+ if(last_score < score_threshold){
+ last_start_pos = i;
+ //max_value = cur_score;
+ //max_pos =i;
+ }
+ //else{
+ //if(cur_score > max_value){
+ //max_value = cur_score; max_pos = i;
+ //}
+ //}
+ }
+ else{
+ if(last_score >= score_threshold){
+ //resolve peaks
+ //if(peak_caller_method == "maximum"){
+ unsigned int region_start = last_start_pos;
+ unsigned int region_end = i;
+
+ double max_score = -1;
+ unsigned int max_pos = 0;
+ for(unsigned int j=region_start; j<region_end; j++){
+ if(j==region_start){ max_score = scores1[j]; max_pos = j; }
+ else{
+ if( scores1[j] > max_score ){ max_score = scores1[j]; max_pos = j; }
+ }
+ }
+ //if(scores2[max_pos] < background_threshold ){
+ {
+ if(peak_caller_method == "maximum"){
+ if(scores2[max_pos] < background_threshold ||
+ (scores2[max_pos] >= background_threshold &&
+ scores1[max_pos] >= above_threshold_mult * scores2[max_pos] ) ){
+ out_str<<contig_id<<" w: "<<i-1-last_start_pos<<" max: "<<max_score<<" at: "<<max_pos<<" null_s: "<<scores2[max_pos]<<endl;
+ peak_calls++;
+ }
+ }
+ if(peak_caller_method == "local_maximum"){
+ unsigned int region_start = last_start_pos;
+ unsigned int region_end = i;
+ vector<unsigned int> candidates;
+ vector<unsigned int> local_max_peaks;
+ vector<unsigned int> past_filter_peaks;
+ for(unsigned int j=region_start+2; j<region_end; j++){
+ if(scores1[j-2] <= scores1[j-1] && scores1[j-1] >= scores1[j])
+ candidates.push_back(j-1);
+ }
+ for(unsigned int j=0; j<candidates.size(); j++){
+ bool local_max = true;
+ unsigned int cur_max_pos = candidates[j];
+ for(unsigned int k= cur_max_pos - local_maximum_radius;
+ k <= cur_max_pos + local_maximum_radius; k++){
+ if( k!=cur_max_pos){ if(scores1[k] > scores1[cur_max_pos]) local_max = false; }
+ }
+
+ if(local_max) local_max_peaks.push_back(cur_max_pos);
+ }
+ switch(local_max_peaks.size()){
+ //if(local_max_peaks.size() == 1)
+ case 0:{ break; }
+ case 1:{ past_filter_peaks.push_back(local_max_peaks[0]); break;}
+ default:{
+ //else{
+ vector<int> local_max_peaks_mask(local_max_peaks.size());
+ for(unsigned int k=0; k<local_max_peaks_mask.size(); k++) local_max_peaks_mask[k] = 0;
+ bool max_pos_found = false;
+ unsigned int max_ind = 1000;
+ for(unsigned int k=0; k<local_max_peaks.size(); k++){
+ if(local_max_peaks[k] == max_pos){
+ max_pos_found = true;
+ max_ind = k;
+ }
+ }
+ if(!max_pos_found){
+ cerr<<"bad region: "<<endl;
+ cerr<<"[ "<<last_start_pos<<","<<i<<" ]"<<endl;
+ cerr<<"score["<<last_start_pos<<"] = "<<scores1[last_start_pos]<<endl;
+ assert(max_pos_found);
+ }
+ local_max_peaks_mask[max_ind] = 1; //selected;
+ unsigned int last_good_ind = max_ind;
+ for(unsigned int k=max_ind+1; k<local_max_peaks.size(); k++){
+ unsigned int last_good_pos = local_max_peaks[last_good_ind];
+ unsigned int cur_pos = local_max_peaks[k];
+ unsigned int smaller_peak, larger_peak;
+ if(scores1[last_good_pos] > scores1[cur_pos]){
+ smaller_peak = cur_pos; larger_peak = last_good_pos;
+ }
+ else{
+ smaller_peak = last_good_pos; larger_peak = cur_pos;
+ }
+ double dip_value = (1-dip_fraction)*scores1[smaller_peak];
+ bool dipped = false;
+ for(unsigned int m=last_good_pos; m<=cur_pos; m++){
+ if(scores1[m] <= dip_value) dipped = true;
+ }
+ if(dipped){
+ local_max_peaks_mask[k] = 1;
+ last_good_ind = k;
+ }
+ }
+
+ last_good_ind = max_ind;
+ for(int k=(int)max_ind-1; k>=0; k--){
+ unsigned int last_good_pos = local_max_peaks[last_good_ind];
+ unsigned int cur_pos = local_max_peaks[k];
+ unsigned int smaller_peak, larger_peak;
+ if(scores1[last_good_pos] > scores1[cur_pos]){
+ smaller_peak = cur_pos; larger_peak = last_good_pos;
+ }
+ else{
+ smaller_peak = last_good_pos; larger_peak = cur_pos;
+ }
+ double dip_value = (1-dip_fraction)*scores1[smaller_peak];
+ bool dipped = false;
+ assert(last_good_pos > cur_pos);
+ for(unsigned int m=last_good_pos; m>=cur_pos; m--){
+ if(scores1[m] <= dip_value) dipped = true;
+ }
+ if(dipped){
+ local_max_peaks_mask[k] = 1;
+ last_good_ind = k;
+ }
+ }
+
+ for(unsigned int k=0; k<local_max_peaks_mask.size(); k++){
+ if(local_max_peaks_mask[k] == 1) past_filter_peaks.push_back(local_max_peaks[k]);
+ }
+ break;
+ }
+ }
+ for(unsigned int k=0; k<past_filter_peaks.size(); k++){
+ unsigned int cur_peak_pos = past_filter_peaks[k];
+ if(scores2[max_pos] < background_threshold ||
+ (scores2[max_pos] >= background_threshold &&
+ scores1[max_pos] >= above_threshold_mult * scores2[max_pos] ) ){
+ //if(scores2[cur_peak_pos] < background_threshold){
+ //double locmax = scores1[cur_peak_pos];
+ //double nulls = scores2[cur_peak_pos];
+ //double ef = ( locmax / entries1 ) / ( nulls / entries2 ); //enrichment fold
+ out_str<<contig_id<<" w: "<<i-1-last_start_pos<<" locmax: "<<scores1[cur_peak_pos];
+ out_str<<" at: "<<cur_peak_pos<<" nulls: "<<scores2[cur_peak_pos];
+ //out_str<<" ef: "<<ef;
+ out_str<<endl;
+ peak_calls++;
+ }
+ }
+ out_str<<endl;
+ }
+ }
+ //out_str<<contig_id<<" w: "<<i-1-last_start_pos<<" max: "<<max_value<<" at: "<<max_pos<<" null_s: "<<scores2[max_pos]<<endl;
+ }
+ }
+ }
+ last_score = cur_score;
+ }
+
+ cout<<"called: "<<peak_calls<<" peaks"<<endl;
+ return 0;
+}
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+#include <math.h>
+#include <assert.h>
+#include <time.h>
+
+#include "utils.h"
+#include "seq_contig.h"
+#include "params.h"
+
+using namespace std;
+
+#define BLACK_BACKGROUND
+
+
+//#define read_size 25
+
+int el(const int* counts, unsigned int i, unsigned int j){
+ return counts[i*3+j];
+}
+
+double weight(int x){
+
+ if(-50<= x && x<=50) return x/15000.0+1.0/300.0;
+ if(50<x && x<=250) return -1*x/(30000) + 1.0/120.0;
+ return 0;
+
+}
+
+double weight_for(int x){
+ return weight(-x);
+}
+double weight_rev(int x){
+ return weight(x);
+}
+
+double indicator_for(int x){
+ if (x >= -300 && x <= 300) return 1;
+ else return 0;
+ //if ( x>= -250 && x <= 50 ) return 1;
+ //else return 0;
+}
+double indicator_rev(int x){
+ if (x >= -300 && x <= 300) return 1;
+ else return 0;
+
+ //if ( x>= -50 && x <= 250) return 1;
+ //else return 0;
+}
+double normal_weight(int x, double sigma){
+ double pi=3.14;
+ return exp(-x*x/(2*sigma*sigma))/(sqrt(2*pi)*sigma);
+}
+
+int main(int argc, char* argv[]){
+
+ params pars(argc, argv);
+ pars.require("align_file","align_file",STRING_TYPE);
+ pars.require("contig_id","contig_id",STRING_TYPE);
+ pars.require("contig_size","contig_size",INT_TYPE);
+ //pars.require("ref_seq_file","ref_seq_file",STRING_TYPE);
+ pars.require("output_file","output_summary_file",STRING_TYPE);
+
+ pars.optional("calc_window","calc_window","300",INT_TYPE);
+ // pars.optional("kde_bandwidth","kde_bandwidth","30",DOUBLE_TYPE);
+ pars.optional("count_threshold","count_threshold","600",INT_TYPE);
+
+ if(!pars.enforce()){
+ pars.list_all_params();
+ exit(1);
+ }
+
+ pars.list_all_params();
+ string align_fname = pars.get_string_value("align_file");
+ string contig_id = pars.get_string_value("contig_id");
+ int contig_size = pars.get_int_value("contig_size");
+ //string ref_seq_fname = pars.get_string_value("ref_seq_file");
+ string output_fname = pars.get_string_value("output_file");
+
+ int calc_window = pars.get_int_value("calc_window");
+ int count_threshold = pars.get_int_value("count_threshold");
+
+ cout<<"align file : "<<align_fname<<endl;
+
+ //ifstream ref_str(ref_seq_fname.c_str());
+ //if(!ref_str.good()){
+ // cerr<<"bad file: "<<ref_seq_fname<<endl;
+ // exit(1);
+ //}
+
+ if(contig_size <= 0){
+ cout<<"Bad size of the contig: "<<contig_size<<endl;
+ cout<<"Aborting."<<endl;
+ exit(0);
+ }
+
+ ifstream hit_str(align_fname.c_str());
+ if(!hit_str.good()){
+ cerr<<"bad file name: "<<align_fname<<endl;
+ exit(1);
+ }
+
+ cout<<"processing contig: "<<contig_id<<endl;
+
+ remove(output_fname.c_str());
+ ofstream out_str(output_fname.c_str());//, ofstream::binary);
+ if(!out_str.good()){
+ cerr<<"bad file name: "<<output_fname<<endl;
+ exit(1);
+ }
+
+ //seq_contig ref_contig;
+ //int load_contig_msg = ref_contig.load_contig_from_file(contig_id, ref_str);
+ //if(load_contig_msg != SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE){
+ // cerr<<"failed to load contig [ "<<contig_id<<" ] from file "<<ref_seq_fname<<endl;
+ // exit(1);
+ // }
+ //else{
+ // cerr<<"loaded "<<ref_contig.seq.length()/1000000<<" M bps"<<endl;
+ //}
+ //unsigned int contig_size = ref_contig.seq.length();
+ //
+ //ref_str.close();
+
+
+ cout<<"allocating memory..."<<endl;
+ vector<unsigned int> hit_counts_for(contig_size);
+ vector<unsigned int> hit_counts_rev(contig_size);
+
+ for(int i=0; i<contig_size; i++){
+ hit_counts_for[i] = 0;
+ hit_counts_rev[i] = 0;
+ //score[i] = 0;
+ }
+
+ cout<<"reading align_file..."<<endl;
+
+ unsigned int entries = 0;
+ string dummy_string;
+
+ char sep1 = ' ';
+ //char sep2 = ':';
+ while(hit_str.good()){
+ getline(hit_str, dummy_string);
+ if(hit_str.good()){
+ if(dummy_string.length() > 0){
+ if(dummy_string[0] != '#'){
+
+ vector<string> cur_hit_entry = split(dummy_string, sep1);
+ //char dummy_char;
+ //cin>>dummy_char;
+
+ if(cur_hit_entry.size() == 3){ //> 4){
+
+ //vector<string> cur_coord = split(cur_hit_entry[3], sep2);
+ //assert(cur_coord.size() > 0);
+
+ string cur_read_contig_name = cur_hit_entry[0];
+ //cout<<cur_read_contig_name<<endl;
+ int cur_read_5p_coord = atoi(cur_hit_entry[1].c_str());
+ string cur_read_orient = cur_hit_entry[2];
+
+ if(cur_read_contig_name == contig_id){
+ //if(false){
+ entries++;
+ int start_coord;
+
+ //assert(cur_read_orient == "+" || cur_read_orient == "-");
+ if(cur_read_orient != "+" && cur_read_orient != "-"){
+ cout<<"Warning: in align file expected +/- for orientation but found: *"<<cur_read_orient<<"*"<<endl;
+ cout<<"Skipping this read."<<endl;
+ }
+ else{
+ start_coord = cur_read_5p_coord;
+ if(start_coord >= contig_size || start_coord <0){
+ cout<<"Warning: read coordinate "<<start_coord<<" is out of boundaries";
+ cout<<"[ 0, "<<contig_size<<" ]. Skipping this read."<<endl;
+ }
+ else{
+ if(cur_read_orient == "+"){ hit_counts_for[start_coord]++; }
+ else{ hit_counts_rev[start_coord]++; }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ hit_str.close();
+
+ cout<<"read: "<<entries<<" entries "<<endl;
+ cout<<"calculating scores"<<endl;
+
+ int cur_count = 0;
+ int cur_status = -1; //uninit
+ int cur_max = 0;
+ unsigned int cur_max_ind = 0;
+
+ for(unsigned int i=0; i<(unsigned int)calc_window; i++){
+ cur_count += hit_counts_for[i] + hit_counts_rev[i];
+ }
+
+ if(cur_count >= count_threshold){ //in the peak
+ cur_status = 1;
+ cur_max = cur_count;
+ cur_max_ind = calc_window/2-1;
+ }
+ else cur_status = 0; //waiting to get over the threshold
+
+ int peaks_called = 0;
+
+ for(int i=calc_window/2; i<(int) contig_size-(int)calc_window; i++){
+ //if(i%1000 == 0) print_tick_bar( ((double) i) / (0.01*((double) contig_size)));
+
+ cur_count += hit_counts_for[i + calc_window/2] + hit_counts_rev[i + calc_window/2]
+ - hit_counts_for[i-calc_window/2] - hit_counts_rev[i-calc_window/2];
+
+ switch (cur_status){
+ case 0: {
+ if (cur_count >= count_threshold){
+ cur_status = 1;
+ cur_max = cur_count;
+ cur_max_ind = i;
+ }
+ break;
+ }
+ case 1:{
+ if(cur_count < count_threshold){
+ out_str<<contig_id<<" "<<cur_max_ind<<" "<<cur_max<<endl;
+ cur_status = 0;
+ cur_max = 0;
+ peaks_called++;
+ }
+ else{
+ if(cur_count > cur_max){
+ cur_max = cur_count;
+ cur_max_ind = i;
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ cout<<"identified "<<peaks_called<<" candidated regions"<<endl<<endl;
+
+ out_str.close();
+
+ return 0;
+
+}
--- /dev/null
+#!/usr/bin/env python
+# Time-stamp: <2008-05-21 18:19:59 Tao Liu>
+
+"""Module Description
+
+Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the Artistic License (see the file COPYING included
+with the distribution).
+
+@status: beta
+@version: $Revision$
+@author: Yong Zhang, Tao Liu
+@contact: taoliu@jimmy.harvard.edu
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+
+import os
+import sys
+import logging
+import math
+from optparse import OptionParser
+# ------------------------------------
+# own python modules
+# ------------------------------------
+from MACS.gsl.cdf import binom_cdf_Q
+from MACS.TabIO import BEDParser, ELANDParser
+from MACS.PeakModel import PeakModel
+from MACS.PeakDetect import PeakDetect
+
+# ------------------------------------
+# Main function
+# ------------------------------------
+def main():
+ usage = "usage: %prog <-t tfile> [options]"
+ description = "MACS -- Model-based Analysis for ChIP-Sequencing"
+
+ # Parse options...
+ optparser = OptionParser(version="%prog 1.0 beta",description=description,usage=usage,add_help_option=False)
+ optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
+ optparser.add_option("-t","--treatment",dest="tfile",type="string",
+ help="ChIP-seq treatment files. REQUIRED")
+ optparser.add_option("-c","--control",dest="cfile",type="string",
+ help="control files.")
+ optparser.add_option("--name",dest="name",type="string",
+ help="experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
+ default="NA")
+ optparser.add_option("--format",dest="format",type="string",
+ help="Format of tag file, \"ELAND\" or \"BED\". DEFAULT: \"BED\"",
+ default="bed")
+ optparser.add_option("--gsize",dest="gsize",type="int",default=2700000000,
+ help="genome size, default:2700000000")
+ optparser.add_option("--tsize",dest="tsize",type="int",default=25,
+ help="tag size. DEFAULT: 25")
+ optparser.add_option("--bw",dest="bw",type="int",default=300,
+ help="band width. DEFAULT: 300")
+ optparser.add_option("--pvalue",dest="pvalue",type="float",default=1e-5,
+ help="pvalue cutoff for peak detection. DEFAULT: 1e-5")
+ optparser.add_option("--mfold",dest="mfold",type="int",default=32,
+ help="select the regions with MFOLD high-confidence enrichment ratio against background to build model. DEFAULT:32")
+# optparser.add_option("--wig",dest="wig",action="store_true",
+# help="Whether or not to save the peak shape for every peak region in wiggle file",
+# default=False)
+ optparser.add_option("--verbose",dest="verbose",type="int",default=2,
+ help="set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
+ (options,args) = optparser.parse_args()
+
+ if not options.tfile:
+ optparser.print_help()
+ sys.exit(1)
+
+ # configure the logging instance
+ logging.basicConfig(level=(4-options.verbose)*10,
+ format='%(levelname)-5s @ %(asctime)s: %(message)s ',
+ datefmt='%a, %d %b %Y %H:%M:%S',
+ stream=sys.stderr,
+ filemode="w"
+ )
+
+ if not os.path.isfile (options.tfile):
+ logging.error("No such file: %s!" % options.tfile)
+ sys.exit(1)
+ if options.cfile and not os.path.isfile (options.cfile):
+ logging.error("No such file: %s!" % options.cfile)
+ sys.exit(1)
+
+ #0 output arguments
+ logging.info("#0 arguments list:")
+ logging.info("#0 name = %s" % (options.name))
+ logging.info("#0 ChIP-seq file = %s" % (options.tfile))
+ logging.info("#0 control file = %s" % (options.cfile))
+ logging.info("#0 effective genome size = %.2e" % (options.gsize))
+ logging.info("#0 tag size = %d" % (options.tsize))
+ logging.info("#0 band width = %d" % (options.bw))
+ logging.info("#0 model fold = %s" % (options.mfold))
+ #logging.info("#0 model peaks = %s" % (options.mpeaks))
+ logging.info("#0 pvalue cutoff = %.2e" % (options.pvalue))
+
+ log_pvalue = math.log(options.pvalue,10)*-10
+
+ #1 Read tag files
+ logging.info("#1 read tag files...")
+ format = options.format.upper()
+ (treat, control) = load_tag_files (options.tfile, options.cfile,format)
+ logging.debug("#1.3 calculate max duplicate tags in single position based on binomal distribution...")
+ max_dup_tags = cal_max_dup_tags(options.gsize,treat.total_w_duplicates)
+ logging.debug("#1.3 max_dup_tags = %d" % (max_dup_tags))
+ logging.debug("#1.3 unique tags in treatment: %d" % (treat.total))
+ logging.debug("#1.3 total tags in treatment: %d" % (treat.total_w_duplicates))
+ if control:
+ logging.debug("#1.3 unique tags in control: %d" % (control.total))
+ logging.debug("#1.3 total tags in control: %d" % (control.total_w_duplicates))
+ bg_redundant = bg_r (treat,max_dup_tags)
+ logging.info("#1.3 Background Redundant rate: %.2f" % (bg_redundant))
+ logging.info("#1.3 finished!")
+ #2 Build Model
+ logging.info("#2 Build Peak Model...")
+ peakmodel = PeakModel(treatment = treat,
+ gz = options.gsize, # mappable genome size
+ bw = options.bw,
+ fold = options.mfold,
+ max_pairnum = 1000,
+ ts = options.tsize,
+ bg = bg_redundant,
+ max_dup_tags = max_dup_tags
+ )
+ logging.info("#2 finished!")
+ logging.debug("#2 Summary Model:")
+ logging.debug("#2 min_tags: %d" % (peakmodel.min_tags))
+ logging.debug("#2 frag_length: %d" % (peakmodel.frag_length))
+ logging.debug("#2 scan_window: %d" % (peakmodel.scan_window))
+ logging.info("#2.2 Generate R script for model")
+ model2r_script(peakmodel,options.name)
+ #3 Call Peaks
+ logging.info("#3 Call peaks...")
+ peakdetect = PeakDetect(treat = treat,
+ control = control,
+ frag_length = peakmodel.frag_length,
+ scan_window = peakmodel.scan_window,
+ pvalue = log_pvalue,
+ gsize = options.gsize
+ )
+ peakdetect.call_peaks()
+ #4 output
+ #4.1 peaks in XLS
+ ofile_xls = options.name+"_peaks.xls"
+ logging.info("#4.1 Write output xls file... %s" % (ofile_xls))
+ ofhd_xls = open(ofile_xls,"w")
+ ofhd_xls.write("# This file is generated by MACS\n")
+ ofhd_xls.write("# name = %s\n" % (options.name))
+ ofhd_xls.write("# ChIP-seq file = %s\n" % (options.tfile))
+ ofhd_xls.write("# control file = %s\n" % (options.cfile))
+ ofhd_xls.write("# effective genome size = %.2e\n" % (options.gsize))
+ ofhd_xls.write("# tag size = %d\n" % (options.tsize))
+ ofhd_xls.write("# band width = %d\n" % (options.bw))
+ ofhd_xls.write("# model fold = %s\n" % (options.mfold))
+ ofhd_xls.write("# pvalue cutoff = %.2e\n" % (options.pvalue))
+ ofhd_xls.write("# frag_len = %d\n" % (peakdetect.frag_length))
+ ofhd_xls.write(peakdetect.toxls())
+ ofhd_xls.close()
+ #4.2 peaks in BED
+ ofile_bed = options.name+"_peaks.bed"
+ logging.info("#4.2 Write output bed file... %s" % (ofile_bed))
+ ofhd_bed = open(ofile_bed,"w")
+ ofhd_bed.write(peakdetect.tobed())
+ ofhd_bed.close()
+
+ #4.3 negative peaks in XLS
+ if options.cfile:
+ ofile_xls = options.name+"_negative_peaks.xls"
+ logging.info("#4.3 Write output xls file for negative peaks... %s" % (ofile_xls))
+ ofhd_xls = open(ofile_xls,"w")
+ ofhd_xls.write(peakdetect.neg_toxls())
+ ofhd_xls.close()
+
+ logging.info("#5 Done! Check the output files!\n")
+
+def bg_r (trackI, max_dup_tags):
+ total_tags = trackI.total_w_duplicates
+ total_duplicates = 0
+ chrs = trackI.get_chr_names()
+ chrs.sort()
+ for chrom in chrs:
+ countlist = trackI.get_comments_by_chr (chrom)
+ for c in countlist[0]:
+ total_duplicates += max(c-max_dup_tags,0)
+ for c in countlist[1]:
+ total_duplicates += max(c-max_dup_tags,0)
+ return float(total_duplicates)/(total_tags)
+
+
+def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
+ for i in range (1,1000):
+ if binom_cdf_Q(1.0/genome_size,tags_number,i) < p:
+ return i
+ raise Exception("LAMBDA > 1000!")
+
+def load_tag_files ( tfile, cfile, format ):
+ """From the options, load treatment tags and control tags (if available).
+
+ """
+ logging.info("#1.1 read treatment tags...")
+ treat = _merge_then_load(tfile,format)
+ treat.merge_overlap()
+ if cfile:
+ logging.info("#1.2 read input tags...")
+ control = _merge_then_load(cfile,format)
+ control.merge_overlap()
+ else:
+ control = None
+ return (treat, control)
+
+def _merge_then_load ( p,format ):
+ """Merge several tag files and load it to a single FWTrackI object.
+
+ """
+ #s = os.system("cat %s > macs_tmpfile.t" % p)
+ #if s == 0:
+ t = _load_tag_file(p,format)
+ #os.remove("macs_tmpfile.t")
+ #else:
+ # raise Exception("unable to cat files: %s!" % p)
+ return t
+
+def _load_tag_file ( f, format ):
+ """Load tag file into single FWTrackI object.
+
+ """
+ fhd = file(f,mode="r")
+ if format == "ELAND":
+ p = ELANDParser()
+ elif format == "BED":
+ p = BEDParser()
+ else:
+ raise Exception("Format \"%s\" cannot be recognized!" % (format))
+ trackI = p.build_fwtrack(fhd)
+ return trackI
+
+def model2r_script(model,name):
+ rfhd = open(name+"_model.r","w")
+ p = model.plus_line
+ m = model.minus_line
+ s = model.shifted_line
+ w = len(p)
+ norm_p = [0]*w
+ norm_m = [0]*w
+ norm_s = [0]*w
+ sum_p = sum(p)
+ sum_m = sum(m)
+ sum_s = sum(s)
+ for i in range(w):
+ norm_p[i] = float(p[i])*100/sum_p
+ norm_m[i] = float(m[i])*100/sum_m
+ norm_s[i] = float(s[i])*100/sum_s
+ rfhd.write("# R script for Peak Model\n")
+ rfhd.write("# -- generated by MACS\n")
+
+ rfhd.write("""p <- c(%s)
+m <- c(%s)
+s <- c(%s)
+x <- seq.int((length(p)-1)/2*-1,(length(p)-1)/2)
+pdf("%s_model.pdf",height=4,width=4)
+plot(x,p,type="l",col=c("red"),main="Peak Model",xlab="Distance to the middle",ylab="Percentage")
+lines(x,m,col=c("blue"))
+lines(x,s,col=c("black"))
+abline(v=median(x[p==max(p)]),lty=2,col=c("red"))
+abline(v=median(x[m==max(m)]),lty=2,col=c("blue"))
+abline(v=median(x[s==max(s)]),lty=2,col=c("black"))
+dev.off()
+""" % (','.join(map(str,norm_p)),','.join(map(str,norm_m)),','.join(map(str,norm_s)),name))
+ rfhd.close()
+
+if __name__ == '__main__':
+ try:
+ main()
+ except KeyboardInterrupt:
+ sys.stderr.write("User interrupt me! ;-) See you!\n")
+ sys.exit(0)
--- /dev/null
+chr2L 0 23011543
+chr2R 0 21146707
+chr3L 0 24543556
+chr3R 0 27905052
+chr4 0 1351856
+chrX 0 22422826
--- /dev/null
+chr1 0 247249718
+chr2 0 242951148
+chr3 0 199501826
+chr4 0 191273062
+chr5 0 180857865
+chr6 0 170899991
+chr7 0 158821423
+chr8 0 146274825
+chr9 0 140273251
+chr10 0 135374736
+chr11 0 134452383
+chr12 0 132349533
+chr13 0 114142979
+chr14 0 106368584
+chr15 0 100338914
+chr16 0 88827253
+chr17 0 78774741
+chr18 0 76117152
+chr19 0 63811650
+chr20 0 62435963
+chr21 0 46944322
+chr22 0 49691431
+chrX 0 154913753
+chrY 0 57772953
--- /dev/null
+chr1 0 197069961
+chr2 0 181976761
+chr3 0 159872111
+chr4 0 155029700
+chr5 0 152003062
+chr6 0 149525684
+chr7 0 145134093
+chr8 0 132085097
+chr9 0 124000668
+chr10 0 129959147
+chr11 0 121798631
+chr12 0 120463158
+chr13 0 120614377
+chr14 0 123978869
+chr15 0 103492576
+chr16 0 98252458
+chr17 0 95177419
+chr18 0 90736836
+chr19 0 61321189
+chrX 0 165556468
+chrY 0 16029403
--- /dev/null
+chr1 0 197195431
+chr2 0 181748086
+chr3 0 159599782
+chr4 0 155630119
+chr5 0 152537258
+chr6 0 149517036
+chr7 0 152524552
+chr8 0 131738870
+chr9 0 124076171
+chr10 0 129993254
+chr11 0 121843855
+chr12 0 121257529
+chr13 0 120284311
+chr14 0 125194863
+chr15 0 103494973
+chr16 0 98319149
+chr17 0 95272650
+chr18 0 90772030
+chr19 0 61342429
+chrX 0 166650295
+chrY 0 15902554
my %libs;
-my $BIOP = "$root_dir/bin/BioProspector/BioProspector.mac";
-my $QUESTDIR = "$root_dir/bin/QuEST/QuEST_ver_0.3";
-my $MACSDIR = "$root_dir/bin/MACS/bin";
-my $WINGPEAKSDIR = "$root_dir/bin/WingPeaks/bin";
-my $WINGPEAKSGENOMEDIR = "$root_dir/bin/WingPeaks/genome_cod";
-my $PROFILEDIR = "$root_dir/bin/";
+my $BIOP = "$root_dir/bin/BioProspector.mac";
+my $QUESTDIR = "$root_dir/bin/QuEST";
+my $MACSDIR = "$root_dir/bin";
+my $WINGPEAKSDIR = "$root_dir/bin";
+my $WINGPEAKSGENOMEDIR = "$root_dir/reference_data";
+my $PROFILEDIR = "$root_dir/bin";
my $GENOMEDIR = "/Volumes/Genomes";
-my $QPCRDIR = "$root_dir/bin/";
+my $QPCRDIR = "$root_dir/bin";
my $QPCRTESTDIR = "$root_dir/reference_data/qPCR_Tests";
my $QPCRBACKGROUND = "$root_dir/reference_data/GenericBackground";