Added auxillliary programs needed to run projects
authorTim Reddy Tim <treddy@hudsonalpha.org>
Tue, 19 Aug 2008 19:13:20 +0000 (19:13 +0000)
committerTim Reddy Tim <treddy@hudsonalpha.org>
Tue, 19 Aug 2008 19:13:20 +0000 (19:13 +0000)
39 files changed:
htswanalysis/bin/BioProspector.mac [new file with mode: 0755]
htswanalysis/bin/ChIPSeq_PeakCaller_ENCODE [new file with mode: 0755]
htswanalysis/bin/QuEST/Makefile [new file with mode: 0755]
htswanalysis/bin/QuEST/README.txt [new file with mode: 0644]
htswanalysis/bin/QuEST/calibrate_peak_shift [new file with mode: 0755]
htswanalysis/bin/QuEST/config.mk [new file with mode: 0644]
htswanalysis/bin/QuEST/configure.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/extras/peakcalls2bed.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/extras/rgb_colors.pm [new file with mode: 0644]
htswanalysis/bin/QuEST/generate_QuEST_parameters.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/generate_peak_profile [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/Makefile [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/params.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/params.h [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/seq_contig.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/seq_contig.h [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/sequence_utility_functions.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/sequence_utility_functions.h [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/utils.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/misc/utils.h [new file with mode: 0755]
htswanalysis/bin/QuEST/peak_caller [new file with mode: 0755]
htswanalysis/bin/QuEST/quick_window_scan [new file with mode: 0755]
htswanalysis/bin/QuEST/run_QuEST_with_param_file.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/run_calibrate_peak_shift_with_param_file.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/run_generate_profile_with_param_file.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/run_peak_caller_with_param_file.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/run_quick_window_scan_with_param_file.pl [new file with mode: 0755]
htswanalysis/bin/QuEST/src/Makefile [new file with mode: 0755]
htswanalysis/bin/QuEST/src/calibrate_peak_shift.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/src/generate_peak_profile.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/src/peak_caller.cpp [new file with mode: 0755]
htswanalysis/bin/QuEST/src/quick_window_scan.cpp [new file with mode: 0755]
htswanalysis/bin/genomebg.mac [new file with mode: 0755]
htswanalysis/bin/macs [new file with mode: 0755]
htswanalysis/reference_data/dm3_chrlist.cod [new file with mode: 0644]
htswanalysis/reference_data/hg18_chrlist.cod [new file with mode: 0644]
htswanalysis/reference_data/mm8_chrlist.cod [new file with mode: 0644]
htswanalysis/reference_data/mm9_chrlist.cod [new file with mode: 0644]
htswanalysis/scripts/ConfigureTasks.pm

diff --git a/htswanalysis/bin/BioProspector.mac b/htswanalysis/bin/BioProspector.mac
new file mode 100755 (executable)
index 0000000..eeaed85
Binary files /dev/null and b/htswanalysis/bin/BioProspector.mac differ
diff --git a/htswanalysis/bin/ChIPSeq_PeakCaller_ENCODE b/htswanalysis/bin/ChIPSeq_PeakCaller_ENCODE
new file mode 100755 (executable)
index 0000000..a4ccffe
Binary files /dev/null and b/htswanalysis/bin/ChIPSeq_PeakCaller_ENCODE differ
diff --git a/htswanalysis/bin/QuEST/Makefile b/htswanalysis/bin/QuEST/Makefile
new file mode 100755 (executable)
index 0000000..860d5d7
--- /dev/null
@@ -0,0 +1,52 @@
+ROOT = .
+SRC_DIR = $(ROOT)/src/
+MISC_DIR = $(ROOT)/misc/
+BIN_DIR = $(ROOT)/bin/
+#../misc_files/
+CC = g++
+#CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops -pthread
+INCLUDE = $(MISC_DIR)
+COPY = cp
+
+include config.mk
+
+LOCAL_TARGETS =        generate_peak_profile peak_caller quick_window_scan calibrate_peak_shift
+
+LOCAL_EXECS = $(LOCAL_TARGETS)
+LOCAL_OBJECTS = $(LOCAL_TARGETS:=.o)
+OTHER_DEPENDENCIES = params utils seq_contig sequence_utility_functions
+OTHER_OBJECTS = $(OTHER_DEPENDENCIES:=.o)
+
+EXPANDED_OTHER_OBJECTS = $(addprefix $(MISC_DIR), $(OTHER_OBJECTS))
+
+default: all
+
+.PHONY: default all clean
+
+
+all: other_dependencies local_objects local_execs
+
+place:
+       mv $(LOCAL_TARGETS) $(BIN_DIR)
+
+local_execs: $(LOCAL_EXECS)
+
+local_objects:
+       cd $(SRC_DIR) && $(MAKE) $(LOCAL_OBJECTS)
+
+$(LOCAL_EXECS) : %: $(SRC_DIR)/%.o
+       echo $(INCLUDE)
+       $(CC) -I$(INCLUDE) $(CCFLAGS) $(SRC_DIR)$@.o $(EXPANDED_OTHER_OBJECTS) -o $@
+
+other_dependencies: other_objects
+
+other_objects:
+       cd $(MISC_DIR) && $(MAKE) $(OTHER_OBJECTS)
+
+clean:
+       rm -rf $(SRC_DIR)*.o
+       cd $(MISC_DIR) && $(MAKE) clean 
+
+all_clean:
+       rm -rf $(SRC_DIR)*.o
+       cd $(MISC_DIR) && $(MAKE) clean
diff --git a/htswanalysis/bin/QuEST/README.txt b/htswanalysis/bin/QuEST/README.txt
new file mode 100644 (file)
index 0000000..e03cd26
--- /dev/null
@@ -0,0 +1,179 @@
+current version: 0.3
+current development status: beta
+
+Disclosure:
+
+QuEST is in its beta state and is expected to run,
+but may contain minor bugs, glitches, and inconsistencies.
+If you discover any bugs or have any suggestions, 
+please contact Anton Valouev (valouev@stanford.edu).
+
+QuEST is a statistical package that is designed to analyze ChIP-Seq data
+and to produce positions of ChIP enrichment.
+
+QuEST is free of charge. You can use it for any purpose you like
+and modify/redistribute it in any way you like without permission of the
+authors.
+
+If you use QuEST or any deriviatives of its results, as well as
+the QuEST code in part or in whole for any purpose, please cite
+in your publications, patents and code:
+
+Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,
+Myers RM, Sidow A. A Statistical Framework for Genome-Wide
+Identification of Transcription Factor Binding Sites Based on ChIP-Seq
+Data. 2008 (under review)
+
+===============================================================
+
+Target OS:
+
+QuEST was tested on Linux ES with kernel 2.6.9, and Mac OS 10.5
+with Perl and g++ compiler. You will also need a fair amount
+of disc space (~20 Gb for a run on a human) to store the score files.
+
+===============================================================
+
+What you will need to run QuEST: 
+
+1. ChIP data
+
+   Reads from the ChIP-Seq experiment. 
+   Supported formats: 
+            - Solexa align
+            - QuEST align
+
+   You will only need to specify one file.
+            
+2. RX_noIP data
+  
+   Reads from the RX_noIP experiment. This is also known as total chromatin or input.
+   Supported formats: 
+            - Solexa align
+            - QuEST align
+
+   You will only need to specify one file.
+
+3. Reference genome or genome table
+
+   For the reference genome, there should be one contig per .fa file.
+   Alternatively, you can provide a genome table containing contig names
+   and their sizes (see below). Supported formats: fasta; genome table.
+   Note that you need either the analysis path or the genome table. But not
+   both, or else the script will not run.
+
+==============================================================
+
+How to run QuEST:
+
+1. Configure compilation flags
+
+   This is done by changing into the QuEST dirstribution directory and
+   typing "./configure.pl".
+
+2. Compile QuEST:
+
+   cd into the QuEST distribution directory and type "make", no extra 
+   work is needed (you need to have g++ compiler).
+
+3. Configure QuEST to run with your data. This will generate all the necessary
+   configuration files/thresholds/etc:
+
+   command: generate_QuEST_parameters.pl
+   
+   flags:   
+           -solexa_align_ChIP <file containing ChIP reads in solexa align format (see below)>
+           -solexa_align_RX_noIP <file containing RX_noIP (aka input/total chromatin) reads
+            in solexa align format (see below)>
+           -rp <path to the reference genome containing .fa files>
+           -ap <analysis path where the QuEST results and all files will be located>
+
+   example:
+
+   $/home/agent_smith/QuEST/generate_QuEST_parameters.pl -solexa_align_ChIP /Volumes/Projects/GABP/GABP.align_25.hg18.txt -solexa_align_RX_noIP /Volumes/Projects/background/Jurkat_RX_noIP.align_25.hg18.txt -rp /Volumes/Projects/Genomes/Human/hg18/ -ap /Volumes/Projects/GABP/analysis/
+
+   flags:
+
+   to list flags call "generate_QuEST_parameters.pl"
+
+4. Run QuEST:
+
+   command: run_QuEST_with_param_file.pl
+
+   flags: 
+         -p <QuEST batch param file>   
+
+   example:
+
+   $/home/agent_smith/QuEST/run_QuEST_with_param_file.pl -p /Volumes/Projects/GABP/analysis/QuEST.batch.pars
+
+5. QuEST will output the following files of interest:
+
+  ChIP peak calls: peak_caller.ChIP.out
+
+  pseudo_ChIP peak calls: peak_caller.pseudo_ChIP.out  
+
+  QuEST summary: QuEST.out
+
+===================================================================
+
+File formats:
+
+1. Read position files:
+
+   - QuEST align file:
+   
+     These files contain five prime end coordinates and orientation
+     of reads (starting with 0) after mapping to the genome.
+   
+     <contig> <coordinate> <orientation>
+
+     chr2 13839273 -
+
+     forward/reverse orientation is indicated by having + or - in the last field
+
+   - Solexa align file:
+
+     These files are outputs of Solexa analysis pipeline and can be used
+     directly as inputs to QuEST software. The file should contain one
+     alignment per line according to the following format:
+
+     <read> <NA> <NA> <contig:coordinate> <orientation>
+     AAAACCACACAA.CACAACACCCAA 1250 1 chr11:56352492 F
+
+     the rest of the fields can be anything you want (NA means the program doesn't
+     care about the value of these fields).
+
+2. Reference genome:
+
+   - FASTA files: 
+
+     The program was tested with Human hg18 and D. melanogaster dm3 releases
+     downloaded from the UCSC Genome browser. Files are expected to contain
+     one contig per file. QuEST will convert the genome to a table file 
+     (see below) which you can use for further QuEST runs. The table will
+     be saved in the analysis path.
+
+  - Genome Table file:
+
+    This file contains the description of genome one contig per line using
+    the following format:
+    
+    <contig_name> <contig_size>
+    chr1 247249719
+
+3. QuEST peak calls:
+     
+   - peak_caller.ChIP.out; peak_caller.pseudo_ChIP.out
+
+     Files containg peak calls for ChIP and pseudo_ChIP analysis
+     using the following format:
+
+     <contig> w: <width of the region of enrichment> locmax: <ChIP enrichment value> at: <position relative to the contig position 0> nulls: <background enrichment value> ef: <enrichment fold normalized for the sequencing depth>
+
+     chr2L w: 754 locmax: 8.81752 at: 195765587 nulls: 0.627292 ef: 162.382
+     
+=============================================================
+
+Detailed description of individual QuEST modules and scripts will follow in
+later versions. Execute Perl scripts without parameters to list available options.
diff --git a/htswanalysis/bin/QuEST/calibrate_peak_shift b/htswanalysis/bin/QuEST/calibrate_peak_shift
new file mode 100755 (executable)
index 0000000..5034b25
Binary files /dev/null and b/htswanalysis/bin/QuEST/calibrate_peak_shift differ
diff --git a/htswanalysis/bin/QuEST/config.mk b/htswanalysis/bin/QuEST/config.mk
new file mode 100644 (file)
index 0000000..487e05d
--- /dev/null
@@ -0,0 +1 @@
+CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops -m64 -pthread
diff --git a/htswanalysis/bin/QuEST/configure.pl b/htswanalysis/bin/QuEST/configure.pl
new file mode 100755 (executable)
index 0000000..83f70af
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $makefile_fname = "./Makefile";
+my $misc_path = "./misc/";
+my $misc_makefile_fname = $misc_path . "Makefile";
+my $config_out_fname = "./config.mk";
+
+my $uname_out = `uname -a`;
+
+my @uname_out_fields = split(/ /, $uname_out);
+
+my $OS = $uname_out_fields[0];
+my $Darwin_ver = "";
+if(scalar(@uname_out_fields) >= 3){
+    my @Darwin_ver_fields = split(/\./, $uname_out_fields[2]);
+    if(scalar(@Darwin_ver_fields) >= 3){
+       $Darwin_ver = $Darwin_ver_fields[0];
+    }
+}
+
+print "Testing OS. OS = $OS.\n";
+print "Testing compiler."; $|++;
+my $gpp_out = `which g++`;
+chomp($gpp_out);
+
+if( $gpp_out eq ""){
+    print "Error: g++ compiler not found.\n";
+}
+else{
+    print " Ok.\n";
+}
+
+#print "Testing pthread flag."; $|++;
+#my $tst_cpp_fname = "./test.cpp";
+#my $tst_exec = "./tst.out";
+#unlink ($tst_cpp_fname);
+#open test_cpp_file, "> $tst_cpp_fname" || die "$tst_cpp_fname: $!\n";
+#print test_cpp_file "\#include <iostream>\n";
+#print test_cpp_file "\#include <pthread.h>\n";
+#print test_cpp_file "using namespace std;\n";
+#print test_cpp_file "int main(){ cout<<\"pthreads work\"<<endl; return 0;}\n";
+
+#close test_cpp_file;
+#my @dummy = `g++ -pthread $tst_cpp_fname -o $tst_exec`;
+#my $tst_exec_out = `$tst_exec`;
+#chomp($tst_exec_out);
+
+#if($tst_exec_out ne "pthreads work"){
+#    print " pthreads failed.\n";
+#}
+#else{
+#    print " Ok.\n";
+#}
+#unlink($tst_exec_out);
+#unlink($tst_cpp_fname);
+
+
+if(-e $config_out_fname){
+    unlink($config_out_fname);
+}
+open config_out_file, "> $config_out_fname" || die "$config_out_fname: $!\n";
+
+my $CCFLAGS = "-Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops"; # -pthread";
+
+if( $OS eq "Darwin" ){
+    if($Darwin_ver >= 9){
+       $CCFLAGS = $CCFLAGS . " -m64 -pthread";
+    }    
+}
+else{
+    $CCFLAGS = $CCFLAGS . " -pthread"
+}
+
+print config_out_file "CCFLAGS = $CCFLAGS\n";
+
+close config_out_file;
diff --git a/htswanalysis/bin/QuEST/extras/peakcalls2bed.pl b/htswanalysis/bin/QuEST/extras/peakcalls2bed.pl
new file mode 100755 (executable)
index 0000000..d35ba8e
--- /dev/null
@@ -0,0 +1,113 @@
+#!/usr/bin/perl
+
+## this program takes peak calls files and converts them
+## to bed file for UCSC browser where peak calls are
+## displayed as features
+
+use strict;
+use warnings;
+use diagnostics;
+
+#use lib '/home/shogo/tools/ucsc_tracks';
+use lib './ucsc_tracks';
+
+use rgb_colors;
+
+my $usage = qq{
+    peakcalls2bed
+
+    This program will take peak calls file and output
+    the reported peak calls as bed file that is usable
+    within UCSC genome browser.
+
+    -----------------------
+    mandatory parameters:
+    
+    -p peak_calls_file           file containeding peak calls
+    -o output_file               bed output file
+    
+    -----------------------
+    optional parameters:
+    
+    -w feature_width             feature width in the browser
+    -s feature_shift             feature shift in the browser
+    -n track_name                track name
+    -c feature_color             bed feature color
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+my $peak_calls_fname = "";
+my $output_fname = "";
+
+## default arguments
+
+my $feature_width = 5;
+my $feature_shift = 0;
+my $track_name = "TF peak calls";
+#my $rgb_color_fname = "/home/shogo/dev/chip_seq/perl_code/rgb.colors";
+
+## parse command line arguments
+
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    if ( $this_arg eq '-p') {$peak_calls_fname = shift @ARGV;}
+    if ( $this_arg eq '-w') {$feature_width = shift @ARGV;}
+    if ( $this_arg eq '-s') {$feature_shift = shift @ARGV;}
+    if ( $this_arg eq '-o') {$output_fname = shift @ARGV;}
+    if ( $this_arg eq '-n') {$track_name = shift @ARGV;}
+}
+
+
+if ( $peak_calls_fname eq '' || $feature_width eq '' || $feature_shift eq '' || $output_fname eq '' ||
+     $track_name eq ''){
+    die "$usage\n";
+}
+
+my $track_color="";
+$track_color = main::get_random_color();
+
+print "track_color: $track_color\n";
+print "track_name: $track_name\n";
+
+open peak_calls_file, "< $peak_calls_fname" || die "$peak_calls_fname: $\n";
+
+unlink($output_fname);
+unlink($output_fname.".gz");
+open output_file, "> $output_fname" || die "$output_fname: $\n";
+
+print output_file "track name=\"$track_name\" description=\"$track_name\" visibility=2 itemRgb=\"On\"\n";
+
+while(<peak_calls_file>){
+    chomp;
+    if(length($_) > 0){
+       if(substr($_,0,1) ne "#"){
+           my @cur_entries = split(/ /, $_);
+           if(scalar(@cur_entries) > 5){
+               my $this_chr_id = $cur_entries[0];
+               my $this_coord = $cur_entries[6];
+               my $this_peak_value = $cur_entries[4];
+               my $browser_start_coord = $this_coord + 1 + $feature_shift - ($feature_width-1)/2;
+               my $browser_end_coord = $this_coord + 1 + $feature_shift + ($feature_width-1)/2;
+               
+               printf output_file "%s\t%d\t%d\t%.2f\t%.2f\t+\t%d\t%d\t%s\n", $this_chr_id, $browser_start_coord, 
+               $browser_end_coord, $this_peak_value, $this_peak_value, $browser_start_coord, 
+               $browser_end_coord, $track_color;
+               
+               #print output_file "$this_chr_id\t$browser_start_coord\t$browser_end_coord\t$this_peak_value\t0\t+";
+               #print output_file "\t$browser_start_coord\t$browser_end_coord\t$track_color\n";        
+               #print "$this_chr_id: $this_coord $this_peak_value\n";
+           }
+       }
+    }
+}
+
+close peak_calls_file;
+close output_file;
+
+system("gzip $output_fname");
diff --git a/htswanalysis/bin/QuEST/extras/rgb_colors.pm b/htswanalysis/bin/QuEST/extras/rgb_colors.pm
new file mode 100644 (file)
index 0000000..7669288
--- /dev/null
@@ -0,0 +1,170 @@
+use strict;
+use warnings;
+
+my @color_alias; 
+my @color_value;
+
+sub init_colors{
+    #push(@color_alias,"snow"); push(@color_value, "255,250,250");
+    #push(@color_alias,"GhostWhite"); push(@color_value, "248,248,255");
+    #push(@color_alias,"WhiteSmoke"); push(@color_value, "245,245,245");
+    #push(@color_alias,"gainsboro"); push(@color_value, "220,220,220");
+    #push(@color_alias,"FloralWhite"); push(@color_value, "255,250,240");
+    #push(@color_alias,"OldLace"); push(@color_value, "253,245,230");
+    #push(@color_alias,"linen"); push(@color_value, "250,240,230");
+    #push(@color_alias,"AntiqueWhite"); push(@color_value, "250,235,215");
+    #push(@color_alias,"PapayaWhip"); push(@color_value, "255,239,213");
+    #push(@color_alias,"BlanchedAlmond"); push(@color_value, "255,235,205");
+    #push(@color_alias,"bisque"); push(@color_value, "255,228,196");
+    #push(@color_alias,"PeachPuff"); push(@color_value, "255,218,185");
+    #push(@color_alias,"NavajoWhite"); push(@color_value, "255,222,173");
+    #push(@color_alias,"moccasin"); push(@color_value, "255,228,181");
+    #push(@color_alias,"cornsilk"); push(@color_value, "255,248,220");
+    #push(@color_alias,"LemonChiffon"); push(@color_value, "255,250,205");
+    #push(@color_alias,"seashell"); push(@color_value, "255,245,238");
+    #push(@color_alias,"honeydew"); push(@color_value, "240,255,240");
+    #push(@color_alias,"MintCream"); push(@color_value, "245,255,250");
+    #push(@color_alias,"azure"); push(@color_value, "240,255,255");
+    #push(@color_alias,"AliceBlue"); push(@color_value, "240,248,255");
+    #push(@color_alias,"lavender"); push(@color_value, "230,230,250");
+    #push(@color_alias,"LavenderBlush"); push(@color_value, "255,240,245");
+    #push(@color_alias,"MistyRose"); push(@color_value, "255,228,225");
+    #push(@color_alias,"white"); push(@color_value, "255,255,255");
+    #push(@color_alias,"black"); push(@color_value, "000,000,000");
+    push(@color_alias,"DarkSlateGray"); push(@color_value, "047,079,079");
+    push(@color_alias,"DarkSlateGrey"); push(@color_value, "047,079,079");
+    push(@color_alias,"DimGray"); push(@color_value, "105,105,105");
+    push(@color_alias,"DimGrey"); push(@color_value, "105,105,105");
+    push(@color_alias,"SlateGray"); push(@color_value, "112,128,144");
+    push(@color_alias,"SlateGrey"); push(@color_value, "112,128,144");
+    push(@color_alias,"LightSlateGray"); push(@color_value, "119,136,153");
+    push(@color_alias,"LightSlateGrey"); push(@color_value, "119,136,153");
+    push(@color_alias,"gray"); push(@color_value, "190,190,190");
+    push(@color_alias,"grey"); push(@color_value, "190,190,190");
+    #push(@color_alias,"LightGrey"); push(@color_value, "211,211,211");
+    #push(@color_alias,"LightGray"); push(@color_value, "211,211,211");
+    push(@color_alias,"MidnightBlue"); push(@color_value, "025,025,112");
+    push(@color_alias,"navy"); push(@color_value, "000,000,128");
+    push(@color_alias,"NavyBlue"); push(@color_value, "000,000,128");
+    push(@color_alias,"CornflowerBlue"); push(@color_value, "100,149,237");
+    push(@color_alias,"DarkSlateBlue"); push(@color_value, "072,061,139");
+    push(@color_alias,"SlateBlue"); push(@color_value, "106,090,205");
+    push(@color_alias,"MediumSlateBlue"); push(@color_value, "123,104,238");
+    push(@color_alias,"LightSlateBlue"); push(@color_value, "132,112,255");
+    push(@color_alias,"MediumBlue"); push(@color_value, "000,000,205");
+    push(@color_alias,"RoyalBlue"); push(@color_value, "065,105,225");
+    push(@color_alias,"blue"); push(@color_value, "000,000,255");
+    push(@color_alias,"DodgerBlue"); push(@color_value, "030,144,255");
+    push(@color_alias,"DeepSkyBlue"); push(@color_value, "000,191,255");
+    push(@color_alias,"SkyBlue"); push(@color_value, "135,206,235");
+    push(@color_alias,"LightSkyBlue"); push(@color_value, "135,206,250");
+    push(@color_alias,"SteelBlue"); push(@color_value, "070,130,180");
+    push(@color_alias,"LightSteelBlue"); push(@color_value, "176,196,222");
+    push(@color_alias,"LightBlue"); push(@color_value, "173,216,230");
+    push(@color_alias,"PowderBlue"); push(@color_value, "176,224,230");
+    push(@color_alias,"PaleTurquoise"); push(@color_value, "175,238,238");
+    push(@color_alias,"DarkTurquoise"); push(@color_value, "000,206,209");
+    push(@color_alias,"MediumTurquoise"); push(@color_value, "072,209,204");
+    push(@color_alias,"turquoise"); push(@color_value, "064,224,208");
+    push(@color_alias,"cyan"); push(@color_value, "000,255,255");
+    push(@color_alias,"LightCyan"); push(@color_value, "224,255,255");
+    push(@color_alias,"CadetBlue"); push(@color_value, "095,158,160");
+    push(@color_alias,"MediumAquamarine"); push(@color_value, "102,205,170");
+    push(@color_alias,"aquamarine"); push(@color_value, "127,255,212");
+    push(@color_alias,"DarkGreen"); push(@color_value, "000,100,000");
+    push(@color_alias,"DarkOliveGreen"); push(@color_value, "085,107,047");
+    push(@color_alias,"DarkSeaGreen"); push(@color_value, "143,188,143");
+    push(@color_alias,"SeaGreen"); push(@color_value, "046,139,087");
+    push(@color_alias,"MediumSeaGreen"); push(@color_value, "060,179,113");
+    push(@color_alias,"LightSeaGreen"); push(@color_value, "032,178,170");
+    push(@color_alias,"PaleGreen"); push(@color_value, "152,251,152");
+    push(@color_alias,"SpringGreen"); push(@color_value, "000,255,127");
+    push(@color_alias,"LawnGreen"); push(@color_value, "124,252,000");
+    push(@color_alias,"green"); push(@color_value, "000,255,000");
+    push(@color_alias,"chartreuse"); push(@color_value, "127,255,000");
+    push(@color_alias,"MediumSpringGreen"); push(@color_value, "000,250,154");
+    # push(@color_alias,"GreenYellow"); push(@color_value, "173,255,047");
+    push(@color_alias,"LimeGreen"); push(@color_value, "050,205,050");
+    push(@color_alias,"YellowGreen"); push(@color_value, "154,205,050");
+    push(@color_alias,"ForestGreen"); push(@color_value, "034,139,034");
+    push(@color_alias,"OliveDrab"); push(@color_value, "107,142,035");
+    push(@color_alias,"dark"); push(@color_value, "khaki");
+    push(@color_alias,"DarkKhaki"); push(@color_value, "189,183,107");
+    #push(@color_alias,"khaki"); push(@color_value, "240,230,140");
+    push(@color_alias,"PaleGoldenrod"); push(@color_value, "238,232,170");
+    push(@color_alias,"LightGoldenrodYellow"); push(@color_value, "250,250,210");
+    push(@color_alias,"LightYellow"); push(@color_value, "255,255,224");
+    push(@color_alias,"yellow"); push(@color_value, "255,255,000");
+    push(@color_alias,"gold"); push(@color_value, "255,215,000");
+    push(@color_alias,"LightGoldenrod"); push(@color_value, "238,221,130");
+    push(@color_alias,"goldenrod"); push(@color_value, "218,165,032");
+    push(@color_alias,"DarkGoldenrod"); push(@color_value, "184,134,011");
+    push(@color_alias,"RosyBrown"); push(@color_value, "188,143,143");
+    push(@color_alias,"IndianRed"); push(@color_value, "205,092,092");
+    push(@color_alias,"SaddleBrown"); push(@color_value, "139,069,019");
+    push(@color_alias,"sienna"); push(@color_value, "160,082,045");
+    push(@color_alias,"peru"); push(@color_value, "205,133,063");
+    push(@color_alias,"burlywood"); push(@color_value, "222,184,135");
+    #push(@color_alias,"beige"); push(@color_value, "245,245,220");
+    push(@color_alias,"wheat"); push(@color_value, "245,222,179");
+    push(@color_alias,"SandyBrown"); push(@color_value, "244,164,096");
+    push(@color_alias,"tan"); push(@color_value, "210,180,140");
+    push(@color_alias,"chocolate"); push(@color_value, "210,105,030");
+    push(@color_alias,"firebrick"); push(@color_value, "178,034,034");
+    push(@color_alias,"brown"); push(@color_value, "165,042,042");
+    push(@color_alias,"DarkSalmon"); push(@color_value, "233,150,122");
+    push(@color_alias,"salmon"); push(@color_value, "250,128,114");
+    push(@color_alias,"LightSalmon"); push(@color_value, "255,160,122");
+    push(@color_alias,"orange"); push(@color_value, "255,165,000");
+    push(@color_alias,"DarkOrange"); push(@color_value, "255,140,000");
+    push(@color_alias,"coral"); push(@color_value, "255,127,080");
+    push(@color_alias,"LightCoral"); push(@color_value, "240,128,128");
+    push(@color_alias,"tomato"); push(@color_value, "255,099,071");
+    push(@color_alias,"OrangeRed"); push(@color_value, "255,069,000");
+    push(@color_alias,"red"); push(@color_value, "255,000,000");
+    push(@color_alias,"HotPink"); push(@color_value, "255,105,180");
+    push(@color_alias,"DeepPink"); push(@color_value, "255,020,147");
+    push(@color_alias,"pink"); push(@color_value, "255,192,203");
+    push(@color_alias,"LightPink"); push(@color_value, "255,182,193");
+    push(@color_alias,"PaleVioletRed"); push(@color_value, "219,112,147");
+    push(@color_alias,"maroon"); push(@color_value, "176,048,096");
+    push(@color_alias,"MediumVioletRed"); push(@color_value, "199,021,133");
+    push(@color_alias,"VioletRed"); push(@color_value, "208,032,144");
+    push(@color_alias,"magenta"); push(@color_value, "255,000,255");
+    push(@color_alias,"violet"); push(@color_value, "238,130,238");
+    push(@color_alias,"plum"); push(@color_value, "221,160,221");
+    push(@color_alias,"orchid"); push(@color_value, "218,112,214");
+    push(@color_alias,"MediumOrchid"); push(@color_value, "186,085,211");
+    push(@color_alias,"DarkOrchid"); push(@color_value, "153,050,204");
+    push(@color_alias,"DarkViolet"); push(@color_value, "148,000,211");
+    push(@color_alias,"BlueViolet"); push(@color_value, "138,043,226");
+    #push(@color_alias,"purple"); push(@color_value, "160,032,240");
+    push(@color_alias,"MediumPurple"); push(@color_value, "147,112,219");
+    push(@color_alias,"thistle"); push(@color_value, "216,191,216");
+}
+
+sub get_color{
+    my $cur_color_alias = shift;
+    for(my $i=0; $i<scalar(@color_alias); $i++){
+       if($cur_color_alias eq $color_alias[$i]){
+           return $color_value[$i];
+       }
+    }
+    return "not found";
+}
+
+sub get_random_color{
+    use POSIX qw(ceil floor);
+    my $cur_rand_int = floor(rand() * (scalar(@color_value) ) );
+    return $color_value[$cur_rand_int];
+}
+
+BEGIN{
+    init_colors();
+}
+
+#my $cur_color = get_random_color("azure");
+#print "$cur_color\n";
+
+
+
diff --git a/htswanalysis/bin/QuEST/generate_QuEST_parameters.pl b/htswanalysis/bin/QuEST/generate_QuEST_parameters.pl
new file mode 100755 (executable)
index 0000000..67aca6c
--- /dev/null
@@ -0,0 +1,1788 @@
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like. 
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## Valouev A, Johnson DS, Sundquist A, Median C, Anton E, Batzoglou S,
+## Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide
+## Identification of Transcription Factor Binding Sites Based on ChIP-Seq
+## Data. under review.
+
+## This program generates parameters for individual QuEST modules
+
+use strict;
+use warnings;
+use diagnostics;
+
+use Cwd qw(realpath);
+
+my $usage = qq{
+    generate_QuEST_parameters.pl
+
+    --------------------------------
+
+    This program is a part of QuEST analytical pipeline. If you use    
+    this software by itself or as a part of another software or use 
+    results obtained using this software, 
+    for purposes of publication, patents, or development, you must cite the 
+    publication anouncing QuEST release:
+
+    Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,
+    Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide
+    Identification of Transcription Factor Binding Sites Based on ChIP-Seq
+    Data. under review.
+
+    -------------------------------
+
+    This script generates parameters for individual QuEST modules
+    
+    -----------------------
+    mandatory arguments:
+    -----------------------
+
+    for the ChIP data use one of the following:
+
+    -QuEST_align_ChIP <QuEST_align_file>
+    -solexa_align_ChIP  <solexa_ChIP_align_file>      
+
+    ( solexa align file containing mapped ChIP reads )
+
+    - - - - - - - - - - - - 
+
+    for the background data (RX_noIP, input, total chromatin) use one of the following:
+
+    -QuEST_align_RX_noIP <QuEST_align_RX_noIP_file>
+    -solexa_align_RX_noIP  <solexa_RX_noIP_align_file>   
+
+    ( solexa align file containing mapped RX_noIP control experiment )
+                   ( reads (aka total chromatin) )
+
+    - - - - - - - - - - - 
+
+    for the reference genome use one of the following:
+
+    -rp <reference_path>              a path to the reference genome fasta files
+                                      formated in UCSC Genome Browser database fashion
+                                      (typical naming convention is chr1.fa chr2.fa ...
+                                     chrM.fa)
+
+    -gt <genome_table>                a genome table containing contig names followed by
+                                      their sizes                                    
+
+    - - - - - - - - - - -                                    
+
+    for the analysis:                                
+                                     
+    -ap <analysis_path>               specify a directory where all QuEST analysis results
+                                      will be reported
+
+    -----------------------
+    optional arguments:
+
+    -silent                          assume all default parameters and do not alert
+                                     to override any values (good for automation)
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory argmuments
+my $ChIP_align_fname =        "** missing **";
+my $ChIP_file_type =          "";
+
+my $RX_noIP_align_fname =     "** missing **";
+my $RX_noIP_file_type =       "";
+
+my $reference_path =                 "** missing **";
+my $genome_table_source_fname =      "** missing **";
+
+my $analysis_path =                  "** missing **";
+
+## /mandatory arguments
+
+## optional arguments
+
+## /optional arguments
+
+## flag options
+
+my $solexa_align_ChIP_flag =          "off";
+my $QuEST_align_ChIP_flag =           "off";
+my $solexa_align_RX_noIP_flag =       "off";
+my $QuEST_align_RX_noIP_flag =        "off";
+my $rp_flag =                         "off";
+my $gt_flag =                         "off";
+my $ap_flag =                         "off";
+my $silent_flag =                     "off";
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    elsif ( $this_arg eq '-solexa_align_ChIP') {$ChIP_align_fname = shift @ARGV; $ChIP_file_type = "solexa_align"; $solexa_align_ChIP_flag = "on";}
+    elsif ( $this_arg eq '-QuEST_align_ChIP') {$ChIP_align_fname = shift @ARGV; $ChIP_file_type = "QuEST_align"; $QuEST_align_ChIP_flag = "on";}
+    elsif ( $this_arg eq '-solexa_align_RX_noIP') {$RX_noIP_align_fname = shift @ARGV; $RX_noIP_file_type = "solexa_align"; $solexa_align_RX_noIP_flag = "on";}
+    elsif ( $this_arg eq '-QuEST_align_RX_noIP') {$RX_noIP_align_fname = shift @ARGV; $RX_noIP_file_type = "QuEST_align"; $QuEST_align_RX_noIP_flag = "on";}
+    elsif ( $this_arg eq '-rp') {$reference_path = shift @ARGV; $rp_flag = "on";}
+    elsif ( $this_arg eq '-gt') { $genome_table_source_fname = shift @ARGV; $gt_flag = "on";}
+    elsif ( $this_arg eq '-ap') {$analysis_path = shift @ARGV; $ap_flag = "on";}
+    elsif ( $this_arg eq '-silent') {$silent_flag = "on";}
+    else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+if($rp_flag eq "on" && $gt_flag eq "on"){
+    print "You need to specify either the reference path (-rp) or the genome table (-gt), but not both. Aborting.\n";
+    exit(1);
+}
+
+if($solexa_align_ChIP_flag eq "on" && $QuEST_align_ChIP_flag eq "on"){
+    print "You need to specify either the solexa_align_ChIP file or QuEST_align_ChIP file but not both. Aborting.\n";
+    exit(1);
+}
+
+if($solexa_align_RX_noIP_flag eq "on" && $QuEST_align_RX_noIP_flag eq "on"){
+    print "You need to specify either the solexa_align_ChIP or QuEST_align_ChIP file, but not both. Aborting.\n";
+    exit(1);
+}
+my $die_pars_bad = "false";       ## die if parameters are bad
+my $bad_par = "";                 ## store bad parameter here
+
+my $cur_path = "";
+
+my $fullpath = realpath(".");
+
+my @fullpath_fields = split(/\//,$fullpath);
+for(my $i=0; $i<scalar(@fullpath_fields); $i++){
+    if($fullpath_fields[$i] ne ""){
+        $cur_path = $cur_path."/".$fullpath_fields[$i];
+    }
+}
+
+if( substr($analysis_path,0,2) eq "./" ){
+    $analysis_path = $cur_path . "/" . substr($analysis_path,2,length($analysis_path)-2);
+}
+
+print "\n=====================\n\n";
+print "parameters: \n\n";
+print "ChIP_align_file:               $ChIP_align_fname\n";
+print "ChIP_file_type:                $ChIP_file_type\n";
+print "\n";
+print "RX_noIP_align_file:            $RX_noIP_align_fname\n";
+print "RX_noIP_file_type:             $RX_noIP_file_type\n";
+print "\n";
+
+print "reference path:                $reference_path\n";
+print "genome_table:                  $genome_table_source_fname\n";
+
+print "\n";
+
+print "analysis_path:                 $analysis_path\n";
+
+print "\n";
+
+print "silent:                        $silent_flag\n";
+
+
+
+if( $analysis_path eq "** missing **"){ $die_pars_bad = "true"; $bad_par = $bad_par." "."analysis_path"; }
+
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+    print "$usage";
+    print "Bad parameters: $bad_par\n";
+    exit(0);
+}
+## /reading arguments
+
+if( $reference_path eq "** missing **" and $genome_table_source_fname eq "** missing **"){
+    print "You should provide either the path to the reference genome or \n";
+    print "the genome table containing contig names and their sizes. Aborting.\n";
+    exit(0);
+}
+
+if( $ChIP_file_type eq ""){
+    print "You must supply ChIP align file. Aborting.\n";
+    exit(0);
+}
+
+if( $RX_noIP_file_type eq ""){
+    print "You must supply RX_noIP align_file. Aborting.\n";
+    exit(0);
+}
+
+## print disclaimer
+
+print "\n";
+print "********* DISCLAIMER ************\n";
+print "\n";
+print "The recoomendation provided below are based on our\n";
+print "limited experience of analyzing ChIP-Seq data, and are\n";
+print "not guaranteed to yield optimal results. Therefore, all\n";
+print "recommendations should be treated as a good staring point\n";
+print "for setting QuEST parameters and should be refined if needed.\n";
+print "\n";
+print "This software is free for use and redistribution, however\n";
+print "any information, results, or software derived from from or using QuEST\n";
+print "should cite the following publication: \n";
+print "\n";
+
+print "   Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,\n";
+print "   Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide\n";
+print "   Identification of Transcription Factor Binding Sites Based on ChIP-Seq\n";
+print "   Data. under review.\n\n";
+print "press Enter to continue.\n";
+
+my $dummy_kbhit;
+if($silent_flag ne "on"){
+    $dummy_kbhit = <STDIN>;
+}
+## /print disclaimer
+
+## setting up analysis directory
+if(-d $analysis_path){
+}
+else{
+    mkdir $analysis_path or die "Failed to create the analysis directory\n";
+}
+## /setting up analysis directory
+
+print "analysis directory.....................OK\n";
+
+## setting up analysis subdirectories
+
+print "analysis subdirectories................OK\n";
+## /setting up analysis subdirectories
+
+## reading reference files
+
+my @contig_names;
+my @contig_sizes;
+my $contig_counts = 0;
+
+if( $reference_path ne "** missing **" ){
+    opendir(REF_DIR, $reference_path) or die "can't open $reference_path: $!";
+    
+    my @fasta_fnames;
+    my $fasta_fnames_counter = 0;
+    while (defined(my $cur_fname = readdir(REF_DIR))){
+       if ( substr($cur_fname,length($cur_fname)-3,3) eq ".fa" ){
+           $fasta_fnames[$fasta_fnames_counter] = $cur_fname; 
+           $fasta_fnames_counter++;
+       }
+    }
+    close(REF_DIR);
+    print "\n";
+    print "The following fasta files were found in your reference path:\n\n";
+    for(my $i=0; $i<scalar(@fasta_fnames); $i++){
+       
+       print "$fasta_fnames[$i]\n";
+    }
+    print "\nEvaluating the size of the genome. This may take several minutes.\n";
+    
+    print "\nFound the following contigs: \n\n";
+    
+    
+    for(my $i=0; $i<scalar(@fasta_fnames); $i++){
+       my $cur_contig_fname = $reference_path . "/$fasta_fnames[$i]";
+       my $cur_contig_name = "";
+       my $cur_contig_size = 0;
+       
+       if(-e $cur_contig_fname){
+           open cur_contig_file, "< $cur_contig_fname";
+           while(<cur_contig_file>){
+               chomp;
+               if( substr($_,0,1) eq ">" ){
+                   if($cur_contig_name eq ""){
+                       # first contig, don't save anything
+                   }
+                   else{
+                       $contig_names[$contig_counts] = $cur_contig_name;
+                       $contig_sizes[$contig_counts] = $cur_contig_size;
+                       $contig_counts++;
+                       
+                       print "$cur_contig_name: $cur_contig_size bps\n";
+                   }
+                   $cur_contig_name = substr($_,1,length($_)-1);
+               }
+               elsif( substr($_,0,1) eq "#" ){
+                   # skip, comment
+               }
+               else{
+                   $cur_contig_size = $cur_contig_size + length($_);
+                   #increment the size
+               }
+           }
+           if($cur_contig_name ne ""){
+               $contig_names[$contig_counts] = $cur_contig_name;
+               $contig_sizes[$contig_counts] = $cur_contig_size;
+               $contig_counts++;
+               
+               print "$cur_contig_name: $cur_contig_size bps\n";
+           }
+           close cur_contig_file;
+       }    
+    }
+}
+else{
+    print "\nFound the following contigs in the genome table: \n\n";
+    if( ! -e $genome_table_source_fname ){
+       print "Failed to locate genome table: $genome_table_source_fname\n";
+       print "Aborting.\n";
+    }
+    
+    open genome_table_source_file, "< $genome_table_source_fname" || die "$genome_table_source_fname: $!\n";
+
+    my @gt = <genome_table_source_file>;
+
+    for(my $j=0; $j < scalar(@gt); $j++){
+       my $cur_gt_entry = $gt[$j];
+       chomp($cur_gt_entry);
+       if(substr($cur_gt_entry,0,1) ne "#"){
+           my @cur_gt_entry_fields = split(/ /, $cur_gt_entry);
+           if( scalar(@cur_gt_entry_fields) == 2){
+               $contig_names[$contig_counts] = $cur_gt_entry_fields[0];
+               $contig_sizes[$contig_counts] = $cur_gt_entry_fields[1];
+               $contig_counts++;
+
+               print "$cur_gt_entry_fields[0] $cur_gt_entry_fields[1]\n";
+           }
+       }
+    }
+
+    close genome_table_source_file;
+}
+
+if (scalar(@contig_names) == 0){
+    print "There were not contigs found. Please correct reference path or genome table.\n";
+    exit(1);
+}
+
+my $contigs_string = "";
+
+my $genome_size = 0;
+
+for(my $i=0; $i<scalar(@contig_names); $i++){
+    if($i!=0){ $contigs_string = $contigs_string . ","; }
+    $contigs_string = $contigs_string . "$contig_names[$i]";
+    $genome_size = $genome_size + $contig_sizes[$i];
+}
+
+printf "\nEstimated size of the genome is %.1f Kb ( %.2f Gbps ).\n", $genome_size/1000, $genome_size/1000000000;
+
+my $contig_table_fname = $analysis_path . "genome_table";
+
+if( -e $contig_table_fname ){
+    unlink($contig_table_fname);
+}
+
+open contig_table_file, "> $contig_table_fname";
+
+for(my $i=0; $i<scalar(@contig_sizes); $i++){
+    print contig_table_file "$contig_names[$i] $contig_sizes[$i]\n";
+}
+
+close contig_table_file;
+
+my $updated_ChIP_align_fname = $analysis_path . "/ChIP.align.txt";
+
+if(-e "$updated_ChIP_align_fname"){
+    my $answer = "";    
+    while( $answer ne "y" && $answer ne "n" ){
+       print "$updated_ChIP_align_fname already exists. \n";
+       while( $answer ne "y" && $answer ne "n" ){
+           print "Do you want to overwrite? (y/n): ";
+           if($silent_flag eq "on"){
+               $answer = "y";
+               print "y\n";
+           }
+           else{
+               $answer = <STDIN>;    
+           }
+           chomp($answer);
+           if( $answer eq "y"){
+               unlink($updated_ChIP_align_fname);
+           }
+       }
+    }
+}
+
+if( ! -e $ChIP_align_fname ){
+    print "ChIP_align_file: $ChIP_align_fname does not exsist. Aborting.\n";
+    exit(1);
+}
+
+my $ChIP_tags = 0;
+
+print "\nConverting ChIP reads file into the QuEST format.\n";
+print "If the counter below stays at zero, the program can't\n";
+print "match the ChIP alignments to the reference genome that you provided.\n";
+
+if($ChIP_file_type eq "solexa_align" || $ChIP_file_type eq "QuEST_align"){
+    open ChIP_align_file, "< $ChIP_align_fname" || die "Failed to open $ChIP_align_fname\n";
+    open updated_ChIP_align_file, "> $updated_ChIP_align_fname" || 
+    die "Failed to open $updated_ChIP_align_fname\n";
+    while(<ChIP_align_file>){
+       
+       if($ChIP_tags % 10000 == 0){
+           printf "\rfound %.2f M reads", $ChIP_tags / 1000000;
+           $|++; ## flush the buffer, that's all
+       }
+       
+       my $cur_contig = "";
+       my $cur_fivep_coord = "";
+       my $cur_QuEST_orient = "";
+       my $read_ok = "true";
+
+       chomp;  
+       my @cur_fields = split(/ /,$_); 
+       
+       
+       if( $ChIP_file_type eq "solexa_align"){     
+           if( scalar(@cur_fields) > 4 ){
+               my $cur_seq_length = length($cur_fields[0]);
+               my @coord_fields = split(/:/,$cur_fields[3]);
+               my $cur_orient = $cur_fields[4];
+               if( scalar(@coord_fields) == 2 && 
+                   ( $cur_orient eq "F" or $cur_orient eq "R") ){
+                   
+                   $cur_contig = $coord_fields[0];
+                   my $cur_coord = $coord_fields[1];
+                   
+                   if($cur_orient eq "F"){ 
+                       $cur_fivep_coord = $cur_coord - 1; 
+                       $cur_QuEST_orient = "+";
+                   }
+                   elsif($cur_orient eq "R"){ 
+                       $cur_fivep_coord = $cur_coord + $cur_seq_length - 1 - 1; 
+                       $cur_QuEST_orient = "-";
+                   } 
+               }               
+               else{ $read_ok = "false"; }
+           }
+           else{ $read_ok = "false"; }           
+       }
+       elsif( $ChIP_file_type eq "QuEST_align" ){
+           if( scalar(@cur_fields) == 3 ){
+               $cur_contig = $cur_fields[0];
+               $cur_fivep_coord = $cur_fields[1];
+               $cur_QuEST_orient = $cur_fields[2];
+               if($cur_QuEST_orient eq "+" || $cur_QuEST_orient eq "-"){
+                   # do nothing
+               }
+               else{ $read_ok = "false"; }
+           }
+       }    
+
+       if( $read_ok eq "true" ){
+           my $cur_contig_ind = -1;
+           
+           for(my $i=0; $i<scalar(@contig_names); $i++){
+               if( $contig_names[$i] eq $cur_contig ){ 
+                   $cur_contig_ind = $i; 
+               }
+           }
+           
+           if($cur_contig_ind != -1){
+               ## Solexa coords start with 1, ours with 0.
+               if( $cur_fivep_coord < 0 || 
+                   $cur_fivep_coord >= $contig_sizes[$cur_contig_ind] ){
+                   print "Warning: read start out of boundaries.\n";
+                   print "Expected [ 0,$contig_sizes[$cur_contig_ind] ] but found $cur_fivep_coord. Skipping.\n";
+                   $read_ok = "false";
+               }
+               else{
+                   print updated_ChIP_align_file "$cur_contig $cur_fivep_coord $cur_QuEST_orient\n";
+                   $ChIP_tags++;
+               }
+           }
+           else{
+               # couldn't locate the read
+           }
+       }
+    }
+
+    close ChIP_align_file;
+    close updated_ChIP_align_file;
+    printf "\n";
+}
+
+print "\n";
+## /creating ChIP file 
+
+## creating RX_noIP file
+
+my $updated_RX_noIP_align_fname = $analysis_path . "/RX_noIP.align.txt";
+
+if(-e "$updated_RX_noIP_align_fname"){
+    print "$updated_RX_noIP_align_fname already exists.\n";
+    my $answer = "";
+    while( $answer ne "y" && $answer ne "n" ){
+       print "Do you want to overwrite? (y/n): ";    
+       if($silent_flag eq "on"){
+           $answer = "y";
+           print "$answer\n";      
+       }
+       else{
+           $answer = <STDIN>;
+       }
+       chomp($answer);
+       if( $answer eq "y"){
+           unlink($updated_RX_noIP_align_fname);
+       }
+    }
+}
+
+my $RX_noIP_tags = 0;
+
+print "\nConverting RX_noIP reads file into the QuEST format.\n";
+print "If the counter below stays at zero, the program can't\n";
+print "match the RX_noIP alignments to the reference genome that you provided.\n";
+
+if( ! -e $RX_noIP_align_fname ){
+    print "RX_noIP_align_file $RX_noIP_align_fname does not exist. Aborting.\n";
+    exit(1);
+}
+
+if($RX_noIP_file_type eq "solexa_align" || $RX_noIP_file_type eq "QuEST_align"){
+    open RX_noIP_align_file, "< $RX_noIP_align_fname" || die "Failed to open $RX_noIP_align_fname\n";
+    open updated_RX_noIP_align_file, "> $updated_RX_noIP_align_fname" || 
+    die "Failed to open $updated_RX_noIP_align_fname\n";
+    while(<RX_noIP_align_file>){
+       
+       if($RX_noIP_tags % 10000 == 0){
+           printf "\rfound %.2f M reads", $RX_noIP_tags / 1000000;
+           $|++; ## flush the buffer, that's all
+       }
+       
+       my $cur_contig = "";
+       my $cur_fivep_coord = "";
+       my $cur_QuEST_orient = "";
+       my $read_ok = "true";
+
+       chomp;  
+       my @cur_fields = split(/ /,$_); 
+       
+       
+       if( $RX_noIP_file_type eq "solexa_align" ){         
+           if( scalar(@cur_fields) > 4 ){
+               my $cur_seq_length = length($cur_fields[0]);
+               my @coord_fields = split(/:/,$cur_fields[3]);
+               my $cur_orient = $cur_fields[4];
+               if( scalar(@coord_fields) == 2 && 
+                   ( $cur_orient eq "F" or $cur_orient eq "R") ){
+                   
+                   $cur_contig = $coord_fields[0];
+                   my $cur_coord = $coord_fields[1];
+                   
+                   if($cur_orient eq "F"){ 
+                       $cur_fivep_coord = $cur_coord - 1; 
+                       $cur_QuEST_orient = "+";
+                   }
+                   elsif($cur_orient eq "R"){ 
+                       $cur_fivep_coord = $cur_coord + $cur_seq_length - 1 - 1; 
+                       $cur_QuEST_orient = "-";
+                   } 
+               }               
+               else{ $read_ok = "false"; }
+           }
+           else{ $read_ok = "false"; }           
+       }
+       elsif( $RX_noIP_file_type eq "QuEST_align" ){
+           if( scalar(@cur_fields) == 3 ){
+               $cur_contig = $cur_fields[0];
+               $cur_fivep_coord = $cur_fields[1];
+               $cur_QuEST_orient = $cur_fields[2];
+               if($cur_QuEST_orient eq "+" || $cur_QuEST_orient eq "-"){
+                   # do nothing
+               }
+               else{ $read_ok = "false"; }
+           }
+       }    
+
+       if( $read_ok eq "true" ){
+           my $cur_contig_ind = -1;
+           
+           for(my $i=0; $i<scalar(@contig_names); $i++){
+               if( $contig_names[$i] eq $cur_contig ){ 
+                   $cur_contig_ind = $i; 
+               }
+           }
+           
+           if($cur_contig_ind != -1){
+               ## Solexa coords start with 1, ours with 0.
+               if( $cur_fivep_coord < 0 || 
+                   $cur_fivep_coord >= $contig_sizes[$cur_contig_ind] ){
+                   print "Warning: read start out of boundaries.\n";
+                   print "Expected [ 0,$contig_sizes[$cur_contig_ind] ] but found $cur_fivep_coord. Skipping.\n";
+                   $read_ok = "false";
+               }
+               else{
+                   print updated_RX_noIP_align_file "$cur_contig $cur_fivep_coord $cur_QuEST_orient\n";
+                   $RX_noIP_tags++;
+               }
+           }
+           else{
+               # couldn't locate the read
+           }
+       }
+    }
+
+    close RX_noIP_align_file;
+    close updated_RX_noIP_align_file;
+    printf "\n";
+}
+
+print "\n";
+
+## creating RX_noIP file
+
+## /creating RX_noIP file 
+
+## make assessment to the number of tags
+
+my $ChIP_recommended_tag_count = ( $genome_size / 3080436051 ) * 5000000;
+my $ChIP_excess_fold = $ChIP_tags / $ChIP_recommended_tag_count;
+
+my $RX_noIP_recommended_tag_count = ( $genome_size / 3080436051 ) * 7000000;
+my $RX_noIP_excess_fold = $RX_noIP_tags / $RX_noIP_recommended_tag_count;
+my $RX_noIP_excess_fold_with_FDR = ($RX_noIP_tags - $ChIP_tags ) / $RX_noIP_recommended_tag_count;
+
+printf "You seem to have at least %.2f fold the recommended amount of ChIP reads.\n", $ChIP_excess_fold;
+printf "You seem to have at least %.2f fold the recommended amount of RX_noIP reads without FDR estimate.\n", $RX_noIP_excess_fold;
+printf "You seem to have at least %.2f fold the recommended amount of RX_noIP reads with FDR estimate\n.", $RX_noIP_excess_fold_with_FDR;
+print "\n";
+
+if($ChIP_excess_fold < 1){
+    print "\nWarning: you seem to not have sufficiently many ChIP sequencing reads.\n";
+    print "We recommend at least $ChIP_recommended_tag_count aligned reads,\n";
+    print "however you only provided $ChIP_tags. QuEST will still work, but your results\n";
+    print "may not be as accurate. \n";
+    my $answer = "";
+    while( $answer ne "y" && $answer ne "n" ){
+       print "Do you want to continue? (y/n): ";
+       if($silent_flag eq "on"){
+           $answer = "y";
+           print "$answer\n";
+       }
+       else{
+           $answer = <STDIN>;
+       }
+       chomp($answer);
+       if($answer eq "y"){
+           # do nothing
+       }
+       elsif($answer eq "n"){
+           print "You have aborted QuEST configuration.\n";
+           exit(0);
+       }
+    }
+}
+else{
+    printf "You seem to have at least %.2f fold the recommended amount of ChIP sequence reads.\n", $ChIP_excess_fold;
+}
+
+if($RX_noIP_excess_fold < 1){
+
+    print "\nWarning: you we seem to not have sufficiently many RX_noIP sequencing reads.\n";
+    printf "We recommend at least %.0f aligned reads,\n", $RX_noIP_recommended_tag_count;
+    print "however you only provided $RX_noIP_tags. QuEST will still work, but your results\n";    
+    print "may not be as accurate. \n";
+    
+    my $answer = "";
+    while( $answer ne "y" && $answer ne "n" ){
+       print "Do you want to continue? (y/n): ";
+       if($silent_flag eq "on"){
+           $answer = "y";
+           print "$answer\n";
+       }
+       else{
+           $answer = <STDIN>;
+       }       
+       chomp($answer);
+       if($answer eq "y"){
+           # do nothing
+       }
+       elsif($answer eq "n"){
+           print "You have aborted QuEST configuration.\n";
+           exit(0);
+       }
+    }
+}
+else{
+    if( $RX_noIP_excess_fold_with_FDR >=1 ){
+       print "You have enough RX_noIP reads to run the analysis with FDR.\n";
+    }
+    else{
+       print "You don't have enought RX_noIP reads to run the analysis with FDR.\n";
+    }
+}
+
+my $analysis_with_FDR = "not defined";
+my $background_tags = -1;
+my $pseudo_ChIP_tags = -1;
+
+
+if($RX_noIP_tags < $ChIP_tags){
+    print "Since the number of Rx_noIP tags is less than ChIP tags, there is no\n";
+    print "possibility to provide an FDR estimate. \n";
+    print "QuEST will be configured without FDR estimation.\n";
+    $analysis_with_FDR = "false";
+    $background_tags = $RX_noIP_tags;
+}
+else{
+    my $answer = "";
+    while( $answer ne "y" && $answer ne "n" ){ 
+       print "\nDo you want to allocate Rx_noIP reads for FDR estimation? (y/n): ";
+       if($silent_flag eq "on"){
+           $answer = "y";
+           print "$answer\n";
+       }
+       else{
+           $answer = <STDIN>;
+       }
+       chomp($answer);
+       
+       if($answer eq "y"){
+           $analysis_with_FDR = "true";
+           $pseudo_ChIP_tags = $ChIP_tags;
+           $background_tags = $RX_noIP_tags - $pseudo_ChIP_tags;
+
+           print "\nWe recommend using $background_tags for the background tags\n";
+           print "and $pseudo_ChIP_tags for the pseudo_ChIP tags. \n";
+
+           my $cur_answer = "";
+           while( $cur_answer ne "y" && $cur_answer ne "n" ){
+               print "Do you agree? (y/n): ";
+               if($silent_flag eq "on"){
+                   $cur_answer = "y";
+                   print "$cur_answer\n";
+               }
+               else{
+                   $cur_answer = <STDIN>;
+               }
+               chomp($cur_answer);
+               
+               if($cur_answer eq "y"){
+                   print "\nAllocating reads. \n\n";
+               }
+               elsif($cur_answer eq "n"){
+                   print "\nPlease specify the number of background and pseudo_ChIP tags.\n";
+                   print "These two numbers should add to the total number of RX_noIP tags ($RX_noIP_tags)\n";
+                   print "and we recommend using the same number of tags for pseudoChIP as you have for\n";
+                   print "your ChIP (ChIP tags)\n";
+                   
+                   print "Please enter the number of pseudo_ChIP tags between 0 and $RX_noIP_tags (recommended $ChIP_tags): ";
+                   my $tags_entered = <STDIN>;
+                   chomp($tags_entered);
+                   if($tags_entered <= 0 or $tags_entered > $RX_noIP_tags){
+                       print "The number that you have specified is not in the correct range. Aborting\n";
+                       exit(0);
+                   }
+                   
+                   $pseudo_ChIP_tags = $tags_entered;
+                   my $RX_noIP_tags_remaining = $RX_noIP_tags - $pseudo_ChIP_tags;
+                   print "\nPlease enter the number of background tags between 0 and $RX_noIP_tags_remaining (recommended $RX_noIP_tags_remaining): ";
+                   $tags_entered = <STDIN>;
+                   
+                   chomp($tags_entered);
+                   if($tags_entered <= 0 or $tags_entered > $RX_noIP_tags_remaining){
+                       print "The number you have specified is not in the correct range. Aborting\n";
+                   }
+                   $background_tags = $tags_entered;
+               }
+           }       
+       }
+       elsif($answer eq "n"){
+           print "QuEST will be configured without FDR estimation.\n";
+           $analysis_with_FDR = "false";
+           $background_tags = $RX_noIP_tags;
+       }
+    }
+}
+
+## /make assessment to the number of tags
+
+## allocating RX_noIP_tags
+my $background_RX_noIP_fname = "";
+my $pseudo_ChIP_RX_noIP_fname = "";
+
+if($analysis_with_FDR eq "true"){
+    $background_RX_noIP_fname = $analysis_path . "/background_RX_noIP.align.txt";
+    $pseudo_ChIP_RX_noIP_fname = $analysis_path . "/pseudo_ChIP_RX_noIP.align.txt";
+    
+    if(-e $background_RX_noIP_fname){
+       my $answer = "";
+       while( $answer ne "y" && $answer ne "n" ){
+           print "File $background_RX_noIP_fname already exists. Do you want to overwrite it? (y/n): ";
+           $answer = <STDIN>;
+           chomp($answer);
+           if($answer eq "y"){
+               unlink($background_RX_noIP_fname);
+           }
+           elsif($answer eq "n"){
+               print "Aborting QuEST configuration.\n";
+               exit(0);
+           }
+       }
+    }
+
+    if(-e $pseudo_ChIP_RX_noIP_fname){
+       my $answer = "";
+       while( $answer ne "y" && $answer ne "n" ){
+           print "File $pseudo_ChIP_RX_noIP_fname already exists. Do you want to overwrite it? (y/n): ";
+           $answer = <STDIN>;
+           chomp($answer);
+           if($answer eq "y"){
+               unlink($pseudo_ChIP_RX_noIP_fname);         
+           }
+           elsif($answer eq "n"){
+               print "Aborting QuEST configuration. \n";
+               exit(0);
+           }
+       }
+    }
+    
+    open background_RX_noIP_file, "> $background_RX_noIP_fname" || 
+       die "Failed to open $background_RX_noIP_fname\n";
+    open pseudo_ChIP_RX_noIP_file, "> $pseudo_ChIP_RX_noIP_fname" || 
+       die "Failed to open $pseudo_ChIP_RX_noIP_fname\n";
+    open updated_RX_noIP_align_file, "< $updated_RX_noIP_align_fname" || 
+       die "Failed to open $updated_RX_noIP_align_fname\n";
+
+    my $cur_line_counter = 0;
+    while(<updated_RX_noIP_align_file>){
+       chomp;
+       if($cur_line_counter < $background_tags){
+           print background_RX_noIP_file "$_\n";           
+       }
+       else{
+           if($cur_line_counter < $background_tags + $pseudo_ChIP_tags){
+               print pseudo_ChIP_RX_noIP_file "$_\n";
+           }
+       }
+       $cur_line_counter++;
+    }
+    close background_RX_noIP_file;
+    close pseudo_ChIP_RX_noIP_file;
+    close updated_RX_noIP_align_file;
+}
+else{
+    $background_tags = $RX_noIP_tags;
+    $background_RX_noIP_fname = $updated_RX_noIP_align_fname;
+}
+## /allocating RX_noIP_tags
+
+## calculating cutoffs/thresholds
+
+print "\nWe will now walk through threshold setup for different software modules.\n";
+
+## calculating Quick Window Scan thresholds
+
+print "\n";
+print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+print "~~~~~~~~~~~~~~ Quick Window Scan Module ~~~~~~~~~~~~~~~~~~~~~\n";
+
+my $quick_window_scan_calc_window = -1;
+my $quick_window_scan_count_threshold = -1;
+my $recommended_quick_window_scan_calc_window = 300;
+my $recommended_quick_window_scan_count_threshold = 
+    50 * ( $ChIP_tags / 10000000 ) * ( $recommended_quick_window_scan_calc_window / 300 ) *
+    ( 3080436051 / $genome_size );
+
+print "\n";
+print "Quick window scan is the module that identifies a training\n";
+print "set of potential peaks for estimating peak shift parameter.\n";
+print "To do so, it needs the size of the window to count the tags in\n";
+print "and a count threshold to identify positive regions where that\n";
+print "threshold is exceeded.\n\n";
+print "Based on the size of the genome and the number of ChIP tags, we\n";
+print "recommend using $recommended_quick_window_scan_calc_window for the window size and ";
+printf "%.0f for the the ", $recommended_quick_window_scan_count_threshold;
+print "tag count threshold.\n";
+
+my $answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    printf "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if($answer eq "y"){
+       $quick_window_scan_calc_window = $recommended_quick_window_scan_calc_window;
+       $quick_window_scan_count_threshold = $recommended_quick_window_scan_count_threshold;    
+    }
+    elsif($answer eq "n"){
+       print "Please enter the the size of the count window (recommended $recommended_quick_window_scan_calc_window): ";
+       my $cur_answer1 = <STDIN>;
+       chomp($cur_answer1);
+       if($cur_answer1 >= 0){
+           $quick_window_scan_calc_window = $cur_answer1;
+       }
+       else{
+           print "The value cannot be negative. Aborting.\n";
+           exit(0);
+       }
+       
+       print "Please enter the count threshold for the number of tags: ";
+       $cur_answer1 = <STDIN>;
+       chomp($cur_answer1);
+
+       if($cur_answer1 >= 0){
+           $quick_window_scan_count_threshold = $cur_answer1;  
+       }
+    }
+}
+
+print "\n";
+print "Specified parameters:\n\n";
+print "quick_window_scan_calc_window:         $quick_window_scan_calc_window\n";
+print "quick_window_scan_count_threshold:     $quick_window_scan_count_threshold\n";
+
+print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
+
+## /calculating Quick Window Scan thresholds
+
+## calculating Peak Shift Calibrator thresholds
+
+print "\n";
+print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+print "~~~~~~~~~~~~~ Peak Shift Calibrator Module ~~~~~~~~~~~~~~~~~~\n";
+
+my $recommended_KDE_bandwidth = 30;
+my $recommended_scan_window = 1000;
+my $recommended_quick_scan_tag_count_threshold = 0;
+my $recommended_top_count = 200;
+my $recommended_ChIP_to_background_fold = 10;
+my $recommended_best_to_second_best_peak_ratio = 10;
+my $recommended_ChIP_threshold_for_PSC = 0.3 * ( $ChIP_tags / 7000000 ) * ( 3080436051 / $genome_size );
+my $recommended_ChIP_for_rev_peak_max_min_ratio = 1.5;
+
+my $KDE_bandwidth = -1;
+my $scan_window = -1;
+my $quick_scan_tag_count_threshold = 0;
+my $top_count = -1;
+my $ChIP_to_background_fold = -1;
+my $best_to_second_best_peak_ratio = -1;
+my $ChIP_threshold_for_PSC = -1;
+my $ChIP_for_rev_peak_max_min_ratio = -1;
+my $peak_shift_estimation_method = "NA";
+
+print "\nCalibrate Peak Shift Module estimates the value of peak shift based on the\n";
+print "the list of the regions provided by the Quick Window Scan Module.\n\n";
+
+
+print "Kernel Density Estimation requires a smoothing parameter called KDE bandwidth.\n";
+print "For sequence-specific TFs we recommend using $recommended_KDE_bandwidth bps for the KDE bandwidth.\n";
+print "If your factor is  not sequence specific or library has a large range of inserts,\n";
+print "consider increasing the value of KDE bandwidth to achieve more smoothing.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $KDE_bandwidth = $recommended_KDE_bandwidth;    
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the value of KDE bandwidth: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer > 0 ){
+           $KDE_bandwidth = $cur_answer;
+       }
+       else{ print "You should specify a positive number. Aborting.\n"; exit(0); }    
+    }
+}
+
+$answer = "";
+
+print "\nScan window defines a region within which the peak is expected to be found located.\n";
+print "We recommend using sufficiently large window size such as $recommended_scan_window.\n";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $scan_window = $recommended_scan_window;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the value of scan window: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer > 0 ){
+           $scan_window = $cur_answer;
+       }
+       else{ print "You should specify a positive number. Aborting.\n"; exit(0); }    
+    }
+}
+
+print "\nYou can limit the regions by the number of tags found by the \n";
+print "Quick Window Scan Module. We suggest not doing that. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";      
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $quick_scan_tag_count_threshold = $recommended_quick_scan_tag_count_threshold;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the value of quick scan tag count threshold: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer >= 0 ){
+           $quick_scan_tag_count_threshold = $cur_answer;
+       }
+       else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }    
+    }
+}
+
+
+print "\nYou can limit the total number of regions to consider to save time. We\n";
+print "recommend using at least $recommended_top_count regions. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $top_count = $recommended_top_count;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the number of regions to consider: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer >= 0 ){
+           $top_count = $cur_answer;
+       }
+       else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }    
+    }
+}
+
+print "\nFor each peak to be qualified it must be enriched over the background.\n";
+print "We recommend using $recommended_ChIP_to_background_fold fold enrichment to identify\n";
+print "positive regions. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $ChIP_to_background_fold = $recommended_ChIP_to_background_fold;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the ChIP to background fold enrichment: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer >= 0 ){
+           $ChIP_to_background_fold = $cur_answer;
+       }
+       else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }    
+    }
+}
+
+print "\nMultiple peaks may exist within the same window. To assign them correctly,\n";
+print "we need to reject close value peaks on the same strand.\n";
+print "We suggest using the value of $recommended_best_to_second_best_peak_ratio ";
+print "to reject the second best peaks.\n";
+print "Higher value is more stringent, lower will pass more peaks.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree to this value? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $best_to_second_best_peak_ratio = $recommended_best_to_second_best_peak_ratio;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the best-to-second-best peak ratio: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer >= 0 ){
+           $best_to_second_best_peak_ratio = $cur_answer;
+       }
+       else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }    
+    
+    }
+}
+
+print "\nMatching the peaks on opposite strands requires their heights to be very close.\n";
+print "We suggest rejecting all opposite strand peaks for which the ratio between\n";
+print "the larger to the smaller peak is greater than $recommended_ChIP_for_rev_peak_max_min_ratio.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+        if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $ChIP_for_rev_peak_max_min_ratio = $recommended_ChIP_for_rev_peak_max_min_ratio;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the forward/reverse ratio of the larger to the smaller peak: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer >= 0 ){
+           $ChIP_for_rev_peak_max_min_ratio = $cur_answer;
+       }
+       else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }    
+    }
+}
+
+print "\nLow peaks must be rejected since they are unlikely to be true binding events.\n";
+print "According to the number of ChIP tags and the size of the genome, we suggest\n";
+printf "using value of %.2f for the ChIP threshold. \n", $recommended_ChIP_threshold_for_PSC;
+
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if( $answer eq "y" ){
+       $ChIP_threshold_for_PSC = $recommended_ChIP_threshold_for_PSC;
+    }
+    elsif( $answer eq "n" ){
+       print "Please specify the ChIP threshold: ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       if( $cur_answer >= 0 ){
+           $ChIP_threshold_for_PSC = $cur_answer;
+       }
+       else{ print "You should specify a non-negative number. Aborting.\n"; exit(0); }    
+    }
+}
+
+print "\nYou can choose between the median and the mean based estimates for the peak shift.\n";
+print "Mean based estimates tend to be more accurate, however mode is more robust\n";
+print "to errors. We recommend using the mean-based estimate. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if($answer eq "y"){
+       $peak_shift_estimation_method = "mean";
+    }
+    elsif($answer eq "n"){
+       print "Setting the estimation method to mean.\n";
+       $peak_shift_estimation_method = "median";
+    }
+}
+
+print "\n";
+print "Specified parameters for the Peak Shift Calibration Module:\n\n";
+
+print "KDE_bandwidth:                          $KDE_bandwidth\n";
+print "scan_window:                            $scan_window\n";
+print "quick_scan_tag_count_threshold:         $quick_scan_tag_count_threshold\n";
+print "top_count:                              $top_count\n";
+print "ChIP_to_background_fold:                $ChIP_to_background_fold\n";
+print "best_to_second_best_peak_ratio:         $best_to_second_best_peak_ratio\n";
+printf "ChIP_threshold:                         %.2f\n", $ChIP_threshold_for_PSC;
+print "ChIP_for_rev_peak_max_min_ratio:        $ChIP_for_rev_peak_max_min_ratio\n";
+print "peak_shift_estimation_method:           $peak_shift_estimation_method\n";
+
+## /calculating Peak Shift Calibrator thresholds
+
+
+print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
+
+## calculating Peak Caller thresholds
+
+print "\n";
+print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+print "~~~~~~~~~~~~~~~~~~ Peak Caller Module ~~~~~~~~~~~~~~~~~~~~~~~\n";
+
+
+my ($ChIP_threshold, $background_threshold, $er, $rr, $dip_fraction);
+
+my $recommended_ChIP_threshold = 0.3 * ( 3080436051 / $genome_size ) * ( $ChIP_tags / 7000000 );
+my $recommended_enrichment_fold = 3.0;
+my $recommended_rescue_fold = 10;
+my $recommended_rr = $recommended_rescue_fold * ( $ChIP_tags / $background_tags );
+my $recommended_dip_fraction = 0.1;
+
+
+print "\n";
+print "\n";
+print "*** Important: this is where you dial in your detection stringency. ****\n";
+print "\n";
+
+print "Judging by the number of ChIP tags and size of the genome,\n";
+printf "we suggest using %.2f for the ChIP profile threshold.\n", $recommended_ChIP_threshold;
+print "Higher threshold will give more stringency). \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if($answer eq "y"){
+       # do nothing
+       $ChIP_threshold = $recommended_ChIP_threshold;
+    }
+    elsif($answer eq "n"){
+       print "Please specify the ChIP profile threshold (recommended $recommended_ChIP_threshold): ";
+       my $answer1 = <STDIN>;
+       chomp($answer1);        
+       $ChIP_threshold = $answer1;
+    }
+}
+
+my $recommended_background_threshold 
+    = $ChIP_threshold * ($background_tags / $ChIP_tags) / $recommended_enrichment_fold;
+
+
+print "\n";
+print "We recommend a value of $recommended_enrichment_fold for the enrichment fold.\n";
+print "This value is used to descriminate enrichment between ChIP and \n";
+print "background datasets at loci with low amount of background.\n";
+printf "With enrichment fold of %.2f, the corresponding background profile\n", $recommended_enrichment_fold;
+printf "threshold will be %.2f. \n", $recommended_background_threshold;
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+    
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    if($answer eq "y"){
+       # do nothing
+       $er = $recommended_enrichment_fold;
+    $background_threshold = $recommended_background_threshold;
+    }
+    elsif($answer eq "n"){
+       print "Please specify the enrichment fold (recommended 3.0 and up): ";
+       my $cur_answer = <STDIN>;
+       chomp($cur_answer);
+       $background_threshold = $ChIP_threshold * ($background_tags / $ChIP_tags) / $cur_answer;
+       
+       print "According to your selection the background threshold was calculated to be $background_threshold\n";
+    }
+}
+
+print "\n";
+print "Rescue fold specifies when peaks int the high-background regions are called.\n";
+print "We recommend using a value of $recommended_rescue_fold for the the value\n";
+printf "of rescue_fold. This will correspond to the value of %.2f for the rescue ratio.\n", $recommended_rr;
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if($answer eq "y"){
+       $rr = $recommended_rr;
+    }
+    elsif($answer eq "n"){
+       print "Please specify the rescue fold (we recommend $recommended_rescue_fold): ";
+       my $cur_answer2 = <STDIN>;
+       chomp($cur_answer2);
+       $rr = $cur_answer2 * ($ChIP_tags) / ($background_tags);
+       print "The corresponding value of rescue ratio will be $rr\n";
+       print "\n";    
+    }
+}
+
+print "\nAdjacent peaks are distinguished if the profile dips between the two peaks\n";
+printf "by certain amount. We recommend using %.2f for the value of the dip fraction\n", $recommended_dip_fraction;
+print "between the two peaks. \n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+    chomp($answer);
+    
+    if($answer eq "y"){
+       $dip_fraction = $recommended_dip_fraction;
+    }
+    elsif($answer eq "n"){
+       print "Please specify an alternative dip fraction value: ";    
+       my $cur_answer2 = <STDIN>;
+       chomp($cur_answer2);
+       $dip_fraction = $cur_answer2;
+    }
+}
+
+print "\n";
+print "Specified parameters for the Peak Caller Module:\n\n";
+
+printf "ChIP_threshold:                      %.2f\n", $ChIP_threshold;
+printf "background_threshold:                %.2f\n", $background_threshold;
+printf "enrichment_ratio:                    %.2f\n", $er;
+printf "rescue_ratio:                        %.2f\n", $rr;
+printf "dip_fraction:                        %.2f\n", $dip_fraction; 
+
+print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
+
+## /calculating Peak Caller thresholds
+
+## system info
+
+my $processors = -1;
+my $recommended_threads = -1;
+my $threads = -1;
+
+if(-e "/proc/cpuinfo"){
+    $processors = `cat /proc/cpuinfo | grep processor | wc -l`;
+    chomp($processors);
+}
+elsif(-e "/usr/sbin/sysctl"){
+    $processors = `/usr/sbin/sysctl -n hw.ncpu`;
+}
+else{
+    print "I failed to determine the number of processors on your platform.\n";
+    print "Please specify it manually: ";
+    $answer = <STDIN>;
+    chomp($answer);
+    if($answer <= 0){
+       print "You need to specify a positive number. Aborting.\n";
+       exit(0);
+    }
+    $processors = $answer;
+}
+
+if($processors == 0){
+    print "I failed to determine the number of processors on your platform.\n";
+    print "Please specify it manually: ";
+    $answer = <STDIN>;
+    chomp($answer);
+    if($answer <= 0){
+       print "You need to specify a positive number. Aborting.\n";
+       exit(0);
+    }
+    $processors = $answer;
+}
+
+print "\nNumber of processors: $processors\n";
+
+if($processors == 1){
+    $recommended_threads = 1;
+}
+else{ $recommended_threads = $processors - 1; }
+
+print "\nThe Profiler Module supports thread-level parallelism. We suggest utilizing \n";
+print "$recommended_threads for the number of threads to distribute the computation over.\n";
+
+$answer = "";
+while( $answer ne "y" && $answer ne "n" ){
+    print "Do you agree? (y/n): ";
+    if($silent_flag eq "on"){
+       $answer = "y";
+       print "$answer\n";
+    }
+    else{
+       $answer = <STDIN>;
+    }
+
+    chomp($answer);
+    
+    if($answer eq "y"){
+       $threads = $recommended_threads;
+    }
+    elsif($answer eq "n"){
+       print "Please specify desired number of threads: ";
+       my $cur_answer2 = <STDIN>;
+       chomp($cur_answer2);
+       $threads = $cur_answer2;
+    }
+}
+
+## /system info
+
+## /calculating cutoffs/thresholds
+
+print "\n";
+
+## writing parameter files
+
+## writing Quick Window Scan parameters
+
+my $quick_window_scan_par_fname = $analysis_path . "/quick_window_scan.batch.pars";
+my $quick_window_scan_output_fname = $analysis_path . "/quick_window_scan.out";
+
+unlink($quick_window_scan_par_fname);
+
+open quick_window_scan_par_file, "> $quick_window_scan_par_fname" 
+    or die "Failed to open $quick_window_scan_par_fname\n";
+print quick_window_scan_par_file "genome_table = $contig_table_fname\n";
+print quick_window_scan_par_file "align_file = $updated_ChIP_align_fname\n";
+print quick_window_scan_par_file "output_file = $quick_window_scan_output_fname\n";
+print quick_window_scan_par_file "calc_window = $quick_window_scan_calc_window\n";
+printf quick_window_scan_par_file "count_threshold = %d", $quick_window_scan_count_threshold;
+
+close quick_window_scan_par_file;
+
+print "Saved Quick Window Scan batch parameter file.\n";
+
+## /writing Quick Window Scan parameters
+
+## writing Calibrate Peak Shift parameters
+
+my $calibrate_peak_shift_par_fname = $analysis_path . "/calibrate_peak_shift.batch.pars";
+my $calibrate_peak_shift_output_fname = $analysis_path . "/calibrate_peak_shift.out";
+
+open calibrate_peak_shift_par_file, "> $calibrate_peak_shift_par_fname"
+    or die "Failed to open $calibrate_peak_shift_par_fname\n";
+
+if($silent_flag eq "on"){
+    print calibrate_peak_shift_par_file "verbose = off\n";
+}
+else{
+    print calibrate_peak_shift_par_file "verbose = yes\n";
+}
+print calibrate_peak_shift_par_file "quick_scan_file = $quick_window_scan_output_fname.sorted\n";
+print calibrate_peak_shift_par_file "align_chip_file = $updated_ChIP_align_fname\n";
+print calibrate_peak_shift_par_file "align_background_file = $updated_RX_noIP_align_fname\n";
+print calibrate_peak_shift_par_file "output_file = $calibrate_peak_shift_output_fname\n";
+print calibrate_peak_shift_par_file "kde_bandwidth = $KDE_bandwidth\n";
+print calibrate_peak_shift_par_file "scan_window = $scan_window\n";
+print calibrate_peak_shift_par_file "quick_scan_tag_count_threshold = $quick_scan_tag_count_threshold\n";
+print calibrate_peak_shift_par_file "top_count = $top_count\n";
+print calibrate_peak_shift_par_file "chip_to_background_fold = $ChIP_to_background_fold\n";
+print calibrate_peak_shift_par_file "best_to_second_best_peak_ratio = $best_to_second_best_peak_ratio\n";
+print calibrate_peak_shift_par_file "chip_peak_threshold = $ChIP_threshold_for_PSC\n";
+print calibrate_peak_shift_par_file "chip_for_rev_peak_max_min_ratio = $ChIP_for_rev_peak_max_min_ratio\n";
+print calibrate_peak_shift_par_file "method = $peak_shift_estimation_method\n";
+
+
+close calibrate_peak_shift_par_file;
+
+print "Saved Calibrate Peak Shift batch parameter file.\n";
+
+## /writing Calibrate Peak Shift parameters
+
+## writing Generate Peak Profile parameters
+
+my $Profiler_calc_window = $KDE_bandwidth * 10;
+
+my $scores_path = $analysis_path . "/scores";
+
+if(-d $scores_path){
+    print "Warning: score directory $scores_path already exists\n";
+}
+else{
+    mkdir($scores_path);
+}
+
+my $ChIP_scores_path = $scores_path . "/ChIP/";
+my $pseudo_ChIP_scores_path = $scores_path . "/pseudo_ChIP/";
+my $background_scores_path = $scores_path . "/background/";
+
+if(-d $ChIP_scores_path){ 
+    print "Warning: ChIP scores directory $ChIP_scores_path already exists\n"; 
+    print "When the Profiler Module will run the files will be overwritten\n";
+}
+else{ mkdir($ChIP_scores_path); }
+
+
+if( $analysis_with_FDR eq "true"){
+    if(-d $pseudo_ChIP_scores_path){ 
+       print "Warning: pseudo_ChIP scores directory $pseudo_ChIP_scores_path already exists\n"; 
+       print "When the Profiler Module will run the files will be overwritten\n";
+    }
+    else{ mkdir($pseudo_ChIP_scores_path); }
+}
+
+if(-d $background_scores_path){ 
+    print "Warning: backgrounds scores directory $background_scores_path already exists\n"; 
+    print "When the Profiler Module will run the files will be overwritten\n";
+}
+else{ mkdir($background_scores_path); }
+
+my $ChIP_generate_profile_par_fname = $analysis_path . "/generate_profile.ChIP.batch.pars";
+my $ChIP_generate_profile_output_fname = $analysis_path . "/generate_profile.ChIP.out";
+
+my $pseudo_ChIP_generate_profile_par_fname = $analysis_path . "/generate_profile.pseudo_ChIP.batch.pars";
+my $pseudo_ChIP_generate_profile_output_fname = $analysis_path . "/generate_profile.pseudo_ChIP.out";
+
+my $background_generate_profile_par_fname = $analysis_path . "/generate_profile.background.batch.pars";
+my $background_generate_profile_output_fname = $analysis_path . "/generate_profile.background.out";
+
+## writing ChIP Profiler param file
+
+unlink($ChIP_generate_profile_par_fname);
+
+open ChIP_generate_profile_ChIP_par_file, "> $ChIP_generate_profile_par_fname"
+    or die "Failed to open $ChIP_generate_profile_par_fname";
+
+print ChIP_generate_profile_ChIP_par_file "genome_table = $contig_table_fname\n";
+print ChIP_generate_profile_ChIP_par_file "align_file = $updated_ChIP_align_fname\n";
+print ChIP_generate_profile_ChIP_par_file "output_score_path = $ChIP_scores_path\n";
+print ChIP_generate_profile_ChIP_par_file "peak_shift_source_file = $calibrate_peak_shift_output_fname\n";
+print ChIP_generate_profile_ChIP_par_file "kde_bandwidth = $KDE_bandwidth\n";
+print ChIP_generate_profile_ChIP_par_file "calc_window = $Profiler_calc_window\n";
+print ChIP_generate_profile_ChIP_par_file "threads = $threads\n";
+
+close ChIP_generate_profile_ChIP_par_file;
+
+print "Saved Profiler batch parameter file for the ChIP data\n";
+
+## /writing ChIP Profiler param file
+
+## writing background Profiler param file
+
+unlink($background_generate_profile_par_fname);
+
+open background_generate_profile_ChIP_par_file, "> $background_generate_profile_par_fname"
+    or die "Failed to open $background_generate_profile_par_fname";
+print background_generate_profile_ChIP_par_file "genome_table = $contig_table_fname\n";
+print background_generate_profile_ChIP_par_file "align_file = $background_RX_noIP_fname\n";
+print background_generate_profile_ChIP_par_file "output_score_path = $background_scores_path\n";
+print background_generate_profile_ChIP_par_file "peak_shift_source_file = $calibrate_peak_shift_output_fname\n";
+print background_generate_profile_ChIP_par_file "kde_bandwidth = $KDE_bandwidth\n";
+print background_generate_profile_ChIP_par_file "calc_window = $Profiler_calc_window\n";
+print background_generate_profile_ChIP_par_file "threads = $threads\n";
+
+close background_generate_profile_ChIP_par_file;
+
+print "Saved Profiler batch parameter file for the background data.\n";
+
+
+## /writing background Profiler param file
+
+## writing pseudo_ChIP Profiler param file
+
+if($analysis_with_FDR eq "true"){
+
+    unlink($pseudo_ChIP_generate_profile_par_fname);
+
+    open pseudo_ChIP_generate_profile_ChIP_par_file, "> $pseudo_ChIP_generate_profile_par_fname"
+       or die "Failed to open $pseudo_ChIP_generate_profile_par_fname";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "genome_table = $contig_table_fname\n";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "align_file = $pseudo_ChIP_RX_noIP_fname\n";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "output_score_path = $pseudo_ChIP_scores_path\n";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "peak_shift_source_file = $calibrate_peak_shift_output_fname\n";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "kde_bandwidth = $KDE_bandwidth\n";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "calc_window = $Profiler_calc_window\n";
+    print pseudo_ChIP_generate_profile_ChIP_par_file "threads = $threads\n";
+
+    close pseudo_ChIP_generate_profile_ChIP_par_file;
+
+    print "Saved Profiler batch parameter file for the pseudo_ChIP data\n";
+}
+
+## /writing pseudo_ChIP Profiler param file
+
+## writing ChIP Peak Caller param file
+
+my $ChIP_peak_caller_par_fname = $analysis_path . "/peak_caller.ChIP.batch.pars";
+my $ChIP_peak_calls_fname = $analysis_path . "/peak_caller.ChIP.out";
+
+my $pseudo_ChIP_peak_caller_par_fname = $analysis_path . "/peak_caller.pseudo_ChIP.batch.pars";
+my $pseudo_ChIP_peak_calls_fname = $analysis_path . "/peak_caller.pseudo_ChIP.out";
+
+unlink($ChIP_peak_caller_par_fname);
+
+open ChIP_peak_caller_par_file, "> $ChIP_peak_caller_par_fname"
+    or die "Failed to open $ChIP_peak_caller_par_fname\n";
+
+print ChIP_peak_caller_par_file "genome_table = $contig_table_fname\n";
+print ChIP_peak_caller_par_file "chip_profile_path = $ChIP_scores_path\n";
+print ChIP_peak_caller_par_file "background_profile_path = $background_scores_path\n";
+print ChIP_peak_caller_par_file "output_file = $ChIP_peak_calls_fname\n";
+print ChIP_peak_caller_par_file "chip_threshold = $ChIP_threshold\n";
+print ChIP_peak_caller_par_file "background_threshold = $background_threshold\n";
+print ChIP_peak_caller_par_file "rescue_ratio = $rr\n";
+print ChIP_peak_caller_par_file "local_maximum_radius = 10\n";
+print ChIP_peak_caller_par_file "dip_fraction = $dip_fraction\n";
+print ChIP_peak_caller_par_file "ChIP_reads = $ChIP_tags\n";
+print ChIP_peak_caller_par_file "background_reads = $background_tags\n";
+
+close ChIP_peak_caller_par_file;
+
+print "Saved Peak Caller batch parameter file for the ChIP data\n";
+## /writing ChIP Peak Caller param file
+
+## writing pseudo_ChIP Peak Caller param file
+if($analysis_with_FDR eq "true"){
+    unlink($pseudo_ChIP_peak_caller_par_fname);
+    
+    open pseudo_ChIP_peak_caller_par_file, "> $pseudo_ChIP_peak_caller_par_fname"
+       or die "Failed to open $pseudo_ChIP_peak_caller_par_fname\n";
+    print pseudo_ChIP_peak_caller_par_file "genome_table = $contig_table_fname\n";
+    print pseudo_ChIP_peak_caller_par_file "chip_profile_path = $pseudo_ChIP_scores_path\n";
+    print pseudo_ChIP_peak_caller_par_file "background_profile_path = $background_scores_path\n";
+    print pseudo_ChIP_peak_caller_par_file "output_file = $pseudo_ChIP_peak_calls_fname\n";
+    print pseudo_ChIP_peak_caller_par_file "chip_threshold = $ChIP_threshold\n";
+    print pseudo_ChIP_peak_caller_par_file "background_threshold = $background_threshold\n";
+    print pseudo_ChIP_peak_caller_par_file "rescue_ratio = $rr\n";
+    print pseudo_ChIP_peak_caller_par_file "local_maximum_radius = 10\n";
+    print pseudo_ChIP_peak_caller_par_file "dip_fraction = $dip_fraction\n";
+    print pseudo_ChIP_peak_caller_par_file "ChIP_reads = $pseudo_ChIP_tags\n";
+    print pseudo_ChIP_peak_caller_par_file "background_reads = $background_tags\n";
+    
+    close pseudo_ChIP_peak_caller_par_file;
+
+    print "Saved Peak Caller batch parameter file for the pseudo_ChIP data.\n";
+}
+
+## /writing pseudo_ChIP Peak Caller param file
+
+## writing a superscript param file
+
+my $QuEST_param_fname = $analysis_path . "/QuEST.batch.pars";
+my $QuEST_log_fname = $analysis_path . "/QuEST.log";
+my $QuEST_output_fname = $analysis_path . "/QuEST.out";
+
+if( -e $QuEST_param_fname ){ unlink($QuEST_param_fname); }
+
+open QuEST_param_file, "> $QuEST_param_fname" 
+    or die "Failed to open $QuEST_param_fname\n";
+
+print QuEST_param_file "analysis_with_FDR = $analysis_with_FDR\n";
+print QuEST_param_file "quick_window_scan_param_file = $quick_window_scan_par_fname\n";
+print QuEST_param_file "calibrate_peak_shift_param_file = $calibrate_peak_shift_par_fname\n";
+print QuEST_param_file "ChIP_generate_profile_param_file = $ChIP_generate_profile_par_fname\n";
+print QuEST_param_file "background_generate_profile_param_file = $background_generate_profile_par_fname\n";
+print QuEST_param_file "ChIP_peak_caller_param_file = $ChIP_peak_caller_par_fname\n";
+
+if($analysis_with_FDR eq "true"){
+    print QuEST_param_file "pseudo_ChIP_generate_profile_param_file = $pseudo_ChIP_generate_profile_par_fname\n";
+    print QuEST_param_file "pseudo_ChIP_peak_caller_param_file = $pseudo_ChIP_peak_caller_par_fname\n";
+}
+
+print QuEST_param_file "log_file = $QuEST_log_fname\n";
+print QuEST_param_file "output_file = $QuEST_output_fname\n";
+
+print QuEST_param_file "\n";
+print QuEST_param_file "# ** Pipeline steps, use \# to comment out **\n";
+print QuEST_param_file "\n";
+
+print QuEST_param_file "quick_window_scan: ChIP\n";
+print QuEST_param_file "calibrate_peak_shift: ChIP\n";
+print QuEST_param_file "generate_profile: ChIP\n";
+print QuEST_param_file "generate_profile: background\n";
+if($analysis_with_FDR eq "true"){
+    print QuEST_param_file "generate_profile: pseudo_ChIP\n";
+}
+print QuEST_param_file "peak_caller: ChIP vs background\n";
+if($analysis_with_FDR eq "true"){
+    print QuEST_param_file "peak_caller: pseudo_ChIP vs background\n";
+}
+    
+close QuEST_param_file;
+
+## /writing a superscript param file
+
+## /writing parameter files
diff --git a/htswanalysis/bin/QuEST/generate_peak_profile b/htswanalysis/bin/QuEST/generate_peak_profile
new file mode 100755 (executable)
index 0000000..14d1a41
Binary files /dev/null and b/htswanalysis/bin/QuEST/generate_peak_profile differ
diff --git a/htswanalysis/bin/QuEST/misc/Makefile b/htswanalysis/bin/QuEST/misc/Makefile
new file mode 100755 (executable)
index 0000000..74fd7fb
--- /dev/null
@@ -0,0 +1,52 @@
+CC = g++
+#CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops
+
+include ../config.mk
+
+TARGETS = seq_contig sequence_utility_functions ticker utils array_2d params
+SOURCES = $(TARGETS:=.cpp)
+HEADERS = $(TARGETS:=.h)
+OBJECTS = $(TARGETS:=.o)
+
+all: $(OBJECTS)
+
+%.o : %.cpp %.h 
+       $(CC) $(INCLUDE) $(CCFLAGS) -c $<
+
+clean: 
+       rm -rf *.o 
+
+#all: seq_contig.o contig_set.o sequence_utility_functions.o ticker.o utils.o params.o array_2d.o
+
+#%.o: 
+#%.o : %.cpp %.h 
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c $<
+#-o $@.o
+#$(OBJ_DIR)/$@.o
+
+#$(TARGETS): $(SOURCES) $(HEADERS)
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c $(SOURCES)
+
+#params.o: params.cpp params.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c params.cpp
+#
+#seq_contig.o: seq_contig.cpp seq_contig.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c seq_contig.cpp
+#
+#contig_set.o: contig_set.cpp contig_set.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c contig_set.cpp 
+#
+#sequence_utility_functions.o: sequence_utility_functions.cpp sequence_utility_functions.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c sequence_utility_functions.cpp
+#
+#ticker.o: ticker.cpp ticker.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c ticker.cpp
+#
+#utils.o: utils.cpp utils.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c utils.cpp
+#
+#array_2d.o: array_2d.cpp array_2d.h
+#      $(CC) $(INCLUDE) $(CCFLAGS) -c array_2d.cpp
+#
+#clean: 
+#      rm -rf *.o 
diff --git a/htswanalysis/bin/QuEST/misc/params.cpp b/htswanalysis/bin/QuEST/misc/params.cpp
new file mode 100755 (executable)
index 0000000..894027f
--- /dev/null
@@ -0,0 +1,280 @@
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "assert.h"
+
+using std::string;
+using std::vector;
+
+
+
+#include "params.h"
+
+namespace Error{
+  struct bad_argc{
+    int argc;
+    bad_argc(int _argc){ argc = _argc; }
+  };
+  struct bad_param{
+    const char* value;
+    bad_param(const char* _value){ value = _value; }
+  };
+  struct bad_param_type{
+    param_type bp_type;
+    bad_param_type(param_type _bp_type){ bp_type = _bp_type;}
+  };
+  struct param_missing{
+    const char* par_missing;
+    param_missing(const char* _par_missing){ par_missing = _par_missing; }
+  };
+}
+
+params::params(int _argc, char* _argv[]){
+  if(_argc < 1) throw Error::bad_argc(_argc);
+
+  par_names.resize(_argc);
+  par_values.resize(_argc);
+
+  par_names[0] = "executable";
+  par_values[0] = _argv[0];
+  
+  for(int i=1; i<_argc; i++){
+    string cur_par_name="";
+    string cur_par_value="";
+    try{ 
+      parse(_argv[i], cur_par_name, cur_par_value); 
+      par_names[i] = cur_par_name;
+      par_values[i] = cur_par_value;
+    }
+    catch(Error::bad_param bp){
+      std::cerr<<"Warning:\ncouldn't parse the parameter: "<<bp.value<<"\nskipping\n";
+      par_names[i] = "";
+      par_values[i] = "";
+    }    
+  }
+  //std::cout<<"this can print\n";
+}
+
+void params::parse(const char* this_par, string& cur_par_name, string& cur_par_value){
+  string dummy_string = this_par;
+  unsigned int eq_pos = 0;
+  bool eq_pos_found = false;
+
+  for(unsigned int i=0; i<dummy_string.length() && !eq_pos_found; i++){
+    if(dummy_string[i] == '='){
+      eq_pos_found = true;
+      eq_pos = i;
+    }
+  }
+  
+  if(!eq_pos_found || eq_pos == dummy_string.length()-1){
+    throw Error::bad_param(this_par);
+  }
+
+  cur_par_name = dummy_string.substr(0, eq_pos);
+  cur_par_value = dummy_string.substr(eq_pos+1, dummy_string.length()-eq_pos-1);
+
+}
+
+void params::require(const string& par_name, const string& par_legend, param_type par_type){
+  required_params.push_back(par_name);
+  required_params_legend.push_back(par_legend);
+  required_params_type.push_back(par_type);
+}
+
+void params::require(const char* this_par_name, const char* this_par_legend, 
+                    param_type this_par_type){
+  string this_par_name_str = this_par_name;
+  string this_par_legend_str = this_par_legend;
+  
+  this->require(this_par_name_str, this_par_legend_str, this_par_type);
+}
+
+void params::optional(const char* this_par_name, const char* this_par_legend, 
+                     const char* default_value, param_type par_type){
+  string _this_par_name = this_par_name;
+  string _this_par_legend = this_par_legend;
+  string _default_value = default_value;
+  
+  optional_params.push_back(_this_par_name);
+  optional_params_legend.push_back(_this_par_legend);
+  optional_params_type.push_back(par_type);
+  optional_params_default_value.push_back(default_value);
+
+  bool par_found = false;
+  unsigned int par_index = 0;
+  for(unsigned int i=0; i<par_names.size(); i++){
+    if(par_names.at(i) == _this_par_name){
+      par_found = true;
+      par_index = i;
+    }
+  }
+  if(!par_found){
+    par_names.push_back(_this_par_name);
+    par_values.push_back(default_value);  
+  }
+}
+
+string stringify_type(param_type par_type){
+  string dummy_string;
+  switch(par_type){
+  case CHAR_TYPE:
+    dummy_string = "CHAR_TYPE";    
+    break;
+    
+  case STRING_TYPE:
+    dummy_string = "STRING_TYPE"; 
+    break;
+
+  case INT_TYPE:
+    dummy_string = "INT_TYPE";    
+    break;
+
+  case DOUBLE_TYPE:
+    dummy_string = "DOUBLE_TYPE";    
+    break;
+  default: 
+    std::cerr<<"bad par_type: "<<par_type<<"\n";
+    throw Error::bad_param_type(par_type);
+  }
+  return dummy_string;
+}
+
+bool good_type(const string& cur_par, param_type req_type){
+  switch(req_type){
+  case CHAR_TYPE:
+    if(cur_par.length() == 1) return true; break;
+  case STRING_TYPE:
+    return true; break;
+  case INT_TYPE:
+    return true; break;
+  case DOUBLE_TYPE:
+    return true; break;
+  default:
+    std::cerr<<"bad param type: "<<req_type<<"\n";
+    throw Error::bad_param_type(req_type);
+    break;
+  }
+  return false;
+}
+
+bool params::enforce(){
+  bool all_enforced = true;
+  for(unsigned int i=0; i<required_params.size(); i++){
+    string cur_req_param = required_params.at(i);
+    string cur_par_legend = required_params_legend.at(i);
+    param_type cur_req_type = required_params_type.at(i);
+
+    //std::cout<<"enforcing "<<cur_req_param<<"\n";
+
+    bool param_found = false;
+    unsigned int param_pos = 0;
+
+    for(unsigned j=0; j<par_names.size() && !param_found; j++){
+      if(par_names.at(j) == cur_req_param){
+       param_found = true;
+       param_pos = j;
+      }
+    }
+    if(!param_found){
+      all_enforced = false;
+      std::cout<<"    Required:\t"<<cur_req_param<<" = <"<<cur_par_legend<<">\t"<<stringify_type(cur_req_type)<<" [ parameter missing ]\n";
+    }
+    else{ //type_check now
+      string cur_par_value = par_values.at(param_pos);
+      if(!good_type(cur_par_value, cur_req_type)){
+       all_enforced = false;
+       std::cout<<"    Required:\t"<<cur_req_param<<" = <"<<cur_par_legend<<">\t"<<stringify_type(cur_req_type)<<" [ unexpected type ]\n";
+      }
+    }
+  }
+  assert(optional_params.size() == optional_params_legend.size());
+  assert(optional_params.size() == optional_params_default_value.size());
+  assert(optional_params.size() == optional_params_type.size());
+
+  /*
+  for(unsigned int i=0; i<optional_params.size(); i++){
+    string cur_param = optional_params.at(i);
+    string cur_legend = optional_params_legend.at(i);
+    string cur_default = optional_params_default_value.at(i);
+    param_type cur_type = optional_params_type.at(i);
+    
+    bool param_found = false;
+    unsigned int param_pos = 0;
+    
+    for(unsigned j=0; j<par_names.size() && !param_found; j++){
+      if(par_names.at(j) == cur_param){
+       param_found = true;
+       param_pos = j;
+      }
+    }
+    
+    //std::cout<<"param_found: "<<param_found<<"\n";
+
+    if(!param_found){
+      std::cerr<<"    Optional:\t"<<cur_param<<" = <"<<cur_legend<<">\t"<<stringify_type(cur_type)<<" [ default: "<<cur_default<<" ]\n";
+    }
+    else{ //type_check now
+      string cur_par_value = par_values.at(param_pos);
+      if(!good_type(cur_par_value, cur_type)){
+       all_enforced = false;
+       std::cerr<<"    Optional:\t"<<cur_param<<" = <"<<cur_legend<<">\t"<<stringify_type(cur_type)<<" [ unexpected type ]\n";
+      }
+    }
+  }
+  */
+  
+  return all_enforced;
+}
+
+void params::list_all_params(){
+  assert(par_names.size() == par_values.size());
+  std::cout<<"Specified parameters:\n";
+  for(unsigned int i=0; i<par_names.size(); i++){
+    std::cout<<par_names.at(i)<<"\t"<<par_values.at(i)<<"\n";
+  }
+}
+
+int params::get_int_value(const char* par_name){
+  string dummy_string = par_name;
+  for(unsigned int i=0; i<par_names.size(); i++){
+    if(par_name == par_names.at(i)) return atoi(par_values.at(i).c_str());
+  }
+#ifdef DEBUG
+  std::cerr<<"bad par_name: "<<par_name<<"\n";
+#endif
+  throw Error::param_missing(par_name);
+}
+
+double params::get_double_value(const char* par_name){
+  string dummy_string = par_name;
+  for(unsigned int i=0; i<par_names.size(); i++){
+    if(par_name == par_names.at(i)) return atof(par_values.at(i).c_str());
+  }
+#ifdef DEBUG
+  std::cerr<<"bad par_name: "<<par_name<<"\n";
+#endif
+  throw Error::param_missing(par_name);
+}
+
+string params::get_string_value(const char* par_name){
+  string dummy_string = par_name;
+  for(unsigned int i=0; i<par_names.size(); i++){
+    if(par_name == par_names.at(i)) return par_values.at(i);
+  }
+#ifdef DEBUG
+  std::cerr<<"bad par_name: "<<par_name<<"\n";
+#endif
+  throw Error::param_missing(par_name);
+}
+char params::get_char_value(const char* par_name){
+  string dummy_string = par_name;
+  for(unsigned int i=0; i<par_names.size(); i++){
+    if(par_name == par_names.at(i)){
+      string cur_par_value = par_values.at(i);
+      return cur_par_value.at(0);
+    }
+  }
+  throw Error::param_missing(par_name);
+}
diff --git a/htswanalysis/bin/QuEST/misc/params.h b/htswanalysis/bin/QuEST/misc/params.h
new file mode 100755 (executable)
index 0000000..291502a
--- /dev/null
@@ -0,0 +1,57 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include <vector>
+#include <string>
+
+#ifndef param_type
+#define param_type unsigned int
+#endif
+
+#ifndef PARAM_TYPES
+#define PARAM_TYPES
+
+#define CHAR_TYPE 0
+#define STRING_TYPE 10
+#define INT_TYPE 20
+#define DOUBLE_TYPE 30
+
+#endif
+
+using std::vector;
+using std::string;
+
+class params{
+ private:
+  //unsigned int argc;
+  //vector<string> argv;
+
+  vector<string> required_params;
+  vector<string> required_params_legend;
+  vector<param_type> required_params_type;
+
+  vector<string> optional_params;
+  vector<string> optional_params_legend;
+  vector<param_type> optional_params_type;
+  vector<string> optional_params_default_value;
+
+  vector<string> par_names;
+  vector<string> par_values;
+
+  void parse(const char* this_par, string& cur_par_name, string& cur_par_value);
+ public:
+  params(int _argc, char* _argv[]);
+  void require(const char* this_par_name, const char* this_par_legend, param_type this_par_type); //specify what sort of parameters are required
+  void require(const string& par_name, const string& par_legend, param_type par_type); //same
+  void optional(const char* this_par_name, const char* this_par_legend, const char* default_value, param_type par_type);
+
+  void list_all_params();
+  bool enforce(); //enforces the required parameters
+
+  string get_string_value(const char* par_name);
+  char get_char_value(const char* par_name);
+  int get_int_value(const char* par_name);
+  double get_double_value(const char* par_name);
+};
+
+#endif
diff --git a/htswanalysis/bin/QuEST/misc/seq_contig.cpp b/htswanalysis/bin/QuEST/misc/seq_contig.cpp
new file mode 100755 (executable)
index 0000000..8f99df3
--- /dev/null
@@ -0,0 +1,170 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#ifndef _ASSERT_H
+#include "assert.h"
+#endif
+
+using std::string;
+using std::cerr;
+using std::endl;
+
+#include "seq_contig.h"
+#include "sequence_utility_functions.h"
+
+#ifndef FAILED_TO_LOAD_CONTIG_FROM_FILE
+#define FAILED_TO_LOAD_CONTIG_FROM_FILE -1
+#endif
+
+#ifndef SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE
+#define SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE 1
+#endif
+
+namespace Error{
+  struct bad_ifstream_in_the_function{
+    const char* thrown_from;
+    bad_ifstream_in_the_function(const char* _thrown_from){ 
+      thrown_from = _thrown_from;
+    }
+  };
+}
+
+seq_contig::seq_contig(unsigned int _size){
+   seq.reserve(_size);
+}
+void seq_contig::read_contig(std::ifstream& o){
+  string dummy_str;
+  getline(o, dummy_str);
+  assert(fa_header(dummy_str));
+
+  seq = "";
+  header = dummy_str;
+  
+  bool continue_reading = true;
+  while(continue_reading){
+    getline(o, dummy_str);
+    if(o.good() && !fa_header(dummy_str)){
+      assert(valid_seq_string(dummy_str));
+      seq += dummy_str;
+    }
+    else{ continue_reading = false; }
+  }
+}
+
+void seq_contig::read_contig(const char* _fname){
+  std::ifstream _seq_fname(_fname);
+  if(!_seq_fname.good()){
+    cerr<<"Bad filename: "<<_fname<<endl;
+  }
+  read_contig(_seq_fname);
+}
+
+int seq_contig::load_contig_from_file(const string& contig_id, std::ifstream& o){
+  unsigned int contig_counter = 0;
+  if(!o.good()) throw Error::bad_ifstream_in_the_function("int seq_contig::load_contig_from_file");
+
+  std::ios::pos_type save_pos = o.tellg(); //save position to go back later
+  o.seekg(0, std::ios::beg);
+  
+  std::ios::pos_type contig_start_pos;
+  bool continue_search = true;
+  while(continue_search){
+    string dummy_string;
+    std::ios::pos_type current_pos = o.tellg();
+    getline(o, dummy_string);
+    if(o.good()){
+      if(dummy_string[0] == '>'){ //fasta header
+       //std::cerr<<"header found: "<<dummy_string<<"\n";
+       if(dummy_string.substr(1, dummy_string.length()-1) == contig_id){
+         continue_search = false;
+         contig_start_pos = current_pos;
+         contig_ind = contig_counter;
+       }
+       contig_counter++;
+      }
+    }
+    else{
+      continue_search = false;
+      o.clear();
+      o.seekg(save_pos);
+      return FAILED_TO_LOAD_CONTIG_FROM_FILE;
+    }
+  }
+  o.seekg(contig_start_pos);
+  read_contig(o);
+  o.seekg(save_pos);
+  return SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE;
+}
+
+seq_contig::seq_contig(std::ifstream& o){
+  read_contig(o);
+}
+
+seq_contig::seq_contig(const char* _fname){
+  read_contig(_fname);
+}
+
+
+void seq_contig::ag_recode(){
+  string nseq = seq.substr(0,seq.size()-1);
+  for(unsigned int i=0; i<seq.size()-1; i++){
+    char cur_code = 'n';
+    if(valid_nucl(seq[i]) && valid_nucl(seq[i]))
+      cur_code = ag_encode(seq[i],seq[i+1]);
+    nseq[i] = cur_code;
+  }
+  seq = nseq;
+}
+
+void seq_contig::save(std::ofstream& ostr){
+  unsigned int str_length = 50;
+  ostr<<header<<endl;
+  for(unsigned int i=0; i<(seq.size()-seq.size()%str_length)/str_length; i++){
+    int cur_start = i*str_length;
+    string cur_str = seq.substr(cur_start, str_length);
+    ostr<<cur_str<<endl;
+  }
+  if(seq.size()%str_length != 0){
+    string cur_str = seq.substr(seq.size()-seq.size()%str_length, seq.size()%str_length);
+    ostr<<cur_str<<endl;
+  }
+}
+
+void seq_contig::save(std::ofstream& ostr, int region_start, int region_end){
+  assert(region_start >= 0);
+  assert(region_start < region_end);
+  assert(region_end <= (int)(seq.length()));
+  int region_size = region_end - region_start;
+  unsigned int str_length = 50;
+  ostr<<header<<endl;
+  for(int i=0; i<(int)((region_size-region_size%str_length)/str_length); i++){
+    int cur_start = region_start+i*str_length;
+    string cur_str = seq.substr(cur_start, str_length);
+    ostr<<cur_str<<endl;
+  }
+  if(seq.size()%str_length != 0){
+    string cur_str = seq.substr(region_start + region_size - region_size%str_length,
+                               region_size%str_length);
+    ostr<<cur_str<<endl;
+  }
+}
+
+int seq_contig::size(){
+  return seq.length();
+}
+
+void seq_contig::validate(){
+  for(unsigned int i=0; i<seq.size(); i++){
+    assert(valid_letter(seq[i]));
+  }
+}
+
+char& seq_contig::operator[](unsigned int pos){
+#ifdef DEBUG
+  assert(pos < seq.size());
+#endif
+  return seq[pos];
+}
+
+
diff --git a/htswanalysis/bin/QuEST/misc/seq_contig.h b/htswanalysis/bin/QuEST/misc/seq_contig.h
new file mode 100755 (executable)
index 0000000..809ec5f
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef SEQ_CONTIG_H
+#define SEQ_CONTIG_H
+
+#define FAILED_TO_LOAD_CONTIG_FROM_FILE -1
+#define SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE 1
+
+
+class seq_contig{
+public:
+  std::string header;
+  std::string seq;
+  unsigned int contig_ind; //the index of contig in the file it's read from
+
+  char& operator[](unsigned int pos);
+
+  void read_contig(std::ifstream& o);
+  void read_contig(const char* fname);
+  
+  int size();
+  void validate();
+  seq_contig(){}
+  seq_contig(unsigned int _size);//{ seq.reserve(_size);}
+  seq_contig(std::string _header, std::string _seq){ header = _header; seq = _seq; validate();}
+  seq_contig(std::ifstream& o);
+  seq_contig(const char* fname);
+  
+  void ag_recode();
+  void save(std::ofstream& ostr);
+  void save(std::ofstream& ostr, int region_start, int region_end);
+  int load_contig_from_file(const std::string& contig_id, std::ifstream& o);
+  int get_contig_ind();//const std::string& contig_id, std::ifstream& o);
+};
+
+#endif
diff --git a/htswanalysis/bin/QuEST/misc/sequence_utility_functions.cpp b/htswanalysis/bin/QuEST/misc/sequence_utility_functions.cpp
new file mode 100755 (executable)
index 0000000..3a0b8eb
--- /dev/null
@@ -0,0 +1,389 @@
+#include <string>
+#include <iostream>
+
+#include <assert.h>
+
+using std::string;
+using std::cerr;
+using std::endl;
+
+#include "sequence_utility_functions.h"
+
+static unsigned int pow4[] =
+  {1,4,16,64,256,1024,4096,16384,65536,262144,1048576,
+   4194304,16777216,67108864,268435456,1073741824};
+/*
+namespace Error{
+  struct Bad_ag_encode{
+    char first_char, second_char;
+    Bad_ag_encode(char _first, char _second){
+      first_char = _first; second_char = _second;
+    }
+  };
+  struct Bad_nucleotide{
+    char c;
+    Bad_nucleotide(char _c){ c=_c;}
+  };
+  struct Bad_fasta_letter{
+    char c;
+    Bad_fasta_letter(char _c){c=_c;}
+  };
+  struct Failed_to_compute_seq_tuple_index{
+    string bad_tuple;
+    Failed_to_compute_seq_tuple_index(const string& _str){
+      bad_tuple = _str;
+    }
+  };
+}
+*/
+char complement(char c){
+  switch(c){
+  case 'a':{ return 't'; break;}
+  case 't':{ return 'a'; break;}
+  case 'g':{ return 'c'; break;}
+  case 'c':{ return 'g'; break;}
+  case 'A':{ return 'T'; break;}
+  case 'T':{ return 'A'; break;}
+  case 'G':{ return 'C'; break;}
+  case 'C':{ return 'G'; break;}
+  case 'n':{ return 'n'; break;}
+  case 'N':{ return 'N'; break;}
+  default:{ cerr<<"bad char in complement: "<<c<<endl;
+  assert(false); return 'Q'; break;}
+  }
+  //assert(valid_letter(c));                                                     
+  //char nc = capitalize(c);
+  /*
+  if(nc == 'N') return 'N';
+  if(nc == 'T') return 'A';
+  if(nc == 'A') return 'T';
+  if(nc == 'C') return 'G';
+  if(nc == 'G') return 'C';
+  assert(false);
+  */
+}
+
+
+bool small(char c){
+  if(c == 'a' || c == 't' || c == 'g' || c == 'c' || c == 'n') return true;
+  return false;
+}
+
+string capitalize(const string& _seq){
+  string res = _seq;
+  for(unsigned int i=0; i<_seq.length(); i++){
+    switch(_seq[i]){
+    case 'a':{res[i] = 'A'; break;}
+    case 'A':{break;}
+    case 't':{res[i] = 'T'; break;}
+    case 'T':{break;}
+    case 'g':{res[i] = 'G'; break;}
+    case 'G':{break;}
+    case 'c':{res[i] = 'C'; break;}
+    case 'C':{break;}      
+    case 'n':{res[i] = 'N'; break;}
+    case 'N':{break;}
+    default:{ throw Error::Bad_nucleotide(_seq[i]); break;}
+    }
+  }
+  return res;
+}
+char capitalize(char c){
+  if(c == 'n') return 'N';
+  if(c == 'a') return 'A';
+  if(c == 't') return 'T';
+  if(c == 'g') return 'G';
+  if(c == 'c') return 'C';
+
+  if(c == 'N') return 'N';
+  if(c == 'A') return 'A';
+  if(c == 'T') return 'T';
+  if(c == 'G') return 'G';
+  if(c == 'C') return 'C';
+
+  //cerr<<c;
+  assert(false);
+}
+
+string rev_compl(const string& _str){
+  //string dummy_str = _str;                                                                                                      
+  //reverse(dummy_str.begin(), dummy_str.end());                                                                                  
+  string res;
+  for(int i=_str.size()-1; i>=0; i--){
+    char cur_char = _str[i];
+    //cout<<cur_char;                                                                                                             
+    //assert(false);                                                                                                              
+    res.push_back(complement(cur_char));
+  }
+  return res;
+}
+
+bool valid_nucl(char c){
+  if(c == 'a' || c == 'A' || c == 't' || c == 'T' ||
+     c == 'g' || c == 'G' || c == 'c' || c == 'C')
+    return true;
+  else return false;
+}
+bool valid_col(char c){
+  if(c == '0' || c == '1' || c == '2' || c == '3')
+    return true;
+  else return false;
+}
+bool valid_letter(char c){
+  if(c == 'a' || c == 'A' || c == 't' || c == 'T' ||
+     c == 'g' || c == 'G' || c == 'c' || c == 'C' ||
+     c == 'n' || c == 'N' || c == 'm' || c == 'M' ||
+     c == 'r' || c == 'R')
+    return true;
+  else return false;
+}
+
+bool seq_missing(char c){
+  //if(c== 'n' || c== 'N' || c=='m' || c=='M'|| c) return true;  
+  if(!valid_nucl(c)) return true;
+  return false;
+}
+
+bool col_missing(char c){
+  //if(c== 'n' || c== 'N' || c=='m' || c=='M'|| c) return true;  
+  if(! (valid_col(c))) return true;
+  return false;
+}
+
+bool fa_header(string _str){
+  if(_str[0] == '>') return true;
+  return false;
+}
+string ag_recode(const string& str){
+  string nseq =str.substr(0,str.size()-1);
+  for(unsigned int i=0; i<str.size()-1; i++){
+    char cur_code = 'n';
+    if(valid_nucl(str[i]) && valid_nucl(str[i]))
+      cur_code = ag_encode(str[i],str[i+1]);
+    nseq[i] = cur_code;
+  }
+  return nseq;
+}
+bool valid_seq_string(string _str){
+  for(unsigned int i=0; i<_str.size(); i++){
+    if(!valid_letter(_str[i])){
+
+      cerr<<"warning: bad letter ["<<_str[i]<<"] code: "<<(int) _str[i]<<" at position "<<i<<endl;
+      cerr<<" in the expected sequence string "<<_str<<endl;
+      throw(Error::Bad_fasta_letter(_str[i]));
+      return false;
+    }
+  }
+  return true;
+}
+
+char ag_encode(char c1, char c2){
+  if(c1 == 'N' || c1 == 'n' || c2 == 'N' || c2 == 'n') return 'N';
+
+  if(((c1 == 'A' || c1 == 'a') && (c2 == 'A' || c2 == 'a')) ||
+     ((c1 == 'C' || c1 == 'c') && (c2 == 'C' || c2 == 'c')) ||
+     ((c1 == 'G' || c1 == 'g') && (c2 == 'G' || c2 == 'g')) ||
+     ((c1 == 'T' || c1 == 't') && (c2 == 'T' || c2 == 't'))) return '0';
+
+  if(((c1 == 'A' || c1 == 'a') && (c2 == 'C' || c2 == 'c')) ||
+     ((c1 == 'C' || c1 == 'c') && (c2 == 'A' || c2 == 'a')) ||
+     ((c1 == 'G' || c1 == 'g') && (c2 == 'T' || c2 == 't')) ||
+     ((c1 == 'T' || c1 == 't') && (c2 == 'G' || c2 == 'g'))) return '1';
+
+  if(((c1 == 'A' || c1 == 'a') && (c2 == 'G' || c2 == 'g')) ||
+     ((c1 == 'C' || c1 == 'c') && (c2 == 'T' || c2 == 't')) ||
+     ((c1 == 'G' || c1 == 'g') && (c2 == 'A' || c2 == 'a')) ||
+     ((c1 == 'T' || c1 == 't') && (c2 == 'C' || c2 == 'c'))) return '2';
+
+  if(((c1 == 'A' || c1 == 'a') && (c2 == 'T' || c2 == 't')) ||
+     ((c1 == 'C' || c1 == 'c') && (c2 == 'G' || c2 == 'g')) ||
+     ((c1 == 'G' || c1 == 'g') && (c2 == 'C' || c2 == 'c')) ||
+     ((c1 == 'T' || c1 == 't') && (c2 == 'A' || c2 == 'a'))) return '3';
+
+  throw Error::Bad_ag_encode(c1,c2);
+
+  //cerr<<c1<<c2<<endl;
+  //assert(false);
+  //return '?';
+}string ag_encode(const string& str, int start, int length){
+  assert(start >= 0 && (unsigned int) (start+length) <= str.length());
+  string res;
+  for(int i=start; i<start+length-1; i++){
+    try{ res += ag_encode(str[i], str[i+1]); }
+    catch( Error::Bad_ag_encode e ){
+      cerr<<"failed to encode ( "<<e.first_char<<","<<e.second_char<<" ) ";
+      cerr<<" at pos: "<<i<<endl;
+      //skip();
+      //assert(false);
+    }
+  }
+  return res;
+}
+
+string ag_encode(const string& str){
+  return ag_encode(str, 0, str.length());
+}
+
+unsigned short encode_nucl(char c){
+  switch(c){
+  case 'A':{ return 0; break;}
+  case 'a':{ return 0; break;}
+  case 'C':{ return 1; break;}
+  case 'c':{ return 1; break;}
+  case 'G':{ return 2; break;}
+  case 'g':{ return 2; break;}
+  case 'T':{ return 3; break;}
+  case 't':{ return 3; break;}
+  case 'N':{ return 0; break;}
+  case '.':{ return 0; break;}
+  case 'n':{ return 0; break;}
+  default:{ throw Error::Bad_nucleotide(c);}
+  }
+}
+unsigned short encode_nucl_complement(char c){
+  switch(c){
+  case 'A':{ return 3; break;}
+  case 'a':{ return 3; break;}
+  case 'C':{ return 2; break;}
+  case 'c':{ return 2; break;}
+  case 'G':{ return 1; break;}
+  case 'g':{ return 1; break;}
+  case 'T':{ return 0; break;}
+  case 't':{ return 0; break;}
+  case 'N':{ return 3; break;}
+  case 'n':{ return 3; break;}
+  case '.':{ return 3; break;}
+  default:{ throw Error::Bad_nucleotide(c);}
+  }
+}
+
+char decode_nucl(unsigned int code){
+  assert(code < 4);
+  switch(code){
+  case 0: return 'A';
+  case 1: return 'C';
+  case 2: return 'G';
+  case 3: return 'T';
+  }
+  return -1;
+}
+
+unsigned int seq_space_tuple_index(const string& str, unsigned int starts_at_pos, unsigned int tuple_size){
+  //  cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+  //a-0, c-1, g-2, t-3
+  assert(starts_at_pos >= 0);
+  assert(starts_at_pos + tuple_size <= (str.length()));
+
+  unsigned int res = 0;
+
+  for(unsigned int i = 0; i<tuple_size; i++){
+    try{
+      res += encode_nucl(str[starts_at_pos+i])*pow4[i];
+    }
+    catch(Error::Bad_nucleotide e){
+      cerr<<"seq_space_tuple_index: Error [ "<<e.c<<" ] in encode char at pos: "<<starts_at_pos<<" ";
+      for(unsigned int j=0; j<tuple_size; j++){
+       cerr<<str[starts_at_pos + j];
+        //cerr<<endl;
+       //throw Error::Failed_to_compute_seq_tuple_index(str.substr(starts_at_pos, tuple_size));
+      }
+      cerr<<endl;
+      throw Error::Failed_to_compute_seq_tuple_index(str.substr(starts_at_pos, tuple_size));
+    }
+  }
+
+  assert(res < pow4[tuple_size]);
+  return res;
+}
+
+unsigned int seq_space_tuple_index_rev_compl(const string& str, unsigned int starts_at_pos, unsigned int tuple_size){
+  //  cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+  assert(starts_at_pos >= 0);
+  assert(starts_at_pos < (str.length()));
+  assert(starts_at_pos - tuple_size + 1 >= 0);
+
+  unsigned int res = 0;
+
+  for(unsigned int i = 0; i<tuple_size; i++){
+    try{
+      //setbits(res, str[starts_at_pos-i],i);
+      res += /*ag_encode_char*/encode_nucl_complement(str[starts_at_pos-i])*pow4[i];//int_pow(4,i);            
+    }
+    catch(Error::Bad_nucleotide e){
+      cerr<<"seq_space_tuple_index_rev_compl: Error [ "<<e.c<<" ] in encode char at pos: "<<starts_at_pos<<" ";
+      for(unsigned int j=0; j<tuple_size; j++){
+       cerr<<str[starts_at_pos - j];
+      }
+      cerr<<endl;
+    }
+  }
+
+  assert(res < pow4[tuple_size]);
+  return res;
+} 
+
+unsigned int color_space_tuple_index(const string& str, int starts_at_pos, int tuple_size){
+  //  cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+  assert(starts_at_pos >= 0);
+  assert(starts_at_pos + tuple_size <= (int)(str.length()));
+
+  unsigned int res = 0;
+
+  for(int i = 0; i<tuple_size; i++){
+    try{
+      res += ag_encode_char(str[starts_at_pos+i])*pow4[i];//int_pow(4,i);                                         
+    }
+    catch(int a){
+      if(a == 20){
+        cerr<<"Error in encode char at pos: "<<starts_at_pos<<" ";
+        for(int j=0; j<tuple_size; j++){
+          cerr<<str[starts_at_pos + j];
+        }
+        cerr<<endl;
+       throw 21;
+      }
+    }
+  }
+
+  assert(res < pow4[tuple_size]);
+  return res;
+}
+
+int color_space_tuple_index_rev(const string& str, int starts_at_pos, int tuple_size){
+  //  cerr<<"starts: "<<starts_at_pos<<" t_s: "<<tuple_size<<" l: "<<str.length()<<endl;
+  assert(starts_at_pos >= 0);
+  assert(starts_at_pos < (int)(str.length()));
+  assert(starts_at_pos - tuple_size + 1 >= 0);
+
+  int res = 0;
+
+  for(int i = 0; i<tuple_size; i++){
+    try{
+      res += ag_encode_char(str[starts_at_pos-i])*pow4[i];//int_pow(4,i); 
+    }
+    catch(Error::Bad_ag_encode e){
+      cerr<<"Failed to color encode: "<<e.first_char<<" "<<e.second_char<<endl;
+      assert(false);
+    }
+  }
+  return res;
+}
+
+short ag_encode_char(char c){
+  switch(c){
+  case '0':{ return 0; break;}
+  case '1':{ return 1; break;}
+  case '2':{ return 2; break;}
+  case '3':{ return 3; break;}
+  default:{throw 20; return -1; break;assert(false); break;} //you shouldn't be here
+  }
+}
+
+
+string invert(const string& str){
+  string res;
+  for(int i=str.length()-1; i>=0; i--){
+    res += str[i];
+  }
+  return res;
+}
diff --git a/htswanalysis/bin/QuEST/misc/sequence_utility_functions.h b/htswanalysis/bin/QuEST/misc/sequence_utility_functions.h
new file mode 100755 (executable)
index 0000000..c7112a5
--- /dev/null
@@ -0,0 +1,58 @@
+#ifndef SEQUENCE_UTILITY_FUNCTIONS_H_
+#define SEQUENCE_UTILITY_FUNCTIONS_H_
+
+using std::string;
+
+bool fa_header(string _str);
+string ag_recode(const string& str);
+bool valid_seq_string(string _str);
+bool valid_nucl(char c);
+bool valid_col(char c);
+bool valid_letter(char c);
+bool seq_missing(char c);
+bool col_missing(char c);
+
+char ag_encode(char c1, char c2);
+string ag_encode(const string& str, int start, int length);
+string ag_encode(const string& str);
+short ag_encode_char(char c);
+
+unsigned short encode_nucl(char c);
+unsigned short encode_nucl_complement(char c);
+
+char decode_nucl(unsigned int code);
+
+unsigned int seq_space_tuple_index(const string& str, unsigned int starts_at_pos, unsigned int tuple_size);
+unsigned int seq_space_tuple_index_rev_compl(const string& str, unsigned int starts_at_pos, unsigned int tuple_size);
+unsigned int color_space_tuple_index(const string& str, int starts_at_pos, int tuple_size);
+int color_space_tuple_index_rev(const string& str, int starts_at_pos, int tuple_size);
+char capitalize(char c);
+string capitalize(const string& _seq);
+string rev_compl(const string& str);
+char complement(char c);
+string invert(const string& str);
+
+namespace Error{
+  struct Bad_ag_encode{
+    char first_char, second_char;
+    Bad_ag_encode(char _first, char _second){
+      first_char = _first; second_char = _second;
+    }
+  };
+  struct Bad_nucleotide{
+    char c;
+    Bad_nucleotide(char _c){ c=_c;}
+  };
+  struct Bad_fasta_letter{
+    char c;
+    Bad_fasta_letter(char _c){c=_c;}
+  };
+  struct Failed_to_compute_seq_tuple_index{
+    string bad_tuple;
+    Failed_to_compute_seq_tuple_index(const string& _str){
+      bad_tuple = _str;
+    }
+  };
+}
+
+#endif
diff --git a/htswanalysis/bin/QuEST/misc/utils.cpp b/htswanalysis/bin/QuEST/misc/utils.cpp
new file mode 100755 (executable)
index 0000000..459b26f
--- /dev/null
@@ -0,0 +1,102 @@
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#include <assert.h>
+
+using std::vector;
+using std::string;
+using std::cerr;
+using std::endl;
+
+//#include "utils.h"
+/*
+std::vector<string> split(const string& inp_string, const string& sep){
+  std::vector<string> res;
+  if(inp_string.length() == 0)  return res;
+  if(inp_string.length() < sep.length()){
+    res.push_back(inp_string);
+    return res;
+  }
+  unsigned int last_i = 0;
+  for(unsigned int i=0; i<inp_string.length()-sep.length(); i++){
+    //std::cout<<"comp: "<<inp_string.substr(i,sep.length())<<" to "<<sep<<std::endl;
+    if(inp_string.substr(i,sep.length()) == sep){
+      if(i>last_i+1){ 
+       res.push_back(inp_string.substr(last_i, i-last_i));
+      }
+       last_i = i+1;
+       //}
+    }
+  }
+  if(last_i < inp_string.length()-1) 
+    res.push_back(inp_string.substr(last_i, inp_string.length()-last_i));
+
+  if(res.size() == 0 && inp_string[0] != sep[0]) res.push_back(inp_string);
+  return res;
+}
+*/
+std::vector<string> split(const string& inp_string, char sep){
+  std::vector<string> res;
+  if(inp_string.length() == 0)  return res;
+
+  unsigned int last_i = 0;
+  for(unsigned int i=0; i<inp_string.length()/*-1*/; i++){
+    //if(inp_string.substr(i,sep.length()) == sep){
+    if(inp_string[i] == sep){
+      if(i>last_i){ 
+       res.push_back(inp_string.substr(last_i, i-last_i));
+      }
+      last_i = i+1;
+      //}
+    }
+  }
+  if(last_i < inp_string.length()) 
+    res.push_back(inp_string.substr(last_i, inp_string.length()-last_i));
+
+  if(res.size() == 0 && inp_string[0] != sep) res.push_back(inp_string);
+  return res;
+}
+
+unsigned int get_string_count(std::ifstream& ifs){
+  using std::ios;
+
+  assert(ifs.good());
+  std::ios::pos_type cur_pos = ifs.tellg();
+  ifs.seekg(0, ios::beg);
+
+  string dummy_string;
+  unsigned int string_counter = 0;
+  while(ifs.good()){
+    getline(ifs,dummy_string);
+    if(ifs.good()) string_counter++;    
+  }
+
+  ifs.clear();
+  ifs.seekg(cur_pos);
+  return string_counter;
+}
+
+void chomp(string& str){
+  //erases bad characters
+  bool continue_chomp = true;
+
+  while(continue_chomp){
+    switch(str[str.length()-1]){
+    case char(13):{ str.erase(str.length()-1,1); break;}
+    default:{ continue_chomp = false; break; }
+    }
+  }  
+}
+
+
+std::ios::pos_type perc_file_length(std::ifstream& ifs){
+  //assert(ifs.good());
+  if(!ifs.good()){ cerr<<"bad file in perc_file_length"<<endl; std::terminate();}
+  std::ios::pos_type cur_pos = ifs.tellg();
+  ifs.seekg(0, std::ios::end);
+  std::ios::pos_type _length = ifs.tellg();
+  ifs.seekg(cur_pos);
+  return (_length - _length%100) / 100;
+}
diff --git a/htswanalysis/bin/QuEST/misc/utils.h b/htswanalysis/bin/QuEST/misc/utils.h
new file mode 100755 (executable)
index 0000000..2c01528
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+std::vector<std::string> split(const std::string& inp_string, const std::string& sep);
+std::vector<std::string> split(const std::string& inp_string, char sep);
+
+void chomp(std::string& str);
+std::ios::pos_type perc_file_length(std::ifstream& ifs);
+unsigned int get_string_count(std::ifstream& ifs);
+#endif
diff --git a/htswanalysis/bin/QuEST/peak_caller b/htswanalysis/bin/QuEST/peak_caller
new file mode 100755 (executable)
index 0000000..98174d8
Binary files /dev/null and b/htswanalysis/bin/QuEST/peak_caller differ
diff --git a/htswanalysis/bin/QuEST/quick_window_scan b/htswanalysis/bin/QuEST/quick_window_scan
new file mode 100755 (executable)
index 0000000..7e0dc68
Binary files /dev/null and b/htswanalysis/bin/QuEST/quick_window_scan differ
diff --git a/htswanalysis/bin/QuEST/run_QuEST_with_param_file.pl b/htswanalysis/bin/QuEST/run_QuEST_with_param_file.pl
new file mode 100755 (executable)
index 0000000..7ba8a64
--- /dev/null
@@ -0,0 +1,535 @@
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like. 
+## However, any redistribution of this software should
+## indicate that this code was derived from the QuEST
+## analytical software and a citation:
+
+## the citation goes here
+
+## Any publication that uses results from this code's
+## deriviative should cite this paper.
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+
+## This program is a wrapper that all modules of QuEST software pipeline
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+    run_QuEST_with_param_file.pl
+       
+    This program is a wrapper that runs QuEST analytical pipeline
+    
+    -----------------------
+    mandatory parameters:
+    -p <params_file>               a file containing QuEST batch parameters    
+
+    -----------------------
+    optional parameters:
+
+    (Use directives below to override execution directives in QuEST batch parameter file)
+
+    -quick_window_scan                 supply to run quick window scan
+    -calibrate_peak_shift              supply to estimate peak shift
+    -generate_profile_ChIP             supply to generate ChIP profile
+    -generate_profile_background       supply to generate background profile
+    -generate_profile_pseudo_ChIP      supply to generate RX_noIP profile    
+    -peak_call_ChIP                    call peaks in ChIP data
+    -peak_call_pseudo_CHIP             call peaks in pseudo_ChIP_data 
+
+    -h                                  to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+    if($fullpath_fields[$i] ne ""){
+       $exec_path = $exec_path."/".$fullpath_fields[$i];
+    }
+}
+
+
+my $override_QuEST_directives = "false";
+
+my $override_quick_window_scan = "false";
+my $override_calibrate_peak_shift = "false";
+my $override_generate_profile_ChIP = "false";
+my $override_generate_profile_background = "false";
+my $override_generate_profile_pseudo_ChIP = "false";
+my $override_peak_call_ChIP = "false";
+my $override_peak_call_pseudo_ChIP = "false";
+## /setting optional arguments
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+    elsif ( $this_arg eq '-quick_window_scan') {$override_quick_window_scan = "true";}
+    elsif ( $this_arg eq '-calibrate_peak_shift') {$override_calibrate_peak_shift = "true";}
+    elsif ( $this_arg eq '-generate_profile_ChIP') {$override_generate_profile_ChIP = "true";}
+    elsif ( $this_arg eq '-generate_profile_background') {$override_generate_profile_background = "true";}
+    elsif ( $this_arg eq '-generate_profile_pseudo_ChIP') {$override_generate_profile_pseudo_ChIP = "true";}
+    elsif ( $this_arg eq '-peak_call_ChIP') {$override_peak_call_ChIP = "true";}
+    elsif ( $this_arg eq '-peak_call_pseudo_ChIP') {$override_peak_call_pseudo_ChIP = "true";}
+#    elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+    else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false";       ## die if parameters are bad
+my $bad_par = "";                 ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file:      $params_fname\n";
+print "exec_path:        $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+    print "$usage";
+    print "Bad parameters: $bad_par\n";
+    exit(0);
+}
+## /reading arguments
+
+if($override_quick_window_scan eq "true" ||
+   $override_calibrate_peak_shift eq "true" ||
+   $override_generate_profile_ChIP eq "true" ||
+   $override_generate_profile_background eq "true" ||
+   $override_generate_profile_pseudo_ChIP eq "true" ||
+   $override_peak_call_ChIP eq "true" ||
+   $override_peak_call_pseudo_ChIP eq "true"){
+    $override_QuEST_directives = "true";
+}
+
+## top-level script
+if( ! -e $params_fname){
+    print "Failed to open $params_fname. Aborting.\n";
+    exit(1);
+}
+open params_file, "< $params_fname" || die "$params_fname: $!\n";
+my @params = <params_file>;
+close params_file;
+
+my $analysis_with_FDR = "";
+my $quick_window_scan_param_fname = "";
+my $calibrate_peak_shift_param_fname = "";
+
+my $ChIP_generate_profile_param_fname = "";
+my $background_generate_profile_param_fname = "";
+my $pseudo_ChIP_generate_profile_param_fname = "";
+
+my $ChIP_peak_caller_param_fname = "";
+my $pseudo_ChIP_peak_caller_param_fname = "";
+
+my $quick_window_scan_ChIP = "false";
+my $calibrate_peak_shift_ChIP = "false";
+my $generate_profile_ChIP = "false";
+my $generate_profile_background = "false";
+my $generate_profile_pseudo_ChIP = "false";
+my $peak_caller_ChIP_vs_background = "false";
+my $peak_caller_pseudo_ChIP_vs_background = "false";
+
+my $log_fname = "";
+my $output_fname = "";
+
+
+for(my $i=0; $i<scalar(@params); $i++){
+    my $cur_param = $params[$i];
+    chomp($cur_param);
+    
+    if( length($cur_param) > 0 ){
+       if( substr($cur_param,0,1) ne "#" ){
+           my @cur_par_fields = split(/ /, $cur_param);
+           
+           if(scalar(@cur_par_fields >= 2)){
+               my $cur_par_name = $cur_par_fields[0];       
+               if ($cur_par_name eq "analysis_with_FDR"){
+                   $analysis_with_FDR = $cur_par_fields[2];    
+               }
+               elsif( $cur_par_name eq "quick_window_scan:" ){
+                   if( $cur_par_fields[1] eq "ChIP" ){
+                       $quick_window_scan_ChIP = "true";
+                   }
+               }
+               elsif( $cur_par_name eq "calibrate_peak_shift:" ){
+                   if( $cur_par_fields[1] eq "ChIP" ){
+                       $calibrate_peak_shift_ChIP = "true";
+                   }
+               }
+               elsif( $cur_par_name eq "generate_profile:"){
+                   if( $cur_par_fields[1] eq "ChIP" ){
+                       $generate_profile_ChIP = "true";
+                   }
+                   elsif( $cur_par_fields[1] eq "background" ){
+                       $generate_profile_background = "true";
+                   }
+                   elsif( $cur_par_fields[1] eq "pseudo_ChIP" ){
+                       $generate_profile_pseudo_ChIP = "true";
+                   }
+               }
+               elsif( $cur_par_name  eq "peak_caller:" ){
+                   if( scalar(@cur_par_fields) == 4 ){
+                       if( $cur_par_fields[1] eq "ChIP" &&
+                           $cur_par_fields[2] eq "vs" &&
+                           $cur_par_fields[3] eq "background"){
+                           $peak_caller_ChIP_vs_background = "true";
+                       }
+                       elsif( $cur_par_fields[1] eq "pseudo_ChIP" &&
+                           $cur_par_fields[2] eq "vs" &&
+                           $cur_par_fields[3] eq "background"){
+                           $peak_caller_pseudo_ChIP_vs_background = "true";
+                       }
+                   }
+               }
+               
+               elsif ($cur_par_name eq "quick_window_scan_param_file"){
+                   $quick_window_scan_param_fname = $cur_par_fields[2];        
+               }
+               elsif ($cur_par_name eq "calibrate_peak_shift_param_file"){
+                   $calibrate_peak_shift_param_fname = $cur_par_fields[2];     
+               }
+               elsif ($cur_par_name eq "ChIP_generate_profile_param_file"){
+                   $ChIP_generate_profile_param_fname = $cur_par_fields[2];    
+               }
+               elsif ($cur_par_name eq "background_generate_profile_param_file"){
+                   $background_generate_profile_param_fname = $cur_par_fields[2];      
+               }
+               elsif ($cur_par_name eq "pseudo_ChIP_generate_profile_param_file"){
+                   $pseudo_ChIP_generate_profile_param_fname = $cur_par_fields[2];     
+               }
+               elsif ($cur_par_name eq "ChIP_peak_caller_param_file"){
+                   $ChIP_peak_caller_param_fname = $cur_par_fields[2]; 
+               }
+               elsif ($cur_par_name eq "pseudo_ChIP_peak_caller_param_file"){
+                   $pseudo_ChIP_peak_caller_param_fname = $cur_par_fields[2];  
+               }
+               elsif ($cur_par_name eq "log_file"){
+                   $log_fname = $cur_par_fields[2];    
+               }
+               elsif ($cur_par_name eq "output_file"){
+                   $output_fname = $cur_par_fields[2]; 
+               }
+               else{
+                   if($params[$i] ne ""){
+                       print "Warning: unrecognized parameter: $cur_par_name";
+                   }
+               }
+           }
+       }
+    }
+}
+
+if($override_QuEST_directives eq "true"){
+    if($override_quick_window_scan eq "true" ){
+       $quick_window_scan_ChIP = "true";
+    }
+    else{ $quick_window_scan_ChIP = "false"; }
+
+    if($override_calibrate_peak_shift eq "true" ){
+       $calibrate_peak_shift_ChIP = "true";
+    }
+    else{ $calibrate_peak_shift_ChIP = "false"; }
+
+    if($override_generate_profile_ChIP eq "true" ){
+       $generate_profile_ChIP = "true";
+    }
+    else{ $generate_profile_ChIP = "false"; }
+
+    if($override_generate_profile_background eq "true" ){
+       $generate_profile_background = "true";
+    }
+    else{ $generate_profile_background = "false"; }
+
+    if( $override_generate_profile_pseudo_ChIP eq "true" ){
+       $generate_profile_pseudo_ChIP = "true";
+    }
+    else{ $generate_profile_pseudo_ChIP = "false"; }
+
+    if( $override_peak_call_ChIP eq "true" ){
+       $peak_caller_ChIP_vs_background = "true";
+    }
+    else{ $peak_caller_ChIP_vs_background = "false"; }
+    
+    if( $override_peak_call_pseudo_ChIP eq "true" ){
+       $peak_caller_pseudo_ChIP_vs_background = "true";
+       $analysis_with_FDR = "true";
+    }
+    else{ $peak_caller_pseudo_ChIP_vs_background = "false"; }
+}
+
+print "override_QuEST_directives:               $override_QuEST_directives\n";
+print "quick_window_scan_ChIP:                  $quick_window_scan_ChIP\n";
+print "calibrate_peak_shift_ChIP:               $calibrate_peak_shift_ChIP\n";
+print "generate_profile_ChIP:                   $generate_profile_ChIP\n";
+print "generate_profile_pseudo_ChIP:            $generate_profile_pseudo_ChIP\n";
+print "generate_profile_background:             $generate_profile_background\n";
+print "peak_caller_ChIP_vs_background:          $peak_caller_ChIP_vs_background\n";
+print "peak_caller_pseudo_ChIP_vs_background:   $peak_caller_pseudo_ChIP_vs_background\n";
+
+
+
+print "Read the following parameters: \n\n";
+
+print "analysis_with_FDR:                        $analysis_with_FDR\n";
+print "\n";
+print "quick_window_scan_param_file:               $quick_window_scan_param_fname\n";
+print "calibrate_peak_shift_param_file:          $calibrate_peak_shift_param_fname\n";
+print "\n";
+print "ChIP_generate_profile_param_file:         $ChIP_generate_profile_param_fname\n";
+print "background_generate_profile_param_file:   $background_generate_profile_param_fname\n";
+if($analysis_with_FDR eq "yes"){
+    print "pseudo_ChIP_generate_profile_param_file:  $pseudo_ChIP_generate_profile_param_fname\n";
+}
+
+print "\n";
+print "ChIP_peak_caller_param_file:              $ChIP_peak_caller_param_fname\n";
+if($analysis_with_FDR eq "yes"){
+    print "pseudo_ChIP_peak_caller_param_file:       $pseudo_ChIP_peak_caller_param_fname\n";
+}
+
+print "\n";
+print "log_file:                                 $log_fname\n";
+print "output_file:                              $output_fname\n";
+
+## QuEST execution
+
+## some necessary checks/setups
+if(-e $log_fname){
+    unlink($log_fname);   
+}
+if(-e $output_fname){
+    unlink($output_fname);
+}
+## some necessary checks/setups
+
+my $quick_window_scan_batcher = $exec_path . "/run_quick_window_scan_with_param_file.pl";
+my $calibrate_peak_shift_batcher = $exec_path . "/run_calibrate_peak_shift_with_param_file.pl";
+my $generate_profile_batcher = $exec_path . "/run_generate_profile_with_param_file.pl";
+my $peak_caller_batcher = $exec_path . "/run_peak_caller_with_param_file.pl";
+
+if( $quick_window_scan_ChIP eq "true"){
+    my $error_code = 
+       system("$quick_window_scan_batcher -p $quick_window_scan_param_fname\n");
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+if( $calibrate_peak_shift_ChIP eq "true" ){
+    my $error_code = 
+       system("$calibrate_peak_shift_batcher -p $calibrate_peak_shift_param_fname\n");
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+if( $generate_profile_ChIP eq "true" ){
+    my $error_code = 
+       system("$generate_profile_batcher -p $ChIP_generate_profile_param_fname\n");
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+if( $generate_profile_background eq "true" ){
+    my $error_code = 
+       system("$generate_profile_batcher -p $background_generate_profile_param_fname\n");
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+if( $analysis_with_FDR eq "true" && $generate_profile_pseudo_ChIP eq "true"){
+    my $error_code = 
+       system("$generate_profile_batcher -p $pseudo_ChIP_generate_profile_param_fname\n");    
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+
+if( $peak_caller_ChIP_vs_background eq "true" ){
+    my $error_code = 
+       system("$peak_caller_batcher -p $ChIP_peak_caller_param_fname\n");
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+
+if( $analysis_with_FDR eq "true" && $peak_caller_pseudo_ChIP_vs_background eq "true" ){
+    my $error_code = 
+       system("$peak_caller_batcher -p $pseudo_ChIP_peak_caller_param_fname\n");
+    if($error_code != 0){
+       print "run_QuEST_with_param_file.pl: Caught an error message (code $error_code) when running generate_ChIP_profile. Aborting.\n";
+       exit(4);
+    }
+}
+
+
+
+open log_file, "> $log_fname" || die "$log_fname: $\n";
+open output_file, "> $output_fname" || die "$output_fname: $\n";
+
+# creating QuEST summaries
+
+my $ChIP_peak_calls_fname = "";
+my $pseudo_ChIP_peak_calls_fname = "";
+
+if(-e $ChIP_peak_caller_param_fname){
+    open ChIP_peak_caller_param_file, "< $ChIP_peak_caller_param_fname";
+    
+    while( <ChIP_peak_caller_param_file> ){
+       chomp;
+       my @cur_fields = split(/ /, $_);
+       if(scalar(@cur_fields) > 0){
+           if($cur_fields[0] eq "output_file"){
+               $ChIP_peak_calls_fname = $cur_fields[2];
+           }
+       }
+    }    
+    close ChIP_peak_caller_param_file;
+}
+else{
+    print "Error in run_QuEST_with_param_file.pl: Failed to locate ChIP_peak_caller_param_fname: $ChIP_peak_caller_param_fname";
+    print log_file"Error in run_QuEST_with_param_file.pl: Failed to locate ChIP_peak_caller_param_fname: $ChIP_peak_caller_param_fname";
+}
+
+
+if( $analysis_with_FDR eq "true"){
+    if(-e $pseudo_ChIP_peak_caller_param_fname){
+       open pseudo_ChIP_peak_caller_param_file, "< $pseudo_ChIP_peak_caller_param_fname";
+       
+       while( <pseudo_ChIP_peak_caller_param_file> ){
+           chomp;
+           my @cur_fields = split(/ /, $_);
+           if(scalar(@cur_fields) > 0){
+               if($cur_fields[0] eq "output_file"){
+                   $pseudo_ChIP_peak_calls_fname = $cur_fields[2];
+               }
+           }
+       }    
+       close pseudo_ChIP_peak_caller_param_file;
+    }
+    else{
+       print "Error in run_QuEST_with_param_file.pl: Failed to locate pseudo_ChIP_peak_caller_param_fname: $pseudo_ChIP_peak_caller_param_fname";
+       print log_file "Error in run_QuEST_with_param_file.pl: Failed to locate pseudo_ChIP_peak_caller_param_fname: $pseudo_ChIP_peak_caller_param_fname";
+    }
+}
+
+# counting ChIP peaks
+my $ChIP_peaks = -1;
+if( $peak_caller_ChIP_vs_background eq "true" ){
+    $ChIP_peaks = 0;
+    if(-e $ChIP_peak_calls_fname){
+       open ChIP_peak_calls_file, "< $ChIP_peak_calls_fname";
+       
+       while( <ChIP_peak_calls_file> ){
+           chomp;
+           my @cur_fields = split(/ /,$_);
+           if(scalar(@cur_fields) >= 4){
+               if($cur_fields[3] eq "locmax:"){
+                   $ChIP_peaks++;
+               }
+           }
+       }
+       close ChIP_peak_calls_file;
+    }
+    else{
+       print "Failed to locate ChIP_peak_calls_file: $ChIP_peak_calls_fname\n";
+       print log_file "Failed to locate ChIP_peak_calls_file: $ChIP_peak_calls_fname\n";
+    }
+}
+# /counting ChIP peaks
+
+# counting pseudo_ChIP peaks
+my $pseudo_ChIP_peaks = -1;
+if( $peak_caller_pseudo_ChIP_vs_background eq "true" ){
+    if( $analysis_with_FDR eq "true" ){
+       $pseudo_ChIP_peaks = 0;
+       if(-e $pseudo_ChIP_peak_calls_fname){
+           open pseudo_ChIP_peak_calls_file, "< $pseudo_ChIP_peak_calls_fname";
+           
+           while( <pseudo_ChIP_peak_calls_file> ){
+               chomp;
+               my @cur_fields = split(/ /,$_);
+               if(scalar(@cur_fields) >= 4){
+                   if($cur_fields[3] eq "locmax:"){
+                       $pseudo_ChIP_peaks++;
+                   }
+               }
+           }
+           close pseudo_ChIP_peak_calls_file;
+       }
+       else{
+           print "Failed to locate pseudo_ChIP_peak_calls_file: $pseudo_ChIP_peak_calls_fname\n";
+           print log_file "Failed to locate pseudo_ChIP_peak_calls_file: $pseudo_ChIP_peak_calls_fname\n";
+       }
+    }
+}
+# /counting pseudo_ChIP peaks
+
+# /creating QuEST summaries
+
+print output_file "\#\#   please cite: \n";
+print output_file "\#\#   Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,\n";
+print output_file "\#\#   Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide\n";
+print output_file "\#\#   Identification of Transcription Factor Binding Sites Based on ChIP-Seq\n";
+print output_file "\#\#   Data. under review.\n\n";
+
+if($ChIP_peaks >= 0){
+    print output_file "ChIP peaks: $ChIP_peaks\n";
+}
+else{
+    print output_file "ChIP peaks: not available\n";
+}
+if( $analysis_with_FDR eq "true" ){
+    if($pseudo_ChIP_peaks >= 0){
+       print output_file "pseudo_ChIP peaks: $pseudo_ChIP_peaks\n";
+       if($ChIP_peaks != 0){
+           printf output_file "FDR estimate: %.2f percent\n", 100 * $pseudo_ChIP_peaks / $ChIP_peaks;
+       }
+       else{
+           print output_file "FDR estimate: NA\n";
+       }
+    }
+    else{
+       print "FDR estimate: not available\n";
+    }
+}
+
+close output_file;
+close log_file;
+
+## /QuEST execution
+
diff --git a/htswanalysis/bin/QuEST/run_calibrate_peak_shift_with_param_file.pl b/htswanalysis/bin/QuEST/run_calibrate_peak_shift_with_param_file.pl
new file mode 100755 (executable)
index 0000000..0d443b1
--- /dev/null
@@ -0,0 +1,359 @@
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like. 
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+## This program is a wrapper that runs calibrate_peak_shift
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+    run_calibrate_peak_shift_with_param_file.pl
+       
+    This program is a wrapper that runs the calibrate_peak_shift
+    
+    -----------------------
+    mandatory parameters:
+    -p <params_file>               a file containing parameters calibrate_peak_shift
+
+    -----------------------
+    optional parameters:
+    -e <exec_path>                 path to the calibrate_peak_shift executable
+    -h                             to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+    if($fullpath_fields[$i] ne ""){
+       $exec_path = $exec_path."/".$fullpath_fields[$i];
+    }
+}
+
+$exec_path = $exec_path."/calibrate_peak_shift";
+## /setting optional arguments
+
+if( ! -e $exec_path ){
+    print "Error in calibrate peak shift master script.\n";
+    print "Couldn't locate executable $exec_path. Aborting.\n";
+    exit(0);
+}
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+    elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+    else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false";       ## die if parameters are bad
+my $bad_par = "";                 ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file:      $params_fname\n";
+print "exec_path:        $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+    print "$usage";
+    print "Bad parameters: $bad_par\n";
+    exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $silent_flag = "off";
+my $quick_scan_file =                    "** missing **";
+my $QuEST_align_chip_file =              "** missing **";
+my $QuEST_align_background_file =        "** missing **";
+my $output_fname =                        "** missing **";
+
+my $kde_bandwidth = "";                     # optional
+my $scan_window = "";                       # optional
+my $quick_scan_tag_count_threshold = "";    # optional
+#my $aligned_part_length = "";               # optional
+my $top_count = "";                         # optional
+my $chip_to_background_fold = "";           # optional
+my $best_to_second_best_peak_ratio = "";    # optional
+my $chip_peak_threshold = "";               # optional
+my $chip_for_rev_peak_max_min_ratio = "";   # optional
+my $estimation_method = "";
+
+my $verbose = "yes";
+
+for(my $i=0; $i<scalar(@params); $i++){
+    my $cur_param = $params[$i];
+    chomp($cur_param);
+    
+    my @cur_par_fields = split(/ /, $cur_param);
+    if(scalar(@cur_par_fields >= 2)){
+       my $cur_par_name = $cur_par_fields[0];  
+       if ($cur_par_name eq "quick_scan_file"){
+           $quick_scan_file = $cur_par_fields[2];      
+       }
+       elsif($cur_par_name eq "align_chip_file"){
+           $QuEST_align_chip_file = $cur_par_fields[2];        
+       }
+       elsif($cur_par_name eq "align_background_file"){
+           $QuEST_align_background_file = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "output_file"){
+           $output_fname = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "kde_bandwidth"){
+           $kde_bandwidth = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "scan_window"){
+           $scan_window = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "quick_scan_tag_count_threshold"){
+           $quick_scan_tag_count_threshold = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "top_count"){
+           $top_count = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "chip_to_background_fold"){
+           $chip_to_background_fold = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "best_to_second_best_peak_ratio"){
+           $best_to_second_best_peak_ratio = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "chip_peak_threshold"){
+           $chip_peak_threshold = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "chip_for_rev_peak_max_min_ratio"){
+           $chip_for_rev_peak_max_min_ratio = $cur_par_fields[2];
+       }
+       elsif ($cur_par_name eq "method"){
+           $estimation_method = $cur_par_fields[2];    
+       }
+       elsif ($cur_par_name eq "verbose"){
+           $verbose = $cur_par_fields[2];      
+       }
+       else{
+           if($params[$i] ne ""){
+               print "Warning: unrecognized parameter: $cur_par_name";
+           }
+       }
+    }
+}
+
+print "Read the following parameters: \n\n";
+
+print "quick_scan_file:                   $quick_scan_file\n";
+print "QuEST_align_chip_file:             $QuEST_align_chip_file\n";
+print "QuEST_align_background_file:       $QuEST_align_background_file\n";
+print "output_file:                       $output_fname\n";
+print "kde_bandwidth:                     $kde_bandwidth\n";                      # optional
+print "scan_window:                       $scan_window\n";                        # optional
+print "quick_scan_tag_count_threshold:    $quick_scan_tag_count_threshold\n";     # optional
+print "top_count:                         $top_count\n";                          # optional
+print "chip_to_background_fold:           $chip_to_background_fold\n";            # optional
+print "best_to_second_best_peak_ratio:    $best_to_second_best_peak_ratio\n";     # optional
+print "chip_peak_threshold:               $chip_peak_threshold\n";                # optional
+print "chip_for_rev_peak_max_min_ratio:   $chip_for_rev_peak_max_min_ratio\n";    # optional
+print "verbose:                           $verbose\n";
+print "\n";
+
+my $peak_shift_recommended_max = 100;
+my $peak_shift_recommended_value = 50;
+my $peak_shift_recommended_min = 35;
+my $recommended_used_regions = 10;
+my $recommended_used_fraction = 0.33;
+
+my $output_fname_tmp = $output_fname . ".tmp";
+
+my $optional_param_string = "";
+if($kde_bandwidth ne ""){
+    $optional_param_string = $optional_param_string . " kde_bandwidth=$kde_bandwidth";
+}
+if($scan_window ne ""){
+    $optional_param_string = $optional_param_string . " scan_window=$scan_window";
+}
+if($quick_scan_tag_count_threshold ne ""){
+    $optional_param_string = $optional_param_string . " quick_scan_tag_count_threshold=$quick_scan_tag_count_threshold";
+}
+#if($aligned_part_length ne ""){
+#    $optional_param_string = $optional_param_string . " aligned_part_length=$aligned_part_length";
+#}
+if($top_count ne ""){
+    $optional_param_string = $optional_param_string . " top_count=$top_count";
+}
+if($chip_to_background_fold ne ""){
+    $optional_param_string = $optional_param_string . " chip_to_background_fold=$chip_to_background_fold";
+}
+if($best_to_second_best_peak_ratio ne ""){
+    $optional_param_string = $optional_param_string . " best_to_second_best_peak_ratio=$best_to_second_best_peak_ratio";
+}
+if($chip_peak_threshold ne ""){
+    $optional_param_string = $optional_param_string . " chip_peak_threshold=$chip_peak_threshold";
+}
+if($chip_for_rev_peak_max_min_ratio ne ""){
+    $optional_param_string = $optional_param_string . " chip_for_rev_peak_max_min_ratio=$chip_for_rev_peak_max_min_ratio";
+}
+if($estimation_method ne ""){
+    $optional_param_string = $optional_param_string . " method=$estimation_method";
+}
+
+my $mandatory_param_string = "quick_scan_file=$quick_scan_file align_chip_file=$QuEST_align_chip_file align_background_file=$QuEST_align_background_file output_file=$output_fname_tmp";
+
+my $system_command = $exec_path . " " . $mandatory_param_string . $optional_param_string;
+
+#print "$system_command\n";
+my $error_code = system("$system_command\n");
+
+if($error_code != 0){
+    print "run_calibrate_peak_shift_with_param_file.pl: Caught an error message (code $error_code), passing the error message to the top script.\n";
+    exit(2);
+}
+
+if(!-e $output_fname_tmp){
+    print "Error in run_calibrate_peak_shift_with_param_file.pl: Failed to locate $output_fname_tmp. Aborting.\n";
+    exit(1);
+}
+
+open output_file_tmp, "< $output_fname_tmp";
+
+my $regions = -1;
+my $used_regions = -1;
+my $peak_shift_estimate = -1;
+my $estimation_method_detected = "";
+
+while( <output_file_tmp> ){
+    chomp;
+    if(length($_) > 0){
+       if( substr($_,0,1) ne "#" ){
+           my @cur_entry_fields = split(/ /, $_);
+           if( scalar(@cur_entry_fields) >= 2){
+               if($cur_entry_fields[0] eq "regions:"){
+                   $regions = $cur_entry_fields[1];
+               }
+               elsif($cur_entry_fields[0] eq "used:"){
+                   $used_regions = $cur_entry_fields[1];
+               }
+               elsif($cur_entry_fields[0] eq "peak_shift_estimate:"){
+                   $peak_shift_estimate = $cur_entry_fields[1];
+               }
+               elsif($cur_entry_fields[0] eq "estimation_method:"){
+                   $estimation_method_detected = $cur_entry_fields[1];
+               }
+           }
+       }
+    }
+}
+
+close output_file_tmp;
+
+if( $verbose eq "yes" ){
+    my $suggest_override = "false";
+    if( $used_regions <= $recommended_used_fraction * $regions ){
+       printf "\nQuEST has determined that the fraction of used regions ( %.2f ) for peak\n", 
+       $used_regions / $regions;
+       print "estimate is low. It does not mean that the estimate is bad, but\n";
+       print "the peak shift estimate may not be very accurate.\n";
+       $suggest_override = "true";
+    }
+    if( $used_regions <= $recommended_used_regions ){
+       print "\nQuEST has determined that the number of used regions ( $used_regions ) for peak\n";
+       print "estimate is low. It does not mean that the estimate is bad, but\n";
+       print "the peak shift estimate may not be very accurate.\n";
+       $suggest_override = "true";
+    }
+    if( $peak_shift_estimate < $peak_shift_recommended_min ){
+       printf "\nQuEST has determined that the peak shift estimate ( %.0f )\n", $peak_shift_estimate;
+       print "is too low. This may be normal in some cases, but also may be indicative\n";
+       print "of read mapping artifacts into repetetive regions. You might consider\n";
+       print "revising the list of candidate regions to exclude the ones falling into the\n";
+       print "regions annotated as repeats or try using median-based estimate for the peak shift\n";
+       print "to achieve a more robust estimate.\n";
+       $suggest_override = "true";
+    }
+       
+    if( $suggest_override eq "true" ){
+       my $cur_answer = "";
+       while($cur_answer ne "y" && $cur_answer ne "n" ){
+           print "Do you want to override the value of peak shift? (y/n): ";
+           $cur_answer = <STDIN>;          
+           chomp($cur_answer);
+       }
+       if( $cur_answer eq "y"){
+           $estimation_method_detected = "user_input";
+           print "Please enter the value of peak shift (recommended $peak_shift_recommended_value): ";
+           $cur_answer = <STDIN>;
+           chomp($cur_answer);
+           $peak_shift_estimate = $cur_answer;
+       }       
+    }
+
+    if( $peak_shift_estimate > $peak_shift_recommended_max ){
+       printf "\nQuEST has determined that the peak shift estimate ( %.0f )\n", $peak_shift_estimate;
+       print "is too high. This may be normal in some cases especially \n";
+       print "if the insert size in the library is large, however \n";
+       print "typically values around $peak_shift_recommended_value are expected.\n";
+       print "regions annotated as repeats or try using median-based estimate for the peak shift.\n";
+       my $cur_answer = "";
+       while($cur_answer ne "y" && $cur_answer ne "n" ){
+           print "Do you want to override the value of peak shift? (y/n): ";
+           $cur_answer = <STDIN>;
+           chomp($cur_answer);
+       }
+       if( $cur_answer eq "y"){
+           $estimation_method_detected = "user_input";
+           print "Please enter the value of peak shift (recommended $peak_shift_recommended_value): ";
+           $cur_answer = <STDIN>;
+           chomp($cur_answer);
+           $peak_shift_estimate = $cur_answer;
+       }       
+    }
+}
+
+if(-e $output_fname){
+    unlink($output_fname);
+}
+
+if(-e $output_fname_tmp){
+    unlink($output_fname_tmp);
+}
+
+open output_file, "> $output_fname";
+print output_file "regions: $regions\n";
+print output_file "used: $used_regions\n";
+print output_file "peak_shift_estimate: $peak_shift_estimate\n";
+print output_file "estimation_method: $estimation_method_detected\n";
+close output_file;
diff --git a/htswanalysis/bin/QuEST/run_generate_profile_with_param_file.pl b/htswanalysis/bin/QuEST/run_generate_profile_with_param_file.pl
new file mode 100755 (executable)
index 0000000..c88cbde
--- /dev/null
@@ -0,0 +1,260 @@
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like. 
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+## This program is a wrapper that runs generate_peak_profile program
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+    run_generate_peak_profile_with_param_file.pl
+       
+    This program is a wrapper that runs the generate_peak_profile
+    
+    -----------------------
+    mandatory parameters:
+    -p <params_file>               a file containing parameters calibrate_peak_shift
+
+    -----------------------
+    optional parameters:
+    -e <exec_path>                 path to the calibrate_peak_shift executable
+    -h                             to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+    if($fullpath_fields[$i] ne ""){
+       $exec_path = $exec_path."/".$fullpath_fields[$i];
+    }
+}
+
+$exec_path = $exec_path."/generate_peak_profile";
+
+if( ! -e $exec_path ){
+    print "Error in generate profile batch script:\n";
+    print "Failed to locate executable $exec_path.\n";
+    print "Aborting.\n";
+    exit(0);
+}
+
+## /setting optional arguments
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+    elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+    else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false";       ## die if parameters are bad
+my $bad_par = "";                 ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file:      $params_fname\n";
+print "exec_path:        $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+    print "$usage";
+    print "Bad parameters: $bad_par\n";
+    exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $QuEST_align_file =     "** missing **";
+# my @contigs;
+# my $reference_path =                "** missing **";
+my $genome_table_fname =   "** missing **";
+my $output_score_path =    "** missing **";
+
+my $peak_shift = "";          # optional
+my $peak_shift_source_fname = "";   # optional
+my $kde_bandwidth = "";       # optional
+my $calc_window = "";         # optional
+my $threads = "";             # optional
+#my $aligned_part_length = ""; # optional
+
+for(my $i=0; $i<scalar(@params); $i++){
+    my $cur_param = $params[$i];
+    chomp($cur_param);
+    
+    my @cur_par_fields = split(/ /, $cur_param);
+    if(scalar(@cur_par_fields >= 2)){
+       my $cur_par_name = $cur_par_fields[0];
+       if ($cur_par_name eq "align_file"){
+           $QuEST_align_file = $cur_par_fields[2];     
+       }
+       #elsif($cur_par_name eq "contigs"){
+       #    @contigs = split(/\,/, $cur_par_fields[2]);        
+       #}
+       #elsif($cur_par_name eq "reference_path"){
+       #    $reference_path = $cur_par_fields[2];
+       #}
+       elsif( $cur_par_name eq "genome_table" ){
+           $genome_table_fname= $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "output_score_path"){
+           $output_score_path = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "peak_shift"){
+           $peak_shift = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "peak_shift_source_file"){
+           $peak_shift_source_fname = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "kde_bandwidth"){
+           $kde_bandwidth = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "calc_window"){
+           $calc_window = $cur_par_fields[2];
+       }
+       elsif($cur_par_name eq "threads"){
+           $threads = $cur_par_fields[2];
+       }
+       #elsif($cur_par_name eq "aligned_part_length"){
+       #    $aligned_part_length = $cur_par_fields[2];
+       #}
+       else{
+           if($params[$i] ne ""){
+               print "Warning: unrecognized parameter: $cur_par_name";
+           }
+       }
+    }
+}
+
+if($peak_shift eq "" and $peak_shift_source_fname ne ""){
+    print "Extracting peak shift value from the $peak_shift_source_fname file\n";
+    open peak_shift_source_file, "< $peak_shift_source_fname" || 
+       die "Failed to open $peak_shift_source_fname\n";
+
+    while(<peak_shift_source_file>){
+       chomp;
+       my @cur_fields = split(/ /,$_);
+       if($cur_fields[0] eq "peak_shift_estimate:"){
+           $peak_shift = $cur_fields[1];
+       }
+    }
+}
+
+print "Read the following parameters: \n\n";
+
+print "align_file:                        $QuEST_align_file\n";
+print "genome_table:                      $genome_table_fname\n";
+#print "contigs:                           @contigs\n";
+#print "reference_path:                    $reference_path\n";
+print "output_score_path:                 $output_score_path\n";
+print "peak_shift:                        $peak_shift\n";                         # optional
+print "kde_bandwidth:                     $kde_bandwidth\n";                      # optional
+print "calc_window:                       $calc_window\n";                        # optional
+print "threads:                           $threads\n";                            # optional
+#print "aligned_part_length:               $aligned_part_length\n";                # optional
+
+#my $stall = <STDIN>;
+print "\n";
+
+my $optional_param_string = "";
+if($kde_bandwidth ne ""){
+    $optional_param_string = $optional_param_string . " kde_bandwidth=$kde_bandwidth";
+}
+if($peak_shift ne ""){
+    $optional_param_string = $optional_param_string . " peak_shift=$peak_shift";
+}
+if($threads ne ""){
+    $optional_param_string = $optional_param_string . " threads=$threads";
+}
+#if($aligned_part_length ne ""){
+#    $optional_param_string = $optional_param_string . " aligned_part_length=$aligned_part_length";
+#}
+if($calc_window ne ""){
+    $optional_param_string = $optional_param_string . " calc_window=$calc_window";
+}
+
+
+## reading genome table
+
+if( ! -e $genome_table_fname ){
+    print "Error in generate profile master script: \n";
+    print "Failed to locate genome table $genome_table_fname. Aborting.\n";
+    exit(0);
+}
+
+
+open genome_table_file, "< $genome_table_fname" || die "$genome_table_fname: $\n";
+my @genome_table = <genome_table_file>;
+
+close genome_table_file;
+
+my @contig_names;
+my @contig_sizes;
+my $contig_counter = 0;
+
+for(my $i=0; $i < scalar(@genome_table); $i++){
+    my $cur_contig_entry = $genome_table[$i];
+    chomp($cur_contig_entry);
+    my @cur_contig_entry_fields = split(/ /, $cur_contig_entry);
+    if(scalar(@cur_contig_entry_fields) == 2){
+       $contig_names[$contig_counter] = $cur_contig_entry_fields[0];
+       $contig_sizes[$contig_counter] = $cur_contig_entry_fields[1];
+       $contig_counter++;
+    }
+}
+
+## /reading genome table
+
+
+
+for(my $i=0; $i<scalar(@contig_names); $i++){
+    my $cur_contig_name = $contig_names[$i];
+    my $cur_contig_size = $contig_sizes[$i];
+    my $system_command = $exec_path . " align_file=$QuEST_align_file contig_id=$cur_contig_name contig_size=$cur_contig_size output_score_file=$output_score_path/$cur_contig_name.score" . $optional_param_string;
+    print "$system_command\n";
+    my $error_code = system("$system_command\n");
+    if($error_code != 0){
+       print "run_generate_profile_with_param_file.pl: Caught an error message (code $error_code), passing error message to the top script.\n";
+       exit(3);
+    }
+    #print "error_code: $error_code\n";
+    #exit(0);
+
+}
+
+
diff --git a/htswanalysis/bin/QuEST/run_peak_caller_with_param_file.pl b/htswanalysis/bin/QuEST/run_peak_caller_with_param_file.pl
new file mode 100755 (executable)
index 0000000..5736b57
--- /dev/null
@@ -0,0 +1,344 @@
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like. 
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+## This program is a wrapper that runs peak_caller program
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+    run_peak_caller_with_param_file.pl
+       
+    This program is a wrapper that runs the peak_caller
+    
+    -----------------------
+    mandatory parameters:
+    -p <params_file>               a file containing parameters calibrate_peak_shift
+
+    -----------------------
+    optional parameters:
+    -e <exec_path>                 path to the calibrate_peak_shift executable
+    -h                             to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+    if($fullpath_fields[$i] ne ""){
+       $exec_path = $exec_path."/".$fullpath_fields[$i];
+    }
+}
+
+$exec_path = $exec_path."/peak_caller";
+## /setting optional arguments
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+    elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+    else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false";       ## die if parameters are bad
+my $bad_par = "";                 ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file:      $params_fname\n";
+print "exec_path:        $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+    print "$usage";
+    print "Bad parameters: $bad_par\n";
+    exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $chip_profile_path =             "** missing **";
+my $background_profile_path =       "** missing **";
+my $output_fname =                   "** missing **";
+my $chip_threshold =                "** missing **";
+my $background_threshold =          "** missing **";
+my $rescue_ratio =                  "** missing **";
+my $genome_table_fname =             "** missing **";
+my $ChIP_reads = "** missing **";
+my $background_reads = "** missing **";
+
+my $local_maximum_radius = "";      # optional
+my $dip_fraction = "";              # optional
+
+
+for(my $i=0; $i<scalar(@params); $i++){
+    my $cur_param = $params[$i];
+    chomp($cur_param);
+    
+    my @cur_par_fields = split(/ /, $cur_param);
+    if(scalar(@cur_par_fields >= 2)){
+       my $cur_par_name = $cur_par_fields[0];
+       if ($cur_par_name eq "chip_profile_path"){
+           $chip_profile_path = $cur_par_fields[2];    
+       }
+       elsif ($cur_par_name eq "background_profile_path"){
+           $background_profile_path = $cur_par_fields[2];      
+       }
+       elsif ($cur_par_name eq "output_file"){
+           $output_fname = $cur_par_fields[2]; 
+       }
+       elsif ($cur_par_name eq "chip_threshold"){
+           $chip_threshold = $cur_par_fields[2];       
+       }
+       elsif ($cur_par_name eq "background_threshold"){
+           $background_threshold = $cur_par_fields[2]; 
+       }
+       elsif ($cur_par_name eq "rescue_ratio"){
+           $rescue_ratio = $cur_par_fields[2]; 
+       }
+       elsif($cur_par_name eq "genome_table"){
+           $genome_table_fname = $cur_par_fields[2];
+       }
+       elsif ($cur_par_name eq "local_maximum_radius"){
+           $local_maximum_radius = $cur_par_fields[2]; 
+       }
+       elsif ($cur_par_name eq "dip_fraction"){
+           $dip_fraction = $cur_par_fields[2]; 
+       }
+       elsif ($cur_par_name eq "ChIP_reads"){
+           $ChIP_reads = $cur_par_fields[2];   
+       }
+       elsif ($cur_par_name eq "background_reads"){
+           $background_reads = $cur_par_fields[2];     
+       }
+       else{
+           if($params[$i] ne ""){
+               print "Warning: unrecognized parameter: $cur_par_name";
+           }
+       }
+    }
+}
+
+print "Read the following parameters: \n\n";
+
+print "chip_profile_path:                 $chip_profile_path\n";
+print "background_profile_path:           $background_profile_path\n";
+print "output_file:                       $output_fname\n";
+print "chip_threshold:                    $chip_threshold\n";
+print "background_threshold:              $background_threshold\n";
+print "rescue_ratio:                      $rescue_ratio\n";
+print "genome_table_file:                 $genome_table_fname\n";
+print "ChIP_reads:                        $ChIP_reads\n";
+print "background_reads:                  $background_reads\n";
+print "local_maximum_radius:              $local_maximum_radius\n";      # optional
+print "dip_fraction:                      $dip_fraction\n";              # optional
+
+print "\n";
+
+my $optional_param_string = "";
+if($local_maximum_radius ne ""){
+    $optional_param_string = $optional_param_string . " local_maximum_radius=$local_maximum_radius";
+}
+if($dip_fraction ne ""){
+    $optional_param_string = $optional_param_string . " dip_fraction=$dip_fraction";
+}
+
+my @files_to_trash;
+my $files_to_trash_counter = 0;
+
+my @peak_call_files;
+my $peak_call_files_counter = 0;
+
+
+## read genome table
+
+if( ! -e $genome_table_fname){
+    print "Error in run peak caller script: couldn't find genome table $genome_table_fname.\n";
+    print "Aborting.\n";
+    exit(0);
+}
+
+open genome_table_file, "< $genome_table_fname" || die "$genome_table_fname: $\n";
+my @genome_table = <genome_table_file>;
+
+my @contigs;
+my $contig_counter = 0;
+
+for(my $i=0; $i<scalar(@genome_table); $i++){
+    my $cur_genome_table_entry = $genome_table[$i];
+    chomp($cur_genome_table_entry);
+
+    my @cur_genome_table_entry_fields = split(/ /, $cur_genome_table_entry);
+    if( scalar(@cur_genome_table_entry_fields) == 2){
+       $contigs[$contig_counter] = $cur_genome_table_entry_fields[0];
+       $contig_counter++;
+    }
+}
+
+
+## /read genome table
+
+for(my $i=0; $i<scalar(@contigs); $i++){
+    my $cur_contig = $contigs[$i];
+    my $system_command = 
+       $exec_path . 
+       " score_file1=$chip_profile_path/$cur_contig.score" . 
+       " score_file2=$background_profile_path/$cur_contig.score" .
+       " output_file=$chip_profile_path/$cur_contig.peak_calls.tmp" .
+       " chip_threshold=$chip_threshold" .
+       " background_threshold=$background_threshold" .
+       " rescue_ratio=$rescue_ratio" .
+       " contig_id=$cur_contig" .
+       $optional_param_string;
+    print "$system_command\n";
+    
+    $files_to_trash[$files_to_trash_counter] = "$chip_profile_path/$cur_contig.peak_calls.tmp";    
+    $files_to_trash_counter++;
+
+    $peak_call_files[$peak_call_files_counter] = "$chip_profile_path/$cur_contig.peak_calls.tmp";
+    $peak_call_files_counter++;
+    my $error_code = system("$system_command");
+    if($error_code != 0){
+       print "run_peak_caller_with_param_file.pl: Caught an error message (code $error_code), passing the error message to the top script.\n";
+       exit(2);
+    }
+}
+
+# merging
+my $cat_string = "";
+
+if(-e $output_fname){
+    unlink($output_fname);
+}
+#else{
+#system("touch $output_fname");
+#}
+#}
+
+my $extended_fname = $output_fname . ".extended";
+
+print "output files: ";
+print "$output_fname\n";
+print "extended_file: $extended_fname\n";
+
+
+if(-e $extended_fname){
+    unlink($extended_fname);
+}
+#else{
+system("touch $extended_fname");
+#}
+
+
+for(my $i=0; $i<scalar(@peak_call_files); $i++){
+    if($i!=0){
+       $cat_string = $cat_string . " ";
+    }
+    $cat_string = $cat_string . "$peak_call_files[$i]";
+}
+# /merging
+
+
+# output
+
+#if(-e $output_fname){
+#    unlink($output_fname);
+#}
+
+
+open output_file, "> $output_fname" or die "can't open $output_fname: $!\n";
+
+
+print output_file "\#\#   please cite: \n";
+print output_file "\#\#   Valouev A, Johnson DS, Sundquist A, Medina C, Anton E, Batzoglou S,\n";
+print output_file "\#\#   Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide\n";
+print output_file "\#\#   Identification of Transcription Factor Binding Sites Based on ChIP-Seq\n";
+print output_file "\#\#   Data. under review.\n\n";
+#print output_file "\#\#   please cite:
+#system("echo \#\#   Valouev A, Johnson DS, Sundquist A, Median C, Anton E, Batzoglou S, >> $extended_fname");
+#system("echo \#\#   Myers RM, Sidow A (2008). A Statistical Framework for Genome-Wide >> $extended_fname");
+#system("echo \#\#   Identification of Transcription Factor Binding Sites Based on ChIP-Seq >> $extended_fname");
+#system("echo \#\#   Data. under review. >> $output_fname");
+
+close output_file;
+
+system("cat $cat_string >> $extended_fname");
+
+
+my $output_fname_tmp1 = $output_fname . ".tmp1";
+system("sort -nrk 5 $extended_fname | grep loc > $output_fname_tmp1");
+open output_file_tmp1, "< $output_fname_tmp1" or die "can't open $output_fname_tmp1: $!\n";
+
+my $output_fname_tmp2 = $output_fname . ".tmp2";
+if(-e $output_fname_tmp2){ unlink($output_fname_tmp2); }
+open output_file_tmp2, "> $output_fname_tmp2" or die "can't open $output_fname_tmp2: $!\n";
+while(<output_file_tmp1>){
+    chomp;
+    if(length($_) > 0){
+       if( substr($_,0,1) ne "#" ){
+           my @cur_entry_fields = split(/ /, $_);
+           if( scalar(@cur_entry_fields) >= 9){
+               my $cur_locmax = $cur_entry_fields[4];
+               my $cur_nulls = $cur_entry_fields[8];
+               my $cur_ef = "NA";
+               if( $cur_nulls != 0){
+                   $cur_ef = ($cur_locmax / $ChIP_reads) / ($cur_nulls / $background_reads);
+               }
+               print output_file_tmp2 "$_ ef: $cur_ef\n";
+           }       
+       }
+       else{
+           print output_file_tmp2 "$_\n";
+       }
+    }
+}
+close output_file_tmp1;
+close output_file_tmp2;
+
+system("cat $output_fname_tmp2 >> $output_fname");
+
+for(my $i=0; $i<scalar(@files_to_trash); $i++){
+    system("rm $files_to_trash[$i]");
+}
+
+system("rm $output_fname_tmp1");
+system("rm $output_fname_tmp2");
+# /output
+
+
+
diff --git a/htswanalysis/bin/QuEST/run_quick_window_scan_with_param_file.pl b/htswanalysis/bin/QuEST/run_quick_window_scan_with_param_file.pl
new file mode 100755 (executable)
index 0000000..e40fbe9
--- /dev/null
@@ -0,0 +1,219 @@
+#!/usr/bin/perl
+
+## This program is distributed under the free software
+## licensing model. You have a right to use, modify and
+## and redistribute this software in any way you like. 
+
+
+## This program is implemented by Anton Valouev, Ph.D.
+## as a part of QuEST analytical pipeline. If you use
+## this software as a part of another software or for publication
+## purposes, you must cite the publication anouncing
+## QuEST release:
+
+## the citation goes here
+
+
+## This program is a wrapper that runs quick_window_scan
+## on the entire genome one contig at a time
+
+use strict;
+use warnings;
+use diagnostics;
+
+my $usage = qq{
+    run_quick_window_scan_with_param_file.pl
+       
+    This program is a wrapper that runs the quick_window_scan
+    once contig at a time
+    
+    -----------------------
+    mandatory parameters:
+    -p <params_file>               a file containing parameters for quick_window_scan
+
+    -----------------------
+    optional parameters:
+    -e <exec_path>                 path to the quick window scan executable
+    -h                             to display this help
+
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory argmuments
+my $params_fname = "";
+## /mandatory arguments
+
+## setting optional arguments
+use Cwd qw(realpath);
+my $fullpath = realpath($0);
+my @fullpath_fields = split(/\//,$fullpath);
+my $exec_path = "";
+
+for(my $i=0; $i<scalar(@fullpath_fields)-1; $i++){
+    if($fullpath_fields[$i] ne ""){
+       $exec_path = $exec_path."/".$fullpath_fields[$i];
+    }
+}
+
+$exec_path = $exec_path."/quick_window_scan";
+## /setting optional arguments
+
+if(-e $exec_path){
+    ## executable exists do nothing
+}
+else{
+    print "Failed to locate executable $exec_path.\n";
+    print "Aborting quick_window_scan batch script.\n";
+    exit(0);
+}
+
+## reading arguments
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;    
+    if ( $this_arg eq '-h') {die "$usage\n";}
+    elsif ( $this_arg eq '-p') {$params_fname = shift @ARGV;}
+    elsif ( $this_arg eq '-e') {$exec_path = shift @ARGV;}
+    else{ print "Warning: unknown parameter: $this_arg\n"; }
+}
+
+my $die_pars_bad = "false";       ## die if parameters are bad
+my $bad_par = "";                 ## store bad parameter here
+
+print "\n=====================\n\n";
+print "mandatory parameters: \n\n";
+print "params_file:      $params_fname\n";
+print "exec_path:        $exec_path\n";
+if( $params_fname eq ''){ $die_pars_bad = "true"; $bad_par." "."$params_fname"; }
+if( $exec_path eq ''){ $die_pars_bad = "true"; $bad_par." "."$exec_path"; }
+print "\n=====================\n\n";
+
+if( $die_pars_bad eq "true"){
+    print "$usage";
+    print "Bad parameters: $bad_par\n";
+    exit(0);
+}
+## /reading arguments
+
+## top-level script
+open params_file, "< $params_fname" || die "$params_fname: $\n";
+my @params = <params_file>;
+close params_file;
+
+my $QuEST_align_file = "";
+##my @contigs;
+##my $reference_path = "";
+my $genome_table_fname = "";
+my $output_file = "";
+my $calc_window = "";
+my $count_threshold = "";
+
+for(my $i=0; $i<scalar(@params); $i++){
+    my $cur_param = $params[$i];
+    chomp($cur_param);
+    my @cur_par_fields = split(/ /, $cur_param);
+    my $cur_par_name = $cur_par_fields[0];
+    if( $cur_par_name eq "align_file" ){
+       $QuEST_align_file = $cur_par_fields[2]; 
+    }
+#    elsif($cur_par_name eq "contigs"){
+#      @contigs = split(/\,/, $cur_par_fields[2]);     
+#    }
+#    elsif($cur_par_name eq "reference_path"){
+#      $reference_path = $cur_par_fields[2];   
+#    }
+    elsif( $cur_par_name eq "genome_table" ){
+       $genome_table_fname = $cur_par_fields[2];
+    }
+    elsif($cur_par_name eq "output_file"){
+       $output_file  = $cur_par_fields[2];
+    }
+    elsif($cur_par_name eq "calc_window"){
+       $calc_window = $cur_par_fields[2];
+    }
+    elsif($cur_par_name eq "count_threshold"){
+       $count_threshold = $cur_par_fields[2];
+    }
+    else{
+       if($params[$i] ne ""){
+           print "Warning: unrecognized parameter: $cur_par_name";
+       }
+    }
+}
+
+print "Read the following parameters: \n\n";
+print "align_file:            $QuEST_align_file\n";
+#print "contigs:               @contigs\n";
+#print "reference_path:        $reference_path\n";
+print "genome_table_file:     $genome_table_fname\n";
+print "output_file:           $output_file\n";
+print "calc_window:           $calc_window\n";
+print "count_threshold:       $count_threshold\n";
+
+my $optional_param_string = "";
+if($calc_window ne ""){
+    $optional_param_string = "calc_window=$calc_window";
+}
+if($count_threshold ne ""){
+    $optional_param_string = $optional_param_string . " count_threshold=$count_threshold";
+}
+
+if( ! -e $genome_table_fname){
+    print "Error in Quick Window Scan batch script:\n";
+    print "Failed to locate genome table $genome_table_fname.\n";
+    print "Aborting.\n";
+    exit(0);
+}
+
+open genome_table_file, "< $genome_table_fname" || die "$genome_table_fname: $\n";
+my @genome_table = <genome_table_file>;
+close genome_table_file;
+
+my @contig_names;
+my @contig_sizes;
+my $contig_counter = 0;
+
+for(my $i=0; $i<scalar(@genome_table); $i++){
+    my $cur_entry = $genome_table[$i];
+    chomp($cur_entry);
+
+    my @cur_entry_fields = split(/ /, $cur_entry);
+    if( scalar(@cur_entry_fields) == 2 ){
+       $contig_names[$contig_counter] = $cur_entry_fields[0];
+       $contig_sizes[$contig_counter] = $cur_entry_fields[1];
+       $contig_counter++;
+    }
+}
+
+for(my $i=0; $i<scalar(@contig_names); $i++){
+    my $cur_contig_name = $contig_names[$i];
+    my $cur_contig_size = $contig_sizes[$i];
+    my $system_command = "$exec_path contig_id=$cur_contig_name contig_size=$cur_contig_size align_file=$QuEST_align_file output_file=$output_file.$cur_contig_name $optional_param_string";
+    #print "$system_command\n";
+    my $error_code = system("$system_command");
+    if( $error_code != 0 ){
+       print "run_quick_window_scan_with_param_file.pl: Caught an error message (code $error_code). Passing the error message to the trop script.\n";
+       exit(3);
+    }
+}
+
+my $output_file_list = "";
+for(my $i=0; $i<scalar(@contig_names); $i++){
+    my $cur_contig = $contig_names[$i];
+    if($i>0){
+       $output_file_list = $output_file_list . " ";
+    }
+    $output_file_list = $output_file_list . "$output_file.$cur_contig"
+}
+
+unlink("$output_file");
+unlink("$output_file.sorted");
+system("cat $output_file_list > $output_file");
+system("sort -nr -k3 $output_file > $output_file.sorted");
+#system("sort -nr +2 $output_file > $output_file.sorted");
+system("rm $output_file_list");
+
+## /top-level script
diff --git a/htswanalysis/bin/QuEST/src/Makefile b/htswanalysis/bin/QuEST/src/Makefile
new file mode 100755 (executable)
index 0000000..547e0b5
--- /dev/null
@@ -0,0 +1,15 @@
+CC = g++
+#CCFLAGS = -Wall -ansi -pedantic -D DEBUG -O4 -funroll-loops
+OBJECTS := $(patsubst %.cpp, %.o,$(wildcard *.cpp))
+MISC_DIR = ./../misc
+INCLUDE := $(MISC_DIR)
+
+include ../config.mk
+
+all: $(OBJECTS)
+
+%.o : %.cpp
+       $(CC) -I$(INCLUDE) $(CCFLAGS) -c $<
+
+clean: 
+       rm -rf *.o 
diff --git a/htswanalysis/bin/QuEST/src/calibrate_peak_shift.cpp b/htswanalysis/bin/QuEST/src/calibrate_peak_shift.cpp
new file mode 100755 (executable)
index 0000000..06baab7
--- /dev/null
@@ -0,0 +1,590 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+#include <math.h>
+
+#include <time.h>
+#include <assert.h>
+
+#include "utils.h"
+#include "seq_contig.h"
+//#include "array_2d.h"
+//#include "ticker.h"
+#include "params.h"
+
+using namespace std;
+
+#define BLACK_BACKGROUND
+
+
+#define read_size 25
+
+int el(const int* counts, unsigned int i, unsigned int j){
+  return counts[i*3+j];
+}
+
+double weight(int x){
+  
+  if(-50<= x && x<=50) return x/15000.0+1.0/300.0;
+  if(50<x && x<=250) return -1*x/(30000) + 1.0/120.0;
+  return 0;
+  
+}
+
+double weight_for(int x){
+  return weight(-x);
+}
+double weight_rev(int x){
+  return weight(x);
+}
+
+double indicator_for(int x){
+  if (x >= -300 && x <= 300) return 1;
+  else return 0;
+  //if ( x>= -250 && x <= 50 ) return 1;
+  //else return 0;
+}
+double indicator_rev(int x){
+  if (x >= -300 && x <= 300) return 1;
+  else return 0;
+
+  //if ( x>= -50 && x <= 250) return 1;
+  //else return 0;
+}
+double normal_weight(int x, double sigma){
+  double pi=3.14;
+  return exp(-x*x/(2*sigma*sigma))/(sqrt(2*pi)*sigma);
+}
+
+int __int_min(int i, int j){
+  if ( i < j ) return i;
+  return j;
+}
+
+int __int_max(int i, int j){
+  if ( i > j) return i;
+  return j;
+}
+
+double max2min(double x, double y){
+  if( x > y) return x/y;
+  else return y/x;
+}
+
+vector<double> non_normalized_kde(vector<int> counts_vec, double kde_bandwidth){
+  int win_size = (int) (kde_bandwidth*5); 
+  
+  vector<double> scores(counts_vec.size());
+  
+  for(int i=0; i<(int)scores.size(); i++){
+    scores[i] = 0;
+    for(int j=__int_max(0,i-win_size); j<__int_min(i+win_size, counts_vec.size()); j++){
+      scores[i] += counts_vec[j] * indicator_for(j-i) * normal_weight(j-i, kde_bandwidth);
+    }
+  }
+  return scores;
+}
+
+double mean(const vector<double>& vec){
+  double res = 0;
+  for(unsigned int i=0; i<vec.size(); i++){
+    res += vec[i] / vec.size();
+  }
+  return res;
+}
+
+double stdev(const vector<double>& vec){
+  double cur_mean = mean(vec);
+  double var = 0;
+  
+  for(unsigned int i=0; i<vec.size(); i++){
+    var += (vec[i]-cur_mean)*(vec[i]-cur_mean) / (vec.size() - 1);
+  }
+  return sqrt(var);
+}
+
+double median(const vector<double>& vec){
+  vector<double> tmp_vec = vec;
+  sort(tmp_vec.begin(), tmp_vec.end());
+  int tmp_vec_size = tmp_vec.size();
+  if(tmp_vec_size%2 == 1){
+    return tmp_vec[(tmp_vec.size() - 1 ) / 2] + 1;
+  }
+  else return 0.5 * (tmp_vec[tmp_vec.size()/2 - 1] + tmp_vec[tmp_vec.size()/2] );
+}
+
+string bool2string(bool x){
+  string res;
+  if(x== true) res = "yes";
+  else res = "no";
+  return res;
+}
+
+int main(int argc, char* argv[]){
+
+  bool with_background_estimate = false;
+
+  params pars(argc, argv);
+  pars.require("quick_scan_file","quick_scan_file",STRING_TYPE);
+  pars.require("align_chip_file","align_chip_file",STRING_TYPE);
+  pars.require("align_background_file","align_background_file",STRING_TYPE);  
+  pars.require("output_file","output_file",STRING_TYPE);
+
+  if(with_background_estimate)
+    pars.require("peak_free_regions_file","file containing regions free of binding sites",STRING_TYPE);
+
+  pars.optional("kde_bandwidth","kde_bandwidth","30",DOUBLE_TYPE);
+  pars.optional("method","estimation method","mean",STRING_TYPE);
+  pars.optional("scan_window","scan_window","1000",INT_TYPE);
+  pars.optional("quick_scan_tag_count_threshold","quick_scan_tag_count_threshold","0",INT_TYPE);
+  //pars.optional("aligned_part_length","length of the aligned part of solexa read","25",INT_TYPE);
+  pars.optional("top_count","quick scan entries to consider","100",INT_TYPE);
+  pars.optional("chip_to_background_fold","chip peak to background value ratio fold","20",DOUBLE_TYPE);
+  pars.optional("best_to_second_best_peak_ratio","ratio threshold of best peak to second best peak",
+               "20",DOUBLE_TYPE);
+  pars.optional("chip_peak_threshold","chip_peak_threshold","1",DOUBLE_TYPE);
+  pars.optional("chip_for_rev_peak_max_min_ratio","threshold ratio between forward and reverse peaks",
+               "1.5",DOUBLE_TYPE);
+
+  if(!pars.enforce()){
+    pars.list_all_params();
+    exit(1);
+  }
+  pars.list_all_params();
+
+  string quick_scan_fname = pars.get_string_value("quick_scan_file");
+  string align_chip_fname = pars.get_string_value("align_chip_file");
+  string align_background_fname = pars.get_string_value("align_background_file");
+  string output_fname = pars.get_string_value("output_file");  
+  double kde_bandwidth = pars.get_double_value("kde_bandwidth");
+  int scan_window = pars.get_int_value("scan_window");
+  int quick_scan_tag_count_threshold = pars.get_int_value("quick_scan_tag_count_threshold");
+  //int aligned_part_length = pars.get_int_value("aligned_part_length");
+  int top_count = pars.get_int_value("top_count");
+
+  string peak_free_regions_fname = "";
+  if(with_background_estimate) peak_free_regions_fname = pars.get_string_value("peak_free_regions_file");
+
+  double chip_to_mock_peak_ratio = pars.get_double_value("chip_to_background_fold");
+  double best_to_second_best_ratio = pars.get_double_value("best_to_second_best_peak_ratio");
+  double chip_peak_threshold = pars.get_double_value("chip_peak_threshold");
+  double chip_for_rev_peak_ratio = pars.get_double_value("chip_for_rev_peak_max_min_ratio");
+
+  string estimation_method = pars.get_string_value("method");
+
+  if(estimation_method != "median" && estimation_method != "mean"){
+    cout<<"You have specified bad estimation method: "<<estimation_method<<endl;
+    cout<<"I was expecting median or mean"<<endl;
+    exit(0);
+  }
+
+  string dummy_string;
+
+  int good_chip_reads = 0;
+  int good_mock_reads = 0;
+  
+  char sep = ' ';
+  char sep1 = ' ';
+  //char sep2 = ':';
+  char sep3 = char(9);
+
+  ifstream peak_free_regions_str;
+  vector<string> peak_free_regions_contig_ids;
+  vector<int> peak_free_regions_start_coord;
+  vector<int> peak_free_regions_end_coord;
+  
+  int peak_free_chip_reads = 0;
+  int peak_free_mock_reads = 0;
+
+  if(with_background_estimate){
+    peak_free_regions_str.open(peak_free_regions_fname.c_str());
+    if(!peak_free_regions_str.good()){
+       cerr<<"bad file name: "<<peak_free_regions_fname<<endl;
+       exit(1);
+    }
+    
+    while(peak_free_regions_str.good()){
+      getline(peak_free_regions_str, dummy_string);
+      if(peak_free_regions_str.good() && dummy_string[0] != '#'){
+       vector<string> cur_region_entries = split(dummy_string, sep3);    
+       if(cur_region_entries.size()>= 3){
+         peak_free_regions_contig_ids.push_back(cur_region_entries[0]);
+         peak_free_regions_start_coord.push_back(atoi(cur_region_entries[1].c_str()));
+         peak_free_regions_end_coord.push_back(atoi(cur_region_entries[2].c_str()));
+       }
+      }
+    }
+    
+    peak_free_regions_str.close();
+    cout<<"read: "<<peak_free_regions_contig_ids.size()<<" peak free regions"<<endl;
+    // exit(1);
+  }
+
+  ifstream quick_scan_str(quick_scan_fname.c_str());
+  if(!quick_scan_str.good()){
+    cerr<<"bad file name: "<<quick_scan_fname<<endl;
+    exit(1);
+  }
+  
+  // do: reading quick scan peaks
+  
+  vector<string> quick_scan_contig_ids;
+  vector<int> quick_scan_coords;
+  vector<int> quick_scan_tag_counts;
+
+  int peak_count = 0;
+
+  while(quick_scan_str.good() && peak_count < top_count){
+    getline(quick_scan_str, dummy_string);
+    if(quick_scan_str.good()){
+      vector<string> cur_pile_entries = split(dummy_string, sep);    
+      if(cur_pile_entries.size() >= 3){        if(atoi(cur_pile_entries[2].c_str()) > quick_scan_tag_count_threshold){
+         quick_scan_contig_ids.push_back(cur_pile_entries[0]);
+         quick_scan_coords.push_back(atoi(cur_pile_entries[1].c_str()));
+         quick_scan_tag_counts.push_back(atoi(cur_pile_entries[2].c_str()));
+         peak_count++;
+       }
+      }
+    }
+  }
+
+  quick_scan_str.close();
+
+  // done: reading quick scan peaks
+
+  cout<<"read "<<quick_scan_contig_ids.size()<<" quick_scan peaks"<<endl;
+
+  ifstream chip_str(align_chip_fname.c_str());
+  if(!chip_str.good()){
+    cerr<<"bad file name: "<<align_chip_fname<<endl;
+    exit(1);
+  }
+
+  ifstream mock_str(align_background_fname.c_str());
+  if(!mock_str.good()){
+    cerr<<"bad file name: "<<align_background_fname<<endl;
+    exit(1);
+  }
+
+  ofstream out_str(output_fname.c_str());
+  if(!out_str.good()){
+    cerr<<"bad file name: "<<output_fname<<endl;
+    exit(1);
+  }
+
+  // do: some necessary inits
+
+  cout<<"Initializing arrays..."<<endl;
+
+  int region_size = scan_window;
+  
+  vector<int> dummy_int_vec(region_size);
+
+  vector< vector <int> > chip_counts_for; 
+  vector< vector <int> > chip_counts_rev; 
+  vector< vector <int> > mock_counts_for; 
+  vector< vector <int> > mock_counts_rev; 
+
+  for(int i=0; i<region_size; i++){
+    dummy_int_vec[i] = 0;
+  }
+
+  for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+    chip_counts_for.push_back(dummy_int_vec);
+    chip_counts_rev.push_back(dummy_int_vec);
+    mock_counts_for.push_back(dummy_int_vec);
+    mock_counts_rev.push_back(dummy_int_vec);
+  }
+  //cout<<"done..."<<endl;
+  // done: some necessary inits
+
+  unsigned int chip_reads_for = 0;
+  unsigned int chip_reads_rev = 0;
+  unsigned int mock_reads_for = 0;
+  unsigned int mock_reads_rev = 0;
+  
+  // do: load chip reads
+  
+  cout<<"Loading ChIP data..."<<endl;
+  while(chip_str.good()){
+    getline(chip_str, dummy_string);
+    if(chip_str.good()){
+      if(dummy_string.length() > 0){
+       if( dummy_string[0] != '#'){
+         
+         vector<string> cur_hit_entry = split(dummy_string, sep1);
+         if(cur_hit_entry.size() == 3){
+           good_chip_reads++;
+           //vector<string> cur_coord = split(cur_hit_entry[3], sep2);
+           string cur_read_contig_name = cur_hit_entry[0];
+           int cur_read_5p_coord = atoi(cur_hit_entry[1].c_str());
+           string cur_read_orient = cur_hit_entry[2];      
+           
+           int start_coord = cur_read_5p_coord;
+           
+           //if(cur_hit_entry[4] == "F"){
+           //  start_coord = atoi(cur_coord[1].c_str()) - 1;     
+           // }
+           //else{       
+           //assert(cur_hit_entry[4] == "R");
+           //start_coord = atoi(cur_coord[1].c_str()) + aligned_part_length - 1; 
+           //}
+           //}
+           //cur_contig_id = cur_coord[0];
+           
+           for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+             string cur_quick_scan_contig_id = quick_scan_contig_ids[i];
+             int cur_quick_scan_coord = quick_scan_coords[i];
+             //int cur_quick_scan_tag_count = quick_scan_tag_counts[i];
+             
+             int cur_region_start_coord = __int_max(0, cur_quick_scan_coord - scan_window/2);
+             int cur_region_end_coord = cur_quick_scan_coord + scan_window/2;
+             
+             if( start_coord >= cur_region_start_coord && start_coord <= cur_region_end_coord &&
+                 cur_quick_scan_contig_id == cur_read_contig_name ){
+               
+               if(cur_read_orient == "+"){
+                 chip_counts_for[i][start_coord - cur_region_start_coord]++; chip_reads_for++;}
+               else{ chip_counts_rev[i][start_coord - cur_region_start_coord]++; chip_reads_rev++;}          
+             }
+           }
+           
+           if(with_background_estimate){
+             for(unsigned int i=0; i<peak_free_regions_contig_ids.size(); i++){
+               if(start_coord >= peak_free_regions_start_coord[i] &&
+                  start_coord <= peak_free_regions_end_coord[i] &&
+                  cur_read_contig_name == peak_free_regions_contig_ids[i]) peak_free_chip_reads++;
+             }
+           }
+         }
+       }
+      }
+    }
+  }
+
+  // done: load chip reads
+  
+  // do: load mock reads
+  cout<<"Loading RX-noIP data..."<<endl;
+  while(mock_str.good()){
+    getline(mock_str, dummy_string);
+    if(mock_str.good() && dummy_string[0] != '#'){
+      
+      vector<string> cur_hit_entry = split(dummy_string, sep1);
+      if(cur_hit_entry.size() == 3){ //>= 5){
+       good_mock_reads++;
+       //vector<string> cur_coord = split(cur_hit_entry[3], sep2);
+       
+       string cur_read_contig_name = cur_hit_entry[0];
+       int cur_read_5p_coord = atoi(cur_hit_entry[1].c_str());
+       string cur_read_orient = cur_hit_entry[2];
+
+       int start_coord = cur_read_5p_coord;          
+       //string cur_contig_id;
+       
+       //if(cur_hit_entry[4] == "F"){
+       //  start_coord = atoi(cur_coord[1].c_str()) - 1;         
+       //      }
+       //else{   
+       //  assert(cur_hit_entry[4] == "R");
+       //  start_coord = atoi(cur_coord[1].c_str()) + aligned_part_length - 1; 
+       //}
+       string cur_contig_id = cur_read_contig_name; //cur_coord[0];
+
+       for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){
+         string cur_quick_scan_contig_id = quick_scan_contig_ids[i];
+         int cur_quick_scan_coord = quick_scan_coords[i];
+         //int cur_quick_scan_tag_count = quick_scan_tag_counts[i];
+         
+         int cur_region_start_coord = __int_max(0, cur_quick_scan_coord - scan_window/2);
+         int cur_region_end_coord = cur_quick_scan_coord + scan_window/2;
+         
+         if( start_coord >= cur_region_start_coord && start_coord <= cur_region_end_coord &&
+             cur_quick_scan_contig_id == cur_contig_id){
+           if(cur_read_orient == "+"){
+             mock_counts_for[i][start_coord - cur_region_start_coord]++; mock_reads_for++;}
+           else{ mock_counts_rev[i][start_coord - cur_region_start_coord]++; mock_reads_rev++;
+           }
+         }
+       }
+       if(with_background_estimate){
+         for(unsigned int i=0; i<peak_free_regions_contig_ids.size(); i++){
+           if(start_coord >= peak_free_regions_start_coord[i] &&
+              start_coord <= peak_free_regions_end_coord[i] &&
+              cur_contig_id == peak_free_regions_contig_ids[i]) peak_free_mock_reads++;
+         }
+       }
+      }
+    }
+  }
+
+  chip_str.close();
+  mock_str.close();
+  
+  // done: load mock reads
+  double chip_to_mock_adj_peak_ratio = chip_to_mock_peak_ratio;  
+  if(with_background_estimate){
+    cout<<"peak_free_chip_reads:"<<peak_free_chip_reads<<endl;
+    cout<<"peak_free_mock_reads:"<<peak_free_mock_reads<<endl;
+
+    double chip_background_fraction = ( ((double)peak_free_chip_reads) / ((double)good_chip_reads)) /  
+      (((double)peak_free_mock_reads) / ((double)good_mock_reads));
+    chip_to_mock_adj_peak_ratio = chip_to_mock_peak_ratio * ((double)good_chip_reads) / 
+      ((double)good_mock_reads);
+    
+    cout<<"chip background fraction: "<<chip_background_fraction<<endl;
+  }
+
+  // do: calculating densities
+
+  cout<<"calculating densities..."<<endl;
+
+  unsigned int qualified_peaks = 0;
+  vector<double> peak_shifts;
+
+  for(unsigned int i=0; i<quick_scan_contig_ids.size(); i++){    
+    //printf("processing region: %d    \r", i);
+    vector<double> chip_scores_for = non_normalized_kde(chip_counts_for[i], kde_bandwidth);
+    vector<double> chip_scores_rev = non_normalized_kde(chip_counts_rev[i], kde_bandwidth);
+    vector<double> mock_scores_for = non_normalized_kde(mock_counts_for[i], kde_bandwidth);
+    vector<double> mock_scores_rev = non_normalized_kde(mock_counts_rev[i], kde_bandwidth);
+
+    // do: find a maximum
+
+    bool chip_for_peak_good = false;
+    int chip_for_peak_coord = -1;
+    double chip_for_peak = 0;
+    double chip_for_enrich_ratio = 0;
+    
+    vector<double> chip_for_peaks;
+
+    int chip_for_best_peak_coord = -1;
+    
+    for(unsigned int j=1; j<chip_scores_for.size()-1; j++){
+      if(chip_scores_for[j] > chip_scores_for[j-1] && 
+        chip_scores_for[j] > chip_scores_for[j+1] ){
+       chip_for_peaks.push_back(chip_scores_for[j]);
+      }
+    }
+
+    //cout<<"found: "<<chip_for_peaks.size()<<" lmax peaks"<<endl;
+
+    sort(chip_for_peaks.begin(), chip_for_peaks.end());
+    
+    if(chip_for_peaks.size() == 1 && chip_for_peaks[0] >= chip_peak_threshold)
+      chip_for_peak_good = true;
+    
+    if(chip_for_peaks.size() > 1){      
+      if(chip_for_peaks[chip_for_peaks.size()-1] / chip_for_peaks[chip_for_peaks.size()-2] >=
+        best_to_second_best_ratio) chip_for_peak_good = true;      
+    }
+    
+    if(chip_for_peak_good){
+      for(unsigned int j=0; j<chip_scores_for.size(); j++){
+       if(chip_scores_for[j] == chip_for_peaks[chip_for_peaks.size()-1])
+         chip_for_best_peak_coord = j;
+      }
+      assert(chip_for_best_peak_coord >= 0 && chip_for_best_peak_coord < (int)chip_scores_for.size());
+    }
+
+    if(chip_for_peak_good){
+      chip_for_peak_coord = chip_for_best_peak_coord;
+      chip_for_peak = chip_scores_for[chip_for_best_peak_coord];
+      chip_for_enrich_ratio = 
+       chip_scores_for[chip_for_best_peak_coord] / mock_scores_for[chip_for_best_peak_coord];
+      //cout<<"peak_for good"<<endl;
+      //cout<<"chip_for_coord: "<<chip_for_best_peak_coord;
+      //cout<<" s: "<<chip_scores_for[chip_for_best_peak_coord]<<endl;      
+    }
+    //else cout<<"peak_for bad"<<endl;
+    
+    bool chip_rev_peak_good = false;
+    int chip_rev_peak_coord = -1;
+    double chip_rev_peak = 0;
+    double chip_rev_enrich_ratio = 0;
+                               
+    vector<double> chip_rev_peaks;
+
+    int chip_rev_best_peak_coord = -1;
+        for(unsigned int j=1; j<chip_scores_rev.size()-1; j++){
+      if(chip_scores_rev[j] > chip_scores_rev[j-1] && 
+        chip_scores_rev[j] > chip_scores_rev[j+1] ){
+       chip_rev_peaks.push_back(chip_scores_rev[j]);
+      }
+    }
+
+    //cout<<"found: "<<chip_rev_peaks.size()<<" lmax peaks"<<endl;
+
+    sort(chip_rev_peaks.begin(), chip_rev_peaks.end());
+    
+    if(chip_rev_peaks.size() == 1 && chip_rev_peaks[0] >= chip_peak_threshold)
+      chip_rev_peak_good = true;
+    
+    if(chip_rev_peaks.size() > 1){      
+      if(chip_rev_peaks[chip_rev_peaks.size()-1] / chip_rev_peaks[chip_rev_peaks.size()-2] >=
+        best_to_second_best_ratio) chip_rev_peak_good = true;      
+    }
+    
+    if(chip_rev_peak_good){
+      for(unsigned int j=0; j<chip_scores_rev.size(); j++){
+       if(chip_scores_rev[j] == chip_rev_peaks[chip_rev_peaks.size()-1])
+         chip_rev_best_peak_coord = j;
+      }
+      assert(chip_rev_best_peak_coord >= 0 && chip_rev_best_peak_coord < (int)chip_scores_rev.size());
+    }
+
+    if(chip_rev_peak_good){
+      chip_rev_peak_coord = chip_rev_best_peak_coord;
+      chip_rev_peak = chip_scores_rev[chip_rev_best_peak_coord];
+      chip_rev_enrich_ratio = 
+       chip_scores_rev[chip_rev_best_peak_coord] / mock_scores_rev[chip_rev_best_peak_coord];
+      //cout<<"peak_rev good"<<endl;
+      //cout<<"chip_rev_coord: "<<chip_rev_best_peak_coord;
+      //cout<<" s: "<<chip_scores_rev[chip_rev_best_peak_coord]<<endl;
+    }
+    //else cout<<"peak_for bad"<<endl;
+    
+    if(chip_for_peak_good && chip_rev_peak_good &&
+       chip_for_enrich_ratio >= chip_to_mock_adj_peak_ratio &&
+       chip_rev_enrich_ratio >= chip_to_mock_adj_peak_ratio &&
+       max2min(chip_for_peak, chip_rev_peak) <= chip_for_rev_peak_ratio){
+      double ps = 0.5 * ( (double) (chip_rev_peak_coord - chip_for_peak_coord ) );
+      cout<<"i: "<<i<<" "<<quick_scan_contig_ids[i]<<" "<<quick_scan_coords[i]<<" +: "<<chip_for_peak<<" -: "<<chip_rev_peak<<" ps: "<<ps<<endl;
+      peak_shifts.push_back(ps);
+      qualified_peaks++;
+    }
+    else{
+      cout<<"i: "<<i<<" "<<quick_scan_contig_ids[i]<<" disqualified";
+      cout<<" chip_for_peak_good: "<<bool2string(chip_for_peak_good);
+      cout<<" chip_rev_peak_good: "<<bool2string(chip_rev_peak_good);
+      cout<<endl;
+    }
+
+    //exit(1);
+  }
+  
+  cout<<"done"<<endl;
+
+  cout<<"median: "<<median(peak_shifts)<<endl;
+  cout<<"mean: "<<mean(peak_shifts)<<" stdev: "<<stdev(peak_shifts)<<endl;
+  cout<<"based on "<<qualified_peaks<<" qualified peaks"<<endl;
+  cout<<"estimation_method: "<<estimation_method<<endl;
+  // done: calculating densities
+
+  out_str<<"regions: "<<quick_scan_coords.size()<<endl;
+  out_str<<"used: "<<qualified_peaks<<endl;
+  if(estimation_method == "mean"){
+    out_str<<"peak_shift_estimate: "<<mean(peak_shifts)<<endl;
+  }
+  else{
+    out_str<<"peak_shift_estimate: "<<median(peak_shifts)<<endl;
+  }
+  out_str<<"estimation_method: "<<estimation_method;
+  
+  out_str.close();
+  return 0;  
+}
diff --git a/htswanalysis/bin/QuEST/src/generate_peak_profile.cpp b/htswanalysis/bin/QuEST/src/generate_peak_profile.cpp
new file mode 100755 (executable)
index 0000000..b7f7e49
--- /dev/null
@@ -0,0 +1,517 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+#include <math.h>
+
+#include <assert.h>
+#include <time.h>
+#include <pthread.h>
+
+#include "utils.h"
+#include "seq_contig.h"
+#include "params.h"
+
+using namespace std;
+
+/*
+char random_char(){
+  string random_char_string = "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ1234567890.";
+  int cur_rand_int = rand()%random_char_string.length();
+
+  return random_char_string[cur_rand_int];
+}
+
+string random_string(int string_length){
+  string result = "";
+  for(int i=0; i<string_length; i++){
+    result += random_char();
+  }
+  return result;
+}
+*/
+
+
+double normal_weight(double x, double sigma){
+  double pi=3.14;
+  return exp(-x*x/(2*sigma*sigma))/(sqrt(2.0*pi)*sigma);
+}
+
+
+struct pthread_pars{
+  //vector<unsigned int>* hit_counts_for_ptr;
+  //vector<unsigned int>* hit_counts_rev_ptr;
+  //vector<double>* score_ptr;
+
+  vector<int>* pos_hits;
+  vector<int>* neg_hits;
+  float* score_ptr;
+
+  //int pos_hits_size;
+  //int neg_hits_size;
+  int score_size;
+
+  double peak_shift;
+  double kde_bandwidth;
+
+  int calc_start_value;
+  int calc_end_value;
+
+  int thread_count;
+  unsigned int calc_window;
+};
+
+int int_max(int i, int j){
+  if ( i>j ) return i;
+  return j;
+}
+int int_min(int i, int j){
+  if (i<j) return i;
+  return j;
+}
+
+void* pscore(void* vptr){
+  pthread_pars* pars_ptr = (pthread_pars*)(vptr);
+
+  if ( !( pars_ptr->calc_start_value <=  pars_ptr->calc_end_value )){
+    cerr<<"thread: "<<pars_ptr->thread_count<<endl;
+    cerr<<"start_count: "<<pars_ptr->calc_start_value<<endl;
+    cerr<<"end_count: "<<pars_ptr->calc_end_value<<endl;
+    assert(false);
+  }
+
+  assert( pars_ptr->calc_start_value >= 0);
+  int calc_window = pars_ptr->calc_window;
+  vector<int>* pos_hits = pars_ptr->pos_hits;
+  vector<int>* neg_hits = pars_ptr->neg_hits;
+  //int pos_hits_size = pars_ptr->pos_hits_size;
+  //int neg_hits_size = pars_ptr->neg_hits_size;
+  float* score = pars_ptr->score_ptr;
+  int score_size = pars_ptr->score_size;
+  int calc_start_value = pars_ptr->calc_start_value;
+  int calc_end_value = pars_ptr->calc_end_value;
+  
+
+  //vector<unsigned int>* hit_counts_for_ptr = pars_ptr->hit_counts_for_ptr;
+  //vector<unsigned int>* hit_counts_rev_ptr = pars_ptr->hit_counts_rev_ptr;
+  double peak_shift = pars_ptr->peak_shift;
+  double kde_bandwidth = pars_ptr->kde_bandwidth;
+
+  /*
+  for(int i=pars_ptr->calc_start_value; i<pars_ptr->calc_end_value; i++){
+    for(int j=i-calc_window/2; j<=i+((int)calc_window)/2; j++){
+       (*(pars_ptr->score_ptr))[i] +=
+        ((double) (*hit_counts_for_ptr)[j] ) * normal_weight(j-i+peak_shift, kde_bandwidth) +
+        ((double) (*hit_counts_rev_ptr)[j] ) * normal_weight(j-i-peak_shift, kde_bandwidth);
+    }
+  }
+  */
+  
+  //locating boundaries in the hits list
+  bool pos_start_found = false;
+  bool pos_end_found = false;
+  bool pos_end_exists = false;
+
+  int pos_start = -1;
+  int pos_end = -1;
+
+  bool neg_start_found = false;
+  bool neg_end_found = false;
+  bool neg_end_exists = false;
+
+  int neg_start = -1;
+  int neg_end = -1;
+
+  // looking for the window of pos hits corresponding to this region 
+  for(int i=(int)pos_hits->size()-1; i>=0; i--){ //can replace this with the binary search
+    if((*pos_hits)[i] >= calc_start_value - calc_window){
+      pos_start_found = true;
+      pos_end_exists = true;
+      pos_start = i;
+    }
+    if((*pos_hits)[i] >= calc_end_value + calc_window){
+      pos_end_found = true;
+      pos_end = i;
+    }
+  }
+
+  if(pos_end_found == false && pos_end_exists == true){
+    if(pos_hits->size() > 0){
+      pos_end_found = true;
+      pos_end = pos_hits->size();
+    }
+  }
+
+  // looking for the window of neg hits corresponding to this region 
+  for(int i = (int)neg_hits->size()-1; i>=0; i--){ //can replace this with the binary search
+    if((*neg_hits)[i] >= calc_start_value - calc_window){
+      neg_start_found = true;
+      neg_end_exists = true;
+      neg_start = i;
+    }
+    if((*neg_hits)[i] >= calc_end_value + calc_window){
+      neg_end_found = true;
+      neg_end = i;
+    }
+  }
+  
+  if(neg_end_found == false && neg_end_exists == true){
+    if(neg_hits->size() > 0){
+      neg_end_found = true;
+      neg_end = neg_hits->size();
+    }
+  }
+
+  //cout<<"pos_start: "<<pos_start<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+  //cout<<"pos_end: "<<pos_end<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+  //cout<<"neg_start: "<<neg_start<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+  //cout<<"neg_end: "<<neg_end<<" [ "<<calc_start_value<<" , "<<calc_end_value<<" ]"<<endl;
+  
+  
+  if(pos_start_found && pos_end_found){
+    for(int i=pos_start; i<pos_end; i++){
+      int cur_pos_coord = (*pos_hits)[i];
+      for(int j=int_max(0, cur_pos_coord - calc_window/2 + (int)peak_shift); 
+         j<=int_min(score_size, cur_pos_coord + calc_window/2 + (int)peak_shift); j++){
+       score[j] += normal_weight(j-cur_pos_coord-peak_shift, kde_bandwidth);
+      }
+    }
+  }
+
+  if(neg_start_found && neg_end_found){
+    for(int i=neg_start; i<neg_end; i++){
+      int cur_pos_coord = (*neg_hits)[i];
+      for(int j=int_max(0, cur_pos_coord - calc_window/2 - (int)peak_shift); 
+         j<=int_min(score_size, cur_pos_coord + calc_window/2 - (int)peak_shift); j++){
+       score[j] += normal_weight(j-cur_pos_coord+peak_shift, kde_bandwidth);
+      }
+    }
+  }
+
+  /*
+  for(int i=pars_ptr->calc_start_value; i<pars_ptr->calc_end_value; i++){
+    for(int j=i-calc_window/2; j<=i+((int)calc_window)/2; j++){
+       (*(pars_ptr->score_ptr))[i] +=
+        ((double) (*hit_counts_for_ptr)[j] ) * normal_weight(j-i+peak_shift, kde_bandwidth) +
+        ((double) (*hit_counts_rev_ptr)[j] ) * normal_weight(j-i-peak_shift, kde_bandwidth);
+    }
+  }
+  */
+  
+  return 0;
+}
+
+
+int main(int argc, char* argv[]){
+  
+  params pars(argc, argv);
+  pars.require("align_file","align_file",STRING_TYPE);
+  pars.require("contig_id","contig_id",STRING_TYPE);
+  pars.require("contig_size","contig_size",INT_TYPE);
+  pars.require("output_score_file","output_score_file",STRING_TYPE);
+
+  pars.optional("peak_shift","peak_shift","50",INT_TYPE);
+  pars.optional("calc_window","calc_window","300",INT_TYPE);
+  pars.optional("kde_bandwidth","kde_bandwidth","30",DOUBLE_TYPE);  
+  pars.optional("threads","threads","7",INT_TYPE);
+  
+  if(!pars.enforce()){
+    exit(1);
+  }
+
+  cout<<endl;
+  pars.list_all_params();
+
+  string align_fname = pars.get_string_value("align_file");
+  string contig_id = pars.get_string_value("contig_id");
+  int contig_size = pars.get_int_value("contig_size");
+
+  if(contig_size <= 0){
+    cout<<"Error: wrong contig size: "<<contig_size<<endl;
+    exit(0);
+  }
+
+  string output_fname = pars.get_string_value("output_score_file");
+
+  int peak_shift = pars.get_int_value("peak_shift");
+  int calc_window = pars.get_int_value("calc_window");
+  double kde_bandwidth = pars.get_double_value("kde_bandwidth");
+  int thread_num = pars.get_int_value("threads");
+
+  cout<<endl<<"Will be using the following:"<<endl<<endl;
+  cout<<"peak_shift: "<<peak_shift<<endl;
+  cout<<"kde_bandwidth: "<<kde_bandwidth<<endl;
+  cout<<"calc_window: "<<calc_window<<endl;
+
+  cout<<"align_file: "<<align_fname<<endl;
+
+  cout<<endl;
+
+  ifstream hit_str(align_fname.c_str());
+  if(!hit_str.good()){
+    cerr<<"bad file name: "<<align_fname<<endl;
+    exit(1);
+  }
+  
+  cout<<"processing contig: "<<contig_id<<" [ "<<contig_size<<" bps ]"<<endl;
+  
+
+  remove(output_fname.c_str());
+  ofstream out_str(output_fname.c_str());
+  if(!out_str.good()){
+    cerr<<"bad file name: "<<output_fname<<endl;
+    exit(1);
+  }
+
+  /*
+  cout<<"allocating memory..."<<endl;
+  vector<unsigned int> hit_counts_for(contig_size);
+  vector<unsigned int> hit_counts_rev(contig_size);
+  vector<double> score(contig_size);
+
+  for(int i=0; i<contig_size; i++){
+    hit_counts_for[i] = 0;
+    hit_counts_rev[i] = 0;
+    score[i] = 0;
+  }
+  */
+
+  cout<<"reading align_file..."<<endl;
+  
+  int entries_pos = 0;
+  int entries_neg = 0;
+  string dummy_string;
+
+  char sep1 = ' ';
+
+  while(hit_str.good()){
+    getline(hit_str, dummy_string);
+    if(hit_str.good()){
+      if(dummy_string.length() > 0){
+       if( dummy_string[0] != '#'){    
+         
+         vector<string> cur_hit_entry_fields = split(dummy_string, sep1);
+         if(cur_hit_entry_fields.size() == 3){
+           string cur_hit_contig_name = cur_hit_entry_fields[0];
+           int cur_hit_5p_coord = atoi(cur_hit_entry_fields[1].c_str());
+           string cur_hit_orient = cur_hit_entry_fields[2];
+
+           if(cur_hit_orient != "+" && cur_hit_orient != "-"){
+             cout<<"Warning: expected +/- for orientation of read but found "<<cur_hit_orient;
+             cout<<". Skipping."<<endl;
+           }
+           else{
+             
+             if(cur_hit_contig_name == contig_id){
+               if(cur_hit_5p_coord < 0 || cur_hit_5p_coord >= contig_size){
+                 cout<<"Warning read coordinate "<<cur_hit_5p_coord<<" is out of boundaries [ 0,";
+                 cout<<contig_size<<" ]. Skipping."<<endl;             
+               }
+               else{
+                 
+                 //entries++;
+                 
+                 //int start_coord = cur_hit_5p_coord;
+                                 
+                 if(cur_hit_orient == "+"){
+                   entries_pos++;
+                   //hit_counts_for[start_coord]++;
+                 }
+                 else{ // "-"
+                   entries_neg++;
+                   //hit_counts_rev[start_coord]++;
+                 }
+               }
+             }
+           }
+         }
+       }
+      }   
+    }
+  }
+  
+  hit_str.close();  
+
+  cout<<"read: "<<entries_pos<<" + entries "<<endl;
+  cout<<"read: "<<entries_neg<<" - entries "<<endl;
+  
+  //int* pos_hits = new int [entries_pos];
+  //int* neg_hits = new int [entries_neg];
+
+  vector<int> pos_hits(entries_pos);
+  vector<int> neg_hits(entries_neg);
+  
+  ifstream hit_str_pass2(align_fname.c_str());
+  if(!hit_str_pass2.good()){
+    cerr<<"Failed to read the file "<<align_fname<<endl;
+    exit(1);
+  }
+
+
+  int entries_pos_pass2 = 0;
+  int entries_neg_pass2 = 0;
+  
+  while(hit_str_pass2.good()){
+    getline(hit_str_pass2, dummy_string);
+    if(hit_str_pass2.good()){
+      if(dummy_string.length() > 0){
+       if( dummy_string[0] != '#'){    
+         
+         vector<string> cur_hit_entry_fields = split(dummy_string, sep1);
+         if(cur_hit_entry_fields.size() == 3){
+           string cur_hit_contig_name = cur_hit_entry_fields[0];
+           int cur_hit_5p_coord = atoi(cur_hit_entry_fields[1].c_str());
+           string cur_hit_orient = cur_hit_entry_fields[2];
+           
+           if(cur_hit_orient != "+" && cur_hit_orient != "-"){
+             cout<<"Warning: expected +/- for orientation of read but found "<<cur_hit_orient;
+             cout<<". Skipping."<<endl;
+           }
+           else{             
+             if(cur_hit_contig_name == contig_id){
+               if(cur_hit_5p_coord < 0 || cur_hit_5p_coord >= contig_size){
+                 cout<<"Warning read coordinate "<<cur_hit_5p_coord<<" is out of boundaries [ 0,";
+                 cout<<contig_size<<" ]. Skipping."<<endl;             
+               }
+               else{
+                 int start_coord = cur_hit_5p_coord;
+                                 
+                 if(cur_hit_orient == "+"){
+                   if(entries_pos_pass2 >= entries_pos){
+                     cout<<"reading out of bounds. Aborting."<<endl;
+                     exit(1);
+                   }
+                   pos_hits[entries_pos_pass2] = start_coord;
+                   entries_pos_pass2++;
+                   //hit_counts_for[start_coord]++;
+                 }
+                 else{ // "-"
+                   if(entries_neg_pass2 >= entries_neg){
+                     cout<<"reading out of bounds. Aborting."<<endl;
+                     exit(1);
+                   }
+                   neg_hits[entries_neg_pass2] = start_coord;
+                   entries_neg_pass2++;
+                   //hit_counts_rev[start_coord]++;
+                 }
+               }
+             }
+           }
+         }
+       }
+      }   
+    }
+  }
+  
+  hit_str_pass2.close();  
+
+  //cout<<"read: "<<entries_pos_pass2<<" positive reads"<<endl;
+  //cout<<"read: "<<entries_neg_pass2<<" negative reads"<<endl;
+
+  cout<<"sorting hits"<<endl;
+  sort(pos_hits.begin(), pos_hits.end());
+  sort(neg_hits.begin(), neg_hits.end());
+
+  //cout<<"enter something "<<endl;
+  //int blah;
+  //cin>>blah;
+  //cout<<"you've entered "<<blah<<endl;
+
+  cout<<"allocating memory for density profile CDP"<<endl;
+  float *scores = new float[contig_size];
+  cout<<"zeroing the score array"<<endl;
+  for(int i=0; i<contig_size; i++){
+    scores[i] = 0;
+  }
+
+  //int blah;
+  //cin>>blah;
+  //cout<<"you've entered "<<blah<<endl;
+
+
+  cout<<"calculating scores"<<endl;
+
+  cout<<"using "<<thread_num<<" parallel threads"<<endl;
+
+  pthread_t* threads = new pthread_t[thread_num];
+  pthread_pars* pars_t = new pthread_pars[thread_num];
+
+  int set_size = ( ( contig_size - calc_window ) - ( calc_window ) );
+  int remainder = set_size % thread_num;
+  int portion = (set_size - remainder) / thread_num;
+  
+  int last_end = calc_window;
+
+  for(int i=0; i<thread_num; i++){
+    
+    int cur_start_value = last_end;
+    int cur_end_value;
+    
+    if( i < remainder ) cur_end_value = cur_start_value + portion + 1;
+    else cur_end_value = cur_start_value + portion;
+
+    assert( cur_end_value > cur_start_value);
+    assert( cur_start_value >= 0);
+
+    last_end = cur_end_value;
+
+    //pars_t[i].hit_counts_for_ptr = &hit_counts_for;
+    //pars_t[i].hit_counts_rev_ptr = &hit_counts_rev;
+    pars_t[i].pos_hits = &pos_hits;
+    pars_t[i].neg_hits = &neg_hits;
+
+    //pars_t[i].pos_hits_size = pos_hits.size();//entries_pos;   
+    //pars_t[i].neg_hits_size = neg_hits.size(); //entries_neg;
+   
+    pars_t[i].score_ptr = scores;
+    pars_t[i].score_size = contig_size;
+
+    pars_t[i].kde_bandwidth = kde_bandwidth;
+    pars_t[i].peak_shift = peak_shift;    
+    pars_t[i].calc_start_value = cur_start_value;
+    pars_t[i].calc_end_value = cur_end_value;
+    pars_t[i].thread_count = i;
+    pars_t[i].calc_window = calc_window;
+
+    if( i == thread_num - 1){
+      if( cur_end_value != (int)contig_size - calc_window ){
+       cerr<<"failed to match the end. expected: "<<contig_size - calc_window;
+       cerr<<" but obtained: "<<cur_end_value<<endl;
+       assert(false);
+      }      
+    }
+  }
+
+  for(int i=0; i<thread_num; i++){
+    pthread_create( threads+i, NULL, pscore, (void*)(pars_t+i) );
+  }
+
+  for(int i=0; i<thread_num; i++){
+    pthread_join( *(threads+i), NULL);
+  }
+  
+
+  cout<<"saving the score file"<<endl;
+
+  int entries = entries_pos + entries_neg;
+
+  out_str.write((char*)(&contig_size), sizeof(contig_size));
+  out_str.write((char*)(&entries), sizeof(entries));
+  out_str.write((char*)(&scores[0]), sizeof(scores[0])*contig_size);
+  
+  out_str.close();
+
+ delete[] threads;
+ delete[] pars_t;
+ delete[] scores;
+
+ // delete threads;
+ // delete pars_t;
+ // delete scores;
+
+  return 0;
+
+}
diff --git a/htswanalysis/bin/QuEST/src/peak_caller.cpp b/htswanalysis/bin/QuEST/src/peak_caller.cpp
new file mode 100755 (executable)
index 0000000..c9f4689
--- /dev/null
@@ -0,0 +1,323 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+//#include <algorithm>
+#include <math.h>
+#include <assert.h>
+#include <time.h>
+
+using namespace std;
+
+#define BLACK_BACKGROUND
+
+//#include "ticker.h"
+//#include "seq_contig.h"
+#include "params.h"
+
+#define read_size 25
+
+int main(int argc, char* argv[]){
+  
+  params pars(argc, argv);
+  pars.require("score_file1","score_file1",STRING_TYPE);
+  pars.require("score_file2","score_file2",STRING_TYPE);
+  pars.require("output_file","output_file",STRING_TYPE);
+  pars.require("chip_threshold","chip_threshold",STRING_TYPE);
+  pars.optional("peak_caller_method","maximum/local_maximum","local_maximum",STRING_TYPE);
+  pars.optional("local_maximum_radius","local_maximum_radius","10",INT_TYPE);
+  pars.require("contig_id","contig_id",STRING_TYPE);
+  pars.require("background_threshold","background_threshold",DOUBLE_TYPE);
+  pars.optional("dip_fraction","dip_fraction","0.1",DOUBLE_TYPE);// how much of the lowest
+  //pars.optional("above_threshold_mult","above_threshold_mult","30.0",DOUBLE_TYPE);
+  pars.require("rescue_ratio","rescue_ratio",DOUBLE_TYPE);
+  //peak the graph should dip before going for another peak
+
+  if(!pars.enforce()) exit(1);
+
+  string scores_fname1 = pars.get_string_value("score_file1");
+  string scores_fname2 = pars.get_string_value("score_file2");
+  string output_fname = pars.get_string_value("output_file");
+  double score_threshold = pars.get_double_value("chip_threshold");
+  string peak_caller_method = pars.get_string_value("peak_caller_method");
+  string contig_id = pars.get_string_value("contig_id");
+  int local_maximum_radius = pars.get_int_value("local_maximum_radius");
+  double background_threshold = pars.get_double_value("background_threshold");
+  double dip_fraction = pars.get_double_value("dip_fraction");
+  double above_threshold_mult = pars.get_double_value("rescue_ratio");
+
+  if(peak_caller_method == "maximum" || peak_caller_method == "local_maximum"){}
+  else{
+    cerr<<"bad peak_caller_method: "<<peak_caller_method<<endl;
+    exit(1);
+  }
+  //string contig_id(argv[2]);
+  //cout<<"processing contig: "<<contig_id<<endl;
+
+  //string ref_str_fname(argv[1]);
+  //ifstream ref_str(ref_str_fname.c_str());
+  //if(!ref_str.good()){
+  //  cerr<<"bad file: "<<ref_str_fname<<endl;
+  //  exit(1);
+  // }
+  cout<<endl<<"contig: "<<contig_id<<endl;
+  cout<<"score_file1: "<<scores_fname1<<endl;
+  cout<<"score_file2: "<<scores_fname2<<endl;
+  cout<<"score_threshold: "<<score_threshold<<endl;
+  cout<<"background_threshold: "<<background_threshold<<endl;
+
+
+  ifstream scores_str1(scores_fname1.c_str());
+  if(!scores_str1.good()){
+    cerr<<"bad file: "<<scores_fname1<<endl;
+    exit(1);
+  }
+
+  ifstream scores_str2(scores_fname2.c_str());
+  if(!scores_str2.good()){
+    cerr<<"bad file: "<<scores_fname2<<endl;
+    exit(1);
+  }
+
+  /*
+  seq_contig ref_contig;
+  int load_contig_msg = ref_contig.load_contig_from_file(contig_id, ref_str);
+  if(load_contig_msg != SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE){
+    cerr<<"failed to load contig [ "<<contig_id<<" ] from file "<<ref_str_fname<<endl;
+    exit(1);
+  }
+  else{
+    cerr<<"loaded "<<ref_contig.seq.length()<<" letters"<<endl;
+  }
+
+  ref_str.close();
+  */
+
+  cout<<"loading the scores"<<endl;
+
+  unsigned int _size1, _size2;
+  int entries1, entries2;
+
+  scores_str1.read((char*) (&_size1), sizeof(_size1));
+  scores_str2.read((char*) (&_size2), sizeof(_size2));
+
+  cout<<"_size1: "<<_size1<<endl;
+  cout<<"_size2: "<<_size2<<endl;
+  assert(_size1 == _size2);
+
+  scores_str1.read((char*) (&entries1), sizeof(entries1));
+  scores_str2.read((char*) (&entries2), sizeof(entries2));
+
+  cout<<"the scores file contains "<<_size1<<" entries"<<endl;
+
+  cout<<"score file 1: "<<entries1<<" tags."<<endl;
+  cout<<"score file 2: "<<entries2<<" tags."<<endl;
+
+  vector<float> scores1(_size1);
+  vector<float> scores2(_size2);
+
+  cout<<"loading the score values..."<<endl;
+  scores_str1.read((char*) &(scores1[0]), sizeof(scores1[0])*_size1);
+  cout<<"file1 loaded."<<endl;
+  scores_str2.read((char*) &(scores2[0]), sizeof(scores2[0])*_size2);
+  cout<<"file2 loaded."<<endl;
+
+  //for(int i=0; i<_size; i++){
+    //if(scores[i] > 20) { cout<<"i: "<<i<<" s: "<<scores[i]<<endl;/* exit(1);*/}
+  //}
+  //cout<<"done reading! "<<scores[100]<<" "<<scores[101]<<endl;
+  
+  assert(_size1 = _size2);
+  unsigned int _size = _size1;
+
+  //assert(_size == ref_contig.seq.length());
+
+  //string out_fname(argv[5]);
+
+  remove(output_fname.c_str());
+  ofstream out_str(output_fname.c_str());
+  if(!out_str.good()){
+    cerr<<"bad file name: "<<output_fname<<endl;
+    exit(1);
+  }
+
+  //double thresh = 20;//30;//50;
+
+  cout<<"score_threshold: "<<score_threshold<<endl;
+  cout<<"background_threshold: "<<background_threshold<<endl;
+  cout<<"dip_fraction: "<<dip_fraction<<endl;
+  cout<<"above_threshold_mult: "<<above_threshold_mult<<endl;
+
+  cout<<"analyzing..."<<endl;
+  unsigned int peak_calls = 0;
+  unsigned int last_start_pos = 0;
+  //double max_value = -1;
+  //unsigned int max_pos = 0;
+  double last_score = -1;
+  for(unsigned int i=0; i<_size; i++){   
+    double cur_score = scores1[i];
+    
+    if(i>0){
+      if(cur_score >= score_threshold){
+       if(last_score < score_threshold){
+         last_start_pos = i;
+         //max_value = cur_score;
+         //max_pos =i;
+       }
+       //else{
+         //if(cur_score > max_value){
+           //max_value = cur_score; max_pos = i;
+         //}
+       //}
+      }
+      else{
+       if(last_score >= score_threshold){ 
+         //resolve peaks
+         //if(peak_caller_method == "maximum"){
+         unsigned int region_start = last_start_pos;
+         unsigned int region_end = i;
+         
+         double max_score = -1;
+         unsigned int max_pos = 0;      
+         for(unsigned int j=region_start; j<region_end; j++){
+           if(j==region_start){ max_score = scores1[j]; max_pos = j; }
+           else{
+             if( scores1[j] > max_score ){ max_score = scores1[j]; max_pos = j; }
+           }
+         }
+         //if(scores2[max_pos] < background_threshold ){
+         {
+           if(peak_caller_method == "maximum"){            
+             if(scores2[max_pos] < background_threshold ||
+                (scores2[max_pos] >= background_threshold && 
+                 scores1[max_pos] >= above_threshold_mult * scores2[max_pos] ) ){
+               out_str<<contig_id<<" w: "<<i-1-last_start_pos<<" max: "<<max_score<<" at: "<<max_pos<<" null_s: "<<scores2[max_pos]<<endl;
+               peak_calls++;
+             }
+           }
+           if(peak_caller_method == "local_maximum"){
+             unsigned int region_start = last_start_pos;
+             unsigned int region_end = i;
+             vector<unsigned int> candidates;
+             vector<unsigned int> local_max_peaks;
+             vector<unsigned int> past_filter_peaks;
+             for(unsigned int j=region_start+2; j<region_end; j++){
+               if(scores1[j-2] <= scores1[j-1] && scores1[j-1] >= scores1[j]) 
+                 candidates.push_back(j-1);
+             }
+             for(unsigned int j=0; j<candidates.size(); j++){
+               bool local_max = true;
+               unsigned int cur_max_pos = candidates[j];
+               for(unsigned int k= cur_max_pos - local_maximum_radius; 
+                   k <= cur_max_pos + local_maximum_radius; k++){
+                 if( k!=cur_max_pos){ if(scores1[k] > scores1[cur_max_pos]) local_max = false; }       
+               }
+               
+               if(local_max) local_max_peaks.push_back(cur_max_pos);             
+             }
+             switch(local_max_peaks.size()){
+               //if(local_max_peaks.size() == 1) 
+             case 0:{ break; }
+             case 1:{ past_filter_peaks.push_back(local_max_peaks[0]); break;}
+             default:{
+               //else{
+               vector<int> local_max_peaks_mask(local_max_peaks.size());
+               for(unsigned int k=0; k<local_max_peaks_mask.size(); k++) local_max_peaks_mask[k] = 0;
+               bool max_pos_found = false;
+               unsigned int max_ind = 1000;
+               for(unsigned int k=0; k<local_max_peaks.size(); k++){
+                 if(local_max_peaks[k] == max_pos){
+                   max_pos_found = true;
+                   max_ind = k;
+                 }
+               }
+               if(!max_pos_found){
+                 cerr<<"bad region: "<<endl;
+                 cerr<<"[ "<<last_start_pos<<","<<i<<" ]"<<endl;
+                 cerr<<"score["<<last_start_pos<<"] = "<<scores1[last_start_pos]<<endl;
+                 assert(max_pos_found);
+               }
+               local_max_peaks_mask[max_ind] = 1; //selected;
+               unsigned int last_good_ind = max_ind;
+               for(unsigned int k=max_ind+1; k<local_max_peaks.size(); k++){
+                 unsigned int last_good_pos = local_max_peaks[last_good_ind];
+                 unsigned int cur_pos = local_max_peaks[k];
+                 unsigned int smaller_peak, larger_peak;
+                 if(scores1[last_good_pos] > scores1[cur_pos]){ 
+                   smaller_peak = cur_pos; larger_peak = last_good_pos;
+                 }
+                 else{
+                   smaller_peak = last_good_pos; larger_peak = cur_pos;
+                 }
+                 double dip_value = (1-dip_fraction)*scores1[smaller_peak];
+                 bool dipped = false;
+                 for(unsigned int m=last_good_pos; m<=cur_pos; m++){
+                   if(scores1[m] <= dip_value) dipped = true;
+                 }
+                 if(dipped){
+                   local_max_peaks_mask[k] = 1;
+                   last_good_ind = k;
+                 }
+               }
+
+               last_good_ind = max_ind;
+               for(int k=(int)max_ind-1; k>=0; k--){
+                 unsigned int last_good_pos = local_max_peaks[last_good_ind];
+                 unsigned int cur_pos = local_max_peaks[k];
+                 unsigned int smaller_peak, larger_peak;
+                 if(scores1[last_good_pos] > scores1[cur_pos]){ 
+                   smaller_peak = cur_pos; larger_peak = last_good_pos;
+                 }
+                 else{
+                   smaller_peak = last_good_pos; larger_peak = cur_pos;
+                 }
+                 double dip_value = (1-dip_fraction)*scores1[smaller_peak];
+                 bool dipped = false;
+                 assert(last_good_pos > cur_pos);
+                 for(unsigned int m=last_good_pos; m>=cur_pos; m--){
+                   if(scores1[m] <= dip_value) dipped = true;
+                 }
+                 if(dipped){
+                   local_max_peaks_mask[k] = 1;
+                   last_good_ind = k;
+                 }
+               }
+
+               for(unsigned int k=0; k<local_max_peaks_mask.size(); k++){
+                 if(local_max_peaks_mask[k] == 1) past_filter_peaks.push_back(local_max_peaks[k]);
+               }
+               break;
+             }
+             }
+             for(unsigned int k=0; k<past_filter_peaks.size(); k++){           
+               unsigned int cur_peak_pos = past_filter_peaks[k];
+               if(scores2[max_pos] < background_threshold ||
+                  (scores2[max_pos] >= background_threshold && 
+                   scores1[max_pos] >= above_threshold_mult * scores2[max_pos] ) ){
+                 //if(scores2[cur_peak_pos] < background_threshold){
+                 //double locmax = scores1[cur_peak_pos];
+                 //double nulls = scores2[cur_peak_pos]; 
+                 //double ef = ( locmax / entries1 ) / ( nulls / entries2 );  //enrichment fold
+                 out_str<<contig_id<<" w: "<<i-1-last_start_pos<<" locmax: "<<scores1[cur_peak_pos];
+                 out_str<<" at: "<<cur_peak_pos<<" nulls: "<<scores2[cur_peak_pos];
+                 //out_str<<" ef: "<<ef;
+                 out_str<<endl;
+                 peak_calls++;
+               }
+             }     
+             out_str<<endl;
+           }
+         }
+         //out_str<<contig_id<<" w: "<<i-1-last_start_pos<<" max: "<<max_value<<" at: "<<max_pos<<" null_s: "<<scores2[max_pos]<<endl;
+       }
+      }
+    }
+    last_score = cur_score;
+  }
+    
+  cout<<"called: "<<peak_calls<<" peaks"<<endl;
+  return 0;
+}
diff --git a/htswanalysis/bin/QuEST/src/quick_window_scan.cpp b/htswanalysis/bin/QuEST/src/quick_window_scan.cpp
new file mode 100755 (executable)
index 0000000..312fdaf
--- /dev/null
@@ -0,0 +1,258 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <vector>
+#include <list>
+#include <math.h>
+#include <assert.h>
+#include <time.h>
+
+#include "utils.h"
+#include "seq_contig.h"
+#include "params.h"
+
+using namespace std;
+
+#define BLACK_BACKGROUND
+
+
+//#define read_size 25
+
+int el(const int* counts, unsigned int i, unsigned int j){
+  return counts[i*3+j];
+}
+
+double weight(int x){
+  
+  if(-50<= x && x<=50) return x/15000.0+1.0/300.0;
+  if(50<x && x<=250) return -1*x/(30000) + 1.0/120.0;
+  return 0;
+  
+}
+
+double weight_for(int x){
+  return weight(-x);
+}
+double weight_rev(int x){
+  return weight(x);
+}
+
+double indicator_for(int x){
+  if (x >= -300 && x <= 300) return 1;
+  else return 0;
+  //if ( x>= -250 && x <= 50 ) return 1;
+  //else return 0;
+}
+double indicator_rev(int x){
+  if (x >= -300 && x <= 300) return 1;
+  else return 0;
+
+  //if ( x>= -50 && x <= 250) return 1;
+  //else return 0;
+}
+double normal_weight(int x, double sigma){
+  double pi=3.14;
+  return exp(-x*x/(2*sigma*sigma))/(sqrt(2*pi)*sigma);
+}
+
+int main(int argc, char* argv[]){
+
+  params pars(argc, argv);
+  pars.require("align_file","align_file",STRING_TYPE);
+  pars.require("contig_id","contig_id",STRING_TYPE);
+  pars.require("contig_size","contig_size",INT_TYPE);
+  //pars.require("ref_seq_file","ref_seq_file",STRING_TYPE);
+  pars.require("output_file","output_summary_file",STRING_TYPE);
+  pars.optional("calc_window","calc_window","300",INT_TYPE);
+  //  pars.optional("kde_bandwidth","kde_bandwidth","30",DOUBLE_TYPE);  
+  pars.optional("count_threshold","count_threshold","600",INT_TYPE);
+
+  if(!pars.enforce()){
+    pars.list_all_params();
+    exit(1);
+  }
+
+  pars.list_all_params();
+  string align_fname = pars.get_string_value("align_file");
+  string contig_id = pars.get_string_value("contig_id");
+  int contig_size = pars.get_int_value("contig_size");
+  //string ref_seq_fname = pars.get_string_value("ref_seq_file");
+  string output_fname = pars.get_string_value("output_file");
+
+  int calc_window = pars.get_int_value("calc_window");
+  int count_threshold = pars.get_int_value("count_threshold");
+
+  cout<<"align file : "<<align_fname<<endl;
+
+  //ifstream ref_str(ref_seq_fname.c_str());
+  //if(!ref_str.good()){
+  //  cerr<<"bad file: "<<ref_seq_fname<<endl;
+  //  exit(1);
+  //}
+
+  if(contig_size <= 0){
+    cout<<"Bad size of the contig: "<<contig_size<<endl;
+    cout<<"Aborting."<<endl;
+    exit(0);
+  }
+
+  ifstream hit_str(align_fname.c_str());
+  if(!hit_str.good()){
+    cerr<<"bad file name: "<<align_fname<<endl;
+    exit(1);
+  }
+  
+  cout<<"processing contig: "<<contig_id<<endl;
+
+  remove(output_fname.c_str());
+  ofstream out_str(output_fname.c_str());//, ofstream::binary);
+  if(!out_str.good()){
+    cerr<<"bad file name: "<<output_fname<<endl;
+    exit(1);
+  }
+
+  //seq_contig ref_contig;
+  //int load_contig_msg = ref_contig.load_contig_from_file(contig_id, ref_str);
+  //if(load_contig_msg != SUCCEEDED_TO_LOAD_CONTIG_FROM_FILE){
+  //  cerr<<"failed to load contig [ "<<contig_id<<" ] from file "<<ref_seq_fname<<endl;
+  //  exit(1);
+  // }
+  //else{
+  //  cerr<<"loaded "<<ref_contig.seq.length()/1000000<<" M bps"<<endl;
+  //}
+  //unsigned int contig_size = ref_contig.seq.length();
+  //
+  //ref_str.close();
+
+
+  cout<<"allocating memory..."<<endl;
+  vector<unsigned int> hit_counts_for(contig_size);
+  vector<unsigned int> hit_counts_rev(contig_size);
+
+  for(int i=0; i<contig_size; i++){
+    hit_counts_for[i] = 0;
+    hit_counts_rev[i] = 0;
+    //score[i] = 0;
+  }
+
+  cout<<"reading align_file..."<<endl;
+  
+  unsigned int entries = 0;
+  string dummy_string;
+
+  char sep1 = ' ';
+  //char sep2 = ':';
+  while(hit_str.good()){
+    getline(hit_str, dummy_string);
+    if(hit_str.good()){
+      if(dummy_string.length() > 0){
+       if(dummy_string[0] != '#'){
+         
+         vector<string> cur_hit_entry = split(dummy_string, sep1);
+         //char dummy_char;
+         //cin>>dummy_char;
+         
+         if(cur_hit_entry.size() == 3){ //> 4){        
+           
+           //vector<string> cur_coord = split(cur_hit_entry[3], sep2);
+           //assert(cur_coord.size() > 0);
+           
+           string cur_read_contig_name = cur_hit_entry[0];
+           //cout<<cur_read_contig_name<<endl;
+           int cur_read_5p_coord = atoi(cur_hit_entry[1].c_str());
+           string cur_read_orient = cur_hit_entry[2];
+           
+           if(cur_read_contig_name == contig_id){
+             //if(false){
+             entries++;
+             int start_coord;
+             
+             //assert(cur_read_orient == "+" || cur_read_orient == "-");
+             if(cur_read_orient != "+" && cur_read_orient != "-"){
+               cout<<"Warning: in align file expected +/- for orientation but found: *"<<cur_read_orient<<"*"<<endl;
+               cout<<"Skipping this read."<<endl;
+             }
+             else{
+               start_coord = cur_read_5p_coord;
+               if(start_coord >= contig_size || start_coord <0){
+                 cout<<"Warning: read coordinate "<<start_coord<<" is out of boundaries";
+                 cout<<"[ 0, "<<contig_size<<" ]. Skipping this read."<<endl;
+               }
+               else{
+                 if(cur_read_orient == "+"){ hit_counts_for[start_coord]++; }
+                 else{ hit_counts_rev[start_coord]++; }
+               }
+             }
+           }
+         }
+       }
+      }
+    }
+  }
+  
+  hit_str.close();
+
+  cout<<"read: "<<entries<<" entries "<<endl;
+  cout<<"calculating scores"<<endl;
+
+  int cur_count = 0;
+  int cur_status = -1; //uninit
+  int cur_max = 0;
+  unsigned int cur_max_ind = 0;
+
+  for(unsigned int i=0; i<(unsigned int)calc_window; i++){
+    cur_count += hit_counts_for[i] + hit_counts_rev[i];
+  }
+
+  if(cur_count >= count_threshold){ //in the peak
+    cur_status = 1;
+    cur_max = cur_count;
+    cur_max_ind = calc_window/2-1;
+  }
+  else cur_status = 0; //waiting to get over the threshold
+  
+  int peaks_called = 0;
+  
+  for(int i=calc_window/2; i<(int) contig_size-(int)calc_window; i++){
+    //if(i%1000 == 0) print_tick_bar( ((double) i) / (0.01*((double) contig_size)));
+    
+     cur_count += hit_counts_for[i + calc_window/2] + hit_counts_rev[i + calc_window/2] 
+      - hit_counts_for[i-calc_window/2] - hit_counts_rev[i-calc_window/2];
+
+    switch (cur_status){
+    case 0: {
+      if (cur_count >= count_threshold){
+       cur_status = 1; 
+       cur_max = cur_count;
+       cur_max_ind = i;        
+      }
+      break;
+    }
+    case 1:{
+      if(cur_count < count_threshold){
+       out_str<<contig_id<<" "<<cur_max_ind<<" "<<cur_max<<endl;
+       cur_status = 0; 
+       cur_max = 0;
+       peaks_called++;
+      }
+      else{
+       if(cur_count > cur_max){
+         cur_max = cur_count;
+         cur_max_ind = i;
+       }
+      }
+      break;
+    }
+    }
+  }
+
+  cout<<"identified "<<peaks_called<<" candidated regions"<<endl<<endl;
+  
+  out_str.close();
+  
+  return 0;
+
+}
diff --git a/htswanalysis/bin/genomebg.mac b/htswanalysis/bin/genomebg.mac
new file mode 100755 (executable)
index 0000000..fe56922
Binary files /dev/null and b/htswanalysis/bin/genomebg.mac differ
diff --git a/htswanalysis/bin/macs b/htswanalysis/bin/macs
new file mode 100755 (executable)
index 0000000..bb2669a
--- /dev/null
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# Time-stamp: <2008-05-21 18:19:59 Tao Liu>
+
+"""Module Description
+
+Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the Artistic License (see the file COPYING included
+with the distribution).
+
+@status:  beta
+@version: $Revision$
+@author:  Yong Zhang, Tao Liu
+@contact: taoliu@jimmy.harvard.edu
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+
+import os
+import sys
+import logging
+import math
+from optparse import OptionParser
+# ------------------------------------
+# own python modules
+# ------------------------------------
+from MACS.gsl.cdf import binom_cdf_Q
+from MACS.TabIO import BEDParser, ELANDParser
+from MACS.PeakModel import PeakModel
+from MACS.PeakDetect import PeakDetect
+
+# ------------------------------------
+# Main function
+# ------------------------------------
+def main():
+    usage = "usage: %prog <-t tfile> [options]"
+    description = "MACS -- Model-based Analysis for ChIP-Sequencing"
+
+    # Parse options...
+    optparser = OptionParser(version="%prog 1.0 beta",description=description,usage=usage,add_help_option=False)
+    optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
+    optparser.add_option("-t","--treatment",dest="tfile",type="string",
+                         help="ChIP-seq treatment files. REQUIRED")
+    optparser.add_option("-c","--control",dest="cfile",type="string",
+                         help="control files.")
+    optparser.add_option("--name",dest="name",type="string",
+                         help="experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
+                         default="NA")
+    optparser.add_option("--format",dest="format",type="string",
+                         help="Format of tag file, \"ELAND\" or \"BED\". DEFAULT: \"BED\"",
+                         default="bed")
+    optparser.add_option("--gsize",dest="gsize",type="int",default=2700000000,
+                         help="genome size, default:2700000000")
+    optparser.add_option("--tsize",dest="tsize",type="int",default=25,
+                         help="tag size. DEFAULT: 25")
+    optparser.add_option("--bw",dest="bw",type="int",default=300,
+                         help="band width. DEFAULT: 300")
+    optparser.add_option("--pvalue",dest="pvalue",type="float",default=1e-5,
+                         help="pvalue cutoff for peak detection. DEFAULT: 1e-5")
+    optparser.add_option("--mfold",dest="mfold",type="int",default=32,
+                         help="select the regions with MFOLD high-confidence enrichment ratio against background to build model. DEFAULT:32")
+#     optparser.add_option("--wig",dest="wig",action="store_true",
+#                          help="Whether or not to save the peak shape for every peak region in wiggle file",
+#                          default=False)
+    optparser.add_option("--verbose",dest="verbose",type="int",default=2,
+                         help="set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
+    (options,args) = optparser.parse_args()
+
+    if not options.tfile:
+        optparser.print_help()
+        sys.exit(1)
+
+    # configure the logging instance
+    logging.basicConfig(level=(4-options.verbose)*10,
+                        format='%(levelname)-5s @ %(asctime)s: %(message)s ',
+                        datefmt='%a, %d %b %Y %H:%M:%S',
+                        stream=sys.stderr,
+                        filemode="w"
+                        )
+
+    if not os.path.isfile (options.tfile):
+        logging.error("No such file: %s!" % options.tfile)
+        sys.exit(1)
+    if options.cfile and not os.path.isfile (options.cfile):
+        logging.error("No such file: %s!" % options.cfile)
+        sys.exit(1)
+
+    #0 output arguments
+    logging.info("#0 arguments list:")
+    logging.info("#0 name = %s" % (options.name))
+    logging.info("#0 ChIP-seq file = %s" % (options.tfile))
+    logging.info("#0 control file = %s" % (options.cfile))
+    logging.info("#0 effective genome size = %.2e" % (options.gsize))
+    logging.info("#0 tag size = %d" % (options.tsize))
+    logging.info("#0 band width = %d" % (options.bw))
+    logging.info("#0 model fold = %s" % (options.mfold))
+    #logging.info("#0 model peaks = %s" % (options.mpeaks))
+    logging.info("#0 pvalue cutoff = %.2e" % (options.pvalue))
+
+    log_pvalue = math.log(options.pvalue,10)*-10
+
+    #1 Read tag files
+    logging.info("#1 read tag files...")
+    format = options.format.upper()
+    (treat, control) = load_tag_files (options.tfile, options.cfile,format)
+    logging.debug("#1.3 calculate max duplicate tags in single position based on binomal distribution...")
+    max_dup_tags = cal_max_dup_tags(options.gsize,treat.total_w_duplicates)
+    logging.debug("#1.3  max_dup_tags = %d" % (max_dup_tags))
+    logging.debug("#1.3  unique tags in treatment: %d" % (treat.total))
+    logging.debug("#1.3  total tags in treatment: %d" % (treat.total_w_duplicates))
+    if control:
+        logging.debug("#1.3  unique tags in control: %d" % (control.total))
+        logging.debug("#1.3  total tags in control: %d" % (control.total_w_duplicates))
+    bg_redundant = bg_r (treat,max_dup_tags)
+    logging.info("#1.3  Background Redundant rate: %.2f" % (bg_redundant))
+    logging.info("#1.3 finished!")
+    #2 Build Model
+    logging.info("#2 Build Peak Model...")
+    peakmodel = PeakModel(treatment = treat,
+                          gz = options.gsize, # mappable genome size
+                          bw = options.bw,
+                          fold = options.mfold,
+                          max_pairnum = 1000,
+                          ts = options.tsize,
+                          bg = bg_redundant,
+                          max_dup_tags = max_dup_tags
+                          )
+    logging.info("#2 finished!")
+    logging.debug("#2  Summary Model:")
+    logging.debug("#2   min_tags: %d" % (peakmodel.min_tags))
+    logging.debug("#2   frag_length: %d" % (peakmodel.frag_length))
+    logging.debug("#2   scan_window: %d" % (peakmodel.scan_window))
+    logging.info("#2.2 Generate R script for model")
+    model2r_script(peakmodel,options.name)
+    #3 Call Peaks
+    logging.info("#3 Call peaks...")
+    peakdetect = PeakDetect(treat = treat,
+                            control = control,
+                            frag_length = peakmodel.frag_length,
+                            scan_window = peakmodel.scan_window,
+                            pvalue = log_pvalue,
+                            gsize = options.gsize
+                            )
+    peakdetect.call_peaks()
+    #4 output
+    #4.1 peaks in XLS
+    ofile_xls = options.name+"_peaks.xls"
+    logging.info("#4.1 Write output xls file... %s" % (ofile_xls))
+    ofhd_xls = open(ofile_xls,"w")
+    ofhd_xls.write("# This file is generated by MACS\n")
+    ofhd_xls.write("# name = %s\n" % (options.name))
+    ofhd_xls.write("# ChIP-seq file = %s\n" % (options.tfile))
+    ofhd_xls.write("# control file = %s\n" % (options.cfile))
+    ofhd_xls.write("# effective genome size = %.2e\n" % (options.gsize))
+    ofhd_xls.write("# tag size = %d\n" % (options.tsize))
+    ofhd_xls.write("# band width = %d\n" % (options.bw))
+    ofhd_xls.write("# model fold = %s\n" % (options.mfold))
+    ofhd_xls.write("# pvalue cutoff = %.2e\n" % (options.pvalue))
+    ofhd_xls.write("# frag_len = %d\n" % (peakdetect.frag_length))
+    ofhd_xls.write(peakdetect.toxls())
+    ofhd_xls.close()
+    #4.2 peaks in BED
+    ofile_bed = options.name+"_peaks.bed"
+    logging.info("#4.2 Write output bed file... %s" % (ofile_bed))
+    ofhd_bed = open(ofile_bed,"w")
+    ofhd_bed.write(peakdetect.tobed())
+    ofhd_bed.close()
+
+    #4.3 negative peaks in XLS
+    if options.cfile:
+        ofile_xls = options.name+"_negative_peaks.xls"
+        logging.info("#4.3 Write output xls file for negative peaks... %s" % (ofile_xls))
+        ofhd_xls = open(ofile_xls,"w")
+        ofhd_xls.write(peakdetect.neg_toxls())
+        ofhd_xls.close()
+
+    logging.info("#5 Done! Check the output files!\n")
+   
+def bg_r (trackI, max_dup_tags):
+    total_tags  = trackI.total_w_duplicates
+    total_duplicates = 0
+    chrs = trackI.get_chr_names()
+    chrs.sort()
+    for chrom in chrs:
+        countlist = trackI.get_comments_by_chr (chrom)
+        for c in countlist[0]:
+            total_duplicates += max(c-max_dup_tags,0) 
+        for c in countlist[1]:
+            total_duplicates += max(c-max_dup_tags,0)
+    return float(total_duplicates)/(total_tags)
+
+
+def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
+    for i in range (1,1000):
+        if binom_cdf_Q(1.0/genome_size,tags_number,i) < p:
+            return i
+    raise Exception("LAMBDA > 1000!")
+
+def load_tag_files ( tfile, cfile, format ):
+    """From the options, load treatment tags and control tags (if available).
+
+    """
+    logging.info("#1.1 read treatment tags...")
+    treat = _merge_then_load(tfile,format)
+    treat.merge_overlap()
+    if cfile:
+        logging.info("#1.2 read input tags...")
+        control = _merge_then_load(cfile,format)
+        control.merge_overlap()
+    else:
+        control = None
+    return (treat, control)
+
+def _merge_then_load ( p,format ):
+    """Merge several tag files and load it to a single FWTrackI object.
+
+    """
+    #s = os.system("cat %s > macs_tmpfile.t" % p)
+    #if s == 0:
+    t = _load_tag_file(p,format)
+    #os.remove("macs_tmpfile.t")
+    #else:
+    #    raise Exception("unable to cat files: %s!" % p)
+    return t
+
+def _load_tag_file ( f, format ):
+    """Load tag file into single FWTrackI object.
+
+    """
+    fhd = file(f,mode="r")
+    if format == "ELAND":
+        p = ELANDParser()
+    elif format == "BED":
+        p = BEDParser()
+    else:
+        raise Exception("Format \"%s\" cannot be recognized!" % (format))
+    trackI = p.build_fwtrack(fhd)
+    return trackI
+
+def model2r_script(model,name):
+    rfhd = open(name+"_model.r","w")
+    p = model.plus_line
+    m = model.minus_line
+    s = model.shifted_line
+    w = len(p)
+    norm_p = [0]*w
+    norm_m = [0]*w
+    norm_s = [0]*w
+    sum_p = sum(p)
+    sum_m = sum(m)
+    sum_s = sum(s)
+    for i in range(w):
+        norm_p[i] = float(p[i])*100/sum_p
+        norm_m[i] = float(m[i])*100/sum_m
+        norm_s[i] = float(s[i])*100/sum_s
+    rfhd.write("# R script for Peak Model\n")
+    rfhd.write("#  -- generated by MACS\n")
+
+    rfhd.write("""p <- c(%s)
+m <- c(%s)
+s <- c(%s)
+x <- seq.int((length(p)-1)/2*-1,(length(p)-1)/2)
+pdf("%s_model.pdf",height=4,width=4)
+plot(x,p,type="l",col=c("red"),main="Peak Model",xlab="Distance to the middle",ylab="Percentage")
+lines(x,m,col=c("blue"))
+lines(x,s,col=c("black"))
+abline(v=median(x[p==max(p)]),lty=2,col=c("red"))
+abline(v=median(x[m==max(m)]),lty=2,col=c("blue"))
+abline(v=median(x[s==max(s)]),lty=2,col=c("black"))
+dev.off()
+""" % (','.join(map(str,norm_p)),','.join(map(str,norm_m)),','.join(map(str,norm_s)),name))
+    rfhd.close()
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt me! ;-) See you!\n")
+        sys.exit(0)
diff --git a/htswanalysis/reference_data/dm3_chrlist.cod b/htswanalysis/reference_data/dm3_chrlist.cod
new file mode 100644 (file)
index 0000000..9dc44f6
--- /dev/null
@@ -0,0 +1,6 @@
+chr2L  0       23011543
+chr2R  0       21146707
+chr3L  0       24543556
+chr3R  0       27905052
+chr4   0       1351856
+chrX   0       22422826
diff --git a/htswanalysis/reference_data/hg18_chrlist.cod b/htswanalysis/reference_data/hg18_chrlist.cod
new file mode 100644 (file)
index 0000000..3acca1a
--- /dev/null
@@ -0,0 +1,24 @@
+chr1   0       247249718
+chr2   0       242951148
+chr3   0       199501826
+chr4   0       191273062
+chr5   0       180857865
+chr6   0       170899991
+chr7   0       158821423
+chr8   0       146274825
+chr9   0       140273251
+chr10  0       135374736
+chr11  0       134452383
+chr12  0       132349533
+chr13  0       114142979
+chr14  0       106368584
+chr15  0       100338914
+chr16  0       88827253
+chr17  0       78774741
+chr18  0       76117152
+chr19  0       63811650
+chr20  0       62435963
+chr21  0       46944322
+chr22  0       49691431
+chrX   0       154913753
+chrY   0       57772953
diff --git a/htswanalysis/reference_data/mm8_chrlist.cod b/htswanalysis/reference_data/mm8_chrlist.cod
new file mode 100644 (file)
index 0000000..e921684
--- /dev/null
@@ -0,0 +1,21 @@
+chr1   0       197069961
+chr2   0       181976761
+chr3   0       159872111
+chr4   0       155029700
+chr5   0       152003062
+chr6   0       149525684
+chr7   0       145134093
+chr8   0       132085097
+chr9   0       124000668
+chr10  0       129959147
+chr11  0       121798631
+chr12  0       120463158
+chr13  0       120614377
+chr14  0       123978869
+chr15  0       103492576
+chr16  0       98252458
+chr17  0       95177419
+chr18  0       90736836
+chr19  0       61321189
+chrX   0       165556468
+chrY   0       16029403
diff --git a/htswanalysis/reference_data/mm9_chrlist.cod b/htswanalysis/reference_data/mm9_chrlist.cod
new file mode 100644 (file)
index 0000000..56c6c94
--- /dev/null
@@ -0,0 +1,21 @@
+chr1   0       197195431
+chr2   0       181748086
+chr3   0       159599782
+chr4   0       155630119
+chr5   0       152537258
+chr6   0       149517036
+chr7   0       152524552
+chr8   0       131738870
+chr9   0       124076171
+chr10  0       129993254
+chr11  0       121843855
+chr12  0       121257529
+chr13  0       120284311
+chr14  0       125194863
+chr15  0       103494973
+chr16  0       98319149
+chr17  0       95272650
+chr18  0       90772030
+chr19  0       61342429
+chrX   0       166650295
+chrY   0       15902554
index 3c6b9c51f832babdba5ff61221434448e657e0e2..92740e185a1ec52bd2b39f14adb427fc9b240f30 100755 (executable)
@@ -23,14 +23,14 @@ my $parm = shift;
 
 my %libs;
 
-my $BIOP = "$root_dir/bin/BioProspector/BioProspector.mac";
-my $QUESTDIR = "$root_dir/bin/QuEST/QuEST_ver_0.3";
-my $MACSDIR  = "$root_dir/bin/MACS/bin";
-my $WINGPEAKSDIR = "$root_dir/bin/WingPeaks/bin";
-my $WINGPEAKSGENOMEDIR = "$root_dir/bin/WingPeaks/genome_cod";
-my $PROFILEDIR = "$root_dir/bin/";
+my $BIOP = "$root_dir/bin/BioProspector.mac";
+my $QUESTDIR = "$root_dir/bin/QuEST";
+my $MACSDIR  = "$root_dir/bin";
+my $WINGPEAKSDIR = "$root_dir/bin";
+my $WINGPEAKSGENOMEDIR = "$root_dir/reference_data";
+my $PROFILEDIR = "$root_dir/bin";
 my $GENOMEDIR = "/Volumes/Genomes";
-my $QPCRDIR = "$root_dir/bin/";
+my $QPCRDIR = "$root_dir/bin";
 my $QPCRTESTDIR = "$root_dir/reference_data/qPCR_Tests";
 my $QPCRBACKGROUND = "$root_dir/reference_data/GenericBackground";