Removed from hard-coded directories. Starting to bring the QC into the sequence repos...
authorTim Reddy Tim <treddy@hudsonalpha.org>
Mon, 24 Nov 2008 20:06:07 +0000 (20:06 +0000)
committerTim Reddy Tim <treddy@hudsonalpha.org>
Mon, 24 Nov 2008 20:06:07 +0000 (20:06 +0000)
htswanalysis/scripts/ConfigureTasks.pm
htswanalysis/scripts/WriteQCSummary.pm [new file with mode: 0755]

index ab5f785376d0d75645d6846c35e8497de727fd6f..3aab29c0380bba6415dea5fa3dea1df258edd873 100755 (executable)
@@ -177,8 +177,8 @@ sub WriteQPCRTasks {
       my $testdir =  $QPCRTESTDIR;
       my $outfile = "$name.qPCR";
   
-      my $seqcheck = "if [ ! -e ~Data/Libraries/$lib.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for sequencing.\"; fi;"; 
-      my $cmd = "$outfile: ~Data/Libraries/$lib.txt\n\t$seqcheck\n\t$QPCRDIR/qPCR \$< $background $testdir > \$@\n";
+      my $seqcheck = "if [ ! -e $data_dir/Libraries/$lib.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for sequencing.\"; fi;"; 
+      my $cmd = "$outfile: $data_dir/Libraries/$lib.txt\n\t$seqcheck\n\t$QPCRDIR/qPCR \$< $background $testdir > \$@\n";
       writeTask($qpcr, "qPCR", $outfile, $cmd);
 
       $tasks .= $task." ";
@@ -201,12 +201,12 @@ sub WriteProfileTasks {
 
       my $outfile = "$lib.wig.gz $lib.profile.gif";  
 
-      my $seqcheck = "if [ ! -e ~Data/Libraries/$lib.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for sequencing.\"; fi;"; 
-      my $cmds .= "$lib.wig.gz: ~Data/Libraries/$lib.txt\n";
+      my $seqcheck = "if [ ! -e $data_dir/Libraries/$lib.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for sequencing.\"; fi;"; 
+      my $cmds .= "$lib.wig.gz: $data_dir/Libraries/$lib.txt\n";
       $cmds .= "\t$seqcheck\n";
-      $cmds .= "\t".$PROFILEDIR.'/profile_reads_wig ~Data/Libraries/'.$lib.'.txt "'.$name.'" "'.$name.'" | gzip > '.$lib.'.wig.gz';
+      $cmds .= "\t".$PROFILEDIR.'/profile_reads_wig '.$data_dir.'/Libraries/'.$lib.'.txt "'.$name.'" "'.$name.'" | gzip > '.$lib.'.wig.gz';
       $cmds .= "\n\n";
-      $cmds .= $lib.'.profile.gif: ~Data/Libraries/'.$lib.'.txt '.$root_dir.'/reference_data/'.$genome.'_tx_start_sites'."\n";
+      $cmds .= $lib.'.profile.gif: '.$data_dir.'/Libraries/'.$lib.'.txt '.$root_dir.'/reference_data/'.$genome.'_tx_start_sites'."\n";
       $cmds .= "\t".$PROFILEDIR.'/profile_reads_against_features $^ | '.$root_dir.'/scripts/profile_to_svg.pm | /opt/local/bin/convert - $@'."\n";
       $cmds .= "\n";
 
@@ -238,10 +238,10 @@ sub WriteCompareLibTasks {
     
       my $outfile = $name1."_".$name2.".compare "; 
 
-      my $seqcheck = "if [ ! -e ~Data/Libraries/$name1.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $name1 sequencing.\"; fi;"; 
-      $seqcheck .= "\n\tif [ ! -e ~Data/Libraries/$name2.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $name2 sequencing.\"; fi;"; 
+      my $seqcheck = "if [ ! -e $data_dir/Libraries/$name1.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $name1 sequencing.\"; fi;"; 
+      $seqcheck .= "\n\tif [ ! -e $data_dir/Libraries/$name2.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $name2 sequencing.\"; fi;"; 
 
-      my $cmd = "$outfile: ~Data/Libraries/$name1.txt ~Data/Libraries/$name2.txt\n\t$seqcheck\n\t$root_dir/bin/count_reads_in_peaks $tf $name1 $features ~Data/Libraries/$name1.txt $name2 $features ~Data/Libraries/$name2.txt > \$@\n";
+      my $cmd = "$outfile: $data_dir/Libraries/$name1.txt $data_dir/Libraries/$name2.txt\n\t$seqcheck\n\t$root_dir/bin/count_reads_in_peaks $tf $name1 $features $data_dir/Libraries/$name1.txt $name2 $features $data_dir/Libraries/$name2.txt > \$@\n";
   
       writeTask($cmp, "CompareLibraries", $outfile, $cmd);
       $tasks .= $task." ";
@@ -304,8 +304,8 @@ sub WritePeakCallingTasks {
       my $bg =     $peakcall->{Background}->{Library};
       my $genome = $peakcall->{Genome};
 
-      my $seqcheck = "if [ ! -e ~Data/Libraries/$signal.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $signal sequencing.\"; fi;"; 
-      $seqcheck .= "\n\tif [ ! -e ~Data/Libraries/$bg.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $bg sequencing.\"; fi;"; 
+      my $seqcheck = "if [ ! -e $data_dir/Libraries/$signal.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $signal sequencing.\"; fi;"; 
+      $seqcheck .= "\n\tif [ ! -e $data_dir/Libraries/$bg.txt ]; then $root_dir/scripts/analys_track_main.py updsts $task \"Waiting for $bg sequencing.\"; fi;"; 
 
       $tasks .= $task." ";
      
@@ -316,11 +316,11 @@ sub WritePeakCallingTasks {
       if($caller eq "QuEST") {
         $outfile .= "peak_caller.ChIP.out peak_caller.ChIP.out.bedgraph peak_caller.ChIP.out.fasta";
 
-        $cmd   .= "peak_caller.ChIP.out: ~Data/Libraries/$signal.txt ~Data/Libraries/$bg.txt\n";
+        $cmd   .= "peak_caller.ChIP.out: $data_dir/Libraries/$signal.txt $data_dir/Libraries/$bg.txt\n";
         $cmd   .= "\t$seqcheck\n";
        $cmd   .= "\trm -f background_RX_noIP.align.txt\n";
         $cmd   .= "\trm -f pseudo_ChIP_RX_noIP.align.txt\n";
-        $cmd   .= "\t".$QUESTDIR.'/generate_QuEST_parameters.pl -rp '.$GENOMEDIR.'/QuEST_'.$genome.' -solexa_align_ChIP ~Data/Libraries/'.$signal.'.txt -solexa_align_RX_noIP ~Data/Libraries/'.$bg.'.txt -ap '.$data_dir.'/Tasks/'.$task.' -silent > '.$name.'.QuEST.log;'."\n";
+        $cmd   .= "\t".$QUESTDIR.'/generate_QuEST_parameters.pl -rp '.$GENOMEDIR.'/QuEST_'.$genome.' -solexa_align_ChIP $data_dir/Libraries/'.$signal.'.txt -solexa_align_RX_noIP $data_dir/Libraries/'.$bg.'.txt -ap '.$data_dir.'/Tasks/'.$task.' -silent > '.$name.'.QuEST.log;'."\n";
         $cmd   .= "\t".$QUESTDIR.'/run_QuEST_with_param_file.pl -p QuEST.batch.pars >> '.$name.'.QuEST.log;'."\n";
         $cmd   .= "\t".'rm -rf scores;'."\n\n"; 
 
@@ -336,8 +336,8 @@ sub WritePeakCallingTasks {
       } elsif($caller eq "WingPeaks") {  
         $outfile .= "$name.peaks $name.peaks.fasta ";
 
-        $cmd .= "$name.peaks: ~Data/Libraries/$signal.txt ~Data/Libraries/$bg.txt\n";
-        $cmd .= "\t".$WINGPEAKSDIR.'/ChIPSeq_PeakCaller_ENCODE -gn '.$WINGPEAKSGENOMEDIR.'/'.$genome.'_chrlist.cod -it '.$name.' -in 1 -if ~Data/Libraries/'.$signal.'.txt -ct Background -cn 1 -cf ~Data/Libraries/'.$bg.'.txt -ot '.$name.' > '.$name.'.log;'."\n";
+        $cmd .= "$name.peaks: $data_dir/Libraries/$signal.txt $data_dir/Libraries/$bg.txt\n";
+        $cmd .= "\t".$WINGPEAKSDIR.'/ChIPSeq_PeakCaller_ENCODE -gn '.$WINGPEAKSGENOMEDIR.'/'.$genome.'_chrlist.cod -it '.$name.' -in 1 -if '.$data_dir/Libraries/'.$signal.'.txt -ct Background -cn 1 -cf '.$data_dir.'/Libraries/'.$bg.'.txt -ot '.$name.' > '.$name.'.log;'."\n";
 
         $cmd .= "%.peaks.tab: %.peaks\n";
         $cmd .= "\t".'cat $< | awk \'{ print $$1"\t"$$2"\t"$$3"\t"$$4"\t"$$7}\' > $@'."\n\n";
@@ -347,20 +347,23 @@ sub WritePeakCallingTasks {
       } elsif($caller eq "MACS") {
         $outfile .= $name."_peaks.bed ".$name."_peaks.fasta ";
 
-        $cmd .= $name."_peaks.bed: ~Data/Libraries/$signal.txt ~Data/Libraries/$bg.txt\n";
+       $cmd .= "\n.PRECIOUS: $name_peaks.xls $name_peaks.bed $name_peaks.fasta\n\n";
+
+        $cmd .= "\n".$name."_peaks.xls: $data_dir/Libraries/$signal.txt $data_dir/Libraries/$bg.txt\n";
         $cmd .= "\t$seqcheck\n";
-        $cmd .= "\tcat ~Data/Libraries/$signal.txt | $root_dir/scripts/align_to_bed.pm > $signal.bed\n";
-        $cmd .= "\tcat ~Data/Libraries/$bg.txt | $root_dir/scripts/align_to_bed.pm > $bg.bed\n";
+        $cmd .= "\tcat $data_dir/Libraries/$signal.txt | $root_dir/scripts/align_to_bed.pm > $signal.bed\n";
+        $cmd .= "\tcat $data_dir/Libraries/$bg.txt | $root_dir/scripts/align_to_bed.pm > $bg.bed\n";
         $cmd .= "\t$MACSDIR/macs -t $signal.bed -c ./$bg.bed --name=$name --pvalue=1e-10 > $name.log 2> $name.err\n";
-        $cmd .= "\t".'echo "track name="'.$name.'" description="'.$name.'"GR_EtOH_Rep2_Peak_Calls" > header';
-        $cmd .= "\t".'cat header '.$name.'_peaks.bed > t; mv t '.$name.'_peaks.bed'."\n"; 
-        $cmd .= "\trm -f $signal.bed $bg.bed header\n";
+       $cmd .= "\trm -f $signal.bed $bg.bed\n";
        $cmd .= "\t".'exit `grep -c "^CRITICAL" '.$name.'.err`'."\n\n";
 
-        $cmd .= "%_peaks.tab: %_peaks.bed\n";
+        $cmd .= "\n".$name."_peaks.bed: ".$name."_peaks.xls\n";
+        $cmd .= "\t$root_dir/scripts/MACS_2_BED.sh $< $name > $@\n\n"
+
+        $cmd .= "\n".$name."_peaks.tab: ".$name."_peaks.bed\n";
         $cmd .= "\t".'cat $< | awk \'{print NR"\t"$$1"\t"$$2"\t"$$3"\t1"}\' > $@'."\n\n";
 
-        $cmd .= "%_peaks.fasta: %_peaks.tab\n";
+        $cmd .= "\n".$name."_peaks.fasta: ".$name."_peaks.tab\n";
         $cmd .= "\t".'cat $< | '.$root_dir.'/scripts/extract_peaks.pm '.$root_dir.'/reference_data/hg18_chrom_list.txt > $@'."\n\n";
       }
 
@@ -411,8 +414,8 @@ sub WriteComparePeakCallingTasks {
         #my $set2_feat = $caller2."_".$set2."/".$peak_file_2;
   #
         #my $cmd = "$outfile: $set1_feat $set2_feat\n";
-        #$cmd .= "\t~/EXPTRACK/QC/count_reads_in_peaks NA $set1-$caller1 $set1_feat ~Data/Libraries/".$xml->{PeakCalling}->[$index1]->{Signal}->{Library}.".txt ";
-        #$cmd .= "$set2-$caller2 $set2_feat ~Data/Libraries/".$xml->{PeakCalling}->[$index2]->{Signal}->{Library}.".txt > \$@\n";
+        #$cmd .= "\t~/EXPTRACK/QC/count_reads_in_peaks NA $set1-$caller1 $set1_feat $data_dir/Libraries/".$xml->{PeakCalling}->[$index1]->{Signal}->{Library}.".txt ";
+        #$cmd .= "$set2-$caller2 $set2_feat $data_dir/Libraries/".$xml->{PeakCalling}->[$index2]->{Signal}->{Library}.".txt > \$@\n";
   #
         #$file_list .= "$outfile ";
         #$cmds .= $cmd."\n";
diff --git a/htswanalysis/scripts/WriteQCSummary.pm b/htswanalysis/scripts/WriteQCSummary.pm
new file mode 100755 (executable)
index 0000000..aef28d9
--- /dev/null
@@ -0,0 +1,91 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use XML::Simple;
+
+my $library_info = shift;
+my $xml = XMLin($library_info, ForceArray => 1);
+
+my %num_align;
+
+my $qpcr_filename = shift;
+open(QPCR,$qpcr_filename);
+
+my %qpcr_sum;
+
+while(<QPCR>) {
+  chomp;
+  my($lane,$factor,$enrich) = split(/ /,$_);
+  my $line2 = <QPCR>; chomp $line2;
+  my($lane2,$factor2,$enrich2) = split(/ /,$line2);
+   
+  next if(!defined($enrich));
+
+  my @a = split(/\//,$factor);
+  $factor = $a[scalar(@a)-1];
+  @a = split(/\//,$factor2);
+  $factor2 = $a[scalar(@a)-1];
+
+  my $bgcolor;
+  if($enrich < 2) { $bgcolor = "FF6600"; }
+  elsif($enrich< 5) { $bgcolor = "FFCC33"; }
+  elsif($enrich< 10) { $bgcolor = "00CCFF"; }
+  elsif($enrich eq "nan") { $bgcolor = "FFFFFF"; }
+  else { $bgcolor = "66FF66"; }
+
+  $lane =~ /^(.+?)\/(\d\d\d\d\d\d)_(.+?)_s(\d+)_(.+?)\.align_\d+\.(.+?)\.txt\.qPCR$/;
+  my($fc,$date,$fc2,$lanes,$name,$genome) = ($1,$2,$3,$4,$5,$6);
+
+  $qpcr_sum{$lanes}{'best'} = $factor;
+  $qpcr_sum{$lanes}{'enrich'} = $enrich;
+  $qpcr_sum{$lanes}{'bgcolor'} = $bgcolor;
+
+  $qpcr_sum{$lanes}{'best2'} = $factor2;
+  $qpcr_sum{$lanes}{'enrich2'} = $enrich2;
+  $qpcr_sum{$lanes}{'bgcolor2'} = $bgcolor;
+}
+
+for my $i (0..scalar(@{$xml->{Library}})-1) {
+  my $lib = $xml->{Library}->[$i]->{Name};
+  for my $t (0..scalar(@{$xml->{Library}->[$i]->{Track}})-1) {
+    my $N = scalar(@{$xml->{Library}->[$i]->{Track}});
+    my($date,$fc,$lane,$desc);
+    my $filename = $xml->{Library}->[$i]->{Track}->[$t]->{Filename};
+    $filename =~ /^(\d+)_(.+?)_s(\d+)_(.+?)_$lib.align/;
+    ($date,$fc,$lane,$desc) = ($1,$2,$3,$4);
+    my $num_reads = $xml->{Library}->[$i]->{Track}->[$t]->{Count};
+
+    my $bgcolor;
+    if($num_reads < 3000000) { $bgcolor = "FF3300"; }
+    elsif($num_reads < 5000000) { $bgcolor = "FFCC33"; }
+    elsif($num_reads < 10000000) { $bgcolor = "00CCFF"; }
+    else { $bgcolor = "66FF66"; }
+
+    $num_align{$lane}{'num'} = $num_reads;
+    $num_align{$lane}{'bgcolor'} = $bgcolor;
+  }
+}
+  
+print "<TABLE BORDER=1>";
+print "<TR><TD><EM>Lane\(s\)</EM></TD><TD><EM>Lib</EM></TD><TD><EM>Library Name</EM></TD><TD><EM>Aligned Reads</EM></TD><TD><EM>qPCR Factor</EM></TD><TD><EM>Fold enr.</EM></TD><TD><EM>Profile at TSS</EM></TD><TD>IVC Calls</TD></TR>\n";
+
+my @files = `ls -1 QC/*.align_??.*.txt.profile.gif`;
+for my $file (@files) {
+  $file =~ /(\d\d\d\d\d\d)_(.+?)_s(\d+)_(.+?)_([Ss][Ll]\d+)/;
+  my($date,$fc,$lanes,$libname,$lib) = ($1,$2,$3,$4,$5);
+  my $lane = substr($lanes,0,1);
+  print "<TR>\n";
+  print "<TD>$lanes</TD>";
+  print "<TD>$lib</TD>\n";
+  print "<TD>$libname</TD>\n";
+  printf "<TD BGCOLOR=#%s>%0.2fM</TD>\n",$num_align{$lanes}{'bgcolor'},$num_align{$lanes}{'num'}/1000000.0;
+  printf "<TD BGCOLOR=#%s>%s</TD><TD BGCOLOR=#%s>%0.2f<BR>%0.2f</TD>\n",$qpcr_sum{$lanes}{'bgcolor'},$qpcr_sum{$lanes}{'best'}."<BR>".$qpcr_sum{$lanes}{'best2'},$qpcr_sum{$lanes}{'bgcolor'},$qpcr_sum{$lanes}{'enrich'},$qpcr_sum{$lanes}{'enrich2'};
+  print "<TD><OBJECT DATA=\"",`basename $file`,"\" WIDTH=\"300\" HEIGHT=\"300\"></OBJECT></TD>";
+  print "<TD><IMG SRC=\"s_",$lane,"_percent_base.png\" WIDTH=\"300\" HEIGHT=\"300\"></TD>";
+  print "</TR>\n";
+}
+print "</TABLE>\n";
+
+print "</BODY></HTML>\n";