Moving QC functionalty into repository so we can run outside of the pipeline.
authorTim Reddy Tim <treddy@hudsonalpha.org>
Mon, 24 Nov 2008 22:37:37 +0000 (22:37 +0000)
committerTim Reddy Tim <treddy@hudsonalpha.org>
Mon, 24 Nov 2008 22:37:37 +0000 (22:37 +0000)
htswanalysis/conf/PercentBaseFormat.gnuplot [new file with mode: 0644]
htswanalysis/conf/TSSProfileFormat.gnuplot [new file with mode: 0644]
htswanalysis/scripts/CleanWig.pm [new file with mode: 0755]
htswanalysis/scripts/CollectLibraries.pm
htswanalysis/scripts/Flowcell_QC_Makefile [new file with mode: 0644]
htswanalysis/scripts/LibrariesMakefile
htswanalysis/scripts/MACS_2_BED.sh [new file with mode: 0755]
htswanalysis/scripts/QC_Summarize_qPCR.pm [new file with mode: 0755]
htswanalysis/scripts/WriteQCSummary.pm
htswanalysis/scripts/count_bases.pm [new file with mode: 0755]
htswanalysis/src/profile_reads_wig.cpp

diff --git a/htswanalysis/conf/PercentBaseFormat.gnuplot b/htswanalysis/conf/PercentBaseFormat.gnuplot
new file mode 100644 (file)
index 0000000..8ab16dc
--- /dev/null
@@ -0,0 +1,8 @@
+set terminal png
+set linestyle lw 3
+set linestyle ps 3
+set output "OUTNAME"
+set xlabel "cycle"
+set ylabel "percent"
+set yrange     [0:1]
+plot "FILENAME" using 1:2 title 'A' with linespoints, "FILENAME" using 1:3 title 'C' with linespoints, "FILENAME" using 1:4 title 'G' with linespoints, "FILENAME" using 1:5 title 'T' with linespoints
diff --git a/htswanalysis/conf/TSSProfileFormat.gnuplot b/htswanalysis/conf/TSSProfileFormat.gnuplot
new file mode 100644 (file)
index 0000000..79ad9f1
--- /dev/null
@@ -0,0 +1,10 @@
+set terminal png
+set linestyle lw 3
+set linestyle ps 3
+set xtics 250
+set output "OUTNAME"
+set size 0.6,0.6
+set xlabel "cycle"
+set ylabel "percent"
+set arrow 1 from first "0",graph 0.0 to first "0",graph 1.0 nohead front lw 3 linecolor rgb "#222222"
+plot "FILENAME" using 1:2 title '' smooth cspline with filledcurve y1=0;
diff --git a/htswanalysis/scripts/CleanWig.pm b/htswanalysis/scripts/CleanWig.pm
new file mode 100755 (executable)
index 0000000..d4aeb46
--- /dev/null
@@ -0,0 +1,14 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my $on = 1;
+
+while(<>) {
+  if($_ =~ /chrom=(.*)$/) {
+    if($1 =~ /^chr/) { $on = 1; } else { $on = 0; }
+  }
+  
+  if($on) { print $_; }
+}
index 84cf5f0a85a47bec91462f6ede7b6a42a5f2128e..f3d6aed41f5b5db1382369ceaa7a4a595a00a886 100755 (executable)
@@ -3,13 +3,9 @@
 use strict;
 use warnings;
 
-my $data_dir = shift;
-
-my @libs = `ls -1d $data_dir/Flowcells/**/*.align*.txt`;
-
 my %libraries;
 
-for my $filename (@libs) {
+for my $filename (@ARGV) {
   chomp $filename;
 
   my $base_file = `basename $filename`;
diff --git a/htswanalysis/scripts/Flowcell_QC_Makefile b/htswanalysis/scripts/Flowcell_QC_Makefile
new file mode 100644 (file)
index 0000000..fa3fd8b
--- /dev/null
@@ -0,0 +1,48 @@
+EXPTRACK_DIR=$(HTSW_ANALYSIS_ROOT)
+DATA_DIR=/Users/Data
+
+FLOWCELL=$(shell pwd | awk -F/ '{print $$NF}')
+FILES=$(shell ls -1d *.align*.txt)
+QPCR_FILES=$(shell ls -1d *.align*.txt | sed -e s/txt/txt.qPCR/)
+COUNT_FILES=$(shell ls -1d *.align*.txt | sed -e s/txt/txt.count/)
+PROFILE_FILES=$(shell ls -1d *.align*.txt | sed -e s/txt/txt.profile/)
+PROFILE_IMAGES=$(shell ls -1d *.align*.txt | sed -e s/txt/txt.profile.png/)
+PERCENT_BASE_IMAGES=$(shell ls -1d *.pf.txt.gz | sed -e s/pf.txt.gz/percent_base.png/)
+
+all: $(QPCR_FILES) $(PROFILE_IMAGES) $(PERCENT_BASE_IMAGES) $(COUNT_FILES) $(FILES) $(FLOWCELL)_qPCR_summary.txt $(FLOWCELL)_qPCR_summary.html $(FLOWCELL)_LibraryInfo.xml $(FLOWCELL)_SequencingSummary.html $(FLOWCELL)_QC_Summary.html
+
+%.txt.count: %.txt
+       grep -v contam $< | awk '{if(NF > 3) {print $$1} }' | wc -l > $@;
+
+%.txt.qPCR: %.txt
+       echo $(EXPTRACK_DIR)/qPCR/qPCR $< $(EXPTRACK_DIR)/qPCR/GenericBackground $(EXPTRACK_DIR)/qPCR/Tests/ | sort -k 2 -g -r | sed -e "s#SolexaSoftware/qPCR/Tests//##"
+       $(EXPTRACK_DIR)/qPCR/qPCR $< $(EXPTRACK_DIR)/qPCR/GenericBackground $(EXPTRACK_DIR)/qPCR/Tests/ | sort -k 2 -g -r | sed -e "s#SolexaSoftware/qPCR/Tests//##" > $@
+
+%.txt.profile: %.txt
+       $(EXPTRACK_DIR)/bin/profile_reads_against_features $< $(EXPTRACK_DIR)/reference_data/`basename $< | awk -F\. '{ print $$3 }'`_tx_start_sites > $@
+
+%.txt.profile.png: %.txt.profile
+       cat $(EXPTRACK_DIR)/conf/TSSProfileFormat.gnuplot | sed -e s#FILENAME#$<#g -e s#OUTNAME#$@# | gnuplot;
+
+# new software to profile base composition .
+%.percent_base: %.pf.txt.gz
+       cat $< | gzip -d | $(EXPTRACK_DIR)/scripts/count_bases.pm > $@
+
+%.percent_base.png: %.percent_base
+       cat $(EXPTRACK_DIR)/conf/PercentBaseFormat.gnuplot | sed -e s#FILENAME#$<#g -e s#OUTNAME#$@# | gnuplot;
+
+$(FLOWCELL)_qPCR_summary.txt: $(QPCR_FILES)
+       rm -f $@;
+       for f in $^; do echo `echo $$f`       `cat $$f | head -n 1` >> $@; echo `echo $$f`       `cat $$f | head -n 2 | tail -n 1` >> $@; done;
+
+$(FLOWCELL)_qPCR_summary.html: $(FLOWCELL)_qPCR_summary.txt;
+       cat $< | $(EXPTRACK_DIR)/scripts/QC_Summarize_qPCR.pm > $@
+
+$(FLOWCELL)_LibraryInfo.xml: $(COUNT_FILES)
+       $(EXPTRACK_DIR)/scripts/CollectLibraries.pm `ls *.align*.txt` > $@
+
+$(FLOWCELL)_SequencingSummary.html: $(FLOWCELL)_LibraryInfo.xml
+       $(EXPTRACK_DIR)/scripts/SummarizeLibrary.pm $< > $@
+
+$(FLOWCELL)_QC_Summary.html: $(FLOWCELL)_SequencingSummary.html $(FLOWCELL)_qPCR_summary.html $(PROFILE_FILES)
+       $(EXPTRACK_DIR)/scripts/WriteQCSummary.pm $(FLOWCELL)_LibraryInfo.xml $(FLOWCELL)_qPCR_summary.txt > $@;
index 4a779dd1f3105e7c8bc797a17d06a588927289e8..0305e8fbb241a8e5d690ed28934e5d2f02c897ed 100644 (file)
@@ -54,7 +54,7 @@ $(DATA_DIR)/qPCR_summary.txt: $(QPCR_FILES)
        cat $@ | sort -k 2,1 -g -r > t && mv t $@;
 
 $(DATA_DIR)/LibraryInfo.xml: $(COUNT_FILES) $(CMPLX_FILES)
-       $(ROOT_DIR)/scripts/CollectLibraries.pm $(DATA_DIR) > $@;
+       $(ROOT_DIR)/scripts/CollectLibraries.pm `ls $(DATA_DIR)/Flowcells/**/*.align*.txt` > $@;
        $(ROOT_DIR)/scripts/RecompileLibraries.pm $@ $(DATA_DIR)
 
 $(DATA_DIR)/SequencingSummary.html: $(DATA_DIR)/LibraryInfo.xml
diff --git a/htswanalysis/scripts/MACS_2_BED.sh b/htswanalysis/scripts/MACS_2_BED.sh
new file mode 100755 (executable)
index 0000000..c9a3d6d
--- /dev/null
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo track type=bedGraph name="$2" description="$2" visibility=full color=200,100,0 altColor=0,100,200 priority=20
+cat $1 | grep -v "#" | grep -v "^$" | grep -v "start" | awk '{print $1"\t"$2"\t"$3"\t"$7}'
diff --git a/htswanalysis/scripts/QC_Summarize_qPCR.pm b/htswanalysis/scripts/QC_Summarize_qPCR.pm
new file mode 100755 (executable)
index 0000000..abed3f1
--- /dev/null
@@ -0,0 +1,40 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my $standalone = shift;
+
+if(defined($standalone)) { print "<HTML><HEAD><TITLE><em>in-silico</em> qPCR Summary</TITLE></HEAD><BODY>\n"; }
+print "<H2><EM>in-silico</EM> qPCR Summary</H2><TABLE BORDER=1><TR><TD><EM>Lane</EM></TD><TD><EM>Target</EM></TD><TD><EM>Enrichment</EM></TD></TR>\n";
+
+while(<>) {
+  chomp;
+  my($lane,$factor,$enrich) = split(/ /,$_);
+  my $line2 = <>; chomp $line2;
+  my($lane2,$factor2,$enrich2) = split(/ /,$line2);
+   
+  next if(!defined($enrich));
+
+  my @a = split(/\//,$factor);
+  $factor = $a[scalar(@a)-1];
+
+  my $bgcolor;
+  if($enrich < 2) { $bgcolor = "FF6600"; }
+  elsif($enrich< 5) { $bgcolor = "FFCC33"; }
+  elsif($enrich< 10) { $bgcolor = "00CCFF"; }
+  elsif($enrich eq "nan") { $bgcolor = "FFFFFF"; }
+  else { $bgcolor = "66FF66"; }
+
+  $lane =~ /^(\d\d\d\d\d\d)_(.+?)_s(\d+)_(.+?)\.align_\d+\.(.+?)\.txt\.qPCR$/;
+  my($date,$fc,$lanes,$name,$genome) = ($1,$2,$3,$4,$5,$6);
+  #print STDERR "$lane\n";
+  #$lane =~ /(\d\d\d\d\d\d)_(.+?)_/;
+  #my($fc,$date,$fc2,$lanes,$name,$genome) = ($1,$1,$1,$1,$1,$1);
+  
+  printf("<TR BGCOLOR=#%s><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%0.2f<BR>%0.2f</TD></TR>\n",$bgcolor,$date,$fc,$lanes,$name,$genome,$factor."<BR>".$factor2,$enrich,$enrich2);
+}
+
+print "</TABLE>\n";
+
+if(defined($standalone)) { print "</BODY></HTML>\n"; }
index aef28d97cba5f191a17dcf50b94ec93804dbf60e..9593c8940ba4e984fe08c7ca3fc33225fc543a03 100755 (executable)
@@ -35,8 +35,8 @@ while(<QPCR>) {
   elsif($enrich eq "nan") { $bgcolor = "FFFFFF"; }
   else { $bgcolor = "66FF66"; }
 
-  $lane =~ /^(.+?)\/(\d\d\d\d\d\d)_(.+?)_s(\d+)_(.+?)\.align_\d+\.(.+?)\.txt\.qPCR$/;
-  my($fc,$date,$fc2,$lanes,$name,$genome) = ($1,$2,$3,$4,$5,$6);
+  $lane =~ /^(\d\d\d\d\d\d)_(.+?)_s(\d+)_(.+?)\.align_\d+\.(.+?)\.txt\.qPCR$/;
+  my($date,$fc,$lanes,$name,$genome) = ($1,$2,$3,$4,$5,$6);
 
   $qpcr_sum{$lanes}{'best'} = $factor;
   $qpcr_sum{$lanes}{'enrich'} = $enrich;
@@ -71,7 +71,7 @@ for my $i (0..scalar(@{$xml->{Library}})-1) {
 print "<TABLE BORDER=1>";
 print "<TR><TD><EM>Lane\(s\)</EM></TD><TD><EM>Lib</EM></TD><TD><EM>Library Name</EM></TD><TD><EM>Aligned Reads</EM></TD><TD><EM>qPCR Factor</EM></TD><TD><EM>Fold enr.</EM></TD><TD><EM>Profile at TSS</EM></TD><TD>IVC Calls</TD></TR>\n";
 
-my @files = `ls -1 QC/*.align_??.*.txt.profile.gif`;
+my @files = `ls -1 *.align_??.*.txt.profile.png`;
 for my $file (@files) {
   $file =~ /(\d\d\d\d\d\d)_(.+?)_s(\d+)_(.+?)_([Ss][Ll]\d+)/;
   my($date,$fc,$lanes,$libname,$lib) = ($1,$2,$3,$4,$5);
@@ -83,7 +83,7 @@ for my $file (@files) {
   printf "<TD BGCOLOR=#%s>%0.2fM</TD>\n",$num_align{$lanes}{'bgcolor'},$num_align{$lanes}{'num'}/1000000.0;
   printf "<TD BGCOLOR=#%s>%s</TD><TD BGCOLOR=#%s>%0.2f<BR>%0.2f</TD>\n",$qpcr_sum{$lanes}{'bgcolor'},$qpcr_sum{$lanes}{'best'}."<BR>".$qpcr_sum{$lanes}{'best2'},$qpcr_sum{$lanes}{'bgcolor'},$qpcr_sum{$lanes}{'enrich'},$qpcr_sum{$lanes}{'enrich2'};
   print "<TD><OBJECT DATA=\"",`basename $file`,"\" WIDTH=\"300\" HEIGHT=\"300\"></OBJECT></TD>";
-  print "<TD><IMG SRC=\"s_",$lane,"_percent_base.png\" WIDTH=\"300\" HEIGHT=\"300\"></TD>";
+  print "<TD><IMG SRC=\"",$date,"_",$fc,"_s",$lanes,"_",$libname,"_",$lib,".percent_base.png\" WIDTH=\"300\" HEIGHT=\"300\"></TD>";
   print "</TR>\n";
 }
 print "</TABLE>\n";
diff --git a/htswanalysis/scripts/count_bases.pm b/htswanalysis/scripts/count_bases.pm
new file mode 100755 (executable)
index 0000000..672718b
--- /dev/null
@@ -0,0 +1,33 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my %calls;
+my @a = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
+my @c = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
+my @g = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
+my @t = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
+
+$calls{'A'} = \@a;
+$calls{'C'} = \@c;
+$calls{'G'} = \@g;
+$calls{'T'} = \@t;
+
+my $count = 0;
+my $length;
+
+while( my $read = <>) {
+  chomp $read;
+  if(!defined($length)) { $length = length($read); }
+  if($read =~ /\./) { next; }
+  $count++;
+  for(0..$length-1) {
+    my $base = uc(substr($read,$_,1));
+    $calls{$base}[$_] += 1;
+  }
+}
+
+for(0..$length-1) {
+  print "$_\t",$calls{A}[$_]/$count,"\t",$calls{C}[$_]/$count,"\t",$calls{G}[$_]/$count,"\t",$calls{T}[$_]/$count,"\n";
+}
index 4e58b456187c72d69b149f8b5d3f0f553a135a45..e47d0f9a68bb7abb5a09cdae5ccf458de3516c25 100644 (file)
@@ -88,6 +88,7 @@ void read_align_file(char* filename, Reads& features) {
     vector<string> location; split(fields[3], location_delim, location);
     string chr = location[0];
     if(chr == "NA") { continue; }
+    if(chr.substr(0,3) != "chr") { continue; }
     int pos = atoi(location[1].c_str());
     bool strand = ((fields[4].c_str())[0] == 'F')?0:1;