--- /dev/null
+#!/usr/bin/perl
+
+use XML::Simple;
+use Cwd;
+use Config;
+
+use strict;
+use warnings;
+
+use threads;
+use threads::shared;
+use Thread::Queue;
+
+
+#
+# Number of threads to run simultaneously.
+#
+my $Nthread = 8;
+
+#
+# Read collection programs. May need to change these locations when moving to another
+# computer.
+#
+my $PIPELINE = '/Applications/SolexaPipeline/';
+
+`mkdir CollectReads`;
+my $LOGFILE = '>./CollectReads/CollectReads.log';
+open(LOGFILE,$LOGFILE);
+
+# Takes three args: 1) The directory of the reads,
+# 2) The lane filter
+# 3) The output file
+sub CollectAllReadsCommand {
+ my($dir,$lane,$output) = (shift,shift,shift,shift);
+ return qq{cat `ls $dir | grep "_seq" | grep $lane | awk '{print "$dir/"\$1}'` > $output};
+};
+
+sub CollectPFReadsCommand {
+ my($dir,$lane,$output) = (shift,shift,shift);
+ my $seq_file_mask='_[0-9][0-9][0-9][0-9]_seq.txt';
+ my $qhg_extension='_[0-9][0-9][0-9][0-9]_qhg.txt';
+ my $seq_suffix='_seq.txt';
+ my $qhg_suffix='_qhg.txt';
+ return qq{rm $output; for x in `ls $dir | grep "$seq_file_mask" | grep $lane`; do cat $dir/\${x:0:8}$seq_suffix | perl $PIPELINE/Gerald/qualityFilter.pl -stamp '(CHASTITY>=0.6)' $dir/\${x:0:8}$qhg_suffix 2> temp | awk '{print \$5}' | grep [A,T,G,C] >> $output 2> temp; done;};
+};
+
+sub CollectAlignReadsCommand {
+ my($dir,$lane,$PrimerSeq,$output,$pf_dir,$pf_output) = (shift,shift,shift,shift,shift);
+ my $PrimerSeqlen = length($PrimerSeq);
+ my $rc_PrimerSeq = reverse($PrimerSeq); $rc_PrimerSeq =~ y/acgtACGTnN/tgcaTGCAnN/;
+ my $file_extension='[0][0-9][0-9][0-9]_realign.txt';
+
+ my $check_cmd = qq{ls $dir | grep $file_extension | grep $lane | wc -l | awk '{print \$1}'};
+ my $check_res = `$check_cmd`;
+ chomp $check_res;
+ if($check_res eq "0") {
+ return qq{rm $output; echo "no alignment performed" >> $output};
+ } else {
+ return qq{rm $output; cat `ls $dir | grep $file_extension | grep $lane | awk '{print "$dir/"\$1}'` | awk '{first_char = substr(\$0,1,1); if(first_char != "#") print \$0; }' | awk '{if(NF > 3) { split(\$4,a,":"); if(\$5 == "F") { \$1="$PrimerSeq"\$1; \$4=a[1]":"a[2]-$PrimerSeqlen; \$6="$PrimerSeq"\$6 } else { \$1=\$1"$rc_PrimerSeq"; \$4=a[1]":"a[2]+PrimerSeqlen; \$6=\$6"$rc_PrimerSeq" } } print \$0 }' >> $output};
+ }
+}
+
+# Check for thread-enabled perl
+$Config{useithreads} or die "Recompile Perl with threads to run this program.";
+
+# Parameters from user
+my $dir = shift || "Data";
+my $ResultsXML = shift || "./LaneNames.xml";
+
+my $xsl = XML::Simple->new();
+my $ResultsDoc = $xsl->XMLin($ResultsXML);
+
+PrintBanner("Experiment information:");
+
+#~~~~~~~ Rami: change to extract date simply from the run folder string, instead of the tag attribute
+# my $RunDate = $ResultsDoc->{Date};
+my $Runfolder = getcwd();
+$Runfolder =~ s/^.*\///;
+my $RunDate = $Runfolder;
+$RunDate =~ s/_.*$//;
+
+my $Flowcell = $ResultsDoc->{Flowcell};
+print STDERR "Date: $RunDate\tFlowcell: $Flowcell\n";
+
+my %SolexaLibraries;
+for(0..scalar(@{$ResultsDoc->{Lane}})-1) {
+ my $LaneIndex = $ResultsDoc->{Lane}->[$_]->{Index};
+ my $LaneName = $ResultsDoc->{Lane}->[$_]->{Name};
+ my $LaneLib = $ResultsDoc->{Lane}->[$_]->{Library};
+ my $LaneGen = $ResultsDoc->{Lane}->[$_]->{Genome};
+ my $LanePrimerSeq = $ResultsDoc->{Lane}->[$_]->{PrimerSeq};
+
+ $SolexaLibraries{$LaneIndex}{Name} = $LaneName;
+ $SolexaLibraries{$LaneIndex}{Genome} = $LaneGen;
+ $SolexaLibraries{$LaneIndex}{Lib} = $LaneLib;
+ $SolexaLibraries{$LaneIndex}{PrimerSeq} = $LanePrimerSeq;
+}
+
+PrintBanner("Directory information:");
+
+my $cwd = getcwd();
+print STDERR "Current Directory: $cwd\n";
+my $datadir = $cwd."/".$dir;
+print STDERR "Data Directory: $datadir\n";
+my $firecrestdir = $datadir."/".FindDir($datadir,"Firecrest");
+print 'Firecrest Directory: ',$firecrestdir,"\n";
+my $bustarddir = $firecrestdir."/".FindDir($firecrestdir,"Bustard");
+print 'Bustard Directory: ',$bustarddir,"\n";
+my $geralddir = $bustarddir."/".FindDir($bustarddir,"GERALD");
+print 'Gerald Directory: ',$geralddir,"\n";
+
+PrintBanner("Collecting Reads");
+
+# Read to collect reads. Using template based on below example
+#
+#~/tools/collect_reads_from_pipeline/collect_pf_reads.sh
+#/Volumes/Andromeda/070919_SBSDRIVER_0003_FC11862/Data/C1-36_Firecrest1.8.28_24-09-2007_solexauser/Bustard1.8.28_24-09-2007_solexauser/
+#070919_FC11862_s345_d0_ES_C_msp_SL53.pf.txt
+#s_[3,4,5]
+#
+#(Program) (Bustard) (Output) (LaneSpec)
+
+my $CmdQueue = Thread::Queue->new;
+for (keys %SolexaLibraries) {
+ my $lane = $_;
+ my $lib = $SolexaLibraries{$_}{Lib};
+ my $name = $SolexaLibraries{$_}{Name};
+ my $genome = $SolexaLibraries{$_}{Genome};
+ my $PrimerSeq = $SolexaLibraries{$_}{PrimerSeq};
+ $name =~ s/ /_/g;
+
+ my $LaneString = "s$lane";
+ my $LaneFilter = "s_[$lane]";
+
+ my $outfile = './CollectReads/'.$RunDate.'_'.$Flowcell.'_'.$LaneString.'_'.$name.'_'.$lib;
+ #sanitize output file
+ $outfile =~ y/\(\) /___/;
+ $CmdQueue->enqueue(CollectAllReadsCommand($bustarddir,$LaneFilter,$outfile.'.all.txt'));
+ $CmdQueue->enqueue(CollectPFReadsCommand($bustarddir,$LaneFilter,$outfile.'.pf.txt'));
+ $CmdQueue->enqueue(CollectAlignReadsCommand($geralddir,$LaneFilter,$PrimerSeq,$outfile.'.align_25.'.$genome.'.txt',$bustarddir,$outfile.'.pf.txt'));
+}
+
+my $tot_commands : shared = $CmdQueue->pending();
+my $finished_commands : shared = 0;
+
+my @cmd_threads;
+for(0..$Nthread-1) {
+ #Use the undef to signal the end of data for each thread.
+ $CmdQueue->enqueue(undef);
+
+ $cmd_threads[$_] = threads->new(sub {
+ while (my $cmd = $CmdQueue->dequeue()) {
+ system($cmd);
+ print LOGFILE "Completed: $cmd\n";
+ $finished_commands++;
+ printf STDERR "%d%% complete (%d of %d).\n",int(100*$finished_commands/$tot_commands),$finished_commands,$tot_commands;
+ }
+ });
+}
+
+#
+# wait for the work to be done.
+#
+for(0..$Nthread-1) {
+ $cmd_threads[$_]->join();
+}
+
+close(LOGFILE);
+
+# THIS IS NECESSARY ONLY IF RAN SEPARATELY, OUTSIDE THE MAINSCRIPT
+# Rami
+#if(CheckFile($expTrackProgram)) {
+# system("python $expTrackProgram updsts $Flowcell $Runfolder 5");
+#}
+#else
+#{
+# print'(Could not find expTrackProgram $expTrackProgram, therefore skipping ExpTrack DB update.)\n';
+#}
+
+##Rami create OKfile
+my $OKfile = "ExpTrackLog/OK_CollectReads";
+open OKFILE, ">$OKfile";
+close OKFILE;
+#######
+
+print STDERR "\nDone!\n";
+
+
+sub FindDir {
+ my $dir = shift;
+ my $dirstring = shift;
+
+ opendir(DIR,$dir);
+ my @files = grep(/$dirstring/,readdir(DIR));
+ closedir(DIR);
+ my $founddir;
+ if(scalar(@files) == 0) {
+ print STDERR "Error: $dirstring directory not found in data directory \"$dir\". Make sure that the data directory is correct and that the $dirstring directory exists.\n";
+ exit(1);
+ } elsif(scalar(@files) > 1) {
+ print "Multiple $dirstring directories found.\n";
+ for(0..scalar(@files)-1) {
+ print $_+1,") $files[$_]\n";
+ }
+ print "Which one do you want to use? ";
+ my $choice = <>;
+ if(!defined($files[$choice-1])) {
+ print STDERR "Error: \"$choice\" is not an option. Restart program to try again.\n";
+ exit(1);
+ } else {
+ $founddir = $files[$choice-1];
+ }
+ } else {
+ $founddir = $files[0];
+ }
+ return $founddir;
+}
+
+sub CheckFile {
+ my $filename = shift;
+ my($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks) = stat($filename);
+ # if the file does not exist
+ if(!defined($blocks)) {
+ return 0;
+ }
+ return 1;
+}
+
+sub PrintBanner{
+ my $text = shift;
+ print STDERR "\n";
+ print STDERR "******",'*' x length($text),"******\n";
+ print STDERR "***** $text *****\n";
+ print STDERR "******",'*' x length($text),"******\n";
+ print STDERR "\n";
+}