From 1a6839f0441d370403587a3b9ca32ae3c3c54d6a Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 29 Aug 2008 17:40:14 +0000 Subject: [PATCH] Merge in all of my testing code from trunk. Added a master runtest.sh script that deals with the annoying problem caused by nesting each package in its own source tree. I moved in my eland to bed file code (which might still need to be renamed). I resolved all the import errors in my illumina specific parsing code (thanks to the test code) --- htswdataprod/htswdataprod/illumina/bustard.py | 2 +- .../htswdataprod/illumina/firecrest.py | 2 +- htswdataprod/htswdataprod/illumina/gerald.py | 2 +- htswdataprod/htswdataprod/illumina/makebed.py | 142 +++ .../htswdataprod/{ => illumina}/runfolder.py | 16 +- .../htswdataprod/illumina/test/__init__.py | 0 .../illumina/test/test_genome_mapper.py | 33 + .../illumina/test/test_makebed.py | 51 + .../illumina/test/test_runfolder026.py | 601 ++++++++++ .../illumina/test/test_runfolder030.py | 1024 +++++++++++++++++ htswdataprod/scripts/eland_makebed | 106 ++ runtests.sh | 10 + 12 files changed, 1978 insertions(+), 11 deletions(-) create mode 100755 htswdataprod/htswdataprod/illumina/makebed.py rename htswdataprod/htswdataprod/{ => illumina}/runfolder.py (96%) create mode 100644 htswdataprod/htswdataprod/illumina/test/__init__.py create mode 100644 htswdataprod/htswdataprod/illumina/test/test_genome_mapper.py create mode 100644 htswdataprod/htswdataprod/illumina/test/test_makebed.py create mode 100644 htswdataprod/htswdataprod/illumina/test/test_runfolder026.py create mode 100644 htswdataprod/htswdataprod/illumina/test/test_runfolder030.py create mode 100755 htswdataprod/scripts/eland_makebed create mode 100755 runtests.sh diff --git a/htswdataprod/htswdataprod/illumina/bustard.py b/htswdataprod/htswdataprod/illumina/bustard.py index 422acbf..ab34226 100644 --- a/htswdataprod/htswdataprod/illumina/bustard.py +++ b/htswdataprod/htswdataprod/illumina/bustard.py @@ -6,7 +6,7 @@ import os import time import re -from htswdataprod.runfolder import \ +from htswdataprod.illumina.runfolder import \ ElementTree, \ VERSION_RE, \ EUROPEAN_STRPTIME diff --git a/htswdataprod/htswdataprod/illumina/firecrest.py b/htswdataprod/htswdataprod/illumina/firecrest.py index bab436b..6402c3a 100644 --- a/htswdataprod/htswdataprod/illumina/firecrest.py +++ b/htswdataprod/htswdataprod/illumina/firecrest.py @@ -12,7 +12,7 @@ import os import re import time -from htswdataprod.runfolder import \ +from htswdataprod.illumina.runfolder import \ ElementTree, \ VERSION_RE, \ EUROPEAN_STRPTIME diff --git a/htswdataprod/htswdataprod/illumina/gerald.py b/htswdataprod/htswdataprod/illumina/gerald.py index 4ed320e..5b1e382 100644 --- a/htswdataprod/htswdataprod/illumina/gerald.py +++ b/htswdataprod/htswdataprod/illumina/gerald.py @@ -9,7 +9,7 @@ import stat import time import types -from htswdataprod.runfolder import \ +from htswdataprod.illumina.runfolder import \ ElementTree, \ EUROPEAN_STRPTIME, \ LANES_PER_FLOWCELL, \ diff --git a/htswdataprod/htswdataprod/illumina/makebed.py b/htswdataprod/htswdataprod/illumina/makebed.py new file mode 100755 index 0000000..6f1511c --- /dev/null +++ b/htswdataprod/htswdataprod/illumina/makebed.py @@ -0,0 +1,142 @@ +""" +Utility functions to make bedfiles. +""" +import os +import re + +# map eland_result.txt sense +sense_map = { 'F': '+', 'R': '-'} +sense_color = { 'F': '0,0,255', 'R': '255,255,0' } + +def write_bed_header(outstream, name, description): + """ + Produce the headerline for a bedfile + """ + # provide default track names + if name is None: name = "track" + if description is None: description = "eland result file" + bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' + bed_header += os.linesep + outstream.write(bed_header % (name, description)) + +def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'): + """ + read an eland result file from instream and write a bedfile to outstream + """ + # indexes into fields in eland_result.txt file + SEQ = 1 + CHR = 6 + START = 7 + SENSE = 8 + + write_bed_header(outstream, name, description) + + for line in instream: + fields = line.split() + # we need more than the CHR field, and it needs to match a chromosome + if len(fields) <= CHR or \ + (chromosome_prefix is not None and \ + fields[CHR][:3] != chromosome_prefix): + continue + start = fields[START] + stop = int(start) + len(fields[SEQ]) + chromosome, extension = fields[CHR].split('.') + assert extension == "fa" + outstream.write('%s %s %d read 0 %s - - %s%s' % ( + chromosome, + start, + stop, + sense_map[fields[SENSE]], + sense_color[fields[SENSE]], + os.linesep + )) + + +def make_bed_from_multi_eland_stream( + instream, + outstream, + name, + description, + chr_prefix='chr', + max_reads=255 + ): + """ + read a multi eland stream and write a bedfile + """ + write_bed_header(outstream, name, description) + parse_multi_eland(instream, outstream, chr_prefix, max_reads) + +def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255): + + loc_pattern = '(?P(?P[0-9]+)(?P[FR])(?P[0-9]+))' + other_pattern = '(?P[^:,]+)' + split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern)) + + for line in instream: + rec = line.split() + if len(rec) > 3: + # colony_id = rec[0] + seq = rec[1] + # number of matches for 0, 1, and 2 mismatches + # m0, m1, m2 = [int(x) for x in rec[2].split(':')] + compressed_reads = rec[3] + cur_chr = "" + reads = {0: [], 1: [], 2:[]} + + for token in split_re.finditer(compressed_reads): + if token.group('chr') is not None: + cur_chr = token.group('chr')[:-3] # strip off .fa + elif token.group('fullloc') is not None: + matches = int(token.group('count')) + # only emit a bed line if + # our current chromosome starts with chromosome pattern + if chr_prefix is None or cur_chr.startswith(chr_prefix): + start = int(token.group('start')) + stop = start + len(seq) + orientation = token.group('dir') + strand = sense_map[orientation] + color = sense_color[orientation] + # build up list of reads for this record + reads[matches].append((cur_chr, start, stop, strand, color)) + + # report up to our max_read threshold reporting the fewer-mismatch + # matches first + reported_reads = 0 + keys = [0,1,2] + for mismatch, read_list in ((k, reads[k]) for k in keys): + reported_reads += len(read_list) + if reported_reads <= max_reads: + for cur_chr, start, stop, strand, color in read_list: + reported_reads += 1 + outstream.write('%s %d %d read 0 %s - - %s%s' % ( + cur_chr, + start, + stop, + sense_map[orientation], + sense_color[orientation], + os.linesep + )) + +def make_description(database, flowcell_id, lane): + """ + compute a bedfile name and description from the fctracker database + """ + from gaworkflow.util.fctracker import fctracker + + fc = fctracker(database) + cells = fc._get_flowcells("where flowcell_id='%s'" % (flowcell_id)) + if len(cells) != 1: + raise RuntimeError("couldn't find flowcell id %s" % (flowcell_id)) + lane = int(lane) + if lane < 1 or lane > 8: + raise RuntimeError("flowcells only have lanes 1-8") + + name = "%s-%s" % (flowcell_id, lane) + + cell_id, cell = cells.items()[0] + assert cell_id == flowcell_id + + cell_library_id = cell['lane_%d_library_id' %(lane,)] + cell_library = cell['lane_%d_library' %(lane,)] + description = "%s-%s" % (cell_library['library_name'], cell_library_id) + return name, description diff --git a/htswdataprod/htswdataprod/runfolder.py b/htswdataprod/htswdataprod/illumina/runfolder.py similarity index 96% rename from htswdataprod/htswdataprod/runfolder.py rename to htswdataprod/htswdataprod/illumina/runfolder.py index 65f6191..b3a5da0 100644 --- a/htswdataprod/htswdataprod/runfolder.py +++ b/htswdataprod/htswdataprod/illumina/runfolder.py @@ -22,8 +22,8 @@ VERSION_RE = "([0-9\.]+)" USER_RE = "([a-zA-Z0-9]+)" LANES_PER_FLOWCELL = 8 -from gaworkflow.util.alphanum import alphanum -from gaworkflow.util.ethelp import indent, flatten +from htswcommon.util.alphanum import alphanum +from htswcommon.util.ethelp import indent, flatten class PipelineRun(object): @@ -86,9 +86,9 @@ class PipelineRun(object): def set_elements(self, tree): # this file gets imported by all the others, # so we need to hide the imports to avoid a cyclic imports - from gaworkflow.pipeline import firecrest - from gaworkflow.pipeline import bustard - from gaworkflow.pipeline import gerald + from htswdataprod.illumina import firecrest + from htswdataprod.illumina import bustard + from htswdataprod.illumina import gerald tag = tree.tag.lower() if tag != PipelineRun.PIPELINE_RUN.lower(): @@ -142,9 +142,9 @@ def get_runs(runfolder): generate two different PipelineRun objects, that differ in there gerald component. """ - from gaworkflow.pipeline import firecrest - from gaworkflow.pipeline import bustard - from gaworkflow.pipeline import gerald + from htswdataprod.illumina import firecrest + from htswdataprod.illumina import bustard + from htswdataprod.illumina import gerald datadir = os.path.join(runfolder, 'Data') diff --git a/htswdataprod/htswdataprod/illumina/test/__init__.py b/htswdataprod/htswdataprod/illumina/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htswdataprod/htswdataprod/illumina/test/test_genome_mapper.py b/htswdataprod/htswdataprod/illumina/test/test_genome_mapper.py new file mode 100644 index 0000000..0ea01be --- /dev/null +++ b/htswdataprod/htswdataprod/illumina/test/test_genome_mapper.py @@ -0,0 +1,33 @@ +import unittest + +from StringIO import StringIO +from htswdataprod import genome_mapper + +class testGenomeMapper(unittest.TestCase): + def test_construct_mapper(self): + genomes = { + 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'}, + 'Homo sapiens': {'hg18': '/hg18'}, + 'Mus musculus': {'mm8': '/mm8', + 'mm9': '/mm9', + 'mm10': '/mm10'}, + 'Phage': {'174': '/phi'}, + } + genome_map = genome_mapper.constructMapperDict(genomes) + + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi") + self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10") + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10") + + self.failUnlessEqual(len(genome_map.keys()), 6) + self.failUnlessEqual(len(genome_map.values()), 6) + self.failUnlessEqual(len(genome_map.items()), 6) + + +def suite(): + return unittest.makeSuite(testGenomeMapper,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") diff --git a/htswdataprod/htswdataprod/illumina/test/test_makebed.py b/htswdataprod/htswdataprod/illumina/test/test_makebed.py new file mode 100644 index 0000000..08dcd51 --- /dev/null +++ b/htswdataprod/htswdataprod/illumina/test/test_makebed.py @@ -0,0 +1,51 @@ +import os +from StringIO import StringIO +import unittest + +from htswdataprod.illumina import makebed + +class testMakeBed(unittest.TestCase): + def test_multi_1_0_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + + def test_multi_1_0_0_limit_255(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) + self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + + + def test_multi_2_0_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + self.failUnlessEqual(out.len, 0) + + def test_multi_2_0_0_limit_255(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) + self.failUnlessEqual(out.len, 98) + + def test_multi_0_2_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495 TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG 0:2:1 mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + print out.getvalue() + self.failUnlessEqual(out.len, 0) + +def suite(): + return unittest.makeSuite(testMakeBed, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') + + diff --git a/htswdataprod/htswdataprod/illumina/test/test_runfolder026.py b/htswdataprod/htswdataprod/illumina/test/test_runfolder026.py new file mode 100644 index 0000000..8a6410b --- /dev/null +++ b/htswdataprod/htswdataprod/illumina/test/test_runfolder026.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htswdataprod.illumina import firecrest +from htswdataprod.illumina import bustard +from htswdataprod.illumina import gerald +from htswdataprod.illumina import runfolder +from htswdataprod.illumina.runfolder import ElementTree + + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + + +def make_summary_htm(gerald_dir): + summary_htm = """ + + + + +

080416_HWI-EAS229_0024_207BTAAXX Summary

+

Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080416_HWI-EAS229_0024_207BTAAXX
Chip IDunknown
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterTiles
1unknowndm3ELAND32'((CHASTITY>=0.6))'Lane 1
2unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 2
3unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 3
4unknowncanfam2ELAND32'((CHASTITY>=0.6))'Lane 4
5unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 5
6unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 6
7unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 7
8unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane Clusters Av 1st Cycle Int % intensity after 20 cycles % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
117421 +/- 21397230 +/- 80123.73 +/- 10.7913.00 +/- 22.9132.03 +/- 18.456703.57 +/- 3753.854.55 +/- 4.81
220311 +/- 24027660 +/- 67817.03 +/- 4.4040.74 +/- 30.3329.54 +/- 9.035184.02 +/- 1631.543.27 +/- 3.94
320193 +/- 23997700 +/- 79715.75 +/- 3.3056.56 +/- 17.1627.33 +/- 7.484803.49 +/- 1313.313.07 +/- 2.86
415537 +/- 25317620 +/- 139215.37 +/- 3.7963.05 +/- 18.3015.88 +/- 4.993162.13 +/- 962.593.11 +/- 2.22
532047 +/- 33568093 +/- 83123.79 +/- 6.1853.36 +/- 18.0648.04 +/- 13.779866.23 +/- 2877.302.26 +/- 1.16
632946 +/- 47538227 +/- 73624.07 +/- 4.6954.65 +/- 12.5750.98 +/- 10.5410468.86 +/- 2228.532.21 +/- 2.33
739504 +/- 41718401 +/- 78522.55 +/- 4.5645.22 +/- 10.3448.41 +/- 9.679829.40 +/- 1993.202.26 +/- 1.11
837998 +/- 37928443 +/- 121139.03 +/- 7.5242.16 +/- 12.3540.98 +/- 14.898128.87 +/- 3055.343.57 +/- 2.77
+ + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # make a fake runfolder directory + self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + self.runfolder_dir = os.path.join(self.temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(self.runfolder_dir) + + self.data_dir = os.path.join(self.runfolder_dir, 'Data') + os.mkdir(self.data_dir) + + self.firecrest_dir = os.path.join(self.data_dir, + 'C1-33_Firecrest1.8.28_12-04-2008_diane' + ) + os.mkdir(self.firecrest_dir) + self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix') + os.mkdir(self.matrix_dir) + make_matrix(self.matrix_dir) + + self.bustard_dir = os.path.join(self.firecrest_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(self.bustard_dir) + make_phasing_params(self.bustard_dir) + + self.gerald_dir = os.path.join(self.bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(self.gerald_dir) + make_gerald_config(self.gerald_dir) + make_summary_htm(self.gerald_dir) + make_eland_results(self.gerald_dir) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.firecrest_dir) + self.failUnlessEqual(f.version, '1.8.28') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 33) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + self.failUnlessEqual(f.date, f2.date) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531), + (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") + diff --git a/htswdataprod/htswdataprod/illumina/test/test_runfolder030.py b/htswdataprod/htswdataprod/illumina/test/test_runfolder030.py new file mode 100644 index 0000000..bfe6257 --- /dev/null +++ b/htswdataprod/htswdataprod/illumina/test/test_runfolder030.py @@ -0,0 +1,1024 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htswdataprod.illumina import firecrest +from htswdataprod.illumina import bustard +from htswdataprod.illumina import gerald +from htswdataprod.illumina import runfolder +from htswdataprod.illumina.runfolder import ElementTree + + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + +def make_summary_htm(gerald_dir): + summary_htm=""" + + + + +

080627_HWI-EAS229_0036_3055HAXX Summary

+

Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080627_HWI-EAS229_0036_3055HAXX
Chip IDunknown
+



Chip Results Summary

+ + + + + + + + + + +
ClustersClusters (PF)Yield (kbases)
80933224435778031133022
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterNum TilesTiles
1unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 1
2unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 2
3unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 3
4unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 4
5unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 5
6unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 6
7unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 7
8unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
115804696483 +/- 907460787 +/- 4240329 +/- 35101.88 +/- 6.0363.21 +/- 3.2970.33 +/- 0.249054.08 +/- 59.160.46 +/- 0.18
2156564133738 +/- 793860217 +/- 1926444 +/- 3992.62 +/- 7.5845.20 +/- 3.3151.98 +/- 0.746692.04 +/- 92.490.46 +/- 0.09
3185818152142 +/- 1000271468 +/- 2827366 +/- 3691.53 +/- 8.6647.19 +/- 3.8082.24 +/- 0.4410598.68 +/- 64.130.41 +/- 0.04
43495315784 +/- 216213443 +/- 1728328 +/- 4097.53 +/- 9.8785.29 +/- 1.9180.02 +/- 0.5310368.82 +/- 71.080.15 +/- 0.05
5167936119735 +/- 846564590 +/- 2529417 +/- 3788.69 +/- 14.7954.10 +/- 2.5976.95 +/- 0.329936.47 +/- 65.750.28 +/- 0.02
6173463152177 +/- 814666716 +/- 2493372 +/- 3987.06 +/- 9.8643.98 +/- 3.1278.80 +/- 0.4310162.28 +/- 49.650.38 +/- 0.03
714928784649 +/- 732557418 +/- 3617295 +/- 2889.40 +/- 8.2367.97 +/- 1.8233.38 +/- 0.254247.92 +/- 32.371.00 +/- 0.03
810695354622 +/- 481241136 +/- 3309284 +/- 3790.21 +/- 9.1075.39 +/- 2.2748.33 +/- 0.296169.21 +/- 169.500.86 +/- 1.22
Tile mean across chip
Av.1011665447235492.3660.2965.258403.690.50
+



Expanded Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
1964830.77000.31001.004967663.21317 +/- 320.13 +/- 0.44-1.14 +/- 0.3470.330.4641758
21337380.77000.31001.224046745.20415 +/- 330.29 +/- 0.40-0.79 +/- 0.3551.980.4630615
31521420.77000.31001.307858847.19344 +/- 260.68 +/- 0.51-0.77 +/- 0.4282.240.4157552
4157840.77000.31000.291109585.29306 +/- 340.20 +/- 0.69-1.28 +/- 0.6680.020.1510671
51197350.77000.31000.856033554.10380 +/- 320.34 +/- 0.49-1.55 +/- 4.6976.950.2849015
61521770.77000.31001.217090543.98333 +/- 270.57 +/- 0.50-0.91 +/- 0.3978.800.3851663
7846490.77000.31001.382106967.97272 +/- 201.15 +/- 0.52-0.84 +/- 0.5833.381.0018265
8546220.77000.31001.172133575.39262 +/- 311.10 +/- 0.59-1.01 +/- 0.4748.330.8619104
+

IVC Plots
+

IVC.htm +

+

All Intensity Plots
+

All.htm +

+

Error graphs:
+

Error.htm +

+Back to top +



Lane 1

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
10001114972326.4894.3957.4470.29038.60.44
+Back to top +



Lane 2

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
20001147793448.1283.6838.5753.76905.40.54
+Back to top +



Lane 3

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
30001167904374.0586.9140.3681.310465.00.47
+Back to top +



Lane 4

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
4000120308276.8592.8784.2680.410413.80.16
+Back to top +



Lane 5

+ + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
+Back to top +



Lane 6

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
60001166844348.1277.5938.1379.710264.40.44
+Back to top +



Lane 7

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
7000198913269.9086.6664.5533.24217.51.02
+Back to top +



Lane 8

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
8000164972243.6089.4073.1748.36182.80.71
+Back to top + + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() + +def make_runfolder(obj=None): + """ + Make a fake runfolder, attach all the directories to obj if defined + """ + # make a fake runfolder directory + temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + runfolder_dir = os.path.join(temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(runfolder_dir) + + data_dir = os.path.join(runfolder_dir, 'Data') + os.mkdir(data_dir) + + firecrest_dir = os.path.join(data_dir, + 'C1-33_Firecrest1.8.28_12-04-2008_diane' + ) + os.mkdir(firecrest_dir) + matrix_dir = os.path.join(firecrest_dir, 'Matrix') + os.mkdir(matrix_dir) + make_matrix(matrix_dir) + + bustard_dir = os.path.join(firecrest_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(bustard_dir) + make_phasing_params(bustard_dir) + + gerald_dir = os.path.join(bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(gerald_dir) + make_gerald_config(gerald_dir) + make_summary_htm(gerald_dir) + make_eland_results(gerald_dir) + + if obj is not None: + obj.temp_dir = temp_dir + obj.runfolder_dir = runfolder_dir + obj.data_dir = data_dir + obj.firecrest_dir = firecrest_dir + obj.matrix_dir = matrix_dir + obj.bustard_dir = bustard_dir + obj.gerald_dir = gerald_dir + + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # attaches all the directories to the object passed in + make_runfolder(self) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.firecrest_dir) + self.failUnlessEqual(f.version, '1.8.28') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 33) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + self.failUnlessEqual(f.date, f2.date) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (96483, 9074), (133738, 7938), + (152142, 10002), (15784, 2162), + (119735, 8465), (152177, 8146), + (84649, 7325), (54622, 4812),] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") + diff --git a/htswdataprod/scripts/eland_makebed b/htswdataprod/scripts/eland_makebed new file mode 100755 index 0000000..a4a414b --- /dev/null +++ b/htswdataprod/scripts/eland_makebed @@ -0,0 +1,106 @@ +#!/usr/bin/python +import optparse +import sys +import os + +from gaworkflow.util.makebed import make_bed_from_eland_stream, make_bed_from_multi_eland_stream, make_description + +def make_parser(): + parser = optparse.OptionParser() + parser.add_option('-e', '--eland', dest='inname', + help='specify input eland filename') + parser.add_option('-b', '--bed', dest='outname', + help='specify output befilename') + parser.add_option('-n', '--name', dest='name', + help='specify the track (short) name.', + default=None) + parser.add_option('-d', '--description', dest='description', + help='specify the track description', + default=None) + parser.add_option('--chromosome', dest='prefix', + help='Set the chromosome prefix name. defaults to "chr"', + default='chr') + parser.add_option("--database", dest='database', + help="specify location of fctracker database", + default=None) + parser.add_option("--flowcell", dest='flowcell', + help="compute name and description from database using flowcell id", + default=None) + parser.add_option("--lane", dest='lane', + help='specify which lane to use when retrieving description from database', + default=None) + + multi = optparse.OptionGroup(parser, 'Multi-read ELAND support') + + multi.add_option('-m', '--multi', action='store_true', + help='Enable parsing multi-read eland files', + default=False) + multi.add_option('--reads', type='int', + help='limit reporting multi reads to this many reads' + '(most usefully --reads=1 will turn a multi-read ' + 'file into a single read file)', + default=255) + parser.add_option_group(multi) + + return parser + +def main(command_line=None): + if command_line is None: + command_line = sys.argv[1:] + + parser = make_parser() + (options, args) = parser.parse_args(command_line) + + if options.inname is None: + parser.error("Need eland input file name") + return 1 + + if options.inname == '-': + instream = sys.stdin + elif os.path.exists(options.inname): + instream = open(options.inname, 'r') + else: + parser.error('%s was not found' % (options.inname)) + return 1 + + if options.outname is None: + # if outname wasn't defined, and we're reading from stdout + if instream is sys.stdin: + # write to stdout + outstream = sys.stdout + else: + # if there's a name write to name.bde + options.outname = os.path.splitext(options.inname)[0]+'.bed' + print >>sys.stderr, "defaulting to outputname", options.outname + elif options.outname == '-': + outstream = sys.stdout + elif os.path.exists(options.outname): + parser.error("not overwriting %s" % (options.outname)) + return 1 + else: + outstream = open(options.outname, 'w') + + if options.flowcell is not None and options.lane is not None: + # get our name/description out of the database + name, description = make_description( + options.database, options.flowcell, options.lane + ) + else: + name = options.name + description = options.description + + if options.multi: + make_bed_from_multi_eland_stream(instream, outstream, + name, description, + options.prefix, + options.reads) + + else: + make_bed_from_eland_stream(instream, outstream, + name, description, + options.prefix) + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + diff --git a/runtests.sh b/runtests.sh new file mode 100755 index 0000000..3225cd7 --- /dev/null +++ b/runtests.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +if [ -z $HTSW_ROOT ]; then + HTSW_ROOT=$(pwd) +fi + +PYTHONPATH=${HTSW_ROOT}/htswcommon:${HTSW_ROOT}/htswdataprod:${HTSW_ROOT}/htswfrontend:$PYTHONPATH + +nosetests -w ${HTSW_ROOT}/htswcommon +nosetests -w ${HTSW_ROOT}/htswdataprod -- 2.30.2