X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneStartBins.py;fp=geneStartBins.py;h=3c5183730c288f9c61e29f41fb1df69c3e0f992f;hp=d929d41a89a2da712b1ee953770e0b01bb2f96bb;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/geneStartBins.py b/geneStartBins.py index d929d41..3c51837 100755 --- a/geneStartBins.py +++ b/geneStartBins.py @@ -1,8 +1,3 @@ -# -# geneStartBins.py -# ENRAGE -# - try: import psyco psyco.full() @@ -10,11 +5,10 @@ except: pass import sys -from commoncode import * -import ReadDataset +from commoncode import getReadSense, getGeneInfoDict, getHeaderComment +import pysam from cistematic.genomes import Genome - print "geneStartBins: version 2.1" if len(sys.argv) < 4: print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0] @@ -24,7 +18,6 @@ if len(sys.argv) < 4: genome = sys.argv[1] hitfile = sys.argv[2] outfilename = sys.argv[3] - standardMinDist = 3000 if '-max' in sys.argv: standardMinDist = int(sys.argv[sys.argv.index('-max') + 1]) @@ -42,23 +35,21 @@ if '-cache' in sys.argv: bins = 10 standardMinThresh = standardMinDist / bins - -hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) -readlen = hitRDS.getReadSize() +bamfile = pysam.Samfile(hitfile, "rb") +readlen = getHeaderComment(bamfile.header, "ReadLength") normalizationFactor = 1.0 if normalize: - totalCount = len(hitRDS) + totalCount = getHeaderComment(bamfile.header, "Total") normalizationFactor = totalCount / 1000000. hg = Genome(genome) gidDict = {} geneinfoDict = getGeneInfoDict(genome, cache=True) featuresDict = hg.getallGeneFeatures() - outfile = open(outfilename,'w') - gidList = hg.allGIDs() gidList.sort() +chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] for gid in gidList: symbol = 'LOC' + gid geneinfo = '' @@ -78,7 +69,7 @@ for gid in gidList: if (start, stop) not in newfeatureList: newfeatureList.append((start, stop)) - if chrom not in hitDict: + if chrom not in chromosomeList: continue newfeatureList.sort() @@ -109,15 +100,16 @@ for gid in gidList: gstart = newfeatureList[-1][1] - glen gstop = newfeatureList[-1][1] + glen + tagCount = 0 if glen < standardMinDist / 2: continue binList = [0] * bins - for read in hitDict[chrom]: - tagStart = read["start"] - gstart - sense = read["sense"] - weight = read["weight"] + for alignedread in bamfile.fetch(chrom): + tagStart = alignedread.pos - gstart + sense = getReadSense(alignedread) + weight = 1.0/alignedread.opt('NH') if tagStart >= 2 * glen: break