X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneStartBins.py;h=3c5183730c288f9c61e29f41fb1df69c3e0f992f;hp=cbb3c4a37d9d74cbbb356038c80912a6643200f1;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneStartBins.py b/geneStartBins.py index cbb3c4a..3c51837 100755 --- a/geneStartBins.py +++ b/geneStartBins.py @@ -1,21 +1,15 @@ -# -# geneStartBins.py -# ENRAGE -# - try: import psyco psyco.full() except: pass -# originally from version 1.3 of geneDownstreamBins.py -from commoncode import * -from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB import sys +from commoncode import getReadSense, getGeneInfoDict, getHeaderComment +import pysam +from cistematic.genomes import Genome -print '%s: version 2.0' % sys.argv[0] +print "geneStartBins: version 2.1" if len(sys.argv) < 4: print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0] print '\n\twhere regionSize is the optional maximum region in bp\n' @@ -24,7 +18,6 @@ if len(sys.argv) < 4: genome = sys.argv[1] hitfile = sys.argv[2] outfilename = sys.argv[3] - standardMinDist = 3000 if '-max' in sys.argv: standardMinDist = int(sys.argv[sys.argv.index('-max') + 1]) @@ -42,26 +35,21 @@ if '-cache' in sys.argv: bins = 10 standardMinThresh = standardMinDist / bins - -hitRDS = readDataset(hitfile, verbose = True, cache=doCache) -readlen = hitRDS.getReadSize() +bamfile = pysam.Samfile(hitfile, "rb") +readlen = getHeaderComment(bamfile.header, "ReadLength") normalizationFactor = 1.0 if normalize: - totalCount = len(hitRDS) + totalCount = getHeaderComment(bamfile.header, "Total") normalizationFactor = totalCount / 1000000. hg = Genome(genome) -idb = geneinfoDB(cache=True) - gidDict = {} -geneinfoDict = idb.getallGeneInfo(genome) +geneinfoDict = getGeneInfoDict(genome, cache=True) featuresDict = hg.getallGeneFeatures() - -#infile = open(infilename) outfile = open(outfilename,'w') - gidList = hg.allGIDs() gidList.sort() +chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] for gid in gidList: symbol = 'LOC' + gid geneinfo = '' @@ -72,46 +60,59 @@ for gid in gidList: symbol = geneinfo[0][0] except: print geneinfo + newfeatureList = [] if len(featureList) == 0: continue + for (ftype, chrom, start, stop, fsense) in featureList: if (start, stop) not in newfeatureList: newfeatureList.append((start, stop)) - if chrom not in hitDict: + + if chrom not in chromosomeList: continue + newfeatureList.sort() if len(newfeatureList) < 1: - #print '%s %s %d' % (gid, symbol, -1) - #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1)) continue + glen = standardMinDist / 2 if fsense == 'F': nextGene = hg.leftGeneDistance((genome, gid), glen * 2) if nextGene < glen * 2: - glen = nextGene / 2 + glen = nextGene / 2 + if glen < 1: - glen = 1 + glen = 1 + gstart = newfeatureList[0][0] - glen if gstart < 0: - gstart = 0 + gstart = 0 + gstop = newfeatureList[0][0] + glen else: nextGene = hg.rightGeneDistance((genome, gid), glen * 2) if nextGene < glen * 2: glen = nextGene / 2 + if glen < 1: glen = 1 + gstart = newfeatureList[-1][1] - glen gstop = newfeatureList[-1][1] + glen + tagCount = 0 if glen < standardMinDist / 2: continue + binList = [0] * bins - for (tagStart, sense, weight) in hitDict[chrom]: - tagStart -= gstart + for alignedread in bamfile.fetch(chrom): + tagStart = alignedread.pos - gstart + sense = getReadSense(alignedread) + weight = 1.0/alignedread.opt('NH') if tagStart >= 2 * glen: break + if tagStart > 0: tagCount += weight if fsense == 'R': @@ -122,13 +123,16 @@ for gid in gidList: rdist = 2 * glen - tagStart binID = rdist / standardMinThresh binList[binID] += weight + if tagCount < 2: continue + print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList)) outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen)) for binAmount in binList: outfile.write('\t%d' % binAmount) + outfile.write('\n') -#infile.close() + outfile.close()