X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneStartBins.py;fp=geneStartBins.py;h=d929d41a89a2da712b1ee953770e0b01bb2f96bb;hp=cbb3c4a37d9d74cbbb356038c80912a6643200f1;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneStartBins.py b/geneStartBins.py index cbb3c4a..d929d41 100755 --- a/geneStartBins.py +++ b/geneStartBins.py @@ -9,13 +9,13 @@ try: except: pass -# originally from version 1.3 of geneDownstreamBins.py +import sys from commoncode import * +import ReadDataset from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB -import sys -print '%s: version 2.0' % sys.argv[0] + +print "geneStartBins: version 2.1" if len(sys.argv) < 4: print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0] print '\n\twhere regionSize is the optional maximum region in bp\n' @@ -43,7 +43,7 @@ if '-cache' in sys.argv: bins = 10 standardMinThresh = standardMinDist / bins -hitRDS = readDataset(hitfile, verbose = True, cache=doCache) +hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalize: @@ -51,13 +51,10 @@ if normalize: normalizationFactor = totalCount / 1000000. hg = Genome(genome) -idb = geneinfoDB(cache=True) - gidDict = {} -geneinfoDict = idb.getallGeneInfo(genome) +geneinfoDict = getGeneInfoDict(genome, cache=True) featuresDict = hg.getallGeneFeatures() -#infile = open(infilename) outfile = open(outfilename,'w') gidList = hg.allGIDs() @@ -72,46 +69,58 @@ for gid in gidList: symbol = geneinfo[0][0] except: print geneinfo + newfeatureList = [] if len(featureList) == 0: continue + for (ftype, chrom, start, stop, fsense) in featureList: if (start, stop) not in newfeatureList: newfeatureList.append((start, stop)) + if chrom not in hitDict: continue + newfeatureList.sort() if len(newfeatureList) < 1: - #print '%s %s %d' % (gid, symbol, -1) - #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1)) continue + glen = standardMinDist / 2 if fsense == 'F': nextGene = hg.leftGeneDistance((genome, gid), glen * 2) if nextGene < glen * 2: - glen = nextGene / 2 + glen = nextGene / 2 + if glen < 1: - glen = 1 + glen = 1 + gstart = newfeatureList[0][0] - glen if gstart < 0: - gstart = 0 + gstart = 0 + gstop = newfeatureList[0][0] + glen else: nextGene = hg.rightGeneDistance((genome, gid), glen * 2) if nextGene < glen * 2: glen = nextGene / 2 + if glen < 1: glen = 1 + gstart = newfeatureList[-1][1] - glen gstop = newfeatureList[-1][1] + glen tagCount = 0 if glen < standardMinDist / 2: continue + binList = [0] * bins - for (tagStart, sense, weight) in hitDict[chrom]: - tagStart -= gstart + for read in hitDict[chrom]: + tagStart = read["start"] - gstart + sense = read["sense"] + weight = read["weight"] if tagStart >= 2 * glen: break + if tagStart > 0: tagCount += weight if fsense == 'R': @@ -122,13 +131,16 @@ for gid in gidList: rdist = 2 * glen - tagStart binID = rdist / standardMinThresh binList[binID] += weight + if tagCount < 2: continue + print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList)) outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen)) for binAmount in binList: outfile.write('\t%d' % binAmount) + outfile.write('\n') -#infile.close() + outfile.close()