13 from commoncode import *
15 from cistematic.genomes import Genome
18 print "geneStartBins: version 2.1"
20 print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
21 print '\n\twhere regionSize is the optional maximum region in bp\n'
26 outfilename = sys.argv[3]
28 standardMinDist = 3000
29 if '-max' in sys.argv:
30 standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
32 if '-raw' in sys.argv:
40 if '-cache' in sys.argv:
44 standardMinThresh = standardMinDist / bins
46 hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
47 readlen = hitRDS.getReadSize()
48 normalizationFactor = 1.0
50 totalCount = len(hitRDS)
51 normalizationFactor = totalCount / 1000000.
55 geneinfoDict = getGeneInfoDict(genome, cache=True)
56 featuresDict = hg.getallGeneFeatures()
58 outfile = open(outfilename,'w')
60 gidList = hg.allGIDs()
67 geneinfo = geneinfoDict[gid]
68 featureList = featuresDict[gid]
69 symbol = geneinfo[0][0]
74 if len(featureList) == 0:
77 for (ftype, chrom, start, stop, fsense) in featureList:
78 if (start, stop) not in newfeatureList:
79 newfeatureList.append((start, stop))
81 if chrom not in hitDict:
85 if len(newfeatureList) < 1:
88 glen = standardMinDist / 2
90 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
91 if nextGene < glen * 2:
97 gstart = newfeatureList[0][0] - glen
101 gstop = newfeatureList[0][0] + glen
103 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
104 if nextGene < glen * 2:
110 gstart = newfeatureList[-1][1] - glen
111 gstop = newfeatureList[-1][1] + glen
113 if glen < standardMinDist / 2:
117 for read in hitDict[chrom]:
118 tagStart = read["start"] - gstart
119 sense = read["sense"]
120 weight = read["weight"]
121 if tagStart >= 2 * glen:
127 # we are relying on python's integer division quirk
128 binID = tagStart / standardMinThresh
129 binList[binID] += weight
131 rdist = 2 * glen - tagStart
132 binID = rdist / standardMinThresh
133 binList[binID] += weight
138 print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
139 outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
140 for binAmount in binList:
141 outfile.write('\t%d' % binAmount)