import sys
import optparse
+import string
from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
import ReadDataset
from cistematic.genomes import Genome
return parser
+
def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
acceptfile=None):
+
+ hg = Genome(genome)
+ gidList = hg.allGIDs()
+ gidList.sort()
if acceptfile is None:
acceptDict = {}
else:
acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+ for chrom in acceptDict:
+ for region in acceptDict[chrom]:
+ if region.label not in gidList:
+ gidList.append(region.label)
+
+ if doFlank:
+ locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+ else:
+ locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalizeBins:
totalCount = len(hitRDS)
normalizationFactor = totalCount / 1000000.
- hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
- hg = Genome(genome)
- geneinfoDict = getGeneInfoDict(genome, cache=doCache)
- if doFlank:
- locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
- else:
- locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
- gidList = hg.allGIDs()
- gidList.sort()
- for chrom in acceptDict:
- for region in acceptDict[chrom]:
- if region.label not in gidList:
- gidList.append(region.label)
-
(gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+ geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+ writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
- outfile = open(outfilename,'w')
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+ outfile = open(outfilename, "w")
for gid in gidList:
- if 'FAR' not in gid:
- symbol = 'LOC' + gid
- geneinfo = ''
+ if "FAR" not in gid:
+ symbol = "LOC%s" % gid
+ geneinfo = ""
try:
geneinfo = geneinfoDict[gid]
symbol = geneinfo[0][0]
- except:
+ except KeyError:
pass
else:
symbol = gid
+
if gid in gidBins and gid in gidLen:
tagCount = 0.
for binAmount in gidBins[gid]:
tagCount += binAmount
- outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+ outputList = [gid, symbol, tagCount, gidLen[gid]]
for binAmount in gidBins[gid]:
if normalizeBins:
- if tagCount == 0:
- tagCount = 1
- outfile.write('\t%.1f' % (100. * binAmount / tagCount))
- else:
- outfile.write('\t%.1f' % binAmount)
- outfile.write('\n')
+ try:
+ normalizedValue = 100. * binAmount / tagCount
+ except ZeroDivisionError:
+ normalizedValue = 100. * binAmount
+
+ binAmount = normalizedValue
+
+ outputList.append("%.1f" % binAmount)
+
+ outLine = string.join(outputList, "\t")
+ outfile.write("%s\n" % outLine)
+
outfile.close()
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)