first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneLocusBins.py
index 6a1efc48e270d92a8453e8fcb6907db00185ed92..14cb2b5ee2b5617fba45f1a69ef94386784c0b11 100755 (executable)
@@ -12,6 +12,7 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
+import string
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
@@ -92,66 +93,77 @@ def getParser(usage):
 
     return parser
 
 
     return parser
 
+
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
 
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
 
 
-    outfile = open(outfilename,'w')
 
 
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
     for gid in gidList:
     for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except:
+            except (KeyError, IndexError):
                 pass
         else:
             symbol = gid
                 pass
         else:
             symbol = gid
+
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
         for binAmount in gidBins[gid]:
             if normalizeBins:
         for binAmount in gidBins[gid]:
             if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
     outfile.close()
 
 
     outfile.close()