Release version for Erange 4.0a
[erange.git] / geneLocusBins.py
index 6a1efc48e270d92a8453e8fcb6907db00185ed92..49745dc0c1361c2a8dbe4736c9c8630396fd93e4 100755 (executable)
@@ -12,6 +12,7 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
+import string
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
@@ -92,68 +93,78 @@ def getParser(usage):
 
     return parser
 
 
     return parser
 
+
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
 
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
 
 
-    outfile = open(outfilename,'w')
 
 
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
     for gid in gidList:
     for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except:
+            except KeyError:
                 pass
         else:
             symbol = gid
                 pass
         else:
             symbol = gid
+
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
         for binAmount in gidBins[gid]:
             if normalizeBins:
         for binAmount in gidBins[gid]:
             if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
     outfile.close()
 
 
 if __name__ == "__main__":
     outfile.close()
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)