erange version 4.0a dev release

[erange.git] / geneStartBins.py
diff --git a/geneStartBins.py b/geneStartBins.py

index cbb3c4a37d9d74cbbb356038c80912a6643200f1..d929d41a89a2da712b1ee953770e0b01bb2f96bb 100755 (executable)
--- a/geneStartBins.py
+++ b/geneStartBins.py
@@ -9,13 +9,13 @@ try:
  except:
      pass
  
-# originally from version 1.3 of geneDownstreamBins.py
+import sys
  from commoncode import *
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
-import sys
  
-print '%s: version 2.0' % sys.argv[0]
+
+print "geneStartBins: version 2.1"
  if len(sys.argv) < 4:
      print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
      print '\n\twhere regionSize is the optional maximum region in bp\n'
@@ -43,7 +43,7 @@ if '-cache' in sys.argv:
  bins = 10
  standardMinThresh = standardMinDist / bins
  
-hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
  readlen = hitRDS.getReadSize()
  normalizationFactor = 1.0
  if normalize:
@@ -51,13 +51,10 @@ if normalize:
      normalizationFactor = totalCount / 1000000.
  
  hg = Genome(genome)
-idb = geneinfoDB(cache=True)
-
  gidDict = {}
-geneinfoDict = idb.getallGeneInfo(genome)
+geneinfoDict = getGeneInfoDict(genome, cache=True)
  featuresDict = hg.getallGeneFeatures()
  
-#infile = open(infilename)
  outfile = open(outfilename,'w')
  
  gidList = hg.allGIDs()
@@ -72,46 +69,58 @@ for gid in gidList:
          symbol = geneinfo[0][0]
      except:
          print geneinfo
+
      newfeatureList = []
      if len(featureList) == 0:
          continue
+
      for (ftype, chrom, start, stop, fsense) in featureList:
          if (start, stop) not in newfeatureList:
              newfeatureList.append((start, stop))
+
      if chrom not in hitDict:
          continue
+
      newfeatureList.sort()
      if len(newfeatureList) < 1:
-        #print '%s %s %d' % (gid, symbol, -1)
-        #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1))
          continue
+
      glen = standardMinDist / 2
      if fsense == 'F':
          nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
          if nextGene < glen * 2:
-                glen = nextGene / 2
+            glen = nextGene / 2
+
          if glen < 1:
-                glen = 1
+            glen = 1
+
          gstart = newfeatureList[0][0] - glen
          if gstart < 0:
-                gstart = 0
+            gstart = 0
+
          gstop = newfeatureList[0][0]  + glen
      else:
          nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
          if nextGene < glen * 2:
              glen = nextGene / 2
+
          if glen < 1:
              glen = 1
+
          gstart = newfeatureList[-1][1] - glen
          gstop = newfeatureList[-1][1] + glen
      tagCount = 0
      if glen < standardMinDist / 2:
          continue
+
      binList = [0] * bins
-    for (tagStart, sense, weight) in hitDict[chrom]:
-        tagStart -= gstart 
+    for read in hitDict[chrom]:
+        tagStart = read["start"] - gstart
+        sense = read["sense"]
+        weight = read["weight"]
          if tagStart >= 2 * glen:
              break
+
          if tagStart > 0:
              tagCount += weight
              if fsense == 'R':
@@ -122,13 +131,16 @@ for gid in gidList:
                  rdist = 2 * glen - tagStart
                  binID = rdist / standardMinThresh 
                  binList[binID] += weight
+
      if tagCount < 2:
          continue
+
      print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
      outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
      for binAmount in binList:
          outfile.write('\t%d' % binAmount)
+
      outfile.write('\n')
-#infile.close()
+
  outfile.close()