geneDownstreamBins.py

   1 #
   2 #  geneDownstreamBins.py
   3 #  ENRAGE
   4 #
   5
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 # originally from version 1.3 of geneDnaDownstreamCounts.py
  13 import sys, optparse
  14 from commoncode import readDataset
  15 from cistematic.genomes import Genome
  16 from cistematic.core.geneinfo import geneinfoDB
  17
  18 print "%prog: version 2.0"
  19
  20 def main(argv=None):
  21     if not argv:
  22         argv = sys.argv
  23
  24     usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
  25
  26     parser = optparse.OptionParser(usage=usage)
  27     parser.add_option("--max", type="int", dest="standardMinDist",
  28                       help="maximum region in bp")
  29     parser.set_defaults(standardMinDist=3000)
  30     (options, args) = parser.parse_args(argv[1:])
  31
  32     if len(args) < 3:
  33         print usage
  34         sys.exit(1)
  35
  36     genome = args[0]
  37     hitfile =  args[1]
  38     outfilename = args[2]
  39
  40     geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
  41
  42
  43 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
  44     bins = 10
  45     standardMinThresh = standardMinDist / bins
  46
  47     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
  48     normalizationFactor = 1.0
  49     if normalize:
  50         hitDictSize = len(hitRDS)
  51         normalizationFactor = hitDictSize / 1000000.
  52
  53     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  54
  55     hg = Genome(genome)
  56     idb = geneinfoDB(cache=True)
  57
  58     geneinfoDict = idb.getallGeneInfo(genome)
  59     featuresDict = hg.getallGeneFeatures()
  60
  61     outfile = open(outfilename, "w")
  62
  63     gidList = hg.allGIDs()
  64     gidList.sort()
  65     for gid in gidList:
  66         symbol = "LOC" + gid
  67         geneinfo = ""
  68         featureList = []
  69         try:
  70             geneinfo = geneinfoDict[gid]
  71             featureList = featuresDict[gid]
  72             symbol = geneinfo[0][0]
  73         except:
  74             print gid
  75
  76         if len(featureList) == 0:
  77             continue
  78
  79         newfeatureList = []
  80         for (ftype, chrom, start, stop, fsense) in featureList:
  81             if (start, stop) not in newfeatureList:
  82                 newfeatureList.append((start, stop))
  83
  84         if chrom not in hitDict:
  85             continue
  86
  87         newfeatureList.sort()
  88         if len(newfeatureList) < 1:
  89             continue
  90
  91         glen = standardMinDist
  92         if fsense == "F":
  93             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
  94             if nextGene < glen * 2:
  95                 glen = nextGene / 2
  96
  97             if glen < 1:
  98                 glen = 1
  99
 100             gstart = newfeatureList[-1][1]
 101         else:
 102             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
 103             if nextGene < glen * 2:
 104                 glen = nextGene / 2
 105
 106             if glen < 1:
 107                 glen = 1
 108
 109             gstart = newfeatureList[0][0] - glen
 110             if gstart < 0:
 111                 gstart = 0
 112
 113         tagCount = 0
 114         if glen < standardMinDist:
 115             continue
 116
 117         binList = [0.] * bins
 118         for (tagStart, sense, weight) in hitDict[chrom]:
 119             tagStart -= gstart
 120             if tagStart >= glen:
 121                 break
 122
 123             if tagStart > 0:
 124                 tagCount += weight
 125                 if fsense == "F":
 126                     # we are relying on python's integer division quirk
 127                     binID = tagStart / standardMinThresh
 128                     binList[binID] += weight
 129                 else:
 130                     rdist = glen - tagStart
 131                     binID = rdist / standardMinThresh
 132                     binList[binID] += weight
 133
 134         if tagCount < 2:
 135             continue
 136
 137         tagCount *= normalizationFactor
 138         print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
 139         outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
 140         for binAmount in binList:
 141             outfile.write("\t%.2f" % binAmount)
 142
 143         outfile.write("\n")
 144
 145     outfile.close()
 146
 147
 148 if __name__ == "__main__":
 149     main(sys.argv)