geneStartBins.py

   1 #
   2 #  geneStartBins.py
   3 #  ENRAGE
   4 #
   5
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 import sys
  13 from commoncode import *
  14 import ReadDataset
  15 from cistematic.genomes import Genome
  16
  17
  18 print "geneStartBins: version 2.1"
  19 if len(sys.argv) < 4:
  20     print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
  21     print '\n\twhere regionSize is the optional maximum region in bp\n'
  22     sys.exit(1)
  23
  24 genome = sys.argv[1]
  25 hitfile =  sys.argv[2]
  26 outfilename = sys.argv[3]
  27
  28 standardMinDist = 3000
  29 if '-max' in sys.argv:
  30     standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
  31
  32 if '-raw' in sys.argv:
  33     normalize = False
  34     normalizeBins = False
  35 else:
  36     normalize = True
  37     normalizeBins = True
  38
  39 doCache = False
  40 if '-cache' in sys.argv:
  41     doCache = True
  42
  43 bins = 10
  44 standardMinThresh = standardMinDist / bins
  45
  46 hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
  47 readlen = hitRDS.getReadSize()
  48 normalizationFactor = 1.0
  49 if normalize:
  50     totalCount = len(hitRDS)
  51     normalizationFactor = totalCount / 1000000.
  52
  53 hg = Genome(genome)
  54 gidDict = {}
  55 geneinfoDict = getGeneInfoDict(genome, cache=True)
  56 featuresDict = hg.getallGeneFeatures()
  57
  58 outfile = open(outfilename,'w')
  59
  60 gidList = hg.allGIDs()
  61 gidList.sort()
  62 for gid in gidList:
  63     symbol = 'LOC' + gid
  64     geneinfo = ''
  65     featureList = []
  66     try:
  67         geneinfo = geneinfoDict[gid]
  68         featureList = featuresDict[gid]
  69         symbol = geneinfo[0][0]
  70     except:
  71         print geneinfo
  72
  73     newfeatureList = []
  74     if len(featureList) == 0:
  75         continue
  76
  77     for (ftype, chrom, start, stop, fsense) in featureList:
  78         if (start, stop) not in newfeatureList:
  79             newfeatureList.append((start, stop))
  80
  81     if chrom not in hitDict:
  82         continue
  83
  84     newfeatureList.sort()
  85     if len(newfeatureList) < 1:
  86         continue
  87
  88     glen = standardMinDist / 2
  89     if fsense == 'F':
  90         nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
  91         if nextGene < glen * 2:
  92             glen = nextGene / 2
  93
  94         if glen < 1:
  95             glen = 1
  96
  97         gstart = newfeatureList[0][0] - glen
  98         if gstart < 0:
  99             gstart = 0
 100
 101         gstop = newfeatureList[0][0]  + glen
 102     else:
 103         nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
 104         if nextGene < glen * 2:
 105             glen = nextGene / 2
 106
 107         if glen < 1:
 108             glen = 1
 109
 110         gstart = newfeatureList[-1][1] - glen
 111         gstop = newfeatureList[-1][1] + glen
 112     tagCount = 0
 113     if glen < standardMinDist / 2:
 114         continue
 115
 116     binList = [0] * bins
 117     for read in hitDict[chrom]:
 118         tagStart = read["start"] - gstart
 119         sense = read["sense"]
 120         weight = read["weight"]
 121         if tagStart >= 2 * glen:
 122             break
 123
 124         if tagStart > 0:
 125             tagCount += weight
 126             if fsense == 'R':
 127                 # we are relying on python's integer division quirk
 128                 binID = tagStart / standardMinThresh
 129                 binList[binID] += weight
 130             else:
 131                 rdist = 2 * glen - tagStart
 132                 binID = rdist / standardMinThresh
 133                 binList[binID] += weight
 134
 135     if tagCount < 2:
 136         continue
 137
 138     print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
 139     outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
 140     for binAmount in binList:
 141         outfile.write('\t%d' % binAmount)
 142
 143     outfile.write('\n')
 144
 145 outfile.close()
 146