geneLocusPeaks.py

   1 #
   2 #  geneLocusPeaks.py
   3 #  ENRAGE
   4 #
   5
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict
  13 from cistematic.genomes import Genome
  14 from cistematic.core.geneinfo import geneinfoDB
  15 import sys, optparse
  16
  17 print "%prog: version 2.0"
  18
  19 def main(argv=None):
  20     if not argv:
  21         argv = sys.argv
  22
  23     usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
  24
  25     parser = optparse.OptionParser(usage=usage)
  26     parser.add_option("--up", type="int", dest="upstream")
  27     parser.add_option("--down", type="int", dest="downstream")
  28     parser.add_option("--regions", dest="acceptfile")
  29     parser.add_option("--raw", action="store_false", dest="normalize")
  30     parser.add_option("--cache", action="store_true", dest="doCache")
  31     parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False)
  32     (options, args) = parser.parse_args(argv[1:])
  33
  34     if len(args) < 3:
  35         print usage
  36         print "\twhere upstream and downstream are in bp and and optional"
  37         sys.exit(1)
  38
  39     genome = args[0]
  40     hitfile =  args[1]
  41     outfilename = args[2]
  42
  43     geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
  44
  45
  46 def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
  47     acceptDict = {}
  48
  49     if acceptfile:
  50         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  51
  52     print "upstream = %d downstream = %d" % (upstream, downstream)
  53
  54     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
  55     readlen = hitRDS.getReadSize()
  56     normalizationFactor = 1.0
  57     if normalize:
  58         totalCount = len(hitRDS)
  59         normalizationFactor = totalCount / 1000000.
  60
  61     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  62
  63     hg = Genome(genome)
  64     idb = geneinfoDB(cache=True)
  65
  66     gidCount = {}
  67     gidPos = {}
  68     geneinfoDict = idb.getallGeneInfo(genome)
  69     locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
  70
  71     gidList = hg.allGIDs()
  72     gidList.sort()
  73     for chrom in acceptDict:
  74         for (label, start, stop, length) in acceptDict[chrom]:
  75             if label not in gidList:
  76                 gidList.append(label)
  77
  78     for gid in gidList:
  79         gidCount[gid] = 0
  80
  81     for chrom in hitDict:
  82         if chrom not in locusByChromDict:
  83             continue
  84
  85         print chrom
  86         for (start, stop, gid, glen) in locusByChromDict[chrom]:
  87             gidCount[gid] = 0.
  88             (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen)
  89             if len(topPos) > 0:
  90                 gidCount[gid] = smoothArray[topPos[0]]
  91                 gidPos[gid] = (chrom, start + topPos[0])
  92             else:
  93                 gidPos[gid] = (chrom, start)
  94
  95     outfile = open(outfilename, "w")
  96
  97     for gid in gidList:
  98         if "FAR" not in gid:
  99             symbol = "LOC" + gid
 100             geneinfo = ""
 101             try:
 102                 geneinfo = geneinfoDict[gid]
 103                 symbol = geneinfo[0][0]
 104             except:
 105                 pass
 106         else:
 107             symbol = gid
 108
 109         if gid in gidCount and gid in gidPos:
 110             (chrom, pos) = gidPos[gid]
 111             outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid]))
 112
 113     outfile.close()
 114
 115
 116 if __name__ == "__main__":
 117     main(sys.argv)