geneLocusPeaks.py

   1 #
   2 #  geneLocusPeaks.py
   3 #  ENRAGE
   4 #
   5
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 import sys
  13 import optparse
  14 from commoncode import getMergedRegions, findPeak, getLocusByChromDict
  15 import ReadDataset
  16 from cistematic.genomes import Genome
  17 from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  18
  19
  20 print "geneLocusPeaks: version 2.1"
  21
  22 def main(argv=None):
  23     if not argv:
  24         argv = sys.argv
  25
  26     usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
  27
  28     parser = makeParser(usage)
  29     (options, args) = parser.parse_args(argv[1:])
  30
  31     if len(args) < 3:
  32         print usage
  33         print "\twhere upstream and downstream are in bp and and optional"
  34         sys.exit(1)
  35
  36     genome = args[0]
  37     hitfile =  args[1]
  38     outfilename = args[2]
  39
  40     geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
  41
  42
  43 def makeParser(usage=""):
  44     parser = optparse.OptionParser(usage=usage)
  45     parser.add_option("--up", type="int", dest="upstream")
  46     parser.add_option("--down", type="int", dest="downstream")
  47     parser.add_option("--regions", dest="acceptfile")
  48     parser.add_option("--raw", action="store_false", dest="normalize")
  49     parser.add_option("--cache", action="store_true", dest="doCache")
  50
  51     configParser = getConfigParser()
  52     section = "geneLocusPeaks"
  53     upstream = getConfigIntOption(configParser, section, "upstream", 0)
  54     downstream = getConfigIntOption(configParser, section, "downstream", 0)
  55     acceptfile = getConfigOption(configParser, section, "acceptfile", "")
  56     normalize = getConfigBoolOption(configParser, section, "normalize", True)
  57     doCache = getConfigBoolOption(configParser, section, "doCache", False)
  58
  59     parser.set_defaults(upstream=upstream, downstream=downstream, acceptfile=acceptfile, normalize=normalize, doCache=doCache)
  60
  61     return parser
  62
  63
  64 def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
  65     acceptDict = {}
  66
  67     if acceptfile:
  68         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  69
  70     print "upstream = %d downstream = %d" % (upstream, downstream)
  71
  72     hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
  73     readlen = hitRDS.getReadSize()
  74     normalizationFactor = 1.0
  75     if normalize:
  76         totalCount = len(hitRDS)
  77         normalizationFactor = totalCount / 1000000.
  78
  79     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  80
  81     hg = Genome(genome)
  82     gidCount = {}
  83     gidPos = {}
  84     geneinfoDict = getGeneInfoDict(genome, cache=True)
  85     locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
  86
  87     gidList = hg.allGIDs()
  88     gidList.sort()
  89     for chrom in acceptDict:
  90         for region in acceptDict[chrom]:
  91             if region.label not in gidList:
  92                 gidList.append(region.label)
  93
  94     for gid in gidList:
  95         gidCount[gid] = 0
  96
  97     for chrom in hitDict:
  98         if chrom not in locusByChromDict:
  99             continue
 100
 101         print chrom
 102         for (start, stop, gid, glen) in locusByChromDict[chrom]:
 103             gidCount[gid] = 0.
 104             peak = findPeak(hitDict[chrom], start, glen, readlen)
 105             if len(peak.topPos) > 0:
 106                 gidCount[gid] = peak.smoothArray[peak.topPos[0]]
 107                 gidPos[gid] = (chrom, start + peak.topPos[0])
 108             else:
 109                 gidPos[gid] = (chrom, start)
 110
 111     outfile = open(outfilename, "w")
 112
 113     for gid in gidList:
 114         if "FAR" not in gid:
 115             symbol = "LOC" + gid
 116             geneinfo = ""
 117             try:
 118                 geneinfo = geneinfoDict[gid]
 119                 symbol = geneinfo[0][0]
 120             except:
 121                 pass
 122         else:
 123             symbol = gid
 124
 125         if gid in gidCount and gid in gidPos:
 126             (chrom, pos) = gidPos[gid]
 127             outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid]))
 128
 129     outfile.close()
 130
 131
 132 if __name__ == "__main__":
 133     main(sys.argv)