geneLocusCounts.py

   1 #
   2 #  geneLocusCounts.py
   3 #  ENRAGE
   4 #
   5 """  usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices]
   6             where upstream and downstream are in bp and and optional
   7             using noCDS requires either upstream or downstream (but not both)
   8             to be nonzero. Using -locuslength will report the first bplength
   9             or the last bplength of the gene region depending on whether it
  10             is positive or negative.
  11             will by default only count the uniq reads (use -noUniqs to turn off)
  12             but can also count multi and splice reads given the appropriate flags
  13 """
  14 try:
  15     import psyco
  16     psyco.full()
  17 except:
  18     pass
  19
  20 import sys
  21 import optparse
  22 from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  23 import ReadDataset
  24 from cistematic.genomes import Genome
  25 from commoncode import getGeneInfoDict
  26
  27 print "geneLocusCounts: version 3.1"
  28
  29 def main(argv=None):
  30     if not argv:
  31         argv = sys.argv
  32
  33     usage = "usage: python %prog genome readDB outfilename [options]"
  34
  35     parser = getParser(usage)
  36     (options, args) = parser.parse_args(argv[1:])
  37
  38     if len(args) < 3:
  39         print __doc__
  40         sys.exit(1)
  41
  42     genome = args[0]
  43     hitfile =  args[1]
  44     outfilename = args[2]
  45
  46     upstream = 0
  47     downstream = 0
  48     try:
  49         upstream = int(args[3])
  50     except ValueError:
  51         pass
  52     except IndexError:
  53         pass
  54
  55     try:
  56         if "-" not in args[3]:
  57             downstream = int(args[4])
  58     except ValueError:
  59         pass
  60
  61     geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs,
  62                     options.doMulti, options.doSplices, options.useCDS, options.spanTSS,
  63                     options.bplength, options.acceptfile)
  64
  65
  66 def getParser(usage):
  67     parser = optparse.OptionParser(usage=usage)
  68     parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
  69                       help="do not count unique reads")
  70     parser.add_option("--multi", action="store_true", dest="doUniqs",
  71                       help="count multi reads")
  72     parser.add_option("--splices", action="store_true", dest="doUniqs",
  73                       help="count splice reads")
  74     parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
  75     parser.add_option("--regions", dest="acceptfile")
  76     parser.add_option("--noCDS", action="store_false", dest="useCDS")
  77     parser.add_option("--locusLength", type="int", dest="bplength",
  78                       help="number of bases to report")
  79
  80     configParser = getConfigParser()
  81     section = "geneLocusCounts"
  82     doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
  83     doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
  84     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
  85     useCDS = getConfigBoolOption(configParser, section, "useCDS", True)
  86     spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False)
  87     bplength = getConfigIntOption(configParser, section, "bplength", 0)
  88     acceptfile = getConfigOption(configParser, section, "acceptfile", "")
  89
  90     parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices,
  91                         useCDS=useCDS, spanTSS=spanTSS, bplength=bplength,
  92                         acceptfile=acceptfile)
  93
  94     return parser
  95
  96
  97 def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
  98                     doUniqs=True, doMulti=False, doSplices=False, useCDS=True,
  99                     spanTSS=False, bplength=0, acceptfile=""):
 100
 101     print "returning only up to %d bp from gene locus" % bplength
 102     print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS)
 103
 104     if acceptfile:
 105         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
 106
 107     hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True)
 108
 109     totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
 110
 111     hg = Genome(genome)
 112     gidDict = {}
 113     locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength)
 114     locusChroms = locusByChromDict.keys()
 115     chromList = hitRDS.getChromosomes(fullChrom=False)
 116     chromList.sort()
 117     for chrom in chromList:
 118         if doNotProcessChromosome(chrom, locusChroms):
 119             continue
 120
 121         fullchrom = "chr%s" % chrom
 122         print fullchrom
 123         hitRDS.memSync(fullchrom, index=True)
 124         for (start, stop, gid, length) in locusByChromDict[chrom]:
 125             if not gid in gidDict:
 126                 gidDict[gid] = {"count": 0, "length": length}
 127
 128             gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 129
 130     outfile = open(outfilename, "w")
 131
 132     totalCount /= 1000000.
 133
 134     outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n")
 135     gidList = gidDict.keys()
 136     gidList.sort()
 137     geneinfoDict = getGeneInfoDict(genome, cache=True)
 138     for gid in gidList:
 139         if "FAR" not in gid:
 140             symbol = "LOC%s" % gid
 141             geneinfo = ""
 142             try:
 143                 geneinfo = geneinfoDict[gid]
 144                 symbol = geneinfo[0][0]
 145             except (KeyError, IndexError):
 146                 pass
 147         else:
 148             symbol = gid
 149
 150         gidCount = gidDict[gid]["count"]
 151         gidLength = gidDict[gid]["length"]
 152         rpm  = gidCount / totalCount
 153         rpkm = 1000. * rpm / gidLength
 154         outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm))
 155
 156     outfile.close()
 157
 158
 159 def doNotProcessChromosome(chrom, locusChroms):
 160     return chrom == "M" or chrom not in locusChroms
 161
 162
 163 if __name__ == "__main__":
 164     main(sys.argv)