geneLocusCounts.py

   1 #
   2 #  geneLocusCounts.py
   3 #  ENRAGE
   4 #
   5 """  usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices]
   6             where upstream and downstream are in bp and and optional
   7             using noCDS requires either upstream or downstream (but not both)
   8             to be nonzero. Using -locuslength will report the first bplength
   9             or the last bplength of the gene region depending on whether it
  10             is positive or negative.
  11             will by default only count the uniq reads (use -noUniqs to turn off)
  12             but can also count multi and splice reads given the appropriate flags
  13 """
  14 try:
  15     import psyco
  16     psyco.full()
  17 except:
  18     pass
  19
  20 import sys, optparse
  21 from commoncode import readDataset, getMergedRegions, getLocusByChromDict
  22 from cistematic.genomes import Genome
  23 from cistematic.core.geneinfo import geneinfoDB
  24
  25 print '%s: version 3.0' % sys.argv[0]
  26
  27 def main(argv=None):
  28     if not argv:
  29         argv = sys.argv
  30
  31     usage = "usage: python %prog genome readDB outfilename [options]"
  32
  33     parser = optparse.OptionParser(usage=usage)
  34     parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
  35                       help="do not count unique reads")
  36     parser.add_option("--multi", action="store_true", dest="doUniqs",
  37                       help="count multi reads")
  38     parser.add_option("--splices", action="store_true", dest="doUniqs",
  39                       help="count splice reads")
  40     parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
  41     parser.add_option("--regions", dest="acceptfile")
  42     parser.add_option("--noCDS", action="store_false", dest="useCDS")
  43     parser.add_option("--locusLength", type="int", dest="bplength",
  44                       help="number of bases to report")
  45     parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
  46     (options, args) = parser.parse_args(argv[1:])
  47
  48     if len(args) < 3:
  49         print __doc__
  50         sys.exit(1)
  51
  52     genome = args[0]
  53     hitfile =  args[1]
  54     outfilename = args[2]
  55
  56     upstream = 0
  57     downstream = 0
  58     try:
  59         upstream = int(args[3])
  60     except ValueError:
  61         pass
  62     except IndexError:
  63         pass
  64
  65     try:
  66         if "-" not in args[3]:
  67             downstream = int(args[4])
  68     except ValueError:
  69         pass
  70
  71     geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
  72
  73
  74 def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
  75     print 'returning only up to %d bp from gene locus' % bplength
  76     print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
  77
  78     if acceptfile:
  79         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  80
  81     hitRDS = readDataset(hitfile, verbose = True)
  82
  83     totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
  84
  85     hg = Genome(genome)
  86     idb = geneinfoDB(cache=True)
  87
  88     gidCount = {}
  89     gidList = []
  90     gidLen = {}
  91     geneinfoDict = idb.getallGeneInfo(genome)
  92     locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
  93
  94     locusChroms = locusByChromDict.keys()
  95     chromList = hitRDS.getChromosomes(fullChrom=False)
  96     chromList.sort()
  97     for chrom in chromList:
  98         if chrom == 'M' or chrom not in locusChroms:
  99             continue
 100
 101         print 'chr' + chrom
 102         fullchrom = 'chr' + chrom
 103         hitRDS.memSync(fullchrom, index=True)
 104         for (start, stop, gid, length) in locusByChromDict[chrom]:
 105             if gid not in gidList:
 106                 gidList.append(gid)
 107                 gidCount[gid] = 0
 108                 gidLen[gid] = length
 109
 110             gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 111
 112     outfile = open(outfilename,'w')
 113
 114     totalCount /= 1000000.
 115
 116     outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
 117     gidList.sort()
 118     for gid in gidList:
 119         if 'FAR' not in gid:
 120             symbol = 'LOC' + gid
 121             geneinfo = ''
 122             try:
 123                 geneinfo = geneinfoDict[gid]
 124                 symbol = geneinfo[0][0]
 125             except:
 126                 pass
 127         else:
 128             symbol = gid
 129
 130         if gid in gidCount and gid in gidLen:
 131             rpm  = gidCount[gid] / totalCount
 132             rpkm = 1000. * rpm / gidLen[gid]
 133             outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
 134
 135     outfile.close()
 136
 137 if __name__ == "__main__":
 138     main(sys.argv)