geneMrnaCounts.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     print "psyco not running"
   6
   7 import sys
   8 import optparse
   9 from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
  10 from cistematic.genomes import Genome
  11 from cistematic.core.geneinfo import geneinfoDB
  12
  13 print "geneMrnaCounts: version 6.0"
  14
  15
  16 def main(argv=None):
  17     if not argv:
  18         argv = sys.argv
  19
  20     usage = "usage: python %prog genome rdsfile outfilename [options]"
  21
  22     parser = getParser(usage)
  23     (options, args) = parser.parse_args(argv[1:])
  24
  25     if len(args) < 3:
  26         print usage
  27         sys.exit(1)
  28
  29     genomeName = args[0]
  30     hitfile =  args[1]
  31     outfilename = args[2]
  32
  33     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
  34                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
  35                    options.searchGID, options.countFeats, options.cachePages)
  36
  37
  38 def getParser(usage):
  39     parser = optparse.OptionParser(usage=usage)
  40     parser.add_option("--stranded", action="store_true", dest="trackStrand")
  41     parser.add_option("--splices", action="store_true", dest="doSplices")
  42     parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
  43     parser.add_option("--multi", action="store_true", dest="doMulti")
  44     parser.add_option("--models", dest="extendGenome")
  45     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  46     parser.add_option("--searchGID", action="store_true", dest="searchGID")
  47     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
  48     parser.add_option("--cache", type="int", dest="cachePages")
  49
  50     configParser = getConfigParser()
  51     section = "geneMrnaCounts"
  52     trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
  53     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
  54     doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
  55     doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
  56     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
  57     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
  58     searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
  59     countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
  60     cachePages = getConfigOption(configParser, section, "cachePages", None)
  61
  62     parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
  63                         extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
  64                         countFeats=countFeats, cachePages=cachePages)
  65
  66     return parser
  67
  68 def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
  69                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
  70                    searchGID=False, countFeats=False):
  71
  72     if trackStrand:
  73         print "will track strandedness"
  74         doStranded = "track"
  75     else:
  76         doStranded = "both"
  77
  78     if extendGenome:
  79         if replaceModels:
  80             print "will replace gene models with %s" % extendGenome
  81         else:
  82             print "will extend gene models with %s" % extendGenome
  83     else:
  84         replaceModels = False
  85
  86     genome = Genome(genomeName, inRAM=True)
  87     if extendGenome != "":
  88         genome.extendFeatures(extendGenome, replace=replaceModels)
  89
  90     print "getting gene features...."
  91     featuresByChromDict = getFeaturesByChromDict(genome)
  92
  93     seenFeaturesByChromDict = {}
  94     print "getting geneIDs...."
  95     gidList = genome.allGIDs()
  96     gidList.sort()
  97     gidCount = {}
  98     for gid in gidList:
  99         gidCount[gid] = 0
 100
 101     chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
 102     for chrom in chromosomeList:
 103         if chrom not in featuresByChromDict:
 104             continue
 105
 106         if countFeats:
 107             seenFeaturesByChromDict[chrom] = set([])
 108
 109         print "\nchr%s" % chrom
 110         print "counting GIDs"
 111         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
 112             try:
 113                 if doStranded == "track":
 114                     checkSense = "+"
 115                     if featureSense == "R":
 116                         checkSense = "-"
 117
 118                     count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
 119                 else:
 120                     count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 121
 122                 gidCount[gid] += count
 123                 if countFeats:
 124                     seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
 125             except:
 126                 print "problem with %s - skipping" % gid
 127
 128     print " "
 129     if countFeats:
 130         numFeatures = countFeatures(seenFeaturesByChromDict)
 131         print "saw %d features" % numFeatures
 132
 133     writeOutputFile(outfilename, genome, gidCount, searchGID)
 134
 135
 136 def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
 137     count = 0.0
 138     for alignedread in bamfile.fetch(chrom, start, stop):
 139         if countThisRead(alignedread, uniqs, multi, splices, sense):
 140             count += 1.0/alignedread.opt('NH')
 141
 142     return count
 143
 144
 145 def countFeatures(seenFeaturesByChromDict):
 146     count = 0
 147     for chrom in seenFeaturesByChromDict.keys():
 148         try:
 149             count += len(seenFeaturesByChromDict[chrom])
 150         except TypeError:
 151             pass
 152
 153     return count
 154
 155
 156 def writeOutputFile(outfilename, genome, gidCount, searchGID):
 157     geneAnnotDict = genome.allAnnotInfo()
 158     genomeName = genome.genome
 159     outfile = open(outfilename, "w")
 160     idb = geneinfoDB(cache=True)
 161     geneInfoDict = idb.getallGeneInfo(genomeName)
 162     for gid in gidCount:
 163         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
 164         outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
 165
 166     outfile.close()
 167
 168
 169 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
 170     lookupGID = gid
 171     if searchGID and gid not in geneInfoDict:
 172         actualGeneID = idb.getGeneID(genomeName, gid)
 173         if len(actualGeneID) > 0:
 174             lookupGID = actualGeneID[1]
 175
 176     try:
 177         geneinfo = geneInfoDict[lookupGID]
 178         symbol = geneinfo[0][0]
 179     except (KeyError, IndexError):
 180         try:
 181             symbol = geneAnnotDict[(genomeName, gid)][0]
 182         except (KeyError, IndexError):
 183             symbol = "LOC%s" % gid
 184
 185     return symbol
 186
 187
 188 if __name__ == "__main__":
 189     main(sys.argv)