geneMrnaCounts.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     print "psyco not running"
   6
   7 import sys
   8 import optparse
   9 from commoncode import readDataset, getFeaturesByChromDict
  10 from cistematic.genomes import Genome
  11 from cistematic.core.geneinfo import geneinfoDB
  12
  13 print "%s: version 5.1" % sys.argv[0]
  14
  15
  16 def main(argv=None):
  17     if not argv:
  18         argv = sys.argv
  19
  20     usage = "usage: python %prog genome rdsfile outfilename [options]"
  21
  22     parser = optparse.OptionParser(usage=usage)
  23     parser.add_option("--stranded", action="store_true", dest="trackStrand")
  24     parser.add_option("--splices", action="store_true", dest="doSplices")
  25     parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
  26     parser.add_option("--multi", action="store_true", dest="doMulti")
  27     parser.add_option("--models", dest="extendGenome")
  28     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  29     parser.add_option("--searchGID", action="store_true", dest="searchGID")
  30     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
  31     parser.add_option("--cache", type="int", dest="cachePages")
  32     parser.add_option("--markGID", action="store_true", dest="markGID")
  33     parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
  34                         extendGenome="", replaceModels=False, searchGID=False,
  35                         countFeats=False, cachePages=None, markGID=False)
  36
  37     (options, args) = parser.parse_args(argv[1:])
  38
  39     if len(args) < 3:
  40         print usage
  41         sys.exit(1)
  42
  43     genomeName = args[0]
  44     hitfile =  args[1]
  45     outfilename = args[2]
  46
  47     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
  48                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
  49                    options.searchGID, options.countFeats, options.cachePages, options.markGID)
  50
  51
  52 def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
  53                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
  54                    searchGID=False, countFeats=False, cachePages=None, markGID=False):
  55
  56     if trackStrand:
  57         print "will track strandedness"
  58         doStranded = "track"
  59     else:
  60         doStranded = "both"
  61
  62     if extendGenome:
  63         if replaceModels:
  64             print "will replace gene models with %s" % extendGenome
  65         else:
  66             print "will extend gene models with %s" % extendGenome
  67     else:
  68         replaceModels = False
  69
  70     if cachePages is not None:
  71         doCache = True
  72     else:
  73         cachePages = 100000
  74         doCache = False
  75
  76     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
  77     if cachePages > hitRDS.getDefaultCacheSize():
  78         hitRDS.setDBcache(cachePages)
  79
  80     genome = Genome(genomeName, inRAM=True)
  81     if extendGenome != "":
  82         genome.extendFeatures(extendGenome, replace=replaceModels)
  83
  84     print "getting gene features...."
  85     featuresByChromDict = getFeaturesByChromDict(genome)
  86
  87     seenFeaturesByChromDict = {}
  88     print "getting geneIDs...."
  89     gidList = genome.allGIDs()
  90     gidList.sort()
  91     gidCount = {}
  92     for gid in gidList:
  93         gidCount[gid] = 0
  94
  95     chromList = hitRDS.getChromosomes(fullChrom=False)
  96     if len(chromList) == 0 and doSplices:
  97         chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
  98
  99     if markGID:
 100         print "Flagging all reads as NM"
 101         hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
 102
 103     for chrom in chromList:
 104         if chrom not in featuresByChromDict:
 105             continue
 106
 107         if countFeats:
 108             seenFeaturesByChromDict[chrom] = []
 109
 110         print "\nchr%s" % chrom
 111         fullchrom = "chr%s" % chrom
 112         regionList = []
 113         print "counting GIDs"
 114         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
 115             try:
 116                 if doStranded == "track":
 117                     checkSense = "+"
 118                     if featureSense == "R":
 119                         checkSense = "-"
 120
 121                     regionList.append((gid, fullchrom, start, stop, checkSense))
 122                     count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
 123                 else:
 124                     regionList.append((gid, fullchrom, start, stop))
 125                     count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 126                     if count != 0:
 127                         print count
 128
 129                 gidCount[gid] += count
 130                 if countFeats:
 131                     if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
 132                         seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
 133             except:
 134                 print "problem with %s - skipping" % gid
 135
 136         if markGID:
 137             print "marking GIDs"
 138             hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
 139             print "finished marking"
 140
 141     print " "
 142     if countFeats:
 143         numFeatures = countFeatures(seenFeaturesByChromDict)
 144         print "saw %d features" % numFeatures
 145
 146     writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
 147     if markGID and doCache:
 148         hitRDS.saveCacheDB(hitfile)
 149
 150
 151 def countFeatures(seenFeaturesByChromDict):
 152     count = 0
 153     for chrom in seenFeaturesByChromDict.keys():
 154         try:
 155             count += len(seenFeaturesByChromDict[chrom])
 156         except TypeError:
 157             pass
 158
 159     return count
 160
 161
 162 def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
 163     geneAnnotDict = genome.allAnnotInfo()
 164     genomeName = genome.genome
 165     outfile = open(outfilename, "w")
 166     idb = geneinfoDB(cache=True)
 167     geneInfoDict = idb.getallGeneInfo(genomeName)
 168     for gid in gidList:
 169         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
 170         if gid in gidCount:
 171             outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
 172         else:
 173             outfile.write("%s\t%s\t0\n" % (gid, symbol))
 174
 175     outfile.close()
 176
 177
 178 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
 179     lookupGID = gid
 180     if searchGID and gid not in geneInfoDict:
 181         actualGeneID = idb.getGeneID(genomeName, gid)
 182         if len(actualGeneID) > 0:
 183             lookupGID = actualGeneID[1]
 184
 185     try:
 186         geneinfo = geneInfoDict[lookupGID]
 187         symbol = geneinfo[0][0]
 188     except (KeyError, IndexError):
 189         try:
 190             symbol = geneAnnotDict[(genomeName, gid)][0]
 191         except (KeyError, IndexError):
 192             symbol = "LOC%s" % gid
 193
 194     return symbol
 195
 196
 197 if __name__ == "__main__":
 198     main(sys.argv)