geneMrnaCountsWeighted.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     print "psyco not running"
   6
   7 import sys
   8 import optparse
   9 from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
  10 import ReadDataset
  11 from cistematic.genomes import Genome
  12 from commoncode import getGeneInfoDict
  13 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  14
  15 print "geneMrnaCountsWeighted: version 4.3"
  16
  17
  18 def main(argv=None):
  19     if not argv:
  20         argv = sys.argv
  21
  22     usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
  23
  24     parser = makeParser(usage)
  25     (options, args) = parser.parse_args(argv[1:])
  26
  27     if len(args) < 4:
  28         print usage
  29         sys.exit(1)
  30
  31     genome = args[0]
  32     hitfile =  args[1]
  33     countfile = args[2]
  34     outfilename = args[3]
  35
  36     geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
  37                            options.withUniqs, options.withMulti,
  38                            options.acceptfile, options.cachePages, options.doVerbose,
  39                            options.extendGenome, options.replaceModels)
  40
  41
  42 def makeParser(usage=""):
  43     parser = optparse.OptionParser(usage=usage)
  44     parser.add_option("--stranded", action="store_false", dest="ignoreSense")
  45     parser.add_option("--uniq", action="store_true", dest="withUniqs")
  46     parser.add_option("--multi", action="store_true", dest="withMulti")
  47     parser.add_option("--accept", dest="acceptfile")
  48     parser.add_option("--cache", type="int", dest="cachePages")
  49     parser.add_option("--verbose", action="store_true", dest="doVerbose")
  50     parser.add_option("--models", dest="extendGenome")
  51     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  52
  53     configParser = getConfigParser()
  54     section = "geneMrnaCountsWeighted"
  55     ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
  56     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
  57     withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
  58     acceptfile = getConfigOption(configParser, section, "acceptfile", None)
  59     cachePages = getConfigOption(configParser, section, "cachePages", None)
  60     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
  61     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
  62     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
  63
  64     parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
  65                         acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
  66                         replaceModels=replaceModels)
  67
  68     return parser
  69
  70
  71 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
  72                            withUniqs=False, withMulti=False, acceptfile=None,
  73                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  74
  75     if (not withUniqs and not withMulti) or (withUniqs and withMulti):
  76         print "must have either one of -uniq or -multi set. Exiting"
  77         sys.exit(1)
  78
  79     if cachePages is not None:
  80         cacheGeneDB(genome)
  81         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
  82         print "%s cached" % genome
  83         doCache = True
  84     else:
  85         doCache = False
  86         cachePages = 0
  87         hg = Genome(genome, inRAM=True)
  88
  89     if extendGenome:
  90         if replaceModels:
  91             print "will replace gene models with %s" % extendGenome
  92         else:
  93             print "will extend gene models with %s" % extendGenome
  94
  95         hg.extendFeatures(extendGenome, replace=replaceModels)
  96
  97     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
  98     if cachePages > hitRDS.getDefaultCacheSize():
  99         hitRDS.setDBcache(cachePages)
 100
 101     allGIDs = set(hg.allGIDs())
 102     if acceptfile is not None:
 103         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
 104         for chrom in regionDict:
 105             for region in regionDict[chrom]:
 106                 allGIDs.add(region.label)
 107     else:
 108         regionDict = {}
 109
 110     featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
 111
 112     gidReadDict = {}
 113     read2GidDict = {}
 114     for gid in allGIDs:
 115         gidReadDict[gid] = []
 116
 117     index = 0
 118     if withMulti and not withUniqs:
 119         chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
 120     else:
 121         chromList = hitRDS.getChromosomes(fullChrom=False)
 122
 123     readlen = hitRDS.getReadSize()
 124     for chromosome in chromList:
 125         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
 126             continue
 127
 128         print "\n%s " % chromosome,
 129         fullchrom = "chr%s" % chromosome
 130         hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
 131         featureList = featuresByChromDict[chromosome]
 132
 133         readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
 134         index = totalProcessedReads
 135         for (tagReadID, gid) in readGidList:
 136             try:
 137                 gidReadDict[gid].append(tagReadID)
 138                 if tagReadID in read2GidDict:
 139                     read2GidDict[tagReadID].add(gid)
 140                 else:
 141                     read2GidDict[tagReadID] = set([gid])
 142             except KeyError:
 143                 print "gid %s not in gidReadDict" % gid
 144
 145     writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
 146     if doCache:
 147         uncacheGeneDB(genome)
 148
 149
 150 def doNotProcessChromosome(chromosome, chromosomeList):
 151     return chromosome not in chromosomeList
 152
 153
 154 def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
 155
 156     startFeature = 0
 157     readGidList = []
 158     ignoreSense = True
 159     for read in hitDict[fullchrom]:
 160         tagStart = read["start"]
 161         tagReadID = read["readID"]
 162         if read.has_key("sense"):
 163             tagSense = read["sense"]
 164             ignoreSense = False
 165
 166         index += 1
 167         if index % 100000 == 0:
 168             print "read %d" % index,
 169
 170         stopPoint = tagStart + readlen
 171         if startFeature < 0:
 172             startFeature = 0
 173
 174         for (start, stop, gid, sense, ftype) in featList[startFeature:]:
 175             if tagStart > stop:
 176                 startFeature += 1
 177                 continue
 178
 179             if start > stopPoint:
 180                 startFeature -= 100
 181                 break
 182
 183             if not ignoreSense:
 184                 if sense == "R":
 185                     sense = "-"
 186                 else:
 187                     sense = "+"
 188
 189             if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
 190                 readGidList.append((tagReadID, gid))
 191                 stopPoint = stop
 192
 193     return readGidList, index
 194
 195
 196 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 197
 198     uniqueCountDict = {}
 199     uniquecounts = open(countFilename)
 200     for line in uniquecounts:
 201         fields = line.strip().split()
 202         # add a pseudo-count here to ease calculations below
 203         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 204
 205     uniquecounts.close()
 206
 207     genomeName = genome.genome
 208     geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
 209     geneannotDict = genome.allAnnotInfo()
 210     outfile = open(outFilename, "w")
 211     for gid in allGIDs:
 212         symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
 213         tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
 214         if doVerbose:
 215             print "%s %s %f" % (gid, symbol, tagCount)
 216
 217         outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
 218
 219     outfile.close()
 220
 221
 222 def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
 223     if "FAR" not in gid:
 224         symbol = "LOC%s" % gid
 225         geneinfo = ""
 226         try:
 227             geneinfo = geneinfoDict[gid]
 228             if genomeName == "celegans":
 229                 symbol = geneinfo[0][1]
 230             else:
 231                 symbol = geneinfo[0][0]
 232         except (KeyError, IndexError):
 233             try:
 234                 symbol = geneannotDict[(genomeName, gid)][0]
 235             except (KeyError, IndexError):
 236                 symbol = "LOC%s" % gid
 237     else:
 238         symbol = gid
 239
 240     return symbol
 241
 242
 243 def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
 244     tagCount = 0.
 245     for readID in gidReadDict[gid]:
 246         try:
 247             tagValue = uniqueCountDict[gid]
 248         except KeyError:
 249             tagValue = 1
 250
 251         tagDenom = 0.
 252         for relatedGID in read2GidDict[readID]:
 253             try:
 254                 tagDenom += uniqueCountDict[relatedGID]
 255             except KeyError:
 256                 tagDenom += 1
 257
 258         try:
 259             tagCount += tagValue / tagDenom
 260         except ZeroDivisionError:
 261             pass
 262
 263     return tagCount
 264
 265
 266 if __name__ == "__main__":
 267     main(sys.argv)