geneMrnaCountsWeighted.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     print 'psyco not running'
   6
   7 import sys, optparse
   8 from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
   9 from cistematic.genomes import Genome
  10 from cistematic.core.geneinfo import geneinfoDB
  11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  12
  13 print '%s: version 4.1' % sys.argv[0]
  14
  15
  16 def main(argv=None):
  17     if not argv:
  18         argv = sys.argv
  19
  20     usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
  21
  22     parser = optparse.OptionParser(usage=usage)
  23     parser.add_option("--stranded", action="store_false", dest="ignoreSense")
  24     parser.add_option("--uniq", action="store_true", dest="withUniqs")
  25     parser.add_option("--multi", action="store_true", dest="withMulti")
  26     parser.add_option("--record", action="store_true", dest="recording",
  27                       help="ignored with uniq reads")
  28     parser.add_option("--accept", dest="acceptfile")
  29     parser.add_option("--cache", type="int", dest="cachePages")
  30     parser.add_option("--verbose", action="store_true", dest="doVerbose")
  31     parser.add_option("--models", dest="extendGenome")
  32     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  33     parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
  34                         acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
  35                         replaceModels=False)
  36
  37     (options, args) = parser.parse_args(argv[1:])
  38
  39     if len(args) < 4:
  40         print usage
  41         sys.exit(1)
  42
  43     genome = args[0]
  44     hitfile =  args[1]
  45     countfile = args[2]
  46     outfilename = args[3]
  47
  48     geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
  49                            options.withUniqs, options.withMulti, options.recording,
  50                            options.acceptfile, options.cachePages, options.doVerbose,
  51                            options.extendGenome, options.replaceModels)
  52
  53
  54 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
  55                            withUniqs=False, withMulti=False, recording=False, acceptfile=None,
  56                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  57
  58     if (not withUniqs and not withMulti) or (withUniqs and withMulti):
  59         print "must have either one of -uniq or -multi set. Exiting"
  60         sys.exit(1)
  61
  62     if cachePages is not None:
  63         cacheGeneDB(genome)
  64         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
  65         idb = geneinfoDB(cache=True)
  66         print "%s cached" % genome
  67         doCache = True
  68     else:
  69         doCache = False
  70         cachePages = 0
  71         hg = Genome(genome, inRAM=True)
  72         idb = geneinfoDB()
  73
  74     if acceptfile is not None:
  75         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  76     else:
  77         acceptDict = {}
  78
  79     if recording and withUniqs:
  80         recording = False
  81
  82     if extendGenome:
  83         if replaceModels:
  84             print "will replace gene models with %s" % extendGenome
  85         else:
  86             print "will extend gene models with %s" % extendGenome
  87     else:
  88         replaceModels = False
  89
  90     if extendGenome != "":
  91         hg.extendFeatures(extendGenome, replace = replaceModels)
  92
  93     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
  94     if cachePages > hitRDS.getDefaultCacheSize():
  95         hitRDS.setDBcache(cachePages)
  96
  97     readlen = hitRDS.getReadSize()
  98
  99     geneinfoDict = idb.getallGeneInfo(genome)
 100     geneannotDict = hg.allAnnotInfo()
 101     gidCount = {}
 102     gidReadDict = {}
 103
 104     featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
 105     gidList = hg.allGIDs()
 106
 107     gidList.sort()
 108     for chrom in acceptDict:
 109         for (label, start, stop, length) in acceptDict[chrom]:
 110             if label not in gidList:
 111                 gidList.append(label)
 112
 113     for gid in gidList:
 114         gidCount[gid] = 0
 115         gidReadDict[gid] = []
 116
 117     uniqueCountDict = {}
 118     read2GidDict = {}
 119
 120     uniquecounts = open(countfile)
 121     for line in uniquecounts:
 122         fields = line.strip().split()
 123         # add a pseudo-count here to ease calculations below
 124         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 125
 126     uniquecounts.close()
 127
 128     outfile = open(outfilename, "w")
 129
 130     index = 0
 131     if withMulti and not withUniqs:
 132         chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
 133     else:
 134         chromList = hitRDS.getChromosomes(fullChrom=False)
 135
 136     for achrom in chromList:
 137         if achrom not in featuresByChromDict:
 138             continue
 139
 140         print "\n" + achrom + " ",
 141         startFeature = 0
 142         fullchrom = "chr" + achrom
 143         hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
 144         featList = featuresByChromDict[achrom]
 145         if ignoreSense:
 146             for (tagStart, tagReadID) in hitDict[fullchrom]:
 147                 index += 1
 148                 if index % 100000 == 0:
 149                     print "read %d" % index,
 150
 151                 stopPoint = tagStart + readlen
 152                 if startFeature < 0:
 153                     startFeature = 0
 154
 155                 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
 156                     if tagStart > stop:
 157                         startFeature += 1
 158                         continue
 159
 160                     if start > stopPoint:
 161                         startFeature -= 100
 162                         break
 163
 164                     if start <= tagStart <= stop:
 165                         try:
 166                             gidReadDict[gid].append(tagReadID)
 167                             if tagReadID in read2GidDict:
 168                                 if gid not in read2GidDict[tagReadID]:
 169                                     read2GidDict[tagReadID].append(gid)
 170                             else:
 171                                 read2GidDict[tagReadID] = [gid]
 172
 173                             gidCount[gid] += 1
 174                         except:
 175                             print "gid %s not in gidReadDict" % gid
 176
 177                         stopPoint = stop
 178         else:
 179             for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
 180                 index += 1
 181                 if index % 100000 == 0:
 182                     print "read %d" % index,
 183
 184                 stopPoint = tagStart + readlen
 185                 if startFeature < 0:
 186                     startFeature = 0
 187
 188                 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
 189                     if tagStart > stop:
 190                         startFeature += 1
 191                         continue
 192
 193                     if start > stopPoint:
 194                         startFeature -= 100
 195                         break
 196
 197                     if sense == "R":
 198                         sense = "-"
 199                     else:
 200                         sense = "+"
 201
 202                     if start <= tagStart <= stop and sense == tSense:
 203                         try:
 204                             gidReadDict[gid].append(tagReadID)
 205                             if tagReadID in read2GidDict:
 206                                 if gid not in read2GidDict[tagReadID]:
 207                                     read2GidDict[tagReadID].append(gid)
 208                             else:
 209                                 read2GidDict[tagReadID] = [gid]
 210
 211                             gidCount[gid] += 1
 212                         except:
 213                             print "gid %s not in gidReadDict" % gid
 214
 215                         stopPoint = stop
 216
 217     for gid in gidList:
 218         if "FAR" not in gid:
 219             symbol = "LOC" + gid
 220             geneinfo = ""
 221             try:
 222                 geneinfo = geneinfoDict[gid]
 223                 if genome == "celegans":
 224                     symbol = geneinfo[0][1]
 225                 else:
 226                     symbol = geneinfo[0][0]
 227             except:
 228                 try:
 229                     symbol = geneannotDict[(genome, gid)][0]
 230                 except:
 231                     symbol = "LOC" + gid
 232         else:
 233             symbol = gid
 234
 235         tagCount = 0.
 236         for readID in gidReadDict[gid]:
 237             try:
 238                 tagValue = uniqueCountDict[gid]
 239             except:
 240                 tagValue = 1
 241
 242             tagDenom = 0.
 243             for aGid in read2GidDict[readID]:
 244                 try:
 245                     tagDenom += uniqueCountDict[aGid]
 246                 except:
 247                     tagDenom += 1
 248
 249         try:
 250             tagCount += tagValue / tagDenom
 251         except ZeroDivisionError:
 252             tagCount = 0
 253
 254         if doVerbose:
 255             print "%s %s %f" % (gid, symbol, tagCount)
 256
 257         outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
 258
 259     outfile.close()
 260
 261     if doCache:
 262         uncacheGeneDB(genome)
 263
 264
 265 if __name__ == "__main__":
 266     main(sys.argv)