Release version for Erange 4.0a
[erange.git] / geneMrnaCountsWeighted.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 import sys
8 import optparse
9 from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
10 import ReadDataset
11 from cistematic.genomes import Genome
12 from commoncode import getGeneInfoDict
13 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
14
15 print "geneMrnaCountsWeighted: version 4.3"
16
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
23
24     parser = makeParser(usage)
25     (options, args) = parser.parse_args(argv[1:])
26
27     if len(args) < 4:
28         print usage
29         sys.exit(1)
30
31     genome = args[0]
32     hitfile =  args[1]
33     countfile = args[2]
34     outfilename = args[3]
35
36     geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
37                            options.withUniqs, options.withMulti,
38                            options.acceptfile, options.cachePages, options.doVerbose,
39                            options.extendGenome, options.replaceModels)
40
41
42 def makeParser(usage=""):
43     parser = optparse.OptionParser(usage=usage)
44     parser.add_option("--stranded", action="store_false", dest="ignoreSense")
45     parser.add_option("--uniq", action="store_true", dest="withUniqs")
46     parser.add_option("--multi", action="store_true", dest="withMulti")
47     parser.add_option("--accept", dest="acceptfile")
48     parser.add_option("--cache", type="int", dest="cachePages")
49     parser.add_option("--verbose", action="store_true", dest="doVerbose")
50     parser.add_option("--models", dest="extendGenome")
51     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
52
53     configParser = getConfigParser()
54     section = "geneMrnaCountsWeighted"
55     ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
56     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
57     withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
58     acceptfile = getConfigOption(configParser, section, "acceptfile", None)
59     cachePages = getConfigOption(configParser, section, "cachePages", None)
60     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
61     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
62     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
63
64     parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
65                         acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
66                         replaceModels=replaceModels)
67
68     return parser
69
70
71 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
72                            withUniqs=False, withMulti=False, acceptfile=None,
73                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
74
75     if (not withUniqs and not withMulti) or (withUniqs and withMulti):
76         print "must have either one of -uniq or -multi set. Exiting"
77         sys.exit(1)
78
79     if cachePages is not None:
80         cacheGeneDB(genome)
81         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
82         print "%s cached" % genome
83         doCache = True
84     else:
85         doCache = False
86         cachePages = 0
87         hg = Genome(genome, inRAM=True)
88
89     if extendGenome:
90         if replaceModels:
91             print "will replace gene models with %s" % extendGenome
92         else:
93             print "will extend gene models with %s" % extendGenome
94
95         hg.extendFeatures(extendGenome, replace=replaceModels)
96
97     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
98     if cachePages > hitRDS.getDefaultCacheSize():
99         hitRDS.setDBcache(cachePages)
100
101     allGIDs = set(hg.allGIDs())
102     if acceptfile is not None:
103         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
104         for chrom in regionDict:
105             for region in regionDict[chrom]:
106                 allGIDs.add(region.label)
107     else:
108         regionDict = {}
109
110     featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
111
112     gidReadDict = {}
113     read2GidDict = {}
114     for gid in allGIDs:
115         gidReadDict[gid] = []
116
117     index = 0
118     if withMulti and not withUniqs:
119         chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
120     else:
121         chromList = hitRDS.getChromosomes(fullChrom=False)
122
123     readlen = hitRDS.getReadSize()
124     for chromosome in chromList:
125         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
126             continue
127
128         print "\n%s " % chromosome,
129         fullchrom = "chr%s" % chromosome
130         hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
131         featureList = featuresByChromDict[chromosome]
132
133         readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
134         index = totalProcessedReads
135         for (tagReadID, gid) in readGidList:
136             try:
137                 gidReadDict[gid].append(tagReadID)
138                 if tagReadID in read2GidDict:
139                     read2GidDict[tagReadID].add(gid)
140                 else:
141                     read2GidDict[tagReadID] = set([gid])
142             except KeyError:
143                 print "gid %s not in gidReadDict" % gid
144
145     writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
146     if doCache:
147         uncacheGeneDB(genome)
148
149
150 def doNotProcessChromosome(chromosome, chromosomeList):
151     return chromosome not in chromosomeList
152
153
154 def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
155
156     startFeature = 0
157     readGidList = []
158     ignoreSense = True
159     for read in hitDict[fullchrom]:
160         tagStart = read["start"]
161         tagReadID = read["readID"]
162         if read.has_key("sense"):
163             tagSense = read["sense"]
164             ignoreSense = False
165
166         index += 1
167         if index % 100000 == 0:
168             print "read %d" % index,
169
170         stopPoint = tagStart + readlen
171         if startFeature < 0:
172             startFeature = 0
173
174         for (start, stop, gid, sense, ftype) in featList[startFeature:]:
175             if tagStart > stop:
176                 startFeature += 1
177                 continue
178
179             if start > stopPoint:
180                 startFeature -= 100
181                 break
182
183             if not ignoreSense:
184                 if sense == "R":
185                     sense = "-"
186                 else:
187                     sense = "+"
188
189             if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
190                 readGidList.append((tagReadID, gid))
191                 stopPoint = stop
192
193     return readGidList, index
194
195
196 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
197
198     uniqueCountDict = {}
199     uniquecounts = open(countFilename)
200     for line in uniquecounts:
201         fields = line.strip().split()
202         # add a pseudo-count here to ease calculations below
203         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
204
205     uniquecounts.close()
206
207     genomeName = genome.genome
208     geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
209     geneannotDict = genome.allAnnotInfo()
210     outfile = open(outFilename, "w")
211     for gid in allGIDs:
212         symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
213         tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
214         if doVerbose:
215             print "%s %s %f" % (gid, symbol, tagCount)
216
217         outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
218
219     outfile.close()
220
221
222 def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
223     if "FAR" not in gid:
224         symbol = "LOC%s" % gid
225         geneinfo = ""
226         try:
227             geneinfo = geneinfoDict[gid]
228             if genomeName == "celegans":
229                 symbol = geneinfo[0][1]
230             else:
231                 symbol = geneinfo[0][0]
232         except (KeyError, IndexError):
233             try:
234                 symbol = geneannotDict[(genomeName, gid)][0]
235             except (KeyError, IndexError):
236                 symbol = "LOC%s" % gid
237     else:
238         symbol = gid
239
240     return symbol
241
242
243 def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
244     tagCount = 0.
245     for readID in gidReadDict[gid]:
246         try:
247             tagValue = uniqueCountDict[gid]
248         except KeyError:
249             tagValue = 1
250
251         tagDenom = 0.
252         for relatedGID in read2GidDict[readID]:
253             try:
254                 tagDenom += uniqueCountDict[relatedGID]
255             except KeyError:
256                 tagDenom += 1
257
258         try:
259             tagCount += tagValue / tagDenom
260         except ZeroDivisionError:
261             pass
262
263     return tagCount
264
265
266 if __name__ == "__main__":
267     main(sys.argv)