snapshot of 4.0a development. initial git repo commit
[erange.git] / geneMrnaCountsWeighted.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print 'psyco not running'
6
7 import sys, optparse
8 from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
9 from cistematic.genomes import Genome
10 from cistematic.core.geneinfo import geneinfoDB
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12
13 print '%s: version 4.1' % sys.argv[0]
14
15
16 def main(argv=None):
17     if not argv:
18         argv = sys.argv
19
20     usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
21
22     parser = optparse.OptionParser(usage=usage)
23     parser.add_option("--stranded", action="store_false", dest="ignoreSense")
24     parser.add_option("--uniq", action="store_true", dest="withUniqs")
25     parser.add_option("--multi", action="store_true", dest="withMulti")
26     parser.add_option("--record", action="store_true", dest="recording",
27                       help="ignored with uniq reads")
28     parser.add_option("--accept", dest="acceptfile")
29     parser.add_option("--cache", type="int", dest="cachePages")
30     parser.add_option("--verbose", action="store_true", dest="doVerbose")
31     parser.add_option("--models", dest="extendGenome")
32     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
33     parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
34                         acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
35                         replaceModels=False)
36
37     (options, args) = parser.parse_args(argv[1:])
38
39     if len(args) < 4:
40         print usage
41         sys.exit(1)
42
43     genome = args[0]
44     hitfile =  args[1]
45     countfile = args[2]
46     outfilename = args[3]
47
48     geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
49                            options.withUniqs, options.withMulti, options.recording,
50                            options.acceptfile, options.cachePages, options.doVerbose,
51                            options.extendGenome, options.replaceModels)
52
53
54 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
55                            withUniqs=False, withMulti=False, recording=False, acceptfile=None,
56                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
57
58     if (not withUniqs and not withMulti) or (withUniqs and withMulti):
59         print "must have either one of -uniq or -multi set. Exiting"
60         sys.exit(1)
61
62     if cachePages is not None:
63         cacheGeneDB(genome)
64         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
65         idb = geneinfoDB(cache=True)
66         print "%s cached" % genome
67         doCache = True
68     else:
69         doCache = False
70         cachePages = 0
71         hg = Genome(genome, inRAM=True)
72         idb = geneinfoDB()
73
74     if acceptfile is not None:
75         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
76     else:
77         acceptDict = {}
78
79     if recording and withUniqs:
80         recording = False
81
82     if extendGenome:
83         if replaceModels:
84             print "will replace gene models with %s" % extendGenome
85         else:
86             print "will extend gene models with %s" % extendGenome
87     else:
88         replaceModels = False
89
90     if extendGenome != "":
91         hg.extendFeatures(extendGenome, replace = replaceModels)
92     
93     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
94     if cachePages > hitRDS.getDefaultCacheSize():
95         hitRDS.setDBcache(cachePages)
96
97     readlen = hitRDS.getReadSize()
98
99     geneinfoDict = idb.getallGeneInfo(genome)
100     geneannotDict = hg.allAnnotInfo()
101     gidCount = {}
102     gidReadDict = {}
103
104     featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
105     gidList = hg.allGIDs()
106
107     gidList.sort()
108     for chrom in acceptDict:
109         for (label, start, stop, length) in acceptDict[chrom]:
110             if label not in gidList:
111                 gidList.append(label)
112
113     for gid in gidList:
114         gidCount[gid] = 0
115         gidReadDict[gid] = []
116
117     uniqueCountDict = {}
118     read2GidDict = {}
119
120     uniquecounts = open(countfile)
121     for line in uniquecounts:
122         fields = line.strip().split()
123         # add a pseudo-count here to ease calculations below
124         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
125
126     uniquecounts.close()
127
128     outfile = open(outfilename, "w")
129
130     index = 0
131     if withMulti and not withUniqs:
132         chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
133     else:
134         chromList = hitRDS.getChromosomes(fullChrom=False)
135
136     for achrom in chromList:
137         if achrom not in featuresByChromDict:
138             continue
139
140         print "\n" + achrom + " ",
141         startFeature = 0
142         fullchrom = "chr" + achrom
143         hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
144         featList = featuresByChromDict[achrom]
145         if ignoreSense:
146             for (tagStart, tagReadID) in hitDict[fullchrom]:
147                 index += 1
148                 if index % 100000 == 0:
149                     print "read %d" % index,
150
151                 stopPoint = tagStart + readlen
152                 if startFeature < 0:
153                     startFeature = 0
154
155                 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
156                     if tagStart > stop:
157                         startFeature += 1
158                         continue
159
160                     if start > stopPoint:
161                         startFeature -= 100
162                         break
163
164                     if start <= tagStart <= stop:
165                         try:
166                             gidReadDict[gid].append(tagReadID)
167                             if tagReadID in read2GidDict:
168                                 if gid not in read2GidDict[tagReadID]:
169                                     read2GidDict[tagReadID].append(gid)
170                             else:
171                                 read2GidDict[tagReadID] = [gid]
172
173                             gidCount[gid] += 1
174                         except:
175                             print "gid %s not in gidReadDict" % gid
176
177                         stopPoint = stop
178         else:
179             for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
180                 index += 1
181                 if index % 100000 == 0:
182                     print "read %d" % index,
183
184                 stopPoint = tagStart + readlen
185                 if startFeature < 0:
186                     startFeature = 0
187
188                 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
189                     if tagStart > stop:
190                         startFeature += 1
191                         continue
192
193                     if start > stopPoint:
194                         startFeature -= 100
195                         break
196
197                     if sense == "R":
198                         sense = "-"
199                     else:
200                         sense = "+"
201
202                     if start <= tagStart <= stop and sense == tSense:
203                         try:
204                             gidReadDict[gid].append(tagReadID)
205                             if tagReadID in read2GidDict:
206                                 if gid not in read2GidDict[tagReadID]:
207                                     read2GidDict[tagReadID].append(gid)
208                             else:
209                                 read2GidDict[tagReadID] = [gid]
210
211                             gidCount[gid] += 1
212                         except:
213                             print "gid %s not in gidReadDict" % gid
214
215                         stopPoint = stop
216
217     for gid in gidList:
218         if "FAR" not in gid:
219             symbol = "LOC" + gid
220             geneinfo = ""
221             try:
222                 geneinfo = geneinfoDict[gid]
223                 if genome == "celegans":
224                     symbol = geneinfo[0][1]
225                 else:
226                     symbol = geneinfo[0][0]
227             except:
228                 try:
229                     symbol = geneannotDict[(genome, gid)][0]
230                 except:
231                     symbol = "LOC" + gid
232         else:
233             symbol = gid
234
235         tagCount = 0.
236         for readID in gidReadDict[gid]:
237             try:
238                 tagValue = uniqueCountDict[gid]
239             except:
240                 tagValue = 1
241
242             tagDenom = 0.
243             for aGid in read2GidDict[readID]:
244                 try:
245                     tagDenom += uniqueCountDict[aGid]
246                 except:
247                     tagDenom += 1
248
249         try:
250             tagCount += tagValue / tagDenom
251         except ZeroDivisionError:
252             tagCount = 0
253     
254         if doVerbose:
255             print "%s %s %f" % (gid, symbol, tagCount)
256
257         outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
258
259     outfile.close()
260
261     if doCache:
262         uncacheGeneDB(genome)
263
264
265 if __name__ == "__main__":
266     main(sys.argv)