6 # Created by Ali Mortazavi on 10/02/08.
20 from commoncode import getConfigParser, getConfigBoolOption, getConfigOption
23 print "weighMultireads: version 3.3"
29 usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
31 parser = getParser(usage)
32 (options, args) = parser.parse_args(argv[1:])
40 weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
44 parser = optparse.OptionParser(usage=usage)
45 parser.add_option("--radius", type="int", dest="radius")
46 parser.add_option("--noradius", action="store_false", dest="doRadius")
47 parser.add_option("--usePairs", type="int", dest="pairDist")
48 parser.add_option("--verbose", action="store_true", dest="verbose")
49 parser.add_option("--cache", type="int", dest="cachePages")
51 configParser = getConfigParser()
52 section = "weighMultireads"
53 radius = getConfigOption(configParser, section, "radius", None)
54 doRadius = getConfigBoolOption(configParser, section, "doRadius", True)
55 pairDist = getConfigOption(configParser, section, "pairDist", None)
56 verbose = getConfigBoolOption(configParser, section, "verbose", False)
57 cachePages = getConfigOption(configParser, section, "cachePages", None)
59 parser.set_defaults(radius=radius, doRadius=doRadius, pairDist=pairDist, verbose=verbose, cachePages=cachePages)
64 def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
66 if cachePages is not None:
72 RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
73 if cachePages > RDS.getDefaultCacheSize():
74 RDS.setDBcache(cachePages)
79 multiIDs = RDS.getReadIDs(uniqs=False, multi=True)
81 print "got multiIDs ", time.ctime()
84 if pairDist is not None:
85 fixedReads = reweighUsingPairs(RDS, pairDist, multiIDs, verbose)
87 if radius is not None:
93 reweighUsingRadius(RDS, radius, multiIDs, fixedReads, verbose)
96 RDS.saveCacheDB(rdsfile)
99 print "finished", time.ctime()
102 def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
104 tooFar = pairDist * 10
105 readlen = RDS.getReadSize()
107 print "doing pairs with pairDist = %d" % pairDist
108 hasSplices = RDS.dataType == "RNA"
109 uniqIDs = RDS.getReadIDs(uniqs=True, multi=False, splices=hasSplices)
112 print "got uniqIDs ", time.ctime()
114 jointList, bothMultiList = getReadIDLists(uniqIDs, multiIDs, verbose)
115 uniqDict = getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose)
117 print "guDict actual ", len(uniqDict), time.ctime()
119 multiDict = getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose)
121 print "muDict actual ", len(multiDict), time.ctime()
123 RDS.setSynchronousPragma("OFF")
124 for readID in jointList:
126 ustart = uniqDict[readID]["start"]
127 ustop = ustart + readlen
129 ustart = uniqDict[readID]["startL"]
130 ustop = uniqDict[readID]["stopR"]
132 uniqReadChrom = uniqDict[readID]["chrom"]
133 multiReadList = multiDict[readID]
134 numMultiReads = len(multiReadList)
135 bestMatch = [tooFar] * numMultiReads
137 for index in range(numMultiReads):
138 mstart = multiReadList[index]["start"]
139 multiReadChrom = multiReadList[index]["chrom"]
140 mpair = multiReadList[index]["pairID"]
141 if uniqReadChrom != multiReadChrom:
144 if abs(mstart - ustart) < pairDist:
145 bestMatch[index] = abs(mstart - ustart)
147 elif abs(mstart - ustop) < pairDist:
148 bestMatch[index] = abs(mstart - ustop)
155 for index in range(numMultiReads):
156 if theDist > bestMatch[index]:
158 theDist = bestMatch[index]
160 theID = string.join([readID, mpair], "/")
161 for index in range(numMultiReads):
162 if index == theMatch:
163 score = 1 - ((numMultiReads - 1) / (100. * numMultiReads))
165 score = 1 / (100. * numMultiReads)
167 start = multiReadList[index][0]
168 chrom = "chr%s" % multiReadList[index][1]
169 reweighList.append((round(score,3), chrom, start, theID))
171 #TODO: Is this right? If match index is 0 are we doing nothing?
173 RDS.reweighMultireads(reweighList)
175 if verbose and fixedPair % 10000 == 1:
176 print "fixed %d" % fixedPair
177 print uniqDict[readID]
178 print multiDict[readID]
181 fixedReads.append(theID)
183 RDS.setSynchronousPragma("ON")
185 print "fixed %d pairs" % fixedPair
191 def getReadIDLists(uniqIDs, multiIDs, verbose=False):
194 for readID in uniqIDs:
195 (mainID, pairID) = readID.split("/")
197 uidDict[mainID].append(pairID)
199 uidDict[mainID] = [pairID]
200 mainIDList.append(mainID)
203 print "uidDict all ", len(uidDict), time.ctime()
205 for mainID in mainIDList:
206 if len(uidDict[mainID]) == 2:
210 print "uidDict first candidates ", len(uidDict), time.ctime()
213 for readID in multiIDs:
214 (frontID, multiplicity) = readID.split("::")
215 (mainID, pairID) = frontID.split("/")
217 if pairID not in midDict[mainID]:
218 midDict[mainID].append(pairID)
220 midDict[mainID] = [pairID]
223 print "all multis ", len(midDict), time.ctime()
225 mainIDList = uidDict.keys()
226 for mainID in mainIDList:
227 if mainID not in midDict:
231 print "uidDict actual candidates ", len(uidDict), time.ctime()
235 for readID in midDict:
236 listLen = len(midDict[readID])
238 if readID in uidDict:
239 jointList.append(readID)
241 bothMultiList.append(readID)
244 print "joint ", len(jointList), time.ctime()
245 print "bothMulti ", len(bothMultiList), time.ctime()
247 return jointList, bothMultiList
250 def getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose=False):
252 uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
254 print "got uniq dict ", len(uniqDict), time.ctime()
256 if RDS.dataType == "RNA":
257 spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
259 print "got splice dict ", len(spliceDict), time.ctime()
261 for readID in jointList:
263 uniqReadsDict[readID] = uniqDict[readID][0]
265 if RDS.dataType == "RNA":
266 uniqReadsDict[readID] = spliceDict[readID][0]
271 def getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose=False):
272 multiReadSubsetDict = {}
273 multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
275 print "got multi dict ", len(multiDict), time.ctime()
277 for readID in jointList:
278 multiReadSubsetDict[readID] = multiDict[readID]
280 for readID in bothMultiList:
281 multiReadSubsetDict[readID] = multiDict[readID]
283 return multiReadSubsetDict
286 def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
288 readlen = RDS.getReadSize()
289 halfreadlen = readlen / 2
290 print "doing uniq read radius with radius = %d" % radius
291 multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
292 print "got multiDict"
293 RDS.setSynchronousPragma("OFF")
295 for readID in multiIDs:
296 originalMultiReadID = readID
297 if originalMultiReadID in readsToSkip:
302 (readID, multiplicity) = readID.split("::")
306 for read in multiDict[readID]:
307 start = read["start"]
308 chromosome = "chr%s" % read["chrom"]
309 regionStart = start + halfreadlen - radius
310 regionStop = start + halfreadlen + radius
311 uniqs = RDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
312 scores.append(uniqs + 1)
313 coords.append((chromosome, start, originalMultiReadID))
315 total = float(sum(scores))
317 for index in range(len(scores)):
318 reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
320 RDS.reweighMultireads(reweighList)
322 if reweighedCount % 10000 == 0:
325 RDS.setSynchronousPragma("ON")
327 print "skipped ", skippedReads
329 print "reweighted ", reweighedCount
332 if __name__ == "__main__":