6 # Created by Ali Mortazavi on 10/02/08.
20 from commoncode import getConfigParser, getConfigBoolOption, getConfigOption
23 print "weighMultireads: version 3.3"
29 usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
31 parser = getParser(usage)
32 (options, args) = parser.parse_args(argv[1:])
40 weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
44 parser = optparse.OptionParser(usage=usage)
45 parser.add_option("--radius", type="int", dest="radius")
46 parser.add_option("--noradius", action="store_false", dest="doRadius")
47 parser.add_option("--usePairs", type="int", dest="pairDist")
48 parser.add_option("--verbose", action="store_true", dest="verbose")
49 parser.add_option("--cache", type="int", dest="cachePages")
51 configParser = getConfigParser()
52 section = "weighMultireads"
53 radius = getConfigOption(configParser, section, "radius", None)
54 doRadius = getConfigBoolOption(configParser, section, "doRadius", True)
55 pairDist = getConfigOption(configParser, section, "pairDist", None)
56 verbose = getConfigBoolOption(configParser, section, "verbose", False)
57 cachePages = getConfigOption(configParser, section, "cachePages", None)
59 parser.set_defaults(radius=radius, doRadius=doRadius, pairDist=pairDist, verbose=verbose, cachePages=cachePages)
64 def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
66 if cachePages is not None:
72 RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
73 if cachePages > RDS.getDefaultCacheSize():
74 RDS.setDBcache(cachePages)
79 multiIDs = RDS.getReadIDs(uniqs=False, multi=True)
81 print "got multiIDs ", time.ctime()
84 if pairDist is not None:
85 fixedReads = reweighUsingPairs(RDS, pairDist, multiIDs, verbose)
87 if radius is not None:
93 reweighUsingRadius(RDS, radius, multiIDs, fixedReads, verbose)
96 RDS.saveCacheDB(rdsfile)
99 print "finished", time.ctime()
102 def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
104 tooFar = pairDist * 10
105 readlen = RDS.getReadSize()
107 print "doing pairs with pairDist = %d" % pairDist
108 hasSplices = RDS.dataType == "RNA"
109 uniqIDs = RDS.getReadIDs(uniqs=True, multi=False, splices=hasSplices)
112 print "got uniqIDs ", time.ctime()
114 jointList, bothMultiList = getReadIDLists(uniqIDs, multiIDs, verbose)
115 uniqDict = getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose)
117 print "guDict actual ", len(uniqDict), time.ctime()
119 multiDict = getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose)
121 print "muDict actual ", len(multiDict), time.ctime()
123 RDS.setSynchronousPragma("OFF")
124 for readID in jointList:
126 ustart = uniqDict[readID]["start"]
127 ustop = ustart + readlen
129 ustart = uniqDict[readID]["startL"]
130 ustop = uniqDict[readID]["stopR"]
132 uniqReadChrom = uniqDict[readID]["chrom"]
133 multiReadList = multiDict[readID]
134 numMultiReads = len(multiReadList)
135 bestMatch = [tooFar] * numMultiReads
137 for index in range(numMultiReads):
138 mstart = multiReadList[index]["start"]
139 multiReadChrom = multiReadList[index]["chrom"]
140 mpair = multiReadList[index]["pairID"]
141 if uniqReadChrom != multiReadChrom:
144 if abs(mstart - ustart) < pairDist:
145 bestMatch[index] = abs(mstart - ustart)
147 elif abs(mstart - ustop) < pairDist:
148 bestMatch[index] = abs(mstart - ustop)
155 for index in range(numMultiReads):
156 if theDist > bestMatch[index]:
158 theDist = bestMatch[index]
160 theID = string.join([readID, mpair], "/")
161 for index in range(numMultiReads):
162 if index == theMatch:
163 score = 1 - ((numMultiReads - 1) / (100. * numMultiReads))
165 score = 1 / (100. * numMultiReads)
167 start = multiReadList[index][0]
168 chrom = "chr%s" % multiReadList[index][1]
169 reweighList.append((round(score,3), chrom, start, theID))
172 RDS.reweighMultireads(reweighList)
174 if verbose and fixedPair % 10000 == 1:
175 print "fixed %d" % fixedPair
176 print uniqDict[readID]
177 print multiDict[readID]
180 fixedReads.append(theID)
182 RDS.setSynchronousPragma("ON")
184 print "fixed %d pairs" % fixedPair
190 def getReadIDLists(uniqIDs, multiIDs, verbose=False):
193 for readID in uniqIDs:
194 (mainID, pairID) = readID.split("/")
196 uidDict[mainID].append(pairID)
198 uidDict[mainID] = [pairID]
199 mainIDList.append(mainID)
202 print "uidDict all ", len(uidDict), time.ctime()
204 for mainID in mainIDList:
205 if len(uidDict[mainID]) == 2:
209 print "uidDict first candidates ", len(uidDict), time.ctime()
212 for readID in multiIDs:
213 (frontID, multiplicity) = readID.split("::")
214 (mainID, pairID) = frontID.split("/")
216 if pairID not in midDict[mainID]:
217 midDict[mainID].append(pairID)
219 midDict[mainID] = [pairID]
222 print "all multis ", len(midDict), time.ctime()
224 mainIDList = uidDict.keys()
225 for mainID in mainIDList:
226 if mainID not in midDict:
230 print "uidDict actual candidates ", len(uidDict), time.ctime()
234 for readID in midDict:
235 listLen = len(midDict[readID])
237 if readID in uidDict:
238 jointList.append(readID)
240 bothMultiList.append(readID)
243 print "joint ", len(jointList), time.ctime()
244 print "bothMulti ", len(bothMultiList), time.ctime()
246 return jointList, bothMultiList
249 def getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose=False):
251 uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
253 print "got uniq dict ", len(uniqDict), time.ctime()
255 if RDS.dataType == "RNA":
256 spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
258 print "got splice dict ", len(spliceDict), time.ctime()
260 for readID in jointList:
262 uniqReadsDict[readID] = uniqDict[readID][0]
264 if RDS.dataType == "RNA":
265 uniqReadsDict[readID] = spliceDict[readID][0]
270 def getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose=False):
271 multiReadSubsetDict = {}
272 multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
274 print "got multi dict ", len(multiDict), time.ctime()
276 for readID in jointList:
277 multiReadSubsetDict[readID] = multiDict[readID]
279 for readID in bothMultiList:
280 multiReadSubsetDict[readID] = multiDict[readID]
282 return multiReadSubsetDict
285 def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
287 readlen = RDS.getReadSize()
288 halfreadlen = readlen / 2
289 print "doing uniq read radius with radius = %d" % radius
290 multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
291 print "got multiDict"
292 RDS.setSynchronousPragma("OFF")
294 for readID in multiIDs:
295 originalMultiReadID = readID
296 if originalMultiReadID in readsToSkip:
301 (readID, multiplicity) = readID.split("::")
305 for read in multiDict[readID]:
306 start = read["start"]
307 chromosome = "chr%s" % read["chrom"]
308 regionStart = start + halfreadlen - radius
309 regionStop = start + halfreadlen + radius
310 uniqs = RDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
311 scores.append(uniqs + 1)
312 coords.append((chromosome, start, originalMultiReadID))
314 total = float(sum(scores))
316 for index in range(len(scores)):
317 reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
319 RDS.reweighMultireads(reweighList)
321 if reweighedCount % 10000 == 0:
324 RDS.setSynchronousPragma("ON")
326 print "skipped ", skippedReads
328 print "reweighted ", reweighedCount
331 if __name__ == "__main__":