6 # Created by Ali Mortazavi on 10/02/08.
15 from commoncode import readDataset
16 import sys, time, string, optparse
18 print "%prog: version 3.1"
24 usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
26 parser = optparse.OptionParser(usage=usage)
27 parser.add_option("--radius", type="int", dest="radius")
28 parser.add_option("--noradius", action="store_false", dest="doRadius")
29 parser.add_option("--usePairs", type="int", dest="pairDist")
30 parser.add_option("--verbose", action="store_true", dest="verbose")
31 parser.add_option("--cache", type="int", dest="cachePages")
32 parser.set_defaults(radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None)
33 (options, args) = parser.parse_args(argv[1:])
41 weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
44 def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
46 if radius is not None:
52 if pairDist is not None:
55 tooFar = pairDist * 10
58 if cachePages is not None:
63 RDS = readDataset(rdsfile, verbose = True, cache=doCache)
64 readlen = RDS.getReadSize()
65 halfreadlen = readlen / 2
67 if cachePages > RDS.getDefaultCacheSize():
68 RDS.setDBcache(cachePages)
73 multiIDs = RDS.getReadIDs(uniqs=False,multi=True)
75 print "got multiIDs ", time.ctime()
80 print "doing pairs with pairDist = %d" % pairDist
89 if RDS.dataType == "RNA":
90 uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=True)
92 uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=False)
95 print "got uniqIDs ", time.ctime()
97 for readID in uniqIDs:
98 (mainID, pairID) = readID.split("/")
100 uidDict[mainID].append(pairID)
102 uidDict[mainID] = [pairID]
103 mainIDList.append(mainID)
106 print "uidDict all ", len(uidDict), time.ctime()
108 for mainID in mainIDList:
109 if len(uidDict[mainID]) == 2:
113 print "uidDict first candidates ", len(uidDict), time.ctime()
115 for readID in multiIDs:
116 (frontID, multiplicity) = readID.split("::")
117 (mainID, pairID) = frontID.split("/")
119 if pairID not in midDict[mainID]:
120 midDict[mainID].append(pairID)
122 midDict[mainID] = [pairID]
125 print "all multis ", len(midDict), time.ctime()
127 mainIDList = uidDict.keys()
128 for mainID in mainIDList:
129 if mainID not in midDict:
133 print "uidDict actual candidates ", len(uidDict), time.ctime()
135 for readID in midDict:
136 listLen = len(midDict[readID])
138 if readID in uidDict:
139 jointList.append(readID)
141 bothMultiList.append(readID)
144 print "joint ", len(jointList), time.ctime()
145 print "bothMulti ", len(bothMultiList), time.ctime()
152 uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
154 print "got uniq dict ", len(uniqDict), time.ctime()
156 if RDS.dataType == "RNA":
157 spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
159 print "got splice dict ", len(spliceDict), time.ctime()
161 for readID in jointList:
163 guDict[readID] = uniqDict[readID][0]
165 if RDS.dataType == "RNA":
166 guDict[readID] = spliceDict[readID][0]
171 print "guDict actual ", len(guDict), time.ctime()
173 multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
175 print "got multi dict ", len(multiDict), time.ctime()
177 for readID in jointList:
178 muDict[readID] = multiDict[readID]
180 for readID in bothMultiList:
181 muDict[readID] = multiDict[readID]
185 print "muDict actual ", len(muDict), time.ctime()
187 RDS.setSynchronousPragma("OFF")
188 for readID in jointList:
190 (ustart, uchrom, upair) = guDict[readID]
191 ustop = ustart + readlen
193 (ustart, lstop, rstart, ustop, uchrom, upair) = guDict[readID]
195 muList = muDict[readID]
197 bestMatch = [tooFar] * muLen
199 for index in range(muLen):
200 (mstart, mchrom, mpair) = muList[index]
204 if abs(mstart - ustart) < pairDist:
205 bestMatch[index] = abs(mstart - ustart)
207 elif abs(mstart - ustop) < pairDist:
208 bestMatch[index] = abs(mstart - ustop)
215 for index in range(muLen):
216 if theDist > bestMatch[index]:
218 theDist = bestMatch[index]
220 theID = string.join([readID, mpair], "/")
221 for index in range(muLen):
222 if index == theMatch:
223 score = 1 - (muLen - 1) / (100. * (muLen))
225 score = 1 / (100. * muLen)
227 start = muList[index][0]
228 chrom = "chr%s" % muList[index][1]
229 reweighList.append((round(score,3), chrom, start, theID))
232 RDS.reweighMultireads(reweighList)
234 if verbose and fixedPair % 10000 == 1:
235 print "fixed %d" % fixedPair
240 fixedReads.append(theID)
242 RDS.setSynchronousPragma("ON")
246 print "fixed %d pairs" % fixedPair
251 print "doing uniq read radius with radius = %d" % radius
252 multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
253 print "got multiDict"
254 RDS.setSynchronousPragma("OFF")
256 for readID in multiIDs:
258 if theID in fixedReads:
263 (readID, multiplicity) = readID.split("::")
267 for read in multiDict[readID]:
268 (start, weight, rID, chrom) = read
269 achrom = "chr%s" % chrom
270 regionStart = start + halfreadlen - radius
271 regionStop = start + halfreadlen + radius
272 uniqs = RDS.getCounts(achrom, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
273 scores.append(uniqs + 1)
274 coords.append((achrom, start, theID))
276 total = float(sum(scores))
278 for index in range(len(scores)):
279 reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
281 RDS.reweighMultireads(reweighList)
283 if rindex % 10000 == 0:
286 RDS.setSynchronousPragma("ON")
288 print "skipped ", skippedReads
290 print "reweighted ", rindex
293 RDS.saveCacheDB(rdsfile)
296 print "finished", time.ctime()
299 if __name__ == "__main__":