5 # Created by Ali Mortazavi on 12/7/08.
17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
20 verstring = "makerdsfromblat: version 3.10"
30 usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
32 parser = getParser(usage)
33 (options, args) = parser.parse_args(argv[1:])
43 if options.geneDataFileName:
55 (pname, pvalue) = arg.strip().split("::")
56 propertyList.append((pname, pvalue))
58 makerdsfromblat(label, filename, outdbname, dataType, options.init,
59 options.doIndex, options.trimReadID, options. minSpliceLength,
60 options.forceRNA, theFlag, options.spliceOnly, options.verbose,
61 options.cachePages, options.geneDataFileName, propertyList)
65 parser = optparse.OptionParser(usage=usage)
66 parser.add_option("--append", action="store_false", dest="init")
67 parser.add_option("--index", action="store_true", dest="doIndex")
68 parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
69 parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
70 parser.add_option("--flag", action="store_true", dest="flagReads")
71 parser.add_option("--strict", type="int", dest="minSpliceLength",
72 help="min required bp on each side of a splice")
73 parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
74 parser.add_option("--verbose", action="store_true", dest="verbose")
75 parser.add_option("--cache", type="int", dest="cachePages")
76 parser.add_option("--RNA", dest="geneDataFileName")
78 configParser = getConfigParser()
79 section = "makerdsfromblat"
80 init = getConfigBoolOption(configParser, section, "init", True)
81 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
82 trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
83 minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
84 forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
85 flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
86 spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
87 verbose = getConfigBoolOption(configParser, section, "verbose", False)
88 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
89 geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
91 parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
92 flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
93 geneDataFileName=geneDataFileName)
98 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
99 doIndex=False,trimReadID=True, minSpliceLength=0,
100 forceRNA=False, theFlag="", spliceOnly=False,
101 verbose=False, cachePages=100000, geneDataFileName="",
104 writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
106 print "forcing datatype to RNA"
111 if dataType == "RNA" and not forceRNA:
112 genedatafile = open(geneDataFileName)
113 for line in genedatafile:
114 fields = line.strip().split("\t")
115 blockCount = int(fields[7])
122 chromstarts = fields[8][:-1].split(",")
123 chromstops = fields[9][:-1].split(",")
126 for index in range(blockCount):
127 chromstarts[index] = int(chromstarts[index])
128 chromstops[index] = int(chromstops[index])
129 exonLengths.append(chromstops[index] - chromstarts[index])
130 totalLength += exonLengths[index]
132 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
137 rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
139 #check that our cacheSize is better than the dataset's default cache size
140 defaultCacheSize = rds.getDefaultCacheSize()
141 if cachePages > defaultCacheSize:
143 rds.setDBcache(cachePages, default=True)
145 rds.setDBcache(cachePages)
147 if not init and doIndex:
153 print "couldn't drop Index"
156 if len(propertyList) > 0:
157 rds.insertMetadata(propertyList)
159 # make some assumptions based on first read
160 infile = open(filename, "r")
161 for arg in range(NUM_HEADER_LINES):
162 line = infile.readline()
164 line = infile.readline()
165 fields = line.split()
166 readsize = int(fields[10])
167 pairedTest = fields[9][-2:]
169 if pairedTest in ["/1", "/2"]:
170 print "assuming reads are paired"
173 print "read size: %d bp" % readsize
175 rds.insertMetadata([("readsize", readsize)])
177 rds.insertMetadata([("paired", "True")])
180 if "blat_mapped" not in rds.getMetadata():
181 rds.insertMetadata([("blat_mapped", "True")])
183 minReadScore = readsize - readsize/25 - 1
185 if dataType == "RNA":
187 maxBorder = readsize + trim
189 infile = open(filename, "r")
195 index = uIndex = mIndex = sIndex = lIndex = 0
198 for arg in range(NUM_HEADER_LINES):
199 line = infile.readline()
206 fields = line.strip().split()
209 readID = string.join(readID.split(":")[1:], ":")
213 if bestScore > minReadScore:
214 for readData in readList:
215 if readData[1] == bestScore:
216 newReadList.append(readData)
219 prevID = label + "-" + prevID
221 listlen = len(newReadList)
223 parts = int(newReadList[0][0])
224 if parts == 1 and not spliceOnly:
225 (part, score, sense, chrom, start, mismatches) = newReadList[0]
226 stop = start + readsize
227 uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
229 elif forceRNA and parts == 2:
230 (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
231 startL = int(startList[0])
232 stopL = startL + int(lengthList[0])
233 startR = int(startList[1])
234 stopR = startR + int(lengthList[1])
235 if stopL + minIntron < startR:
236 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
240 (part, score, sense, chrom, start, mismatches) = newReadList[0]
241 currentSplice = chrom
242 (model, spliceID, regionStart) = currentSplice.split(delimiter)
243 if model not in geneDict:
247 (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
248 spliceID = int(spliceID)
249 rstart = int(start) - 2
250 lefthalf = maxBorder - rstart
251 if lefthalf < 1 or lefthalf > maxBorder:
254 righthalf = readsize - lefthalf
255 startL = int(regionStart) + rstart
256 stopL = startL + lefthalf
257 startR = chromstarts[spliceID + 1]
258 stopR = chromstarts[spliceID + 1] + righthalf
259 if stopL + minIntron < startR:
260 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
262 elif listlen > 1 and not spliceOnly:
263 prevID = prevID + "::" + str(listlen)
265 # ignore multireads that can also map across splices
267 for readData in newReadList:
272 for (part, score, sense, chrom, start, mismatches) in newReadList:
273 stop = start + readsize
274 mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
278 if index % insertSize == 0:
279 rds.insertUniqs(uInsertList)
280 rds.insertMulti(mInsertList)
283 if dataType == "RNA":
284 rds.insertSplices(sInsertList)
290 # start processing new read
297 score = int(fields[0])
300 parts = int(fields[17])
303 lengthList = fields[18][:-1].split(",")
304 startList = fields[20][:-1].split(",")
305 listlen = len(lengthList)
306 for lpos in range(listlen):
307 if int(lengthList[lpos]) < minSpliceLength:
310 # throw out deletions, for now
312 if int(lengthList[lpos - 1]) == int(startList[lpos]):
316 start = int(fields[15])
319 if score > bestScore:
323 if int(fields[1]) > 0:
325 mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
330 readList.append((parts, score, sense, chrom, start, mismatches))
332 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
334 if lIndex % 1000000 == 0:
335 print "processed %d lines" % lIndex
337 print "%d lines processed" % lIndex
339 if len(uInsertList) > 0:
340 rds.insertUniqs(uInsertList)
341 if len(mInsertList) > 0:
342 rds.insertMulti(mInsertList)
343 if len(sInsertList) > 0:
344 rds.insertSplices(sInsertList)
346 combString = "%d unique reads" % uIndex
347 combString += "\t%d multi reads" % mIndex
348 if dataType == "RNA":
349 combString += "\t%d spliced reads" % sIndex
352 print combString.replace("\t", "\n")
354 writeLog(outdbname + ".log", verstring, combString)
357 print "building index...."
358 if cachePages > defaultCacheSize:
359 rds.setDBcache(cachePages)
360 rds.buildIndex(cachePages)
362 rds.buildIndex(defaultCacheSize)
365 def decodeMismatches(gString, rString, rsense):
370 for rindex in xrange(rlen):
374 if gString[rindex] == rString[rindex]:
377 genNT = gString[rindex]
378 readNT = rString[rindex]
379 # for eland-compatibility, we are 1-based
380 output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
382 return string.join(output, ",")
385 if __name__ == "__main__":