5 # Created by Ali Mortazavi on 12/7/08.
14 import sys, string, optparse
15 from commoncode import readDataset, writeLog
17 verstring = "%prog: version 3.9"
24 usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
26 parser = optparse.OptionParser(usage=usage)
27 parser.add_option("--append", action="store_false", dest="init")
28 parser.add_option("--index", action="store_true", dest="doIndex")
29 parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
30 parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
31 parser.add_option("--flag", action="store_true", dest="flagReads")
32 parser.add_option("--strict", type="int", dest="minSpliceLength",
33 help="min required bp on each side of a splice")
34 parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
35 parser.add_option("--verbose", action="store_true", dest="verbose")
36 parser.add_option("--cache", type="int", dest="cachePages")
37 parser.add_option("--RNA", dest="geneDataFileName")
38 parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
39 (options, args) = parser.parse_args(argv[1:])
49 if options.geneDataFileName:
61 (pname, pvalue) = arg.strip().split("::")
62 propertyList.append((pname, pvalue))
64 makerdsfromblat(label, filename, outdbname, dataType, options.init,
65 options.doIndex, options.trimReadID, options. minSpliceLength,
66 options.forceRNA, theFlag, options.spliceOnly, options.verbose,
67 options.cachePages, options.geneDataFileName, propertyList)
70 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
71 doIndex=False,trimReadID=True, minSpliceLength=0,
72 forceRNA=False, theFlag="", spliceOnly=False,
73 verbose=False, cachePages=100000, geneDataFileName="",
83 print "forcing datatype to RNA"
87 genedatafile = open(geneDataFileName)
89 writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
93 if dataType == "RNA" and not forceRNA:
94 for line in genedatafile:
95 fields = line.strip().split("\t")
96 blockCount = int(fields[7])
103 chromstarts = fields[8][:-1].split(",")
104 chromstops = fields[9][:-1].split(",")
107 for index in range(blockCount):
108 chromstarts[index] = int(chromstarts[index])
109 chromstops[index] = int(chromstops[index])
110 exonLengths.append(chromstops[index] - chromstarts[index])
111 totalLength += exonLengths[index]
113 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
118 rds = readDataset(outdbname, init, dataType, verbose=True)
120 #check that our cacheSize is better than the dataset's default cache size
121 defaultCacheSize = rds.getDefaultCacheSize()
122 if cachePages > defaultCacheSize:
124 rds.setDBcache(cachePages, default=True)
126 rds.setDBcache(cachePages)
128 if not init and doIndex:
134 print "couldn't drop Index"
137 if len(propertyList) > 0:
138 rds.insertMetadata(propertyList)
140 # make some assumptions based on first read
141 infile = open(filename, "r")
143 line = infile.readline()
145 fields = line.split()
146 readsize = int(fields[10])
147 pairedTest = fields[9][-2:]
149 if pairedTest in ["/1", "/2"]:
150 print "assuming reads are paired"
153 print "read size: %d bp" % readsize
155 rds.insertMetadata([("readsize", readsize)])
157 rds.insertMetadata([("paired", "True")])
160 if "blat_mapped" not in rds.getMetadata():
161 rds.insertMetadata([("blat_mapped", "True")])
163 minReadScore = readsize - readsize/25 - 1
165 if dataType == "RNA":
166 maxBorder = readsize + trim
168 infile = open(filename, "r")
174 index = uIndex = mIndex = sIndex = lIndex = 0
178 line = infile.readline()
182 fields = line.strip().split()
185 readID = string.join(readID.split(":")[1:], ":")
189 if bestScore > minReadScore:
190 for readData in readList:
191 if readData[1] == bestScore:
192 newReadList.append(readData)
195 prevID = label + "-" + prevID
197 listlen = len(newReadList)
199 parts = int(newReadList[0][0])
200 if parts == 1 and not spliceOnly:
201 (part, score, sense, chrom, start, mismatches) = newReadList[0]
202 stop = start + readsize
203 uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
205 elif forceRNA and parts == 2:
206 (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
207 startL = int(startList[0])
208 stopL = startL + int(lengthList[0])
209 startR = int(startList[1])
210 stopR = startR + int(lengthList[1])
211 if stopL + minIntron < startR:
212 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
216 (part, score, sense, chrom, start, mismatches) = newReadList[0]
217 currentSplice = chrom
218 (model, spliceID, regionStart) = currentSplice.split(delimiter)
219 if model not in geneDict:
223 (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
224 spliceID = int(spliceID)
225 rstart = int(start) - 2
226 lefthalf = maxBorder - rstart
227 if lefthalf < 1 or lefthalf > maxBorder:
230 righthalf = readsize - lefthalf
231 startL = int(regionStart) + rstart
232 stopL = startL + lefthalf
233 startR = chromstarts[spliceID + 1]
234 stopR = chromstarts[spliceID + 1] + righthalf
235 if stopL + minIntron < startR:
236 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
238 elif listlen > 1 and not spliceOnly:
239 prevID = prevID + "::" + str(listlen)
241 # ignore multireads that can also map across splices
243 for readData in newReadList:
248 for (part, score, sense, chrom, start, mismatches) in newReadList:
249 stop = start + readsize
250 mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
254 if index % insertSize == 0:
255 rds.insertUniqs(uInsertList)
256 rds.insertMulti(mInsertList)
259 if dataType == "RNA":
260 rds.insertSplices(sInsertList)
266 # start processing new read
273 score = int(fields[0])
276 parts = int(fields[17])
279 lengthList = fields[18][:-1].split(",")
280 startList = fields[20][:-1].split(",")
281 listlen = len(lengthList)
282 for lpos in range(listlen):
283 if int(lengthList[lpos]) < minSpliceLength:
286 # throw out deletions, for now
288 if int(lengthList[lpos - 1]) == int(startList[lpos]):
292 start = int(fields[15])
295 if score > bestScore:
299 if int(fields[1]) > 0:
301 mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
306 readList.append((parts, score, sense, chrom, start, mismatches))
308 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
310 if lIndex % 1000000 == 0:
311 print "processed %d lines" % lIndex
313 print "%d lines processed" % lIndex
315 if len(uInsertList) > 0:
316 rds.insertUniqs(uInsertList)
317 if len(mInsertList) > 0:
318 rds.insertMulti(mInsertList)
319 if len(sInsertList) > 0:
320 rds.insertSplices(sInsertList)
322 combString = "%d unique reads" % uIndex
323 combString += "\t%d multi reads" % mIndex
324 if dataType == "RNA":
325 combString += "\t%d spliced reads" % sIndex
328 print combString.replace("\t", "\n")
330 writeLog(outdbname + ".log", verstring, combString)
333 print "building index...."
334 if cachePages > defaultCacheSize:
335 rds.setDBcache(cachePages)
336 rds.buildIndex(cachePages)
338 rds.buildIndex(defaultCacheSize)
341 def decodeMismatches(gString, rString, rsense):
346 for rindex in xrange(rlen):
350 if gString[rindex] == rString[rindex]:
353 genNT = gString[rindex]
354 readNT = rString[rindex]
355 # for eland-compatibility, we are 1-based
356 output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
358 return string.join(output, ",")
361 if __name__ == "__main__":