5 # Created by Ali Mortazavi on 10/20/08.
17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
20 verstring = "makerdsfrombowtie: version 4.2"
27 usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
29 parser = getParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
43 (pname, pvalue) = arg.strip().split("::")
44 propertyList.append((pname, pvalue))
46 makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
47 options.doIndex, options.spacer, options.trimReadID, options.forceID,
48 options.flip, options.verbose, options.stripSpace, options.cachePages,
53 parser = optparse.OptionParser(usage=usage)
54 parser.add_option("--RNA", dest="genedatafilename")
55 parser.add_option("--append", action="store_false", dest="init")
56 parser.add_option("--index", action="store_true", dest="doIndex")
57 parser.add_option("--spacer", type="int", dest="spacer")
58 parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
59 parser.add_option("--forcepair", type="int", dest="forceID")
60 parser.add_option("--flip", action="store_true", dest="flip")
61 parser.add_option("--verbose", action="store_true", dest="verbose")
62 parser.add_option("--strip", action="store_true", dest="stripSpace")
63 parser.add_option("--cache", type="int", dest="cachePages")
65 configParser = getConfigParser()
66 section = "makerdsfrom bowtie"
67 genedatafilename = getConfigOption(configParser, section, "genedatafilename", None)
68 init = getConfigBoolOption(configParser, section, "init", True)
69 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
70 spacer = getConfigIntOption(configParser, section, "spacer", 2)
71 trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
72 forceID = getConfigOption(configParser, section, "forceID", None)
73 flip = getConfigBoolOption(configParser, section, "flip", False)
74 verbose = getConfigBoolOption(configParser, section, "verbose", False)
75 stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False)
76 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
78 parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer,
79 trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose,
80 stripSpace=stripSpace, cachePages=cachePages)
85 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
86 doIndex=False, spacer=2, trimReadID=True, forceID=None,
87 flip=False, verbose=False, stripSpace=False, cachePages=100000,
93 if genedatafilename is not None:
95 genedatafile = open(genedatafilename)
99 if forceID is not None:
108 writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
112 if dataType == "RNA":
113 for line in genedatafile:
114 fields = line.strip().split("\t")
115 blockCount = int(fields[7])
122 chromstarts = fields[8][:-1].split(",")
123 chromstops = fields[9][:-1].split(",")
126 for index in range(blockCount):
127 chromstarts[index] = int(chromstarts[index])
128 chromstops[index] = int(chromstops[index])
129 exonLengths.append(chromstops[index] - chromstarts[index])
130 totalLength += exonLengths[index]
132 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
137 rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
139 #check that our cacheSize is better than the dataset's default cache size
140 defaultCacheSize = rds.getDefaultCacheSize()
141 if cachePages > defaultCacheSize:
143 rds.setDBcache(cachePages, default=True)
145 rds.setDBcache(cachePages)
147 if not init and doIndex:
153 print "couldn't drop Index"
155 if len(propertyList) > 0:
156 rds.insertMetadata(propertyList)
158 # make some assumptions based on first read
159 infile = open(filename, "r")
160 line = infile.readline()
162 line = line.replace(" ","")
164 fields = line.split()
165 readsize = len(fields[5])
166 pairedTest = fields[0][-2:]
168 if pairedTest in ["/1", "/2"] or forcePair:
169 print "assuming reads are paired"
173 print "read size: %d bp" % readsize
175 rds.insertMetadata([("readsize", readsize)])
177 rds.insertMetadata([("paired", "True")])
179 if "bowtie_mapped" not in rds.getMetadata():
180 rds.insertMetadata([("bowtie_mapped", "True")])
182 if dataType == "RNA" and "spacer" not in rds.getMetadata():
183 rds.insertMetadata([("spacer", spacer)])
188 if dataType == "RNA":
189 maxBorder = readsize + trim
191 infile = open(filename, "r")
197 index = uIndex = mIndex = sIndex = lIndex = 0
201 line = line.replace(" ","")
203 fields = line.strip().split()
206 readID = string.join(readID.split(":")[1:], ":")
209 listlen = len(readList)
211 prevID = "%s-%s" % (label, prevID)
214 prevID += "/%d" % forceID
217 (sense, chrom, start, mismatches) = readList[0]
225 stop = start + readsize
226 uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
228 elif dataType == "RNA":
229 currentSplice = chrom
230 (model, spliceID, regionStart) = currentSplice.split(delimiter)
231 if model not in geneDict:
234 (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
235 spliceID = int(spliceID)
236 rstart = int(start) - spacer
237 lefthalf = maxBorder - rstart
238 if lefthalf < 1 or lefthalf > maxBorder:
241 righthalf = readsize - lefthalf
242 startL = int(regionStart) + rstart
243 stopL = startL + lefthalf
244 startR = chromstarts[spliceID + 1]
245 stopR = chromstarts[spliceID + 1] + righthalf
246 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
249 prevID = "%s::%s" % (prevID, str(listlen))
251 # ignore multireads that can also map across splices
253 for (sense, chrom, start, mismatches) in readList:
258 for (sense, chrom, start, mismatches) in readList:
259 stop = start + readsize
266 mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
270 if index % insertSize == 0:
271 rds.insertUniqs(uInsertList)
272 rds.insertMulti(mInsertList)
275 if dataType == "RNA":
276 rds.insertSplices(sInsertList)
282 # start processing new read
290 # for eland compat, we are 1-based
291 start = int(fields[3]) + 1
293 if ":" in fields[-1]:
294 mismatches = decodeMismatches(fields[-1], sense)
296 readList.append((sense, chrom, start, mismatches))
297 if lIndex % 1000000 == 0:
298 print "processed %d lines" % lIndex
300 print "%d lines processed" % lIndex
302 if len(uInsertList) > 0:
303 rds.insertUniqs(uInsertList)
305 if len(mInsertList) > 0:
306 rds.insertMulti(mInsertList)
308 if len(sInsertList) > 0:
309 rds.insertSplices(sInsertList)
311 combString = "%d unique reads" % uIndex
312 combString += "\t%d multi reads" % mIndex
313 if dataType == "RNA":
314 combString += "\t%d spliced reads" % sIndex
317 print combString.replace("\t", "\n")
319 writeLog("%s.log" % outdbname, verstring, combString)
322 print "building index...."
323 if cachePages > defaultCacheSize:
324 rds.setDBcache(cachePages)
325 rds.buildIndex(cachePages)
327 rds.buildIndex(defaultCacheSize)
330 def decodeMismatches(mString, rsense):
331 complement = {"A": "T",
339 mismatches = mString.split(",")
340 for mismatch in mismatches:
341 (pos,change) = mismatch.split(":")
342 (genNT, readNT) = change.split(">")
344 readNT = complement[readNT]
345 genNT = complement[genNT]
347 elandCompatiblePos = int(pos) + 1
348 output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
350 return string.join(output, ",")
353 if __name__ == "__main__":