5 # Created by Ali Mortazavi on 10/20/08.
17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
20 verstring = "makerdsfrombowtie: version 4.2"
27 usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
29 parser = getParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
43 (pname, pvalue) = arg.strip().split("::")
44 propertyList.append((pname, pvalue))
46 makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
47 options.doIndex, options.spacer, options.trimReadID, options.forceID,
48 options.flip, options.verbose, options.stripSpace, options.cachePages,
53 parser = optparse.OptionParser(usage=usage)
54 parser.add_option("--RNA", dest="genedatafilename")
55 parser.add_option("--append", action="store_false", dest="init")
56 parser.add_option("--index", action="store_true", dest="doIndex")
57 parser.add_option("--spacer", type="int", dest="spacer")
58 parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
59 parser.add_option("--forcepair", type="int", dest="forceID")
60 parser.add_option("--flip", action="store_true", dest="flip")
61 parser.add_option("--verbose", action="store_true", dest="verbose")
62 parser.add_option("--strip", action="store_true", dest="stripSpace")
63 parser.add_option("--cache", type="int", dest="cachePages")
65 configParser = getConfigParser()
66 section = "makerdsfrom bowtie"
67 genedatafilename = getConfigOption(configParser, section, "genedatafilename", None)
68 init = getConfigBoolOption(configParser, section, "init", True)
69 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
70 spacer = getConfigIntOption(configParser, section, "spacer", 2)
71 trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
72 forceID = getConfigOption(configParser, section, "forceID", None)
73 flip = getConfigBoolOption(configParser, section, "flip", False)
74 verbose = getConfigBoolOption(configParser, section, "verbose", False)
75 stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False)
76 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
78 parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer,
79 trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose,
80 stripSpace=stripSpace, cachePages=cachePages)
85 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
86 doIndex=False, spacer=2, trimReadID=True, forceID=None,
87 flip=False, verbose=False, stripSpace=False, cachePages=100000,
90 writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
94 if genedatafilename is not None:
96 genedatafile = open(genedatafilename)
97 for line in genedatafile:
98 fields = line.strip().split("\t")
99 blockCount = int(fields[7])
106 chromstarts = fields[8][:-1].split(",")
107 chromstops = fields[9][:-1].split(",")
110 for index in range(blockCount):
111 chromstarts[index] = int(chromstarts[index])
112 chromstops[index] = int(chromstops[index])
113 exonLengths.append(chromstops[index] - chromstarts[index])
114 totalLength += exonLengths[index]
116 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
120 rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
122 #check that our cacheSize is better than the dataset's default cache size
123 defaultCacheSize = rds.getDefaultCacheSize()
124 if cachePages > defaultCacheSize:
126 rds.setDBcache(cachePages, default=True)
128 rds.setDBcache(cachePages)
130 if not init and doIndex:
136 print "couldn't drop Index"
138 if len(propertyList) > 0:
139 rds.insertMetadata(propertyList)
141 # make some assumptions based on first read
142 infile = open(filename, "r")
143 line = infile.readline()
145 line = line.replace(" ","")
147 fields = line.split()
148 readsize = len(fields[5])
149 pairedTest = fields[0][-2:]
151 if forceID is not None:
157 if pairedTest in ["/1", "/2"] or forcePair:
158 print "assuming reads are paired"
161 print "read size: %d bp" % readsize
163 rds.insertMetadata([("readsize", readsize)])
165 rds.insertMetadata([("paired", "True")])
167 if "bowtie_mapped" not in rds.getMetadata():
168 rds.insertMetadata([("bowtie_mapped", "True")])
170 if dataType == "RNA" and "spacer" not in rds.getMetadata():
171 rds.insertMetadata([("spacer", spacer)])
176 if dataType == "RNA":
178 maxBorder = readsize + trim
180 infile = open(filename, "r")
186 index = uIndex = mIndex = sIndex = lIndex = 0
192 line = line.replace(" ","")
194 fields = line.strip().split()
197 readID = string.join(readID.split(":")[1:], ":")
200 listlen = len(readList)
202 prevID = "%s-%s" % (label, prevID)
205 prevID += "/%d" % forceID
208 (sense, chrom, start, mismatches) = readList[0]
216 stop = start + readsize
217 uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
219 elif dataType == "RNA":
220 currentSplice = chrom
221 (model, spliceID, regionStart) = currentSplice.split(delimiter)
222 if model not in geneDict:
225 (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
226 spliceID = int(spliceID)
227 rstart = int(start) - spacer
228 lefthalf = maxBorder - rstart
229 if lefthalf < 1 or lefthalf > maxBorder:
232 righthalf = readsize - lefthalf
233 startL = int(regionStart) + rstart
234 stopL = startL + lefthalf
235 startR = chromstarts[spliceID + 1]
236 stopR = chromstarts[spliceID + 1] + righthalf
237 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
240 prevID = "%s::%s" % (prevID, str(listlen))
242 # ignore multireads that can also map across splices
244 for (sense, chrom, start, mismatches) in readList:
249 for (sense, chrom, start, mismatches) in readList:
250 stop = start + readsize
257 mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
261 if index % insertSize == 0:
262 rds.insertUniqs(uInsertList)
263 rds.insertMulti(mInsertList)
266 if dataType == "RNA":
267 rds.insertSplices(sInsertList)
273 # start processing new read
281 # for eland compat, we are 1-based
282 start = int(fields[3]) + 1
284 if ":" in fields[-1]:
285 mismatches = decodeMismatches(fields[-1], sense)
287 readList.append((sense, chrom, start, mismatches))
288 if lIndex % 1000000 == 0:
289 print "processed %d lines" % lIndex
291 print "%d lines processed" % lIndex
293 if len(uInsertList) > 0:
294 rds.insertUniqs(uInsertList)
296 if len(mInsertList) > 0:
297 rds.insertMulti(mInsertList)
299 if len(sInsertList) > 0:
300 rds.insertSplices(sInsertList)
302 combString = "%d unique reads" % uIndex
303 combString += "\t%d multi reads" % mIndex
304 if dataType == "RNA":
305 combString += "\t%d spliced reads" % sIndex
308 print combString.replace("\t", "\n")
310 writeLog("%s.log" % outdbname, verstring, combString)
313 print "building index...."
314 if cachePages > defaultCacheSize:
315 rds.setDBcache(cachePages)
316 rds.buildIndex(cachePages)
318 rds.buildIndex(defaultCacheSize)
321 def decodeMismatches(mString, rsense):
322 complement = {"A": "T",
330 mismatches = mString.split(",")
331 for mismatch in mismatches:
332 (pos,change) = mismatch.split(":")
333 (genNT, readNT) = change.split(">")
335 readNT = complement[readNT]
336 genNT = complement[genNT]
338 elandCompatiblePos = int(pos) + 1
339 output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
341 return string.join(output, ",")
344 if __name__ == "__main__":