5 # Created by Ali Mortazavi on 10/20/08.
14 import sys, string, optparse
15 from commoncode import readDataset, writeLog
17 verstring = "%prog: version 4.1"
24 usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
26 parser = optparse.OptionParser(usage=usage)
27 parser.add_option("--RNA", dest="genedatafilename")
28 parser.add_option("--append", action="store_false", dest="init")
29 parser.add_option("--index", action="store_true", dest="doIndex")
30 parser.add_option("--spacer", type="int", dest="spacer")
31 parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
32 parser.add_option("--forcepair", type="int", dest="forceID")
33 parser.add_option("--flip", action="store_true", dest="flip")
34 parser.add_option("--verbose", action="store_true", dest="verbose")
35 parser.add_option("--strip", action="store_true", dest="stripSpace")
36 parser.add_option("--cache", type="int", dest="cachePages")
37 parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
38 trimReadID=True, forceID=None, flip=False, verbose=False,
39 stripSpace=False, cachePages=100000)
41 (options, args) = parser.parse_args(argv[1:])
54 (pname, pvalue) = arg.strip().split("::")
55 propertyList.append((pname, pvalue))
57 makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
58 options.doIndex, options.spacer, options.trimReadID, options.forceID,
59 options.flip, options.verbose, options.stripSpace, options.cachePages,
63 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
64 doIndex=False, spacer=2, trimReadID=True, forceID=None,
65 flip=False, verbose=False, stripSpace=False, cachePages=100000,
71 if genedatafilename is not None:
73 genedatafile = open(genedatafilename)
77 if forceID is not None:
86 writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
91 for line in genedatafile:
92 fields = line.strip().split("\t")
93 blockCount = int(fields[7])
100 chromstarts = fields[8][:-1].split(",")
101 chromstops = fields[9][:-1].split(",")
104 for index in range(blockCount):
105 chromstarts[index] = int(chromstarts[index])
106 chromstops[index] = int(chromstops[index])
107 exonLengths.append(chromstops[index] - chromstarts[index])
108 totalLength += exonLengths[index]
110 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
115 rds = readDataset(outdbname, init, dataType, verbose=True)
117 #check that our cacheSize is better than the dataset's default cache size
118 defaultCacheSize = rds.getDefaultCacheSize()
119 if cachePages > defaultCacheSize:
121 rds.setDBcache(cachePages, default=True)
123 rds.setDBcache(cachePages)
125 if not init and doIndex:
131 print "couldn't drop Index"
133 if len(propertyList) > 0:
134 rds.insertMetadata(propertyList)
136 # make some assumptions based on first read
137 infile = open(filename, "r")
138 line = infile.readline()
140 line = line.replace(" ","")
142 fields = line.split()
143 readsize = len(fields[5])
144 pairedTest = fields[0][-2:]
146 if pairedTest in ["/1", "/2"] or forcePair:
147 print "assuming reads are paired"
151 print "read size: %d bp" % readsize
153 rds.insertMetadata([("readsize", readsize)])
155 rds.insertMetadata([("paired", "True")])
157 if "bowtie_mapped" not in rds.getMetadata():
158 rds.insertMetadata([("bowtie_mapped", "True")])
160 if dataType == "RNA" and "spacer" not in rds.getMetadata():
161 rds.insertMetadata([("spacer", spacer)])
166 if dataType == "RNA":
167 maxBorder = readsize + trim
169 infile = open(filename, "r")
175 index = uIndex = mIndex = sIndex = lIndex = 0
179 line = line.replace(" ","")
181 fields = line.strip().split()
184 readID = string.join(readID.split(":")[1:], ":")
187 listlen = len(readList)
189 prevID = "%s-%s" % (label, prevID)
192 prevID += "/%d" % forceID
195 (sense, chrom, start, mismatches) = readList[0]
203 stop = start + readsize
204 uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
206 elif dataType == "RNA":
207 currentSplice = chrom
208 (model, spliceID, regionStart) = currentSplice.split(delimiter)
209 if model not in geneDict:
212 (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
213 spliceID = int(spliceID)
214 rstart = int(start) - spacer
215 lefthalf = maxBorder - rstart
216 if lefthalf < 1 or lefthalf > maxBorder:
219 righthalf = readsize - lefthalf
220 startL = int(regionStart) + rstart
221 stopL = startL + lefthalf
222 startR = chromstarts[spliceID + 1]
223 stopR = chromstarts[spliceID + 1] + righthalf
224 sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
227 prevID = "%s::%s" % (prevID, str(listlen))
229 # ignore multireads that can also map across splices
231 for (sense, chrom, start, mismatches) in readList:
236 for (sense, chrom, start, mismatches) in readList:
237 stop = start + readsize
244 mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
248 if index % insertSize == 0:
249 rds.insertUniqs(uInsertList)
250 rds.insertMulti(mInsertList)
253 if dataType == "RNA":
254 rds.insertSplices(sInsertList)
260 # start processing new read
268 # for eland compat, we are 1-based
269 start = int(fields[3]) + 1
271 if ":" in fields[-1]:
272 mismatches = decodeMismatches(fields[-1], sense)
274 readList.append((sense, chrom, start, mismatches))
275 if lIndex % 1000000 == 0:
276 print "processed %d lines" % lIndex
278 print "%d lines processed" % lIndex
280 if len(uInsertList) > 0:
281 rds.insertUniqs(uInsertList)
283 if len(mInsertList) > 0:
284 rds.insertMulti(mInsertList)
286 if len(sInsertList) > 0:
287 rds.insertSplices(sInsertList)
289 combString = "%d unique reads" % uIndex
290 combString += "\t%d multi reads" % mIndex
291 if dataType == "RNA":
292 combString += "\t%d spliced reads" % sIndex
295 print combString.replace("\t", "\n")
297 writeLog("%s.log" % outdbname, verstring, combString)
300 print "building index...."
301 if cachePages > defaultCacheSize:
302 rds.setDBcache(cachePages)
303 rds.buildIndex(cachePages)
305 rds.buildIndex(defaultCacheSize)
308 def decodeMismatches(mString, rsense):
309 complement = {"A": "T",
317 mismatches = mString.split(",")
318 for mismatch in mismatches:
319 (pos,change) = mismatch.split(":")
320 (genNT, readNT) = change.split(">")
322 readNT = complement[readNT]
323 genNT = complement[genNT]
325 elandCompatiblePos = int(pos) + 1
326 output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
328 return string.join(output, ",")
331 if __name__ == "__main__":