erange version 4.0a dev release
[erange.git] / makerdsfromblat.py
1 #
2 #  makerdsfromblat.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 12/7/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import string
16 import optparse
17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
18 import ReadDataset
19
20 verstring = "makerdsfromblat: version 3.10"
21 print verstring
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
28
29     parser = getParser(usage)
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 3:
33         print usage
34         sys.exit(1)
35
36     label = args[0]
37     filename = args[1]
38     outdbname = args[2]
39
40     if options.geneDataFileName:
41         dataType = "RNA"
42     else:
43         dataType = "DNA"
44
45     theFlag = ""
46     if options.flagReads:
47         theFlag = "blat"
48
49     propertyList = []
50     for arg in args:
51         if "::" in arg:
52             (pname, pvalue) = arg.strip().split("::")
53             propertyList.append((pname, pvalue))
54
55     makerdsfromblat(label, filename, outdbname, dataType, options.init,
56                    options.doIndex, options.trimReadID, options. minSpliceLength,
57                    options.forceRNA, theFlag, options.spliceOnly, options.verbose,
58                    options.cachePages, options.geneDataFileName, propertyList)
59
60
61 def getParser(usage):
62     parser = optparse.OptionParser(usage=usage)
63     parser.add_option("--append", action="store_false", dest="init")
64     parser.add_option("--index", action="store_true", dest="doIndex")
65     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
66     parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
67     parser.add_option("--flag", action="store_true", dest="flagReads")
68     parser.add_option("--strict", type="int", dest="minSpliceLength",
69                       help="min required bp on each side of a splice")
70     parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
71     parser.add_option("--verbose", action="store_true", dest="verbose")
72     parser.add_option("--cache", type="int", dest="cachePages")
73     parser.add_option("--RNA", dest="geneDataFileName")
74
75     configParser = getConfigParser()
76     section = "makerdsfromblat"
77     init = getConfigBoolOption(configParser, section, "init", True)
78     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
79     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
80     minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
81     forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
82     flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
83     spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
84     verbose = getConfigBoolOption(configParser, section, "verbose", False)
85     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
86     geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
87
88     parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
89                         flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
90                         geneDataFileName=geneDataFileName)
91
92     return parser
93
94
95 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
96                     doIndex=False,trimReadID=True, minSpliceLength=0,
97                     forceRNA=False, theFlag="", spliceOnly=False,
98                     verbose=False, cachePages=100000, geneDataFileName="",
99                     propertyList=[]):
100
101     delimiter = "|"
102     minIntron = 10
103     maxBorder = 0
104     index = 0
105     insertSize = 100000
106
107     if forceRNA:
108         print "forcing datatype to RNA"
109         dataType = "RNA"
110
111     if dataType == "RNA":
112         genedatafile = open(geneDataFileName)
113
114     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
115
116     geneDict = {}
117     mapDict = {}
118     if dataType == "RNA" and not forceRNA:
119         for line in genedatafile:
120             fields = line.strip().split("\t")
121             blockCount = int(fields[7])
122             if blockCount < 2:
123                 continue
124
125             uname = fields[0]
126             chrom = fields[1]
127             sense = fields[2]
128             chromstarts = fields[8][:-1].split(",")
129             chromstops = fields[9][:-1].split(",")
130             exonLengths = []
131             totalLength = 0
132             for index in range(blockCount):
133                 chromstarts[index] = int(chromstarts[index])
134                 chromstops[index] = int(chromstops[index])
135                 exonLengths.append(chromstops[index] - chromstarts[index])
136                 totalLength += exonLengths[index]
137
138             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
139             mapDict[uname] = []
140
141         genedatafile.close()
142
143     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
144
145     #check that our cacheSize is better than the dataset's default cache size
146     defaultCacheSize = rds.getDefaultCacheSize()
147     if cachePages > defaultCacheSize:
148         if init:
149             rds.setDBcache(cachePages, default=True)
150         else:
151             rds.setDBcache(cachePages)
152
153     if not init and doIndex:
154         try:
155             if rds.hasIndex():
156                 rds.dropIndex()
157         except:
158             if verbose:
159                 print "couldn't drop Index"
160
161
162     if len(propertyList) > 0:
163         rds.insertMetadata(propertyList)
164
165     # make some assumptions based on first read
166     infile = open(filename, "r")
167     for arg in range(6):
168         line = infile.readline()
169
170     fields = line.split()
171     readsize = int(fields[10])
172     pairedTest = fields[9][-2:]
173     paired = False
174     if pairedTest in ["/1", "/2"]:
175         print "assuming reads are paired"
176         paired = True
177
178     print "read size: %d bp" % readsize
179     if init:
180         rds.insertMetadata([("readsize", readsize)])
181         if paired:
182             rds.insertMetadata([("paired", "True")])
183
184     infile.close()
185     if "blat_mapped" not in rds.getMetadata():
186         rds.insertMetadata([("blat_mapped", "True")])
187
188     minReadScore = readsize - readsize/25 - 1
189     trim = -4
190     if dataType == "RNA":
191         maxBorder = readsize + trim
192
193     infile = open(filename, "r")
194     prevID = ""
195     readList = []
196     uInsertList = []
197     mInsertList = []
198     sInsertList = []
199     index = uIndex = mIndex = sIndex = lIndex = 0
200     bestScore = 0
201     # skip headers
202     for arg in range(5):
203         line = infile.readline()
204
205     for line in infile:
206         lIndex += 1
207         fields = line.strip().split()
208         readID = fields[9]
209         if trimReadID:
210             readID = string.join(readID.split(":")[1:], ":")
211
212         if readID != prevID:
213             newReadList = []
214             if bestScore > minReadScore:
215                 for readData in readList:
216                     if readData[1] == bestScore:
217                         newReadList.append(readData)
218
219             if trimReadID:
220                 prevID = label + "-" + prevID
221
222             listlen = len(newReadList)
223             if listlen == 1:
224                 parts = int(newReadList[0][0])
225                 if parts == 1 and not spliceOnly:
226                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
227                     stop = start + readsize
228                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
229                     uIndex += 1
230                 elif forceRNA and parts == 2:
231                     (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
232                     startL = int(startList[0]) 
233                     stopL = startL + int(lengthList[0])
234                     startR = int(startList[1])
235                     stopR = startR + int(lengthList[1])
236                     if stopL + minIntron < startR:
237                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
238                         sIndex += 1
239                 elif parts == 2:
240                     print newReadList
241                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
242                     currentSplice = chrom
243                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
244                     if model not in geneDict:
245                         print fields
246                         continue
247
248                     (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
249                     spliceID = int(spliceID)
250                     rstart = int(start) - 2
251                     lefthalf = maxBorder - rstart
252                     if lefthalf < 1 or lefthalf > maxBorder:
253                         continue
254
255                     righthalf = readsize - lefthalf
256                     startL = int(regionStart)  + rstart
257                     stopL = startL + lefthalf
258                     startR = chromstarts[spliceID + 1]
259                     stopR = chromstarts[spliceID + 1] + righthalf
260                     if stopL + minIntron < startR:
261                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
262                         sIndex += 1
263             elif listlen > 1 and not spliceOnly:
264                 prevID = prevID + "::" + str(listlen)
265                 mIndex += 1
266                 # ignore multireads that can also map across splices
267                 skip = False
268                 for readData in newReadList:
269                     if readData[0] > 1:
270                         skip = True
271
272                 if not skip:
273                     for (part, score, sense, chrom, start, mismatches) in newReadList:
274                         stop = start + readsize
275                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
276             else:
277                 prevID = readID
278
279             if index % insertSize == 0:
280                 rds.insertUniqs(uInsertList)
281                 rds.insertMulti(mInsertList)
282                 uInsertList = []
283                 mInsertList = []
284                 if dataType == "RNA":
285                     rds.insertSplices(sInsertList)
286                     sInsertList = []
287
288                 print ".",
289                 sys.stdout.flush()
290
291             # start processing new read
292             readList = []
293             prevID = readID
294             bestScore = 0
295             index += 1
296
297         # add the new read
298         score = int(fields[0])
299         sense = fields[8]
300         chrom = fields[13]
301         parts = int(fields[17])
302         passStrict = True
303         if parts > 1:
304             lengthList = fields[18][:-1].split(",")
305             startList = fields[20][:-1].split(",")
306             listlen = len(lengthList)
307             for lpos in range(listlen):
308                 if int(lengthList[lpos]) < minSpliceLength:
309                     passStrict = False
310
311                 # throw out deletions, for now
312                 if lpos > 0:
313                     if int(lengthList[lpos - 1]) == int(startList[lpos]):
314                         passStrict = False
315             pass
316         else:
317             start = int(fields[15])
318
319         if passStrict:
320             if score > bestScore:
321                 bestScore = score
322
323             mismatches = ""
324             if int(fields[1]) > 0:
325                 try:
326                     mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
327                 except:
328                     mismatches = ""
329
330             if parts == 1:
331                 readList.append((parts, score, sense, chrom, start, mismatches))
332             else:
333                 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
334
335         if lIndex % 1000000 == 0:
336             print "processed %d lines" % lIndex
337
338     print "%d lines processed" % lIndex
339
340     if len(uInsertList) > 0:
341         rds.insertUniqs(uInsertList)
342     if len(mInsertList) > 0:
343         rds.insertMulti(mInsertList)
344     if len(sInsertList) > 0:
345         rds.insertSplices(sInsertList)
346
347     combString = "%d unique reads" % uIndex
348     combString += "\t%d multi reads" % mIndex
349     if dataType == "RNA":
350         combString += "\t%d spliced reads" % sIndex
351
352     print
353     print combString.replace("\t", "\n")
354
355     writeLog(outdbname + ".log", verstring, combString)
356
357     if doIndex:
358         print "building index...."
359         if cachePages > defaultCacheSize:
360             rds.setDBcache(cachePages)
361             rds.buildIndex(cachePages)
362         else:
363             rds.buildIndex(defaultCacheSize)
364
365
366 def decodeMismatches(gString, rString, rsense):
367     
368     output = []
369     rlen = len(gString)
370     partIndex = 0
371     for rindex in xrange(rlen):
372         if gString == ",":
373             partIndex += 1
374
375         if gString[rindex] == rString[rindex]:
376             continue
377
378         genNT = gString[rindex]
379         readNT = rString[rindex]
380         # for eland-compatibility, we are 1-based
381         output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
382             
383     return string.join(output, ",")
384
385
386 if __name__ == "__main__":
387     main(sys.argv)