snapshot of 4.0a development. initial git repo commit
[erange.git] / makerdsfromblat.py
1 #
2 #  makerdsfromblat.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 12/7/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys, string, optparse
15 from commoncode import readDataset, writeLog
16
17 verstring = "%prog: version 3.9"
18 print verstring
19
20 def main(argv=None):
21     if not argv:
22         argv = sys.argv
23
24     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
25
26     parser = optparse.OptionParser(usage=usage)
27     parser.add_option("--append", action="store_false", dest="init")
28     parser.add_option("--index", action="store_true", dest="doIndex")
29     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
30     parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
31     parser.add_option("--flag", action="store_true", dest="flagReads")
32     parser.add_option("--strict", type="int", dest="minSpliceLength",
33                       help="min required bp on each side of a splice")
34     parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
35     parser.add_option("--verbose", action="store_true", dest="verbose")
36     parser.add_option("--cache", type="int", dest="cachePages")
37     parser.add_option("--RNA", dest="geneDataFileName")
38     parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
39     (options, args) = parser.parse_args(argv[1:])
40
41     if len(args) < 3:
42         print usage
43         sys.exit(1)
44
45     label = args[0]
46     filename = args[1]
47     outdbname = args[2]
48
49     if options.geneDataFileName:
50         dataType = "RNA"
51     else:
52         dataType = "DNA"
53
54     theFlag = ""
55     if options.flagReads:
56         theFlag = "blat"
57
58     propertyList = []
59     for arg in args:
60         if "::" in arg:
61             (pname, pvalue) = arg.strip().split("::")
62             propertyList.append((pname, pvalue))
63
64     makerdsfromblat(label, filename, outdbname, dataType, options.init,
65                    options.doIndex, options.trimReadID, options. minSpliceLength,
66                    options.forceRNA, theFlag, options.spliceOnly, options.verbose,
67                    options.cachePages, options.geneDataFileName, propertyList)
68
69
70 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
71                     doIndex=False,trimReadID=True, minSpliceLength=0,
72                     forceRNA=False, theFlag="", spliceOnly=False,
73                     verbose=False, cachePages=100000, geneDataFileName="",
74                     propertyList=[]):
75
76     delimiter = "|"
77     minIntron = 10
78     maxBorder = 0
79     index = 0
80     insertSize = 100000
81
82     if forceRNA:
83         print "forcing datatype to RNA"
84         dataType = "RNA"
85
86     if dataType == "RNA":
87         genedatafile = open(geneDataFileName)
88
89     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
90
91     geneDict = {}
92     mapDict = {}
93     if dataType == "RNA" and not forceRNA:
94         for line in genedatafile:
95             fields = line.strip().split("\t")
96             blockCount = int(fields[7])
97             if blockCount < 2:
98                 continue
99
100             uname = fields[0]
101             chrom = fields[1]
102             sense = fields[2]
103             chromstarts = fields[8][:-1].split(",")
104             chromstops = fields[9][:-1].split(",")
105             exonLengths = []
106             totalLength = 0
107             for index in range(blockCount):
108                 chromstarts[index] = int(chromstarts[index])
109                 chromstops[index] = int(chromstops[index])
110                 exonLengths.append(chromstops[index] - chromstarts[index])
111                 totalLength += exonLengths[index]
112
113             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
114             mapDict[uname] = []
115
116         genedatafile.close()
117
118     rds = readDataset(outdbname, init, dataType, verbose=True)
119
120     #check that our cacheSize is better than the dataset's default cache size
121     defaultCacheSize = rds.getDefaultCacheSize()
122     if cachePages > defaultCacheSize:
123         if init:
124             rds.setDBcache(cachePages, default=True)
125         else:
126             rds.setDBcache(cachePages)
127
128     if not init and doIndex:
129         try:
130             if rds.hasIndex():
131                 rds.dropIndex()
132         except:
133             if verbose:
134                 print "couldn't drop Index"
135
136
137     if len(propertyList) > 0:
138         rds.insertMetadata(propertyList)
139
140     # make some assumptions based on first read
141     infile = open(filename, "r")
142     for arg in range(6):
143         line = infile.readline()
144
145     fields = line.split()
146     readsize = int(fields[10])
147     pairedTest = fields[9][-2:]
148     paired = False
149     if pairedTest in ["/1", "/2"]:
150         print "assuming reads are paired"
151         paired = True
152
153     print "read size: %d bp" % readsize
154     if init:
155         rds.insertMetadata([("readsize", readsize)])
156         if paired:
157             rds.insertMetadata([("paired", "True")])
158
159     infile.close()
160     if "blat_mapped" not in rds.getMetadata():
161         rds.insertMetadata([("blat_mapped", "True")])
162
163     minReadScore = readsize - readsize/25 - 1
164     trim = -4
165     if dataType == "RNA":
166         maxBorder = readsize + trim
167
168     infile = open(filename, "r")
169     prevID = ""
170     readList = []
171     uInsertList = []
172     mInsertList = []
173     sInsertList = []
174     index = uIndex = mIndex = sIndex = lIndex = 0
175     bestScore = 0
176     # skip headers
177     for arg in range(5):
178         line = infile.readline()
179
180     for line in infile:
181         lIndex += 1
182         fields = line.strip().split()
183         readID = fields[9]
184         if trimReadID:
185             readID = string.join(readID.split(":")[1:], ":")
186
187         if readID != prevID:
188             newReadList = []
189             if bestScore > minReadScore:
190                 for readData in readList:
191                     if readData[1] == bestScore:
192                         newReadList.append(readData)
193
194             if trimReadID:
195                 prevID = label + "-" + prevID
196
197             listlen = len(newReadList)
198             if listlen == 1:
199                 parts = int(newReadList[0][0])
200                 if parts == 1 and not spliceOnly:
201                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
202                     stop = start + readsize
203                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
204                     uIndex += 1
205                 elif forceRNA and parts == 2:
206                     (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
207                     startL = int(startList[0]) 
208                     stopL = startL + int(lengthList[0])
209                     startR = int(startList[1])
210                     stopR = startR + int(lengthList[1])
211                     if stopL + minIntron < startR:
212                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
213                         sIndex += 1
214                 elif parts == 2:
215                     print newReadList
216                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
217                     currentSplice = chrom
218                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
219                     if model not in geneDict:
220                         print fields
221                         continue
222
223                     (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
224                     spliceID = int(spliceID)
225                     rstart = int(start) - 2
226                     lefthalf = maxBorder - rstart
227                     if lefthalf < 1 or lefthalf > maxBorder:
228                         continue
229
230                     righthalf = readsize - lefthalf
231                     startL = int(regionStart)  + rstart
232                     stopL = startL + lefthalf
233                     startR = chromstarts[spliceID + 1]
234                     stopR = chromstarts[spliceID + 1] + righthalf
235                     if stopL + minIntron < startR:
236                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
237                         sIndex += 1
238             elif listlen > 1 and not spliceOnly:
239                 prevID = prevID + "::" + str(listlen)
240                 mIndex += 1
241                 # ignore multireads that can also map across splices
242                 skip = False
243                 for readData in newReadList:
244                     if readData[0] > 1:
245                         skip = True
246
247                 if not skip:
248                     for (part, score, sense, chrom, start, mismatches) in newReadList:
249                         stop = start + readsize
250                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
251             else:
252                 prevID = readID
253
254             if index % insertSize == 0:
255                 rds.insertUniqs(uInsertList)
256                 rds.insertMulti(mInsertList)
257                 uInsertList = []
258                 mInsertList = []
259                 if dataType == "RNA":
260                     rds.insertSplices(sInsertList)
261                     sInsertList = []
262
263                 print ".",
264                 sys.stdout.flush()
265
266             # start processing new read
267             readList = []
268             prevID = readID
269             bestScore = 0
270             index += 1
271
272         # add the new read
273         score = int(fields[0])
274         sense = fields[8]
275         chrom = fields[13]
276         parts = int(fields[17])
277         passStrict = True
278         if parts > 1:
279             lengthList = fields[18][:-1].split(",")
280             startList = fields[20][:-1].split(",")
281             listlen = len(lengthList)
282             for lpos in range(listlen):
283                 if int(lengthList[lpos]) < minSpliceLength:
284                     passStrict = False
285
286                 # throw out deletions, for now
287                 if lpos > 0:
288                     if int(lengthList[lpos - 1]) == int(startList[lpos]):
289                         passStrict = False
290             pass
291         else:
292             start = int(fields[15])
293
294         if passStrict:
295             if score > bestScore:
296                 bestScore = score
297
298             mismatches = ""
299             if int(fields[1]) > 0:
300                 try:
301                     mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
302                 except:
303                     mismatches = ""
304
305             if parts == 1:
306                 readList.append((parts, score, sense, chrom, start, mismatches))
307             else:
308                 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
309
310         if lIndex % 1000000 == 0:
311             print "processed %d lines" % lIndex
312
313     print "%d lines processed" % lIndex
314
315     if len(uInsertList) > 0:
316         rds.insertUniqs(uInsertList)
317     if len(mInsertList) > 0:
318         rds.insertMulti(mInsertList)
319     if len(sInsertList) > 0:
320         rds.insertSplices(sInsertList)
321
322     combString = "%d unique reads" % uIndex
323     combString += "\t%d multi reads" % mIndex
324     if dataType == "RNA":
325         combString += "\t%d spliced reads" % sIndex
326
327     print
328     print combString.replace("\t", "\n")
329
330     writeLog(outdbname + ".log", verstring, combString)
331
332     if doIndex:
333         print "building index...."
334         if cachePages > defaultCacheSize:
335             rds.setDBcache(cachePages)
336             rds.buildIndex(cachePages)
337         else:
338             rds.buildIndex(defaultCacheSize)
339
340
341 def decodeMismatches(gString, rString, rsense):
342     
343     output = []
344     rlen = len(gString)
345     partIndex = 0
346     for rindex in xrange(rlen):
347         if gString == ",":
348             partIndex += 1
349
350         if gString[rindex] == rString[rindex]:
351             continue
352
353         genNT = gString[rindex]
354         readNT = rString[rindex]
355         # for eland-compatibility, we are 1-based
356         output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
357             
358     return string.join(output, ",")
359
360
361 if __name__ == "__main__":
362     main(sys.argv)