first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / makerdsfromblat.py
1 #
2 #  makerdsfromblat.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 12/7/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import string
16 import optparse
17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
18 import ReadDataset
19
20 verstring = "makerdsfromblat: version 3.10"
21 print verstring
22
23 NUM_HEADER_LINES = 5
24
25
26 def main(argv=None):
27     if not argv:
28         argv = sys.argv
29
30     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
31
32     parser = getParser(usage)
33     (options, args) = parser.parse_args(argv[1:])
34
35     if len(args) < 3:
36         print usage
37         sys.exit(1)
38
39     label = args[0]
40     filename = args[1]
41     outdbname = args[2]
42
43     if options.geneDataFileName:
44         dataType = "RNA"
45     else:
46         dataType = "DNA"
47
48     theFlag = ""
49     if options.flagReads:
50         theFlag = "blat"
51
52     propertyList = []
53     for arg in args:
54         if "::" in arg:
55             (pname, pvalue) = arg.strip().split("::")
56             propertyList.append((pname, pvalue))
57
58     makerdsfromblat(label, filename, outdbname, dataType, options.init,
59                    options.doIndex, options.trimReadID, options. minSpliceLength,
60                    options.forceRNA, theFlag, options.spliceOnly, options.verbose,
61                    options.cachePages, options.geneDataFileName, propertyList)
62
63
64 def getParser(usage):
65     parser = optparse.OptionParser(usage=usage)
66     parser.add_option("--append", action="store_false", dest="init")
67     parser.add_option("--index", action="store_true", dest="doIndex")
68     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
69     parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
70     parser.add_option("--flag", action="store_true", dest="flagReads")
71     parser.add_option("--strict", type="int", dest="minSpliceLength",
72                       help="min required bp on each side of a splice")
73     parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
74     parser.add_option("--verbose", action="store_true", dest="verbose")
75     parser.add_option("--cache", type="int", dest="cachePages")
76     parser.add_option("--RNA", dest="geneDataFileName")
77
78     configParser = getConfigParser()
79     section = "makerdsfromblat"
80     init = getConfigBoolOption(configParser, section, "init", True)
81     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
82     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
83     minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
84     forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
85     flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
86     spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
87     verbose = getConfigBoolOption(configParser, section, "verbose", False)
88     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
89     geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
90
91     parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
92                         flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
93                         geneDataFileName=geneDataFileName)
94
95     return parser
96
97
98 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
99                     doIndex=False,trimReadID=True, minSpliceLength=0,
100                     forceRNA=False, theFlag="", spliceOnly=False,
101                     verbose=False, cachePages=100000, geneDataFileName="",
102                     propertyList=[]):
103
104     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
105     if forceRNA:
106         print "forcing datatype to RNA"
107         dataType = "RNA"
108
109     geneDict = {}
110     mapDict = {}
111     if dataType == "RNA" and not forceRNA:
112         genedatafile = open(geneDataFileName)
113         for line in genedatafile:
114             fields = line.strip().split("\t")
115             blockCount = int(fields[7])
116             if blockCount < 2:
117                 continue
118
119             uname = fields[0]
120             chrom = fields[1]
121             sense = fields[2]
122             chromstarts = fields[8][:-1].split(",")
123             chromstops = fields[9][:-1].split(",")
124             exonLengths = []
125             totalLength = 0
126             for index in range(blockCount):
127                 chromstarts[index] = int(chromstarts[index])
128                 chromstops[index] = int(chromstops[index])
129                 exonLengths.append(chromstops[index] - chromstarts[index])
130                 totalLength += exonLengths[index]
131
132             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
133             mapDict[uname] = []
134
135         genedatafile.close()
136
137     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
138
139     #check that our cacheSize is better than the dataset's default cache size
140     defaultCacheSize = rds.getDefaultCacheSize()
141     if cachePages > defaultCacheSize:
142         if init:
143             rds.setDBcache(cachePages, default=True)
144         else:
145             rds.setDBcache(cachePages)
146
147     if not init and doIndex:
148         try:
149             if rds.hasIndex():
150                 rds.dropIndex()
151         except:
152             if verbose:
153                 print "couldn't drop Index"
154
155
156     if len(propertyList) > 0:
157         rds.insertMetadata(propertyList)
158
159     # make some assumptions based on first read
160     infile = open(filename, "r")
161     for arg in range(NUM_HEADER_LINES):
162         line = infile.readline()
163
164     line = infile.readline()
165     fields = line.split()
166     readsize = int(fields[10])
167     pairedTest = fields[9][-2:]
168     paired = False
169     if pairedTest in ["/1", "/2"]:
170         print "assuming reads are paired"
171         paired = True
172
173     print "read size: %d bp" % readsize
174     if init:
175         rds.insertMetadata([("readsize", readsize)])
176         if paired:
177             rds.insertMetadata([("paired", "True")])
178
179     infile.close()
180     if "blat_mapped" not in rds.getMetadata():
181         rds.insertMetadata([("blat_mapped", "True")])
182
183     minReadScore = readsize - readsize/25 - 1
184     maxBorder = 0
185     if dataType == "RNA":
186         trim = -4
187         maxBorder = readsize + trim
188
189     infile = open(filename, "r")
190     prevID = ""
191     readList = []
192     uInsertList = []
193     mInsertList = []
194     sInsertList = []
195     index = uIndex = mIndex = sIndex = lIndex = 0
196     bestScore = 0
197     # skip headers
198     for arg in range(NUM_HEADER_LINES):
199         line = infile.readline()
200
201     insertSize = 100000
202     delimiter = "|"
203     minIntron = 10
204     for line in infile:
205         lIndex += 1
206         fields = line.strip().split()
207         readID = fields[9]
208         if trimReadID:
209             readID = string.join(readID.split(":")[1:], ":")
210
211         if readID != prevID:
212             newReadList = []
213             if bestScore > minReadScore:
214                 for readData in readList:
215                     if readData[1] == bestScore:
216                         newReadList.append(readData)
217
218             if trimReadID:
219                 prevID = label + "-" + prevID
220
221             listlen = len(newReadList)
222             if listlen == 1:
223                 parts = int(newReadList[0][0])
224                 if parts == 1 and not spliceOnly:
225                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
226                     stop = start + readsize
227                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
228                     uIndex += 1
229                 elif forceRNA and parts == 2:
230                     (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
231                     startL = int(startList[0]) 
232                     stopL = startL + int(lengthList[0])
233                     startR = int(startList[1])
234                     stopR = startR + int(lengthList[1])
235                     if stopL + minIntron < startR:
236                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
237                         sIndex += 1
238                 elif parts == 2:
239                     print newReadList
240                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
241                     currentSplice = chrom
242                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
243                     if model not in geneDict:
244                         print fields
245                         continue
246
247                     (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
248                     spliceID = int(spliceID)
249                     rstart = int(start) - 2
250                     lefthalf = maxBorder - rstart
251                     if lefthalf < 1 or lefthalf > maxBorder:
252                         continue
253
254                     righthalf = readsize - lefthalf
255                     startL = int(regionStart)  + rstart
256                     stopL = startL + lefthalf
257                     startR = chromstarts[spliceID + 1]
258                     stopR = chromstarts[spliceID + 1] + righthalf
259                     if stopL + minIntron < startR:
260                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
261                         sIndex += 1
262             elif listlen > 1 and not spliceOnly:
263                 prevID = prevID + "::" + str(listlen)
264                 mIndex += 1
265                 # ignore multireads that can also map across splices
266                 skip = False
267                 for readData in newReadList:
268                     if readData[0] > 1:
269                         skip = True
270
271                 if not skip:
272                     for (part, score, sense, chrom, start, mismatches) in newReadList:
273                         stop = start + readsize
274                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
275             else:
276                 prevID = readID
277
278             if index % insertSize == 0:
279                 rds.insertUniqs(uInsertList)
280                 rds.insertMulti(mInsertList)
281                 uInsertList = []
282                 mInsertList = []
283                 if dataType == "RNA":
284                     rds.insertSplices(sInsertList)
285                     sInsertList = []
286
287                 print ".",
288                 sys.stdout.flush()
289
290             # start processing new read
291             readList = []
292             prevID = readID
293             bestScore = 0
294             index += 1
295
296         # add the new read
297         score = int(fields[0])
298         sense = fields[8]
299         chrom = fields[13]
300         parts = int(fields[17])
301         passStrict = True
302         if parts > 1:
303             lengthList = fields[18][:-1].split(",")
304             startList = fields[20][:-1].split(",")
305             listlen = len(lengthList)
306             for lpos in range(listlen):
307                 if int(lengthList[lpos]) < minSpliceLength:
308                     passStrict = False
309
310                 # throw out deletions, for now
311                 if lpos > 0:
312                     if int(lengthList[lpos - 1]) == int(startList[lpos]):
313                         passStrict = False
314             pass
315         else:
316             start = int(fields[15])
317
318         if passStrict:
319             if score > bestScore:
320                 bestScore = score
321
322             mismatches = ""
323             if int(fields[1]) > 0:
324                 try:
325                     mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
326                 except:
327                     mismatches = ""
328
329             if parts == 1:
330                 readList.append((parts, score, sense, chrom, start, mismatches))
331             else:
332                 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
333
334         if lIndex % 1000000 == 0:
335             print "processed %d lines" % lIndex
336
337     print "%d lines processed" % lIndex
338
339     if len(uInsertList) > 0:
340         rds.insertUniqs(uInsertList)
341     if len(mInsertList) > 0:
342         rds.insertMulti(mInsertList)
343     if len(sInsertList) > 0:
344         rds.insertSplices(sInsertList)
345
346     combString = "%d unique reads" % uIndex
347     combString += "\t%d multi reads" % mIndex
348     if dataType == "RNA":
349         combString += "\t%d spliced reads" % sIndex
350
351     print
352     print combString.replace("\t", "\n")
353
354     writeLog(outdbname + ".log", verstring, combString)
355
356     if doIndex:
357         print "building index...."
358         if cachePages > defaultCacheSize:
359             rds.setDBcache(cachePages)
360             rds.buildIndex(cachePages)
361         else:
362             rds.buildIndex(defaultCacheSize)
363
364
365 def decodeMismatches(gString, rString, rsense):
366     
367     output = []
368     rlen = len(gString)
369     partIndex = 0
370     for rindex in xrange(rlen):
371         if gString == ",":
372             partIndex += 1
373
374         if gString[rindex] == rString[rindex]:
375             continue
376
377         genNT = gString[rindex]
378         readNT = rString[rindex]
379         # for eland-compatibility, we are 1-based
380         output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
381             
382     return string.join(output, ",")
383
384
385 if __name__ == "__main__":
386     main(sys.argv)