erange version 4.0a dev release
[erange.git] / makerdsfrombowtie.py
1 #
2 #  makerdsfrombowtie.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 10/20/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import string
16 import optparse
17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
18 import ReadDataset
19
20 verstring = "makerdsfrombowtie: version 4.2"
21 print verstring
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
28
29     parser = getParser(usage)
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 3:
33         print usage
34         sys.exit(1)
35
36     label = args[0]
37     filename = args[1]
38     outdbname = args[2]
39
40     propertyList = []
41     for arg in args:
42         if "::" in arg:
43             (pname, pvalue) = arg.strip().split("::")
44             propertyList.append((pname, pvalue))
45
46     makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
47                       options.doIndex, options.spacer, options.trimReadID, options.forceID,
48                       options.flip, options.verbose, options.stripSpace, options.cachePages,
49                       propertyList)
50
51
52 def getParser(usage):
53     parser = optparse.OptionParser(usage=usage)
54     parser.add_option("--RNA", dest="genedatafilename")
55     parser.add_option("--append", action="store_false", dest="init")
56     parser.add_option("--index", action="store_true", dest="doIndex")
57     parser.add_option("--spacer", type="int", dest="spacer")
58     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
59     parser.add_option("--forcepair", type="int", dest="forceID")
60     parser.add_option("--flip", action="store_true", dest="flip")
61     parser.add_option("--verbose", action="store_true", dest="verbose")
62     parser.add_option("--strip", action="store_true", dest="stripSpace")
63     parser.add_option("--cache", type="int", dest="cachePages")
64
65     configParser = getConfigParser()
66     section = "makerdsfrom bowtie"
67     genedatafilename = getConfigOption(configParser, section, "genedatafilename", None)
68     init = getConfigBoolOption(configParser, section, "init", True)
69     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
70     spacer = getConfigIntOption(configParser, section, "spacer", 2)
71     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
72     forceID = getConfigOption(configParser, section, "forceID", None)
73     flip = getConfigBoolOption(configParser, section, "flip", False)
74     verbose = getConfigBoolOption(configParser, section, "verbose", False)
75     stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False)
76     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
77
78     parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer,
79                         trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose,
80                         stripSpace=stripSpace, cachePages=cachePages)
81
82     return parser
83
84
85 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
86                       doIndex=False, spacer=2, trimReadID=True, forceID=None,
87                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
88                       propertyList=[]):
89
90     delimiter = "|"
91
92     dataType = "DNA"
93     if genedatafilename is not None:
94         dataType = "RNA"
95         genedatafile = open(genedatafilename)
96
97
98     forcePair = False
99     if forceID is not None:
100         forcePair = True
101     else:
102         forceID = 0
103
104     maxBorder = 0
105     index = 0
106     insertSize = 100000
107
108     writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
109
110     geneDict = {}
111     mapDict = {}
112     if dataType == "RNA":
113         for line in genedatafile:
114             fields = line.strip().split("\t")
115             blockCount = int(fields[7])
116             if blockCount < 2:
117                 continue
118
119             uname = fields[0]
120             chrom = fields[1]
121             sense = fields[2]
122             chromstarts = fields[8][:-1].split(",")
123             chromstops = fields[9][:-1].split(",")
124             exonLengths = []
125             totalLength = 0
126             for index in range(blockCount):
127                 chromstarts[index] = int(chromstarts[index])
128                 chromstops[index] = int(chromstops[index])
129                 exonLengths.append(chromstops[index] - chromstarts[index])
130                 totalLength += exonLengths[index]
131
132             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
133             mapDict[uname] = []
134
135         genedatafile.close()
136
137     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
138
139     #check that our cacheSize is better than the dataset's default cache size
140     defaultCacheSize = rds.getDefaultCacheSize()
141     if cachePages > defaultCacheSize:
142         if init:
143             rds.setDBcache(cachePages, default=True)
144         else:
145             rds.setDBcache(cachePages)
146
147     if not init and doIndex:
148         try:
149             if rds.hasIndex():
150                 rds.dropIndex()
151         except:
152             if verbose:
153                 print "couldn't drop Index"
154
155     if len(propertyList) > 0:
156         rds.insertMetadata(propertyList)
157
158     # make some assumptions based on first read
159     infile = open(filename, "r")
160     line = infile.readline()
161     if stripSpace:
162         line = line.replace(" ","")
163
164     fields = line.split()
165     readsize = len(fields[5])
166     pairedTest = fields[0][-2:]
167     paired = False
168     if pairedTest in ["/1", "/2"] or forcePair:
169         print "assuming reads are paired"
170         paired = True
171
172
173     print "read size: %d bp" % readsize
174     if init:
175         rds.insertMetadata([("readsize", readsize)])
176         if paired:
177             rds.insertMetadata([("paired", "True")])
178
179     if "bowtie_mapped" not in rds.getMetadata():
180         rds.insertMetadata([("bowtie_mapped", "True")])
181
182     if dataType == "RNA" and "spacer" not in rds.getMetadata():
183         rds.insertMetadata([("spacer", spacer)])
184
185     infile.close()
186
187     trim = -4
188     if dataType == "RNA":
189         maxBorder = readsize + trim
190
191     infile = open(filename, "r")
192     prevID = ""
193     readList = []
194     uInsertList = []
195     mInsertList = []
196     sInsertList = []
197     index = uIndex = mIndex = sIndex = lIndex = 0
198     for line in infile:
199         lIndex += 1
200         if stripSpace:
201             line = line.replace(" ","")
202
203         fields = line.strip().split()
204         readID = fields[0]
205         if trimReadID:
206             readID = string.join(readID.split(":")[1:], ":")
207
208         if readID != prevID:
209             listlen = len(readList)
210             if trimReadID:
211                 prevID = "%s-%s" % (label, prevID)
212
213             if forcePair:
214                 prevID += "/%d" % forceID 
215
216             if listlen == 1:
217                 (sense, chrom, start, mismatches) = readList[0]
218                 if flip:
219                     if sense == "+":
220                         sense = "-"
221                     else:
222                         sense = "+"
223
224                 if "|" not in chrom:
225                     stop = start + readsize
226                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
227                     uIndex += 1
228                 elif dataType == "RNA":
229                     currentSplice = chrom
230                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
231                     if model not in geneDict:
232                         prevID = readID
233                     else:
234                         (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
235                         spliceID = int(spliceID)
236                         rstart = int(start) - spacer
237                         lefthalf = maxBorder - rstart
238                         if lefthalf < 1 or lefthalf > maxBorder:
239                             prevID = readID
240                         else:
241                             righthalf = readsize - lefthalf
242                             startL = int(regionStart)  + rstart
243                             stopL = startL + lefthalf
244                             startR = chromstarts[spliceID + 1]
245                             stopR = chromstarts[spliceID + 1] + righthalf
246                             sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
247                             sIndex += 1
248             elif listlen > 1:
249                 prevID = "%s::%s" % (prevID, str(listlen))
250                 mIndex += 1
251                 # ignore multireads that can also map across splices
252                 skip = False
253                 for (sense, chrom, start, mismatches) in readList:
254                     if "|" in chrom:
255                         skip = True
256
257                 if not skip:
258                     for (sense, chrom, start, mismatches) in readList:
259                         stop = start + readsize
260                         if flip:
261                             if sense == "+":
262                                 sense = "-"
263                             else:
264                                 sense = "+"
265
266                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
267             else:
268                 prevID = readID
269
270             if index % insertSize == 0:
271                 rds.insertUniqs(uInsertList)
272                 rds.insertMulti(mInsertList)
273                 uInsertList = []
274                 mInsertList = []
275                 if dataType == "RNA":
276                     rds.insertSplices(sInsertList)
277                     sInsertList = []
278
279                 print ".",
280                 sys.stdout.flush()
281
282             # start processing new read
283             readList = []
284             prevID = readID
285             index += 1
286
287         # add the new read
288         sense = fields[1]
289         chrom = fields[2]
290         # for eland compat, we are 1-based
291         start = int(fields[3]) + 1
292         mismatches = ""
293         if ":" in fields[-1]:
294             mismatches = decodeMismatches(fields[-1], sense)
295
296         readList.append((sense, chrom, start, mismatches))
297         if lIndex % 1000000 == 0:
298             print "processed %d lines" % lIndex
299
300     print "%d lines processed" % lIndex
301
302     if len(uInsertList) > 0:
303         rds.insertUniqs(uInsertList)
304
305     if len(mInsertList) > 0:
306         rds.insertMulti(mInsertList)
307
308     if len(sInsertList) > 0:
309         rds.insertSplices(sInsertList)
310
311     combString = "%d unique reads" % uIndex
312     combString += "\t%d multi reads" % mIndex
313     if dataType == "RNA":
314         combString += "\t%d spliced reads" % sIndex
315
316     print
317     print combString.replace("\t", "\n")
318
319     writeLog("%s.log" % outdbname, verstring, combString)
320
321     if doIndex:
322         print "building index...."
323         if cachePages > defaultCacheSize:
324             rds.setDBcache(cachePages)
325             rds.buildIndex(cachePages)
326         else:
327             rds.buildIndex(defaultCacheSize)
328
329
330 def decodeMismatches(mString, rsense):
331     complement = {"A": "T",
332                   "T": "A",
333                   "C": "G",
334                   "G": "C",
335                   "N": "N"
336     }
337
338     output = []
339     mismatches = mString.split(",")
340     for mismatch in mismatches:
341         (pos,change) = mismatch.split(":")
342         (genNT, readNT) = change.split(">")
343         if rsense == "-":
344             readNT = complement[readNT]
345             genNT  = complement[genNT]
346
347         elandCompatiblePos = int(pos) + 1
348         output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
349
350     return string.join(output, ",")
351
352
353 if __name__ == "__main__":
354     main(sys.argv)