snapshot of 4.0a development. initial git repo commit
[erange.git] / makerdsfrombowtie.py
1 #
2 #  makerdsfrombowtie.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 10/20/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys, string, optparse
15 from commoncode import readDataset, writeLog
16
17 verstring = "%prog: version 4.1"
18 print verstring
19
20 def main(argv=None):
21     if not argv:
22         argv = sys.argv
23
24     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
25
26     parser = optparse.OptionParser(usage=usage)
27     parser.add_option("--RNA", dest="genedatafilename")
28     parser.add_option("--append", action="store_false", dest="init")
29     parser.add_option("--index", action="store_true", dest="doIndex")
30     parser.add_option("--spacer", type="int", dest="spacer")
31     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
32     parser.add_option("--forcepair", type="int", dest="forceID")
33     parser.add_option("--flip", action="store_true", dest="flip")
34     parser.add_option("--verbose", action="store_true", dest="verbose")
35     parser.add_option("--strip", action="store_true", dest="stripSpace")
36     parser.add_option("--cache", type="int", dest="cachePages")
37     parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
38                         trimReadID=True, forceID=None, flip=False, verbose=False,
39                         stripSpace=False, cachePages=100000)
40
41     (options, args) = parser.parse_args(argv[1:])
42
43     if len(args) < 3:
44         print usage
45         sys.exit(1)
46
47     label = args[0]
48     filename = args[1]
49     outdbname = args[2]
50
51     propertyList = []
52     for arg in args:
53         if "::" in arg:
54             (pname, pvalue) = arg.strip().split("::")
55             propertyList.append((pname, pvalue))
56
57     makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
58                       options.doIndex, options.spacer, options.trimReadID, options.forceID,
59                       options.flip, options.verbose, options.stripSpace, options.cachePages,
60                       propertyList)
61
62
63 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
64                       doIndex=False, spacer=2, trimReadID=True, forceID=None,
65                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
66                       propertyList=[]):
67
68     delimiter = "|"
69
70     dataType = "DNA"
71     if genedatafilename is not None:
72         dataType = "RNA"
73         genedatafile = open(genedatafilename)
74
75
76     forcePair = False
77     if forceID is not None:
78         forcePair = True
79     else:
80         forceID = 0
81
82     maxBorder = 0
83     index = 0
84     insertSize = 100000
85
86     writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
87
88     geneDict = {}
89     mapDict = {}
90     if dataType == "RNA":
91         for line in genedatafile:
92             fields = line.strip().split("\t")
93             blockCount = int(fields[7])
94             if blockCount < 2:
95                 continue
96
97             uname = fields[0]
98             chrom = fields[1]
99             sense = fields[2]
100             chromstarts = fields[8][:-1].split(",")
101             chromstops = fields[9][:-1].split(",")
102             exonLengths = []
103             totalLength = 0
104             for index in range(blockCount):
105                 chromstarts[index] = int(chromstarts[index])
106                 chromstops[index] = int(chromstops[index])
107                 exonLengths.append(chromstops[index] - chromstarts[index])
108                 totalLength += exonLengths[index]
109
110             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
111             mapDict[uname] = []
112
113         genedatafile.close()
114
115     rds = readDataset(outdbname, init, dataType, verbose=True)
116
117     #check that our cacheSize is better than the dataset's default cache size
118     defaultCacheSize = rds.getDefaultCacheSize()
119     if cachePages > defaultCacheSize:
120         if init:
121             rds.setDBcache(cachePages, default=True)
122         else:
123             rds.setDBcache(cachePages)
124
125     if not init and doIndex:
126         try:
127             if rds.hasIndex():
128                 rds.dropIndex()
129         except:
130             if verbose:
131                 print "couldn't drop Index"
132
133     if len(propertyList) > 0:
134         rds.insertMetadata(propertyList)
135
136     # make some assumptions based on first read
137     infile = open(filename, "r")
138     line = infile.readline()
139     if stripSpace:
140         line = line.replace(" ","")
141
142     fields = line.split()
143     readsize = len(fields[5])
144     pairedTest = fields[0][-2:]
145     paired = False
146     if pairedTest in ["/1", "/2"] or forcePair:
147         print "assuming reads are paired"
148         paired = True
149
150
151     print "read size: %d bp" % readsize
152     if init:
153         rds.insertMetadata([("readsize", readsize)])
154         if paired:
155             rds.insertMetadata([("paired", "True")])
156
157     if "bowtie_mapped" not in rds.getMetadata():
158         rds.insertMetadata([("bowtie_mapped", "True")])
159
160     if dataType == "RNA" and "spacer" not in rds.getMetadata():
161         rds.insertMetadata([("spacer", spacer)])
162
163     infile.close()
164
165     trim = -4
166     if dataType == "RNA":
167         maxBorder = readsize + trim
168
169     infile = open(filename, "r")
170     prevID = ""
171     readList = []
172     uInsertList = []
173     mInsertList = []
174     sInsertList = []
175     index = uIndex = mIndex = sIndex = lIndex = 0
176     for line in infile:
177         lIndex += 1
178         if stripSpace:
179             line = line.replace(" ","")
180
181         fields = line.strip().split()
182         readID = fields[0]
183         if trimReadID:
184             readID = string.join(readID.split(":")[1:], ":")
185
186         if readID != prevID:
187             listlen = len(readList)
188             if trimReadID:
189                 prevID = "%s-%s" % (label, prevID)
190
191             if forcePair:
192                 prevID += "/%d" % forceID 
193
194             if listlen == 1:
195                 (sense, chrom, start, mismatches) = readList[0]
196                 if flip:
197                     if sense == "+":
198                         sense = "-"
199                     else:
200                         sense = "+"
201
202                 if "|" not in chrom:
203                     stop = start + readsize
204                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
205                     uIndex += 1
206                 elif dataType == "RNA":
207                     currentSplice = chrom
208                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
209                     if model not in geneDict:
210                         prevID = readID
211                     else:
212                         (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
213                         spliceID = int(spliceID)
214                         rstart = int(start) - spacer
215                         lefthalf = maxBorder - rstart
216                         if lefthalf < 1 or lefthalf > maxBorder:
217                             prevID = readID
218                         else:
219                             righthalf = readsize - lefthalf
220                             startL = int(regionStart)  + rstart
221                             stopL = startL + lefthalf
222                             startR = chromstarts[spliceID + 1]
223                             stopR = chromstarts[spliceID + 1] + righthalf
224                             sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
225                             sIndex += 1
226             elif listlen > 1:
227                 prevID = "%s::%s" % (prevID, str(listlen))
228                 mIndex += 1
229                 # ignore multireads that can also map across splices
230                 skip = False
231                 for (sense, chrom, start, mismatches) in readList:
232                     if "|" in chrom:
233                         skip = True
234
235                 if not skip:
236                     for (sense, chrom, start, mismatches) in readList:
237                         stop = start + readsize
238                         if flip:
239                             if sense == "+":
240                                 sense = "-"
241                             else:
242                                 sense = "+"
243
244                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
245             else:
246                 prevID = readID
247
248             if index % insertSize == 0:
249                 rds.insertUniqs(uInsertList)
250                 rds.insertMulti(mInsertList)
251                 uInsertList = []
252                 mInsertList = []
253                 if dataType == "RNA":
254                     rds.insertSplices(sInsertList)
255                     sInsertList = []
256
257                 print ".",
258                 sys.stdout.flush()
259
260             # start processing new read
261             readList = []
262             prevID = readID
263             index += 1
264
265         # add the new read
266         sense = fields[1]
267         chrom = fields[2]
268         # for eland compat, we are 1-based
269         start = int(fields[3]) + 1
270         mismatches = ""
271         if ":" in fields[-1]:
272             mismatches = decodeMismatches(fields[-1], sense)
273
274         readList.append((sense, chrom, start, mismatches))
275         if lIndex % 1000000 == 0:
276             print "processed %d lines" % lIndex
277
278     print "%d lines processed" % lIndex
279
280     if len(uInsertList) > 0:
281         rds.insertUniqs(uInsertList)
282
283     if len(mInsertList) > 0:
284         rds.insertMulti(mInsertList)
285
286     if len(sInsertList) > 0:
287         rds.insertSplices(sInsertList)
288
289     combString = "%d unique reads" % uIndex
290     combString += "\t%d multi reads" % mIndex
291     if dataType == "RNA":
292         combString += "\t%d spliced reads" % sIndex
293
294     print
295     print combString.replace("\t", "\n")
296
297     writeLog("%s.log" % outdbname, verstring, combString)
298
299     if doIndex:
300         print "building index...."
301         if cachePages > defaultCacheSize:
302             rds.setDBcache(cachePages)
303             rds.buildIndex(cachePages)
304         else:
305             rds.buildIndex(defaultCacheSize)
306
307
308 def decodeMismatches(mString, rsense):
309     complement = {"A": "T",
310                   "T": "A",
311                   "C": "G",
312                   "G": "C",
313                   "N": "N"
314     }
315
316     output = []
317     mismatches = mString.split(",")
318     for mismatch in mismatches:
319         (pos,change) = mismatch.split(":")
320         (genNT, readNT) = change.split(">")
321         if rsense == "-":
322             readNT = complement[readNT]
323             genNT  = complement[genNT]
324
325         elandCompatiblePos = int(pos) + 1
326         output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
327
328     return string.join(output, ",")
329
330
331 if __name__ == "__main__":
332     main(sys.argv)