snapshot of 4.0a development. initial git repo commit
[erange.git] / MakeBamFromRds.py
1 """
2 MakeBamFromRds
3
4 Converts ERANGE RDS zero based file to Bam zero based format.
5
6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
7
8 """
9
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys
17 import re
18 import optparse
19 import random
20 import pysam
21 from commoncode import readDataset
22
23
24 def main(argv=None):
25     if not argv:
26         argv = sys.argv
27
28     verstring = "MakeBamFromRds: version 1.0"
29     print verstring
30
31     doPairs = False
32     
33     usage = "usage: python %prog rdsFile bamFile [options]"
34
35     parser = optparse.OptionParser(usage=usage)
36     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
37     parser.add_option("--nomulti", action="store_false", dest="withMulti")
38     parser.add_option("--splices", action="store_true", dest="doSplices")
39     parser.add_option("--flag", dest="withFlag")
40     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
41     parser.add_option("--pairs", action="store_true", dest="doPairs")
42     parser.add_option("--cache", type="int", dest="cachePages")
43     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
44     parser.add_option("--chrom", action="append", dest="chromList")
45     parser.add_option("--fasta", dest="fastaFileName")
46     parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
47                         doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
48                         doCache=False, cachePages=100000, fastaFileName="",
49                         chromList=[])
50
51     (options, args) = parser.parse_args(argv[1:])
52
53     if len(args) < 2:
54         print usage
55         sys.exit(1)
56
57     rdsfile = args[0]
58     outfilename = args[1]
59
60     allChrom = True
61     if options.chromList:
62         allChrom = False
63
64     makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
65                      options.doSplices, doPairs, options.withFlag, options.useFlagLike,
66                      options.enforceChr, allChrom, options.doCache, options.cachePages,
67                      options.chromList, options.fastaFileName)
68
69
70 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
71                      doSplices=False, doPairs=False, withFlag="",
72                      useFlagLike=False, enforceChr=False, allChrom=True,
73                      doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
74
75     if not withUniqs and not withMulti and not doSplices:
76         print "must be outputting at least one of uniqs, multi, or -splices - exiting"
77         sys.exit(1)
78
79     print "\nsample:"
80     RDS = readDataset(rdsfile, verbose = True, cache=doCache)
81
82     if cachePages > RDS.getDefaultCacheSize():
83         RDS.setDBcache(cachePages)
84
85     readlength = RDS.getReadSize()
86
87     if allChrom:
88         if withUniqs:
89             chromList = RDS.getChromosomes()
90         elif withMulti:
91             chromList = RDS.getChromosomes(table="multi")
92         else:
93             chromList = RDS.getChromosomes(table="splices")
94
95         chromList.sort()
96
97     fastaSequenceDict = {}
98     if fastaFileName:
99         fastafile = open(fastaFileName)
100         fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
101         fastafile.close()
102
103     referenceSequenceList = []
104     chromRemoveList = []
105     for chromosome in chromList:
106         if doNotOutputChromosome(chromosome, enforceChr):
107             chromRemoveList.append(chromosome)
108         else:
109             chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
110             referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
111             referenceSequenceList.append(referenceDataDict)
112
113     for chrom in chromRemoveList:
114         chromList.remove(chrom)
115
116     header = {"HD": {"VN": "1.0"}}
117     if referenceSequenceList:
118         header["SQ"] = referenceSequenceList
119
120     outfile = pysam.Samfile(outfilename, "wb", header=header)
121
122     totalWrites = 0
123     noncanonicalSplices = 0
124     for chrom in chromList:
125         index = 0
126         print "chromosome %s" % (chrom)
127         if withUniqs or withMulti:
128             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
129                                        withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
130                                        flagLike=useFlagLike, entryDict=True)
131
132             for read in hitDict[chrom]:
133                 writeBAMEntry(outfile, chrom, read, readlength)
134                 index += 1
135
136         if doSplices:
137             numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
138             index += numSpliceReadsWritten
139             noncanonicalSplices += noncanonical
140
141         print index
142         totalWrites += index
143
144     outfile.close()
145     print "%d total reads written" % totalWrites
146     print "%d non-canonical splices" % noncanonicalSplices
147
148
149 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
150     index = 0
151     noncanonicalSplices = 0
152     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
153     if chrom not in spliceDict:
154         pass
155     else:
156         for read in spliceDict[chrom]:
157             if fastaSequenceDict.has_key(chrom):
158                 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
159                 noncanonicalSplices += noncanonical
160
161             writeBAMEntry(outfile, chrom, read, readlength)
162             index += 1
163
164     return index, noncanonicalSplices
165
166
167 def writeBAMEntry(outfile, chrom, outputDict, readlength):
168     tagList = []
169     alignedRead = pysam.AlignedRead()
170     alignedRead.qname = outputDict["readID"]
171     if outputDict["sense"] == "-":
172         alignedRead.is_reverse = True
173
174     alignedRead.rname = outfile.references.index(chrom)
175
176     if outputDict.has_key("startL"):
177         startL = outputDict["startL"]
178         stopL = outputDict["stopL"]
179         startR = outputDict["startR"]
180         stopR = outputDict["stopR"]
181         alignedRead.pos = startL
182         alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
183         tagList.append(("XS", outputDict["sense"]))
184     else:
185         alignedRead.pos = outputDict["start"]
186         alignedRead.cigar = [(0, readlength)]
187
188     if outputDict.has_key("pairID"):
189         pairID = outputDict["pairID"]
190         if pairID == "1":
191             alignedRead.is_read1 = True
192             alignedRead.is_proper_pair = True
193         elif pairID == "2":
194             alignedRead.is_read2 = True
195             alignedRead.is_proper_pair = True
196         else:
197             pass
198
199     if outputDict.has_key("mismatch"):
200         mismatchTag = getMismatches(outputDict["mismatch"])
201         if mismatchTag:
202             tagList.append(("MD", mismatchTag))
203     
204     if tagList:
205         alignedRead.tags = tagList
206
207     outfile.write(alignedRead)
208
209
210 def getMismatches(mismatchString):
211     mismatch = ""
212     positions = re.findall("\d+", mismatchString)
213     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
214     for index in range(0, len(positions)):
215         mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
216
217     return mismatch
218
219
220 def doNotOutputChromosome(chrom, enforceChr):
221     result = False
222
223     if chrom == "chrM":
224         result = True
225
226     if enforceChr and ("chr" not in chrom):
227         result = True
228
229     return result
230
231
232 def getFastaSequenceDictionary(fastaFileName):
233     fastaSeqDict = {}
234     fchrom = ""
235     fseq = ""
236
237     fastafile = open(fastaFileName)
238     for line in fastafile:
239         if line[0] == ">":
240             if fchrom != "":
241                 fastaSeqDict[fchrom] = fseq
242
243             fseq = ""
244             fchrom = line[1:-1]
245         else:
246             fseq += line.strip()
247
248     fastafile.close()
249
250     return fastaSeqDict
251
252
253 def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
254     spliceSense = {"GTAG": "+",
255                    "GCAG": "+",
256                    "ATAC": "+",
257                    "CTAC": "-",
258                    "CTGC": "-",
259                    "GTAT": "-"
260     }
261
262     noncanonical = 0
263     intronstart = stopLeft
264     intronlen = startRight - stopLeft
265     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
266     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
267     spliceJunction = leftJunctionSig + rightJunctionSig
268     spliceJunction = spliceJunction.upper()
269     if spliceSense.has_key(spliceJunction):
270         sense = spliceSense[spliceJunction]
271     else:
272         noncanonical += 1
273         senses = ["+", "-"]
274         random.shuffle(senses)
275         sense = senses[0]
276
277     return sense, noncanonical
278
279
280 if __name__ == "__main__":
281     main(sys.argv)