first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / MakeBamFromRds.py
1 """
2 MakeBamFromRds
3
4 Converts ERANGE RDS zero based file to Bam zero based format.
5
6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
7
8 """
9
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys
17 import re
18 import optparse
19 import random
20 import string
21 import pysam
22 import ReadDataset
23 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
24
25 VERSION = "1.0"
26
27 def main(argv=None):
28     if not argv:
29         argv = sys.argv
30
31     print "makeBamFromRds: version %s" % VERSION
32
33     doPairs = False
34     
35     usage = "usage: python %prog rdsFile bamFile [options]"
36
37     parser = getParser(usage)
38     (options, args) = parser.parse_args(argv[1:])
39
40     if len(args) < 2:
41         print usage
42         sys.exit(1)
43
44     rdsfile = args[0]
45     outfilename = args[1]
46
47     allChrom = True
48     if options.chromList:
49         allChrom = False
50
51     makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
52                      options.doSplices, doPairs, options.withFlag, options.useFlagLike,
53                      options.enforceChr, allChrom, options.doCache, options.cachePages,
54                      options.chromList, options.fastaFileName)
55
56
57 def getParser(usage):
58     parser = optparse.OptionParser(usage=usage)
59     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
60     parser.add_option("--nomulti", action="store_false", dest="withMulti")
61     parser.add_option("--splices", action="store_true", dest="doSplices")
62     parser.add_option("--flag", dest="withFlag")
63     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
64     parser.add_option("--pairs", action="store_true", dest="doPairs")
65     parser.add_option("--cache", type="int", dest="cachePages")
66     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
67     parser.add_option("--chrom", action="append", dest="chromList")
68     parser.add_option("--fasta", dest="fastaFileName")
69
70     configParser = getConfigParser()
71     section = "MakeBamFromRds"
72     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
73     withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
74     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
75     doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
76     withFlag = getConfigOption(configParser, section, "withFlag", "")
77     useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
78     enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
79     doCache = getConfigBoolOption(configParser, section, "doCache", False)
80     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
81     fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
82
83     parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
84                         doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
85                         doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
86                         chromList=[])
87
88     return parser
89
90
91 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
92                      doSplices=False, doPairs=False, withFlag="",
93                      useFlagLike=False, enforceChr=False, allChrom=True,
94                      doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
95
96     if not withUniqs and not withMulti and not doSplices:
97         print "must be outputting at least one of uniqs, multi, or -splices - exiting"
98         sys.exit(1)
99
100     print "\nsample:"
101     RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
102
103     if cachePages > RDS.getDefaultCacheSize():
104         RDS.setDBcache(cachePages)
105
106     readlength = RDS.getReadSize()
107
108     if allChrom:
109         if withUniqs:
110             chromList = RDS.getChromosomes()
111         elif withMulti:
112             chromList = RDS.getChromosomes(table="multi")
113         else:
114             chromList = RDS.getChromosomes(table="splices")
115
116         chromList.sort()
117
118     fastaSequenceDict = {}
119     if fastaFileName:
120         fastafile = open(fastaFileName)
121         fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
122         fastafile.close()
123
124     referenceSequenceList = []
125     chromRemoveList = []
126     for chromosome in chromList:
127         if doNotOutputChromosome(chromosome, enforceChr):
128             chromRemoveList.append(chromosome)
129         else:
130             chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
131             referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
132             referenceSequenceList.append(referenceDataDict)
133
134     for chrom in chromRemoveList:
135         chromList.remove(chrom)
136
137     header = {"HD": {"VN": VERSION}}
138     if referenceSequenceList:
139         header["SQ"] = referenceSequenceList
140
141     outfile = pysam.Samfile(outfilename, "wb", header=header)
142
143     totalWrites = 0
144     noncanonicalSpliceCount = 0
145     for chrom in chromList:
146         index = 0
147         print "chromosome %s" % (chrom)
148         if withUniqs or withMulti:
149             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
150                                        doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
151                                        flagLike=useFlagLike, withMismatch=True)
152
153             for read in hitDict[chrom]:
154                 writeBAMEntry(outfile, chrom, read, readlength)
155                 index += 1
156
157         if doSplices:
158             numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
159             index += numSpliceReadsWritten
160             noncanonicalSpliceCount += noncanonical
161
162         print index
163         totalWrites += index
164
165     outfile.close()
166     print "%d total reads written" % totalWrites
167     print "%d non-canonical splices" % noncanonicalSpliceCount
168
169
170 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
171     index = 0
172     noncanonicalSplices = 0
173     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
174                                     withMismatch=True)
175     if chrom not in spliceDict:
176         pass
177     else:
178         for read in spliceDict[chrom]:
179             if chrom in fastaSequenceDict:
180                 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
181                 noncanonicalSplices += noncanonical
182
183             writeBAMEntry(outfile, chrom, read, readlength)
184             index += 1
185
186     return index, noncanonicalSplices
187
188
189 def writeBAMEntry(outfile, chrom, outputDict, readlength):
190     """ We need to subtract 1 from the position because rds is 1 based and
191         most of the rest of the entire world is 0 based.
192     """
193     tagList = []
194     alignedRead = pysam.AlignedRead()
195     try:
196         (readID, pairID) = outputDict["readID"].split("/")
197         paired = True
198     except ValueError:
199         readID = outputDict["readID"]
200         paired = False
201
202     alignedRead.qname = readID
203     if outputDict["sense"] == "-":
204         alignedRead.is_reverse = True
205
206     alignedRead.rname = outfile.references.index(chrom)
207
208     if "startL" in outputDict:
209         startL = outputDict["startL"] - 1
210         stopL = outputDict["stopL"] - 1
211         startR = outputDict["startR"] - 1
212         stopR = outputDict["stopR"] - 1
213         alignedRead.pos = startL
214         alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
215         tagList.append(("XS", str(outputDict["sense"])))
216     else:
217         alignedRead.pos = outputDict["start"] - 1
218         alignedRead.cigar = [(0, readlength)]
219
220     #TODO: add sequence info
221     #cistematic.core.retrieveSequence(genome, chrom, start, stop, sense="F", db="")
222     #ooooo. let's do this with PyGr
223
224     #TODO: add quality - max for each base, but default is to not include just like current
225     # rdsToFastq() sets everything to max
226
227     if paired:
228         if pairID == "1":
229             alignedRead.is_read1 = True
230             alignedRead.is_proper_pair = True
231         elif pairID == "2":
232             alignedRead.is_read2 = True
233             alignedRead.is_proper_pair = True
234         else:
235             pass
236
237     if "mismatch" in outputDict:
238         mismatchTag = getMismatches(outputDict["mismatch"])
239         if mismatchTag:
240             tagList.append(("MD", str(mismatchTag)))
241
242     if tagList:
243         alignedRead.tags = tuple(tagList)
244
245     outfile.write(alignedRead)
246
247
248 def getMismatches(mismatchString):
249     mismatchList = []
250     positions = re.findall("\d+", mismatchString)
251     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
252     for index in range(0, len(positions)):
253         mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
254
255     mismatch = string.join(mismatchList, "")
256
257     return mismatch
258
259
260 def doNotOutputChromosome(chrom, enforceChr):
261     result = False
262
263     if chrom == "chrM":
264         result = True
265
266     if enforceChr and ("chr" not in chrom):
267         result = True
268
269     return result
270
271
272 def getFastaSequenceDictionary(fastaFileName):
273     fastaSeqDict = {}
274     fchrom = ""
275     fseq = ""
276
277     fastafile = open(fastaFileName)
278     for line in fastafile:
279         if line[0] == ">":
280             if fchrom != "":
281                 fastaSeqDict[fchrom] = fseq
282
283             fseq = ""
284             fchrom = line[1:-1]
285         else:
286             fseq += line.strip()
287
288     fastafile.close()
289
290     return fastaSeqDict
291
292
293 def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
294     spliceSense = {"GTAG": "+",
295                    "GCAG": "+",
296                    "ATAC": "+",
297                    "CTAC": "-",
298                    "CTGC": "-",
299                    "GTAT": "-"
300     }
301
302     noncanonical = 0
303     intronstart = stopLeft
304     intronlen = startRight - stopLeft
305     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
306     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
307     spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
308     spliceJunction = spliceJunction.upper()
309     print spliceJunction
310     if spliceJunction in spliceSense:
311         sense = spliceSense[spliceJunction]
312     else:
313         noncanonical += 1
314         senses = ["+", "-"]
315         random.shuffle(senses)
316         sense = senses[0]
317
318     return sense, noncanonical
319
320
321 if __name__ == "__main__":
322     main(sys.argv)