4 Converts ERANGE RDS zero based file to Bam zero based format.
6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
23 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
31 print "makeBamFromRds: version %s" % VERSION
35 usage = "usage: python %prog rdsFile bamFile [options]"
37 parser = getParser(usage)
38 (options, args) = parser.parse_args(argv[1:])
51 makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
52 options.doSplices, doPairs, options.withFlag, options.useFlagLike,
53 options.enforceChr, allChrom, options.doCache, options.cachePages,
54 options.chromList, options.fastaFileName)
58 parser = optparse.OptionParser(usage=usage)
59 parser.add_option("--nouniq", action="store_false", dest="withUniqs")
60 parser.add_option("--nomulti", action="store_false", dest="withMulti")
61 parser.add_option("--splices", action="store_true", dest="doSplices")
62 parser.add_option("--flag", dest="withFlag")
63 parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
64 parser.add_option("--pairs", action="store_true", dest="doPairs")
65 parser.add_option("--cache", type="int", dest="cachePages")
66 parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
67 parser.add_option("--chrom", action="append", dest="chromList")
68 parser.add_option("--fasta", dest="fastaFileName")
70 configParser = getConfigParser()
71 section = "MakeBamFromRds"
72 withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
73 withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
74 doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
75 doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
76 withFlag = getConfigOption(configParser, section, "withFlag", "")
77 useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
78 enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
79 doCache = getConfigBoolOption(configParser, section, "doCache", False)
80 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
81 fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
83 parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
84 doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
85 doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
91 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
92 doSplices=False, doPairs=False, withFlag="",
93 useFlagLike=False, enforceChr=False, allChrom=True,
94 doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
96 if not withUniqs and not withMulti and not doSplices:
97 print "must be outputting at least one of uniqs, multi, or -splices - exiting"
101 RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
103 if cachePages > RDS.getDefaultCacheSize():
104 RDS.setDBcache(cachePages)
106 readlength = RDS.getReadSize()
110 chromList = RDS.getChromosomes()
112 chromList = RDS.getChromosomes(table="multi")
114 chromList = RDS.getChromosomes(table="splices")
118 fastaSequenceDict = {}
120 fastafile = open(fastaFileName)
121 fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
124 referenceSequenceList = []
126 for chromosome in chromList:
127 if doNotOutputChromosome(chromosome, enforceChr):
128 chromRemoveList.append(chromosome)
130 chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
131 referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
132 referenceSequenceList.append(referenceDataDict)
134 for chrom in chromRemoveList:
135 chromList.remove(chrom)
137 header = {"HD": {"VN": VERSION}}
138 if referenceSequenceList:
139 header["SQ"] = referenceSequenceList
141 outfile = pysam.Samfile(outfilename, "wb", header=header)
144 noncanonicalSpliceCount = 0
145 for chrom in chromList:
147 print "chromosome %s" % (chrom)
148 if withUniqs or withMulti:
149 hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
150 doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
151 flagLike=useFlagLike, withMismatch=True)
153 for read in hitDict[chrom]:
154 writeBAMEntry(outfile, chrom, read, readlength)
158 numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
159 index += numSpliceReadsWritten
160 noncanonicalSpliceCount += noncanonical
166 print "%d total reads written" % totalWrites
167 print "%d non-canonical splices" % noncanonicalSpliceCount
170 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
172 noncanonicalSplices = 0
173 spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
175 if chrom not in spliceDict:
178 for read in spliceDict[chrom]:
179 if chrom in fastaSequenceDict:
180 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
181 noncanonicalSplices += noncanonical
183 writeBAMEntry(outfile, chrom, read, readlength)
186 return index, noncanonicalSplices
189 def writeBAMEntry(outfile, chrom, outputDict, readlength):
190 """ We need to subtract 1 from the position because rds is 1 based and
191 most of the rest of the entire world is 0 based.
194 alignedRead = pysam.AlignedRead()
196 (readID, pairID) = outputDict["readID"].split("/")
199 readID = outputDict["readID"]
202 alignedRead.qname = readID
203 if outputDict["sense"] == "-":
204 alignedRead.is_reverse = True
206 alignedRead.rname = outfile.references.index(chrom)
208 if "startL" in outputDict:
209 startL = outputDict["startL"] - 1
210 stopL = outputDict["stopL"] - 1
211 startR = outputDict["startR"] - 1
212 stopR = outputDict["stopR"] - 1
213 alignedRead.pos = startL
214 alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
215 tagList.append(("XS", str(outputDict["sense"])))
217 alignedRead.pos = outputDict["start"] - 1
218 alignedRead.cigar = [(0, readlength)]
220 #TODO: add sequence info
221 #cistematic.core.retrieveSequence(genome, chrom, start, stop, sense="F", db="")
222 #ooooo. let's do this with PyGr
224 #TODO: add quality - max for each base, but default is to not include just like current
225 # rdsToFastq() sets everything to max
229 alignedRead.is_read1 = True
230 alignedRead.is_proper_pair = True
232 alignedRead.is_read2 = True
233 alignedRead.is_proper_pair = True
237 if "mismatch" in outputDict:
238 mismatchTag = getMismatches(outputDict["mismatch"])
240 tagList.append(("MD", str(mismatchTag)))
243 alignedRead.tags = tuple(tagList)
245 outfile.write(alignedRead)
248 def getMismatches(mismatchString):
250 positions = re.findall("\d+", mismatchString)
251 nucleotides = re.findall("([ACGTN])\d+", mismatchString)
252 for index in range(0, len(positions)):
253 mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
255 mismatch = string.join(mismatchList, "")
260 def doNotOutputChromosome(chrom, enforceChr):
266 if enforceChr and ("chr" not in chrom):
272 def getFastaSequenceDictionary(fastaFileName):
277 fastafile = open(fastaFileName)
278 for line in fastafile:
281 fastaSeqDict[fchrom] = fseq
293 def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
294 spliceSense = {"GTAG": "+",
303 intronstart = stopLeft
304 intronlen = startRight - stopLeft
305 leftJunctionSig =fastaSequence[intronstart:intronstart+2]
306 rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
307 spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
308 spliceJunction = spliceJunction.upper()
310 if spliceJunction in spliceSense:
311 sense = spliceSense[spliceJunction]
315 random.shuffle(senses)
318 return sense, noncanonical
321 if __name__ == "__main__":