4 Converts ERANGE RDS zero based file to Bam zero based format.
6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
21 from commoncode import readDataset
28 verstring = "MakeBamFromRds: version 1.0"
33 usage = "usage: python %prog rdsFile bamFile [options]"
35 parser = optparse.OptionParser(usage=usage)
36 parser.add_option("--nouniq", action="store_false", dest="withUniqs")
37 parser.add_option("--nomulti", action="store_false", dest="withMulti")
38 parser.add_option("--splices", action="store_true", dest="doSplices")
39 parser.add_option("--flag", dest="withFlag")
40 parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
41 parser.add_option("--pairs", action="store_true", dest="doPairs")
42 parser.add_option("--cache", type="int", dest="cachePages")
43 parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
44 parser.add_option("--chrom", action="append", dest="chromList")
45 parser.add_option("--fasta", dest="fastaFileName")
46 parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
47 doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
48 doCache=False, cachePages=100000, fastaFileName="",
51 (options, args) = parser.parse_args(argv[1:])
64 makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
65 options.doSplices, doPairs, options.withFlag, options.useFlagLike,
66 options.enforceChr, allChrom, options.doCache, options.cachePages,
67 options.chromList, options.fastaFileName)
70 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
71 doSplices=False, doPairs=False, withFlag="",
72 useFlagLike=False, enforceChr=False, allChrom=True,
73 doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
75 if not withUniqs and not withMulti and not doSplices:
76 print "must be outputting at least one of uniqs, multi, or -splices - exiting"
80 RDS = readDataset(rdsfile, verbose = True, cache=doCache)
82 if cachePages > RDS.getDefaultCacheSize():
83 RDS.setDBcache(cachePages)
85 readlength = RDS.getReadSize()
89 chromList = RDS.getChromosomes()
91 chromList = RDS.getChromosomes(table="multi")
93 chromList = RDS.getChromosomes(table="splices")
97 fastaSequenceDict = {}
99 fastafile = open(fastaFileName)
100 fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
103 referenceSequenceList = []
105 for chromosome in chromList:
106 if doNotOutputChromosome(chromosome, enforceChr):
107 chromRemoveList.append(chromosome)
109 chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
110 referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
111 referenceSequenceList.append(referenceDataDict)
113 for chrom in chromRemoveList:
114 chromList.remove(chrom)
116 header = {"HD": {"VN": "1.0"}}
117 if referenceSequenceList:
118 header["SQ"] = referenceSequenceList
120 outfile = pysam.Samfile(outfilename, "wb", header=header)
123 noncanonicalSplices = 0
124 for chrom in chromList:
126 print "chromosome %s" % (chrom)
127 if withUniqs or withMulti:
128 hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
129 withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
130 flagLike=useFlagLike, entryDict=True)
132 for read in hitDict[chrom]:
133 writeBAMEntry(outfile, chrom, read, readlength)
137 numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
138 index += numSpliceReadsWritten
139 noncanonicalSplices += noncanonical
145 print "%d total reads written" % totalWrites
146 print "%d non-canonical splices" % noncanonicalSplices
149 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
151 noncanonicalSplices = 0
152 spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
153 if chrom not in spliceDict:
156 for read in spliceDict[chrom]:
157 if fastaSequenceDict.has_key(chrom):
158 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
159 noncanonicalSplices += noncanonical
161 writeBAMEntry(outfile, chrom, read, readlength)
164 return index, noncanonicalSplices
167 def writeBAMEntry(outfile, chrom, outputDict, readlength):
169 alignedRead = pysam.AlignedRead()
170 alignedRead.qname = outputDict["readID"]
171 if outputDict["sense"] == "-":
172 alignedRead.is_reverse = True
174 alignedRead.rname = outfile.references.index(chrom)
176 if outputDict.has_key("startL"):
177 startL = outputDict["startL"]
178 stopL = outputDict["stopL"]
179 startR = outputDict["startR"]
180 stopR = outputDict["stopR"]
181 alignedRead.pos = startL
182 alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
183 tagList.append(("XS", outputDict["sense"]))
185 alignedRead.pos = outputDict["start"]
186 alignedRead.cigar = [(0, readlength)]
188 if outputDict.has_key("pairID"):
189 pairID = outputDict["pairID"]
191 alignedRead.is_read1 = True
192 alignedRead.is_proper_pair = True
194 alignedRead.is_read2 = True
195 alignedRead.is_proper_pair = True
199 if outputDict.has_key("mismatch"):
200 mismatchTag = getMismatches(outputDict["mismatch"])
202 tagList.append(("MD", mismatchTag))
205 alignedRead.tags = tagList
207 outfile.write(alignedRead)
210 def getMismatches(mismatchString):
212 positions = re.findall("\d+", mismatchString)
213 nucleotides = re.findall("([ACGTN])\d+", mismatchString)
214 for index in range(0, len(positions)):
215 mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
220 def doNotOutputChromosome(chrom, enforceChr):
226 if enforceChr and ("chr" not in chrom):
232 def getFastaSequenceDictionary(fastaFileName):
237 fastafile = open(fastaFileName)
238 for line in fastafile:
241 fastaSeqDict[fchrom] = fseq
253 def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
254 spliceSense = {"GTAG": "+",
263 intronstart = stopLeft
264 intronlen = startRight - stopLeft
265 leftJunctionSig =fastaSequence[intronstart:intronstart+2]
266 rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
267 spliceJunction = leftJunctionSig + rightJunctionSig
268 spliceJunction = spliceJunction.upper()
269 if spliceSense.has_key(spliceJunction):
270 sense = spliceSense[spliceJunction]
274 random.shuffle(senses)
277 return sense, noncanonical
280 if __name__ == "__main__":