MakeBamFromRds.py

   1 """
   2 MakeBamFromRds
   3
   4 Converts ERANGE RDS zero based file to Bam zero based format.
   5
   6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
   7
   8 """
   9
  10 try:
  11     import psyco
  12     psyco.full()
  13 except:
  14     pass
  15
  16 import sys
  17 import re
  18 import optparse
  19 import random
  20 import string
  21 import pysam
  22 import ReadDataset
  23 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  24
  25 VERSION = "1.0"
  26
  27 def main(argv=None):
  28     if not argv:
  29         argv = sys.argv
  30
  31     print "makeBamFromRds: version %s" % VERSION
  32
  33     doPairs = False
  34
  35     usage = "usage: python %prog rdsFile bamFile [options]"
  36
  37     parser = getParser(usage)
  38     (options, args) = parser.parse_args(argv[1:])
  39
  40     if len(args) < 2:
  41         print usage
  42         sys.exit(1)
  43
  44     rdsfile = args[0]
  45     outfilename = args[1]
  46
  47     allChrom = True
  48     if options.chromList:
  49         allChrom = False
  50
  51     makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
  52                      options.doSplices, doPairs, options.withFlag, options.useFlagLike,
  53                      options.enforceChr, allChrom, options.doCache, options.cachePages,
  54                      options.chromList, options.fastaFileName)
  55
  56
  57 def getParser(usage):
  58     parser = optparse.OptionParser(usage=usage)
  59     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
  60     parser.add_option("--nomulti", action="store_false", dest="withMulti")
  61     parser.add_option("--splices", action="store_true", dest="doSplices")
  62     parser.add_option("--flag", dest="withFlag")
  63     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
  64     parser.add_option("--pairs", action="store_true", dest="doPairs")
  65     parser.add_option("--cache", type="int", dest="cachePages")
  66     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
  67     parser.add_option("--chrom", action="append", dest="chromList")
  68     parser.add_option("--fasta", dest="fastaFileName")
  69
  70     configParser = getConfigParser()
  71     section = "MakeBamFromRds"
  72     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
  73     withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
  74     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
  75     doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
  76     withFlag = getConfigOption(configParser, section, "withFlag", "")
  77     useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
  78     enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
  79     doCache = getConfigBoolOption(configParser, section, "doCache", False)
  80     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  81     fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
  82
  83     parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
  84                         doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
  85                         doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
  86                         chromList=[])
  87
  88     return parser
  89
  90
  91 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
  92                      doSplices=False, doPairs=False, withFlag="",
  93                      useFlagLike=False, enforceChr=False, allChrom=True,
  94                      doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
  95
  96     if not withUniqs and not withMulti and not doSplices:
  97         print "must be outputting at least one of uniqs, multi, or -splices - exiting"
  98         sys.exit(1)
  99
 100     print "\nsample:"
 101     RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
 102
 103     if cachePages > RDS.getDefaultCacheSize():
 104         RDS.setDBcache(cachePages)
 105
 106     readlength = RDS.getReadSize()
 107
 108     if allChrom:
 109         if withUniqs:
 110             chromList = RDS.getChromosomes()
 111         elif withMulti:
 112             chromList = RDS.getChromosomes(table="multi")
 113         else:
 114             chromList = RDS.getChromosomes(table="splices")
 115
 116         chromList.sort()
 117
 118     fastaSequenceDict = {}
 119     if fastaFileName:
 120         fastafile = open(fastaFileName)
 121         fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
 122         fastafile.close()
 123
 124     referenceSequenceList = []
 125     chromRemoveList = []
 126     for chromosome in chromList:
 127         if doNotOutputChromosome(chromosome, enforceChr):
 128             chromRemoveList.append(chromosome)
 129         else:
 130             chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
 131             referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
 132             referenceSequenceList.append(referenceDataDict)
 133
 134     for chrom in chromRemoveList:
 135         chromList.remove(chrom)
 136
 137     header = {"HD": {"VN": VERSION}}
 138     if referenceSequenceList:
 139         header["SQ"] = referenceSequenceList
 140
 141     outfile = pysam.Samfile(outfilename, "wb", header=header)
 142
 143     totalWrites = 0
 144     noncanonicalSpliceCount = 0
 145     for chrom in chromList:
 146         index = 0
 147         print "chromosome %s" % (chrom)
 148         if withUniqs or withMulti:
 149             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
 150                                        doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
 151                                        flagLike=useFlagLike, withMismatch=True)
 152
 153             for read in hitDict[chrom]:
 154                 writeBAMEntry(outfile, chrom, read, readlength)
 155                 index += 1
 156
 157         if doSplices:
 158             numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
 159             index += numSpliceReadsWritten
 160             noncanonicalSpliceCount += noncanonical
 161
 162         print index
 163         totalWrites += index
 164
 165     outfile.close()
 166     print "%d total reads written" % totalWrites
 167     print "%d non-canonical splices" % noncanonicalSpliceCount
 168
 169
 170 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
 171     index = 0
 172     noncanonicalSplices = 0
 173     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
 174                                     withMismatch=True)
 175     if chrom not in spliceDict:
 176         pass
 177     else:
 178         for read in spliceDict[chrom]:
 179             if chrom in fastaSequenceDict:
 180                 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
 181                 noncanonicalSplices += noncanonical
 182
 183             writeBAMEntry(outfile, chrom, read, readlength)
 184             index += 1
 185
 186     return index, noncanonicalSplices
 187
 188
 189 def writeBAMEntry(outfile, chrom, outputDict, readlength):
 190     """ We need to subtract 1 from the position because rds is 1 based and
 191         most of the rest of the entire world is 0 based.
 192     """
 193     tagList = []
 194     alignedRead = pysam.AlignedRead()
 195     try:
 196         (readID, pairID) = outputDict["readID"].split("/")
 197         paired = True
 198     except ValueError:
 199         readID = outputDict["readID"]
 200         paired = False
 201
 202     alignedRead.qname = readID
 203     if outputDict["sense"] == "-":
 204         alignedRead.is_reverse = True
 205
 206     alignedRead.rname = outfile.references.index(chrom)
 207
 208     if "startL" in outputDict:
 209         startL = outputDict["startL"] - 1
 210         stopL = outputDict["stopL"] - 1
 211         startR = outputDict["startR"] - 1
 212         stopR = outputDict["stopR"] - 1
 213         alignedRead.pos = startL
 214         alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
 215         tagList.append(("XS", str(outputDict["sense"])))
 216     else:
 217         alignedRead.pos = outputDict["start"] - 1
 218         alignedRead.cigar = [(0, readlength)]
 219
 220     #TODO: add sequence info
 221     #cistematic.core.retrieveSequence(genome, chrom, start, stop, sense="F", db="")
 222     #ooooo. let's do this with PyGr
 223
 224     #TODO: add quality - max for each base, but default is to not include just like current
 225     # rdsToFastq() sets everything to max
 226
 227     if paired:
 228         if pairID == "1":
 229             alignedRead.is_read1 = True
 230             alignedRead.is_proper_pair = True
 231         elif pairID == "2":
 232             alignedRead.is_read2 = True
 233             alignedRead.is_proper_pair = True
 234         else:
 235             pass
 236
 237     if "mismatch" in outputDict:
 238         mismatchTag = getMismatches(outputDict["mismatch"])
 239         if mismatchTag:
 240             tagList.append(("MD", str(mismatchTag)))
 241
 242     if tagList:
 243         alignedRead.tags = tuple(tagList)
 244
 245     outfile.write(alignedRead)
 246
 247
 248 def getMismatches(mismatchString):
 249     mismatchList = []
 250     positions = re.findall("\d+", mismatchString)
 251     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
 252     for index in range(0, len(positions)):
 253         mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
 254
 255     mismatch = string.join(mismatchList, "")
 256
 257     return mismatch
 258
 259
 260 def doNotOutputChromosome(chrom, enforceChr):
 261     result = False
 262
 263     if chrom == "chrM":
 264         result = True
 265
 266     if enforceChr and ("chr" not in chrom):
 267         result = True
 268
 269     return result
 270
 271
 272 def getFastaSequenceDictionary(fastaFileName):
 273     fastaSeqDict = {}
 274     fchrom = ""
 275     fseq = ""
 276
 277     fastafile = open(fastaFileName)
 278     for line in fastafile:
 279         if line[0] == ">":
 280             if fchrom != "":
 281                 fastaSeqDict[fchrom] = fseq
 282
 283             fseq = ""
 284             fchrom = line[1:-1]
 285         else:
 286             fseq += line.strip()
 287
 288     fastafile.close()
 289
 290     return fastaSeqDict
 291
 292
 293 def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
 294     spliceSense = {"GTAG": "+",
 295                    "GCAG": "+",
 296                    "ATAC": "+",
 297                    "CTAC": "-",
 298                    "CTGC": "-",
 299                    "GTAT": "-"
 300     }
 301
 302     noncanonical = 0
 303     intronstart = stopLeft
 304     intronlen = startRight - stopLeft
 305     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
 306     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
 307     spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
 308     spliceJunction = spliceJunction.upper()
 309     print spliceJunction
 310     if spliceJunction in spliceSense:
 311         sense = spliceSense[spliceJunction]
 312     else:
 313         noncanonical += 1
 314         senses = ["+", "-"]
 315         random.shuffle(senses)
 316         sense = senses[0]
 317
 318     return sense, noncanonical
 319
 320
 321 if __name__ == "__main__":
 322     main(sys.argv)