MakeBamFromRds.py

   1 """
   2 MakeBamFromRds
   3
   4 Converts ERANGE RDS zero based file to Bam zero based format.
   5
   6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
   7
   8 """
   9
  10 try:
  11     import psyco
  12     psyco.full()
  13 except:
  14     pass
  15
  16 import sys
  17 import re
  18 import optparse
  19 import random
  20 import string
  21 import pysam
  22 import ReadDataset
  23 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  24
  25
  26 def main(argv=None):
  27     if not argv:
  28         argv = sys.argv
  29
  30     verstring = "makeBamFromRds: version 1.0"
  31     print verstring
  32
  33     doPairs = False
  34
  35     usage = "usage: python %prog rdsFile bamFile [options]"
  36
  37     parser = getParser(usage)
  38     (options, args) = parser.parse_args(argv[1:])
  39
  40     if len(args) < 2:
  41         print usage
  42         sys.exit(1)
  43
  44     rdsfile = args[0]
  45     outfilename = args[1]
  46
  47     allChrom = True
  48     if options.chromList:
  49         allChrom = False
  50
  51     makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
  52                      options.doSplices, doPairs, options.withFlag, options.useFlagLike,
  53                      options.enforceChr, allChrom, options.doCache, options.cachePages,
  54                      options.chromList, options.fastaFileName)
  55
  56
  57 def getParser(usage):
  58     parser = optparse.OptionParser(usage=usage)
  59     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
  60     parser.add_option("--nomulti", action="store_false", dest="withMulti")
  61     parser.add_option("--splices", action="store_true", dest="doSplices")
  62     parser.add_option("--flag", dest="withFlag")
  63     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
  64     parser.add_option("--pairs", action="store_true", dest="doPairs")
  65     parser.add_option("--cache", type="int", dest="cachePages")
  66     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
  67     parser.add_option("--chrom", action="append", dest="chromList")
  68     parser.add_option("--fasta", dest="fastaFileName")
  69
  70     configParser = getConfigParser()
  71     section = "MakeBamFromRds"
  72     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
  73     withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
  74     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
  75     doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
  76     withFlag = getConfigOption(configParser, section, "withFlag", "")
  77     useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
  78     enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
  79     doCache = getConfigBoolOption(configParser, section, "doCache", False)
  80     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  81     fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
  82
  83     parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
  84                         doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
  85                         doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
  86                         chromList=[])
  87
  88     return parser
  89
  90
  91 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
  92                      doSplices=False, doPairs=False, withFlag="",
  93                      useFlagLike=False, enforceChr=False, allChrom=True,
  94                      doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
  95
  96     if not withUniqs and not withMulti and not doSplices:
  97         print "must be outputting at least one of uniqs, multi, or -splices - exiting"
  98         sys.exit(1)
  99
 100     print "\nsample:"
 101     RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
 102
 103     if cachePages > RDS.getDefaultCacheSize():
 104         RDS.setDBcache(cachePages)
 105
 106     readlength = RDS.getReadSize()
 107
 108     if allChrom:
 109         if withUniqs:
 110             chromList = RDS.getChromosomes()
 111         elif withMulti:
 112             chromList = RDS.getChromosomes(table="multi")
 113         else:
 114             chromList = RDS.getChromosomes(table="splices")
 115
 116         chromList.sort()
 117
 118     fastaSequenceDict = {}
 119     if fastaFileName:
 120         fastafile = open(fastaFileName)
 121         fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
 122         fastafile.close()
 123
 124     referenceSequenceList = []
 125     chromRemoveList = []
 126     for chromosome in chromList:
 127         if doNotOutputChromosome(chromosome, enforceChr):
 128             chromRemoveList.append(chromosome)
 129         else:
 130             chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
 131             referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
 132             referenceSequenceList.append(referenceDataDict)
 133
 134     for chrom in chromRemoveList:
 135         chromList.remove(chrom)
 136
 137     header = {"HD": {"VN": "1.0"}}
 138     if referenceSequenceList:
 139         header["SQ"] = referenceSequenceList
 140
 141     outfile = pysam.Samfile(outfilename, "wb", header=header)
 142
 143     totalWrites = 0
 144     noncanonicalSpliceCount = 0
 145     for chrom in chromList:
 146         index = 0
 147         print "chromosome %s" % (chrom)
 148         if withUniqs or withMulti:
 149             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
 150                                        doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
 151                                        flagLike=useFlagLike, withMismatch=True)
 152
 153             for read in hitDict[chrom]:
 154                 writeBAMEntry(outfile, chrom, read, readlength)
 155                 index += 1
 156
 157         if doSplices:
 158             numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
 159             index += numSpliceReadsWritten
 160             noncanonicalSpliceCount += noncanonical
 161
 162         print index
 163         totalWrites += index
 164
 165     outfile.close()
 166     print "%d total reads written" % totalWrites
 167     print "%d non-canonical splices" % noncanonicalSpliceCount
 168
 169
 170 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
 171     index = 0
 172     noncanonicalSplices = 0
 173     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
 174                                     withMismatch=True)
 175     if chrom not in spliceDict:
 176         pass
 177     else:
 178         for read in spliceDict[chrom]:
 179             if fastaSequenceDict.has_key(chrom):
 180                 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
 181                 noncanonicalSplices += noncanonical
 182
 183             writeBAMEntry(outfile, chrom, read, readlength)
 184             index += 1
 185
 186     return index, noncanonicalSplices
 187
 188
 189 def writeBAMEntry(outfile, chrom, outputDict, readlength):
 190     """ We need to subtract 1 from the position because rds is 1 based and
 191         most of the rest of the entire world is 0 based.
 192     """
 193     tagList = []
 194     alignedRead = pysam.AlignedRead()
 195     try:
 196         (readID, pairID) = outputDict["readID"].split("/")
 197         paired = True
 198     except ValueError:
 199         readID = outputDict["readID"]
 200         paired = False
 201
 202     alignedRead.qname = readID
 203     if outputDict["sense"] == "-":
 204         alignedRead.is_reverse = True
 205
 206     alignedRead.rname = outfile.references.index(chrom)
 207
 208     if outputDict.has_key("startL"):
 209         startL = outputDict["startL"] - 1
 210         stopL = outputDict["stopL"] - 1
 211         startR = outputDict["startR"] - 1
 212         stopR = outputDict["stopR"] - 1
 213         alignedRead.pos = startL
 214         alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
 215         tagList.append(("XS", str(outputDict["sense"])))
 216     else:
 217         alignedRead.pos = outputDict["start"] - 1
 218         alignedRead.cigar = [(0, readlength)]
 219
 220     if paired:
 221         if pairID == "1":
 222             alignedRead.is_read1 = True
 223             alignedRead.is_proper_pair = True
 224         elif pairID == "2":
 225             alignedRead.is_read2 = True
 226             alignedRead.is_proper_pair = True
 227         else:
 228             pass
 229
 230     if outputDict.has_key("mismatch"):
 231         mismatchTag = getMismatches(outputDict["mismatch"])
 232         if mismatchTag:
 233             tagList.append(("MD", str(mismatchTag)))
 234
 235     if tagList:
 236         alignedRead.tags = tuple(tagList)
 237
 238     outfile.write(alignedRead)
 239
 240
 241 def getMismatches(mismatchString):
 242     mismatchList = []
 243     positions = re.findall("\d+", mismatchString)
 244     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
 245     for index in range(0, len(positions)):
 246         mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
 247
 248     mismatch = string.join(mismatchList, "")
 249
 250     return mismatch
 251
 252
 253 def doNotOutputChromosome(chrom, enforceChr):
 254     result = False
 255
 256     if chrom == "chrM":
 257         result = True
 258
 259     if enforceChr and ("chr" not in chrom):
 260         result = True
 261
 262     return result
 263
 264
 265 def getFastaSequenceDictionary(fastaFileName):
 266     fastaSeqDict = {}
 267     fchrom = ""
 268     fseq = ""
 269
 270     fastafile = open(fastaFileName)
 271     for line in fastafile:
 272         if line[0] == ">":
 273             if fchrom != "":
 274                 fastaSeqDict[fchrom] = fseq
 275
 276             fseq = ""
 277             fchrom = line[1:-1]
 278         else:
 279             fseq += line.strip()
 280
 281     fastafile.close()
 282
 283     return fastaSeqDict
 284
 285
 286 def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
 287     spliceSense = {"GTAG": "+",
 288                    "GCAG": "+",
 289                    "ATAC": "+",
 290                    "CTAC": "-",
 291                    "CTGC": "-",
 292                    "GTAT": "-"
 293     }
 294
 295     noncanonical = 0
 296     intronstart = stopLeft
 297     intronlen = startRight - stopLeft
 298     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
 299     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
 300     spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
 301     spliceJunction = spliceJunction.upper()
 302     print spliceJunction
 303     if spliceSense.has_key(spliceJunction):
 304         sense = spliceSense[spliceJunction]
 305     else:
 306         noncanonical += 1
 307         senses = ["+", "-"]
 308         random.shuffle(senses)
 309         sense = senses[0]
 310
 311     return sense, noncanonical
 312
 313
 314 if __name__ == "__main__":
 315     main(sys.argv)