MakeBamFromRds.py

   1 """
   2 MakeBamFromRds
   3
   4 Converts ERANGE RDS zero based file to Bam zero based format.
   5
   6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
   7
   8 """
   9
  10 try:
  11     import psyco
  12     psyco.full()
  13 except:
  14     pass
  15
  16 import sys
  17 import re
  18 import optparse
  19 import random
  20 import pysam
  21 from commoncode import readDataset
  22
  23
  24 def main(argv=None):
  25     if not argv:
  26         argv = sys.argv
  27
  28     verstring = "MakeBamFromRds: version 1.0"
  29     print verstring
  30
  31     doPairs = False
  32
  33     usage = "usage: python %prog rdsFile bamFile [options]"
  34
  35     parser = optparse.OptionParser(usage=usage)
  36     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
  37     parser.add_option("--nomulti", action="store_false", dest="withMulti")
  38     parser.add_option("--splices", action="store_true", dest="doSplices")
  39     parser.add_option("--flag", dest="withFlag")
  40     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
  41     parser.add_option("--pairs", action="store_true", dest="doPairs")
  42     parser.add_option("--cache", type="int", dest="cachePages")
  43     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
  44     parser.add_option("--chrom", action="append", dest="chromList")
  45     parser.add_option("--fasta", dest="fastaFileName")
  46     parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
  47                         doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
  48                         doCache=False, cachePages=100000, fastaFileName="",
  49                         chromList=[])
  50
  51     (options, args) = parser.parse_args(argv[1:])
  52
  53     if len(args) < 2:
  54         print usage
  55         sys.exit(1)
  56
  57     rdsfile = args[0]
  58     outfilename = args[1]
  59
  60     allChrom = True
  61     if options.chromList:
  62         allChrom = False
  63
  64     makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
  65                      options.doSplices, doPairs, options.withFlag, options.useFlagLike,
  66                      options.enforceChr, allChrom, options.doCache, options.cachePages,
  67                      options.chromList, options.fastaFileName)
  68
  69
  70 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
  71                      doSplices=False, doPairs=False, withFlag="",
  72                      useFlagLike=False, enforceChr=False, allChrom=True,
  73                      doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
  74
  75     if not withUniqs and not withMulti and not doSplices:
  76         print "must be outputting at least one of uniqs, multi, or -splices - exiting"
  77         sys.exit(1)
  78
  79     print "\nsample:"
  80     RDS = readDataset(rdsfile, verbose = True, cache=doCache)
  81
  82     if cachePages > RDS.getDefaultCacheSize():
  83         RDS.setDBcache(cachePages)
  84
  85     readlength = RDS.getReadSize()
  86
  87     if allChrom:
  88         if withUniqs:
  89             chromList = RDS.getChromosomes()
  90         elif withMulti:
  91             chromList = RDS.getChromosomes(table="multi")
  92         else:
  93             chromList = RDS.getChromosomes(table="splices")
  94
  95         chromList.sort()
  96
  97     fastaSequenceDict = {}
  98     if fastaFileName:
  99         fastafile = open(fastaFileName)
 100         fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
 101         fastafile.close()
 102
 103     referenceSequenceList = []
 104     chromRemoveList = []
 105     for chromosome in chromList:
 106         if doNotOutputChromosome(chromosome, enforceChr):
 107             chromRemoveList.append(chromosome)
 108         else:
 109             chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
 110             referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
 111             referenceSequenceList.append(referenceDataDict)
 112
 113     for chrom in chromRemoveList:
 114         chromList.remove(chrom)
 115
 116     header = {"HD": {"VN": "1.0"}}
 117     if referenceSequenceList:
 118         header["SQ"] = referenceSequenceList
 119
 120     outfile = pysam.Samfile(outfilename, "wb", header=header)
 121
 122     totalWrites = 0
 123     noncanonicalSplices = 0
 124     for chrom in chromList:
 125         index = 0
 126         print "chromosome %s" % (chrom)
 127         if withUniqs or withMulti:
 128             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
 129                                        withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
 130                                        flagLike=useFlagLike, entryDict=True)
 131
 132             for read in hitDict[chrom]:
 133                 writeBAMEntry(outfile, chrom, read, readlength)
 134                 index += 1
 135
 136         if doSplices:
 137             numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
 138             index += numSpliceReadsWritten
 139             noncanonicalSplices += noncanonical
 140
 141         print index
 142         totalWrites += index
 143
 144     outfile.close()
 145     print "%d total reads written" % totalWrites
 146     print "%d non-canonical splices" % noncanonicalSplices
 147
 148
 149 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
 150     index = 0
 151     noncanonicalSplices = 0
 152     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
 153     if chrom not in spliceDict:
 154         pass
 155     else:
 156         for read in spliceDict[chrom]:
 157             if fastaSequenceDict.has_key(chrom):
 158                 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
 159                 noncanonicalSplices += noncanonical
 160
 161             writeBAMEntry(outfile, chrom, read, readlength)
 162             index += 1
 163
 164     return index, noncanonicalSplices
 165
 166
 167 def writeBAMEntry(outfile, chrom, outputDict, readlength):
 168     tagList = []
 169     alignedRead = pysam.AlignedRead()
 170     alignedRead.qname = outputDict["readID"]
 171     if outputDict["sense"] == "-":
 172         alignedRead.is_reverse = True
 173
 174     alignedRead.rname = outfile.references.index(chrom)
 175
 176     if outputDict.has_key("startL"):
 177         startL = outputDict["startL"]
 178         stopL = outputDict["stopL"]
 179         startR = outputDict["startR"]
 180         stopR = outputDict["stopR"]
 181         alignedRead.pos = startL
 182         alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
 183         tagList.append(("XS", outputDict["sense"]))
 184     else:
 185         alignedRead.pos = outputDict["start"]
 186         alignedRead.cigar = [(0, readlength)]
 187
 188     if outputDict.has_key("pairID"):
 189         pairID = outputDict["pairID"]
 190         if pairID == "1":
 191             alignedRead.is_read1 = True
 192             alignedRead.is_proper_pair = True
 193         elif pairID == "2":
 194             alignedRead.is_read2 = True
 195             alignedRead.is_proper_pair = True
 196         else:
 197             pass
 198
 199     if outputDict.has_key("mismatch"):
 200         mismatchTag = getMismatches(outputDict["mismatch"])
 201         if mismatchTag:
 202             tagList.append(("MD", mismatchTag))
 203
 204     if tagList:
 205         alignedRead.tags = tagList
 206
 207     outfile.write(alignedRead)
 208
 209
 210 def getMismatches(mismatchString):
 211     mismatch = ""
 212     positions = re.findall("\d+", mismatchString)
 213     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
 214     for index in range(0, len(positions)):
 215         mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
 216
 217     return mismatch
 218
 219
 220 def doNotOutputChromosome(chrom, enforceChr):
 221     result = False
 222
 223     if chrom == "chrM":
 224         result = True
 225
 226     if enforceChr and ("chr" not in chrom):
 227         result = True
 228
 229     return result
 230
 231
 232 def getFastaSequenceDictionary(fastaFileName):
 233     fastaSeqDict = {}
 234     fchrom = ""
 235     fseq = ""
 236
 237     fastafile = open(fastaFileName)
 238     for line in fastafile:
 239         if line[0] == ">":
 240             if fchrom != "":
 241                 fastaSeqDict[fchrom] = fseq
 242
 243             fseq = ""
 244             fchrom = line[1:-1]
 245         else:
 246             fseq += line.strip()
 247
 248     fastafile.close()
 249
 250     return fastaSeqDict
 251
 252
 253 def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
 254     spliceSense = {"GTAG": "+",
 255                    "GCAG": "+",
 256                    "ATAC": "+",
 257                    "CTAC": "-",
 258                    "CTGC": "-",
 259                    "GTAT": "-"
 260     }
 261
 262     noncanonical = 0
 263     intronstart = stopLeft
 264     intronlen = startRight - stopLeft
 265     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
 266     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
 267     spliceJunction = leftJunctionSig + rightJunctionSig
 268     spliceJunction = spliceJunction.upper()
 269     if spliceSense.has_key(spliceJunction):
 270         sense = spliceSense[spliceJunction]
 271     else:
 272         noncanonical += 1
 273         senses = ["+", "-"]
 274         random.shuffle(senses)
 275         sense = senses[0]
 276
 277     return sense, noncanonical
 278
 279
 280 if __name__ == "__main__":
 281     main(sys.argv)