bamPreprocessing.py

   1 '''
   2 User specs:
   3 1. Checks if there are NH tags presents
   4 2. If there aren't, it goes through the BAM file, calculates the read multiplicity
   5 3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
   6 4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
   7 5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
   8 the file with one that has that information in the same manner, it will save the need to count all the reads every time.
   9 It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
  10
  11 The only place to have the flag is going to be to attach them to the individual reads.  We'll use the 'ZG' tag and have it
  12 flagged here if the tag will be populated.  Read will be tagged with the feature label or 'NM' if it is not in a feature.
  13 '''
  14 import sys
  15 import pysam
  16 import optparse
  17 from cistematic.genomes import Genome
  18 from commoncode import isSpliceEntry
  19
  20
  21 def main(argv=None):
  22
  23     usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
  24     if len(sys.argv) < 3:
  25         print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
  26         print '       BAM file has to be indexed'
  27         sys.exit(1)
  28
  29     parser = getParser(usage)
  30     (options, args) = parser.parse_args(argv[1:])
  31
  32     BAM = args[0]
  33     outputfilename = args[1]
  34
  35     samfile = pysam.Samfile(BAM, "rb" )
  36     readMultiplicityDict = getReadMultiplicity(samfile)
  37
  38     counts, readLength = getReadCounts(samfile, readMultiplicityDict)
  39     outheader = buildHeader(samfile.header, counts)
  40     if options.markGID:
  41         genome = Genome(options.genomeName, inRAM=True)
  42         if options.extendGenome != "":
  43             genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
  44
  45         print "getting gene features...."
  46         outheader['CO'].append(options.genomeComment)
  47
  48     outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
  49     for alignedread in samfile.fetch(until_eof=True):
  50         geneModelFlag = ""
  51         if options.markGID:
  52             chrom = samfile.getrname(alignedread.tid)
  53             chromNum = chrom[3:]
  54             #new direct query
  55             geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
  56             try:
  57                 (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
  58                 geneModelFlag = name
  59             except IndexError:
  60                 geneModelFlag = "NM"
  61
  62         ID = getReadID(alignedread)
  63         try:
  64             scaleby = alignedread.opt('NH')
  65             outfile.write(alignedread)
  66         except KeyError:
  67             scaleby = readMultiplicityDict[ID]
  68             writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
  69
  70     outfile.close()
  71
  72
  73 def getParser(usage):
  74     parser = optparse.OptionParser(usage=usage)
  75     parser.add_option("--models", dest="extendGenome")
  76     parser.add_option("--genome", dest="genomeName")
  77     parser.add_option("--genomeName", dest="genomeComment")
  78     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  79     parser.add_option("--markGID", action="store_true", dest="markGID")
  80
  81     parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
  82
  83     return parser
  84
  85
  86 def getReadMultiplicity(samfile):
  87     """ There is an assumption here that the set of all reads will not have a mixture of
  88         reads bearing the NH tag and reads without.
  89     """
  90     readMultiplicityDict = {}
  91     processedReads = 0
  92     for alignedread in samfile.fetch(until_eof=True):
  93         try:
  94             readMultiplicity = alignedread.opt('NH')
  95             return {}
  96         except KeyError:
  97             ID = getReadID(alignedread)
  98             try:
  99                 readMultiplicityDict[ID] += 1
 100             except KeyError:
 101                 readMultiplicityDict[ID] = 1
 102
 103         processedReads += 1
 104         if processedReads % 5000000 == 0:
 105             print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
 106
 107     return readMultiplicityDict
 108
 109
 110 def getReadCounts(samfile, readMultiplicityDict):
 111     uniques = 0
 112     multis = 0.0
 113     minusUnique = 0
 114     uniqueSplice = 0
 115     multiSplice = 0.0
 116     minusMulti = 0.0
 117     readLength = 0
 118     for alignedread in samfile.fetch(until_eof=True):
 119         if readLength == 0:
 120             readLength = getReadLength(alignedread.cigar)
 121
 122         try:
 123             readMultiplicity = alignedread.opt('NH')
 124         except KeyError:
 125             ID = getReadID(alignedread)
 126             readMultiplicity = readMultiplicityDict[ID]
 127
 128         if readMultiplicity == 1:
 129             if alignedread.is_reverse:
 130                 minusUnique += 1
 131
 132             if isSpliceEntry(alignedread.cigar):
 133                 uniqueSplice += 1
 134             else:
 135                 uniques += 1
 136         else:
 137             if alignedread.is_reverse:
 138                 minusMulti += 1.0/readMultiplicity
 139
 140             if isSpliceEntry(alignedread.cigar):
 141                 multiSplice += 1.0/readMultiplicity
 142             else:
 143                 multis += 1.0/readMultiplicity
 144
 145     totalReads = uniques + multis + uniqueSplice + multiSplice
 146     plusUnique = uniques + uniqueSplice - minusUnique
 147     plusMulti = multis + multiSplice - minusMulti
 148     counts = ["Total\t%d" % totalReads,
 149               "Unique\t%d" % uniques,
 150               "Multis\t%d" % multis,
 151               "UniqueSplices\t%d" % uniqueSplice,
 152               "Multisplices\t%d" % multiSplice,
 153               "PlusUnique\t%d" % plusUnique,
 154               "PlusMulti\t%d" % plusMulti,
 155               "MinusUnique\t%d" % minusUnique,
 156               "MinusMulti\t%d" % minusMulti,
 157               "ReadLength\t%d" % readLength
 158     ]
 159
 160     return counts, readLength
 161
 162
 163 def buildHeader(templateheader, commentList):
 164     header = templateheader
 165     header["CO"] = commentList
 166
 167     return header
 168
 169
 170 def getReadID(alignedread):
 171     ID = alignedread.qname
 172     if alignedread.is_read1:
 173         ID = ID + '/1'
 174
 175     if alignedread.is_read2:
 176         ID = ID + '/2'
 177
 178     return ID
 179
 180
 181 def getReadLength(cigar):
 182     take = (0, 1) # CIGAR operation (M/match, I/insertion)
 183
 184     return sum([length for op,length in cigar if op in take])
 185
 186
 187 def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
 188     newAlignedRead = originalRead
 189     newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
 190     outfile.write(newAlignedRead)
 191
 192
 193 if __name__ == "__main__":
 194     main(sys.argv)