bamPreprocessing.py

   1 '''
   2 User specs:
   3 1. Checks if there are NH tags presents
   4 2. If there aren't, it goes through the BAM file, calculates the read multiplicity
   5 3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
   6 4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
   7 5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
   8 the file with one that has that information in the same manner, it will save the need to count all the reads every time.
   9 It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
  10
  11 The only place to have the flag is going to be to attach them to the individual reads.  We'll use the 'ZG' tag and have it
  12 flagged here if the tag will be populated.  Read will be tagged with the feature label or 'NM' if it is not in a feature.
  13 '''
  14 import sys
  15 import pysam
  16 import optparse
  17 from cistematic.genomes import Genome
  18 from commoncode import isSpliceEntry, getFeaturesByChromDict
  19
  20
  21 def main(argv=None):
  22
  23     usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
  24     if len(sys.argv) < 3:
  25         print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
  26         print '       BAM file has to be indexed'
  27         sys.exit(1)
  28
  29     parser = getParser(usage)
  30     (options, args) = parser.parse_args(argv[1:])
  31
  32     BAM = args[0]
  33     outputfilename = args[1]
  34
  35     samfile = pysam.Samfile(BAM, "rb" )
  36     readMultiplicityDict = getReadMultiplicity(samfile)
  37
  38     counts = getReadCounts(samfile, readMultiplicityDict)
  39     outheader = buildHeader(samfile.header, counts)
  40     if options.markGID:
  41         genome = Genome(options.genomeName, inRAM=True)
  42         if options.extendGenome != "":
  43             genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
  44
  45         print "getting gene features...."
  46         featuresByChromDict = getFeaturesByChromDict(genome)
  47         outheader['CO'].append(options.genomeComment)
  48
  49     outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
  50     for alignedread in samfile.fetch(until_eof=True):
  51         geneModelFlag = ""
  52         if options.markGID:
  53             chrom = samfile.getrname(alignedread.tid)
  54             chromNum = chrom[3:]
  55             geneModelFlag = "NM"
  56             for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
  57                 if start < alignedread.pos and stop > alignedread.pos:
  58                     geneModelFlag = gid
  59                     continue
  60
  61         ID = getReadID(alignedread)
  62         try:
  63             scaleby = alignedread.opt('NH')
  64             outfile.write(alignedread)
  65         except KeyError:
  66             scaleby = readMultiplicityDict[ID]
  67             writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
  68
  69     outfile.close()
  70
  71
  72 def getParser(usage):
  73     parser = optparse.OptionParser(usage=usage)
  74     parser.add_option("--models", dest="extendGenome")
  75     parser.add_option("--genome", dest="genomeName")
  76     parser.add_option("--genomeName", dest="genomeComment")
  77     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  78     parser.add_option("--markGID", action="store_true", dest="markGID")
  79
  80     parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
  81
  82     return parser
  83
  84
  85 def getReadMultiplicity(samfile):
  86     """ There is an assumption here that the set of all reads will not have a mixture of
  87         reads bearing the NH tag and reads without.
  88     """
  89     readMultiplicityDict = {}
  90     processedReads = 0
  91     for alignedread in samfile.fetch(until_eof=True):
  92         try:
  93             readMultiplicity = alignedread.opt('NH')
  94             return {}
  95         except KeyError:
  96             ID = getReadID(alignedread)
  97             try:
  98                 readMultiplicityDict[ID] += 1
  99             except KeyError:
 100                 readMultiplicityDict[ID] = 1
 101
 102         processedReads += 1
 103         if processedReads % 5000000 == 0:
 104             print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
 105
 106     return readMultiplicityDict
 107
 108
 109 def getReadCounts(samfile, readMultiplicityDict):
 110     uniques = 0
 111     multis = 0.0
 112     minusUnique = 0
 113     uniqueSplice = 0
 114     multiSplice = 0.0
 115     minusMulti = 0.0
 116     readLength = 0
 117     for alignedread in samfile.fetch(until_eof=True):
 118         if readLength == 0:
 119             readLength = getReadLength(alignedread.cigar)
 120
 121         try:
 122             readMultiplicity = alignedread.opt('NH')
 123         except KeyError:
 124             ID = getReadID(alignedread)
 125             readMultiplicity = readMultiplicityDict[ID]
 126
 127         if readMultiplicity == 1:
 128             if alignedread.is_reverse:
 129                 minusUnique += 1
 130
 131             if isSpliceEntry(alignedread.cigar):
 132                 uniqueSplice += 1
 133             else:
 134                 uniques += 1
 135         else:
 136             if alignedread.is_reverse:
 137                 minusMulti += 1.0/readMultiplicity
 138
 139             if isSpliceEntry(alignedread.cigar):
 140                 multiSplice += 1.0/readMultiplicity
 141             else:
 142                 multis += 1.0/readMultiplicity
 143
 144     totalReads = uniques + multis + uniqueSplice + multiSplice
 145     plusUnique = uniques + uniqueSplice - minusUnique
 146     plusMulti = multis + multiSplice - minusMulti
 147     counts = ["Total\t%d" % totalReads,
 148               "Unique\t%d" % uniques,
 149               "Multis\t%d" % multis,
 150               "UniqueSplices\t%d" % uniqueSplice,
 151               "Multisplices\t%d" % multiSplice,
 152               "PlusUnique\t%d" % plusUnique,
 153               "PlusMulti\t%d" % plusMulti,
 154               "MinusUnique\t%d" % minusUnique,
 155               "MinusMulti\t%d" % minusMulti,
 156               "ReadLength\t%d" % readLength
 157     ]
 158
 159     return counts
 160
 161
 162 def buildHeader(templateheader, commentList):
 163     header = templateheader
 164     header["CO"] = commentList
 165
 166     return header
 167
 168
 169 def getReadID(alignedread):
 170     ID = alignedread.qname
 171     if alignedread.is_read1:
 172         ID = ID + '/1'
 173
 174     if alignedread.is_read2:
 175         ID = ID + '/2'
 176
 177     return ID
 178
 179
 180 def getReadLength(cigar):
 181     take = (0, 1) # CIGAR operation (M/match, I/insertion)
 182
 183     return sum([length for op,length in cigar if op in take])
 184
 185
 186 def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
 187     newAlignedRead = originalRead
 188     newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
 189     outfile.write(newAlignedRead)
 190
 191
 192 if __name__ == "__main__":
 193     main(sys.argv)