+'''
+User specs:
+1. Checks if there are NH tags presents
+2. If there aren't, it goes through the BAM file, calculates the read multiplicity
+3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
+4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
+5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
+the file with one that has that information in the same manner, it will save the need to count all the reads every time.
+It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
+
+The only place to have the flag is going to be to attach them to the individual reads. We'll use the 'ZG' tag and have it
+flagged here if the tag will be populated. Read will be tagged with the feature label or 'NM' if it is not in a feature.
+'''
+import sys
+import pysam
+import optparse
+from cistematic.genomes import Genome
+from commoncode import isSpliceEntry, getFeaturesByChromDict
+
+
+def main(argv=None):
+
+ usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+ if len(sys.argv) < 3:
+ print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+ print ' BAM file has to be indexed'
+ sys.exit(1)
+
+ parser = getParser(usage)
+ (options, args) = parser.parse_args(argv[1:])
+
+ BAM = args[0]
+ outputfilename = args[1]
+
+ samfile = pysam.Samfile(BAM, "rb" )
+ readMultiplicityDict = getReadMultiplicity(samfile)
+
+ counts = getReadCounts(samfile, readMultiplicityDict)
+ outheader = buildHeader(samfile.header, counts)
+ if options.markGID:
+ genome = Genome(options.genomeName, inRAM=True)
+ if options.extendGenome != "":
+ genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
+
+ print "getting gene features...."
+ featuresByChromDict = getFeaturesByChromDict(genome)
+ outheader['CO'].append(options.genomeComment)
+
+ outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
+ for alignedread in samfile.fetch(until_eof=True):
+ geneModelFlag = ""
+ if options.markGID:
+ chrom = samfile.getrname(alignedread.tid)
+ chromNum = chrom[3:]
+ geneModelFlag = "NM"
+ for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
+ if start < alignedread.pos and stop > alignedread.pos:
+ geneModelFlag = gid
+ continue
+
+ ID = getReadID(alignedread)
+ try:
+ scaleby = alignedread.opt('NH')
+ outfile.write(alignedread)
+ except KeyError:
+ scaleby = readMultiplicityDict[ID]
+ writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
+
+ outfile.close()
+
+
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--genome", dest="genomeName")
+ parser.add_option("--genomeName", dest="genomeComment")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.add_option("--markGID", action="store_true", dest="markGID")
+
+ parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
+
+ return parser
+
+
+def getReadMultiplicity(samfile):
+ """ There is an assumption here that the set of all reads will not have a mixture of
+ reads bearing the NH tag and reads without.
+ """
+ readMultiplicityDict = {}
+ processedReads = 0
+ for alignedread in samfile.fetch(until_eof=True):
+ try:
+ readMultiplicity = alignedread.opt('NH')
+ return {}
+ except KeyError:
+ ID = getReadID(alignedread)
+ try:
+ readMultiplicityDict[ID] += 1
+ except KeyError:
+ readMultiplicityDict[ID] = 1
+
+ processedReads += 1
+ if processedReads % 5000000 == 0:
+ print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
+
+ return readMultiplicityDict
+
+
+def getReadCounts(samfile, readMultiplicityDict):
+ uniques = 0
+ multis = 0.0
+ minusUnique = 0
+ uniqueSplice = 0
+ multiSplice = 0.0
+ minusMulti = 0.0
+ readLength = 0
+ for alignedread in samfile.fetch(until_eof=True):
+ if readLength == 0:
+ readLength = getReadLength(alignedread.cigar)
+
+ try:
+ readMultiplicity = alignedread.opt('NH')
+ except KeyError:
+ ID = getReadID(alignedread)
+ readMultiplicity = readMultiplicityDict[ID]
+
+ if readMultiplicity == 1:
+ if alignedread.is_reverse:
+ minusUnique += 1
+
+ if isSpliceEntry(alignedread.cigar):
+ uniqueSplice += 1
+ else:
+ uniques += 1
+ else:
+ if alignedread.is_reverse:
+ minusMulti += 1.0/readMultiplicity
+
+ if isSpliceEntry(alignedread.cigar):
+ multiSplice += 1.0/readMultiplicity
+ else:
+ multis += 1.0/readMultiplicity
+
+ totalReads = uniques + multis + uniqueSplice + multiSplice
+ plusUnique = uniques + uniqueSplice - minusUnique
+ plusMulti = multis + multiSplice - minusMulti
+ counts = ["Total\t%d" % totalReads,
+ "Unique\t%d" % uniques,
+ "Multis\t%d" % multis,
+ "UniqueSplices\t%d" % uniqueSplice,
+ "Multisplices\t%d" % multiSplice,
+ "PlusUnique\t%d" % plusUnique,
+ "PlusMulti\t%d" % plusMulti,
+ "MinusUnique\t%d" % minusUnique,
+ "MinusMulti\t%d" % minusMulti,
+ "ReadLength\t%d" % readLength
+ ]
+
+ return counts
+
+
+def buildHeader(templateheader, commentList):
+ header = templateheader
+ header["CO"] = commentList
+
+ return header
+
+
+def getReadID(alignedread):
+ ID = alignedread.qname
+ if alignedread.is_read1:
+ ID = ID + '/1'
+
+ if alignedread.is_read2:
+ ID = ID + '/2'
+
+ return ID
+
+
+def getReadLength(cigar):
+ take = (0, 1) # CIGAR operation (M/match, I/insertion)
+
+ return sum([length for op,length in cigar if op in take])
+
+
+def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
+ newAlignedRead = originalRead
+ newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
+ outfile.write(newAlignedRead)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file