X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=bamPreprocessing.py;fp=bamPreprocessing.py;h=dd5e8612001dd31f19d49470c956205c789f016c;hp=0000000000000000000000000000000000000000;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/bamPreprocessing.py b/bamPreprocessing.py new file mode 100644 index 0000000..dd5e861 --- /dev/null +++ b/bamPreprocessing.py @@ -0,0 +1,193 @@ +''' +User specs: +1. Checks if there are NH tags presents +2. If there aren't, it goes through the BAM file, calculates the read multiplicity +3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags +4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file +5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace +the file with one that has that information in the same manner, it will save the need to count all the reads every time. +It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand. + +The only place to have the flag is going to be to attach them to the individual reads. We'll use the 'ZG' tag and have it +flagged here if the tag will be populated. Read will be tagged with the feature label or 'NM' if it is not in a feature. +''' +import sys +import pysam +import optparse +from cistematic.genomes import Genome +from commoncode import isSpliceEntry, getFeaturesByChromDict + + +def main(argv=None): + + usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0] + if len(sys.argv) < 3: + print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0] + print ' BAM file has to be indexed' + sys.exit(1) + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + BAM = args[0] + outputfilename = args[1] + + samfile = pysam.Samfile(BAM, "rb" ) + readMultiplicityDict = getReadMultiplicity(samfile) + + counts = getReadCounts(samfile, readMultiplicityDict) + outheader = buildHeader(samfile.header, counts) + if options.markGID: + genome = Genome(options.genomeName, inRAM=True) + if options.extendGenome != "": + genome.extendFeatures(options.extendGenome, replace=options.replaceModels) + + print "getting gene features...." + featuresByChromDict = getFeaturesByChromDict(genome) + outheader['CO'].append(options.genomeComment) + + outfile = pysam.Samfile(outputfilename, "wb", header=outheader) + for alignedread in samfile.fetch(until_eof=True): + geneModelFlag = "" + if options.markGID: + chrom = samfile.getrname(alignedread.tid) + chromNum = chrom[3:] + geneModelFlag = "NM" + for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]: + if start < alignedread.pos and stop > alignedread.pos: + geneModelFlag = gid + continue + + ID = getReadID(alignedread) + try: + scaleby = alignedread.opt('NH') + outfile.write(alignedread) + except KeyError: + scaleby = readMultiplicityDict[ID] + writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag) + + outfile.close() + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--models", dest="extendGenome") + parser.add_option("--genome", dest="genomeName") + parser.add_option("--genomeName", dest="genomeComment") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + parser.add_option("--markGID", action="store_true", dest="markGID") + + parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="") + + return parser + + +def getReadMultiplicity(samfile): + """ There is an assumption here that the set of all reads will not have a mixture of + reads bearing the NH tag and reads without. + """ + readMultiplicityDict = {} + processedReads = 0 + for alignedread in samfile.fetch(until_eof=True): + try: + readMultiplicity = alignedread.opt('NH') + return {} + except KeyError: + ID = getReadID(alignedread) + try: + readMultiplicityDict[ID] += 1 + except KeyError: + readMultiplicityDict[ID] = 1 + + processedReads += 1 + if processedReads % 5000000 == 0: + print str(processedReads/1000000) + 'M alignments processed in multiplicity examination' + + return readMultiplicityDict + + +def getReadCounts(samfile, readMultiplicityDict): + uniques = 0 + multis = 0.0 + minusUnique = 0 + uniqueSplice = 0 + multiSplice = 0.0 + minusMulti = 0.0 + readLength = 0 + for alignedread in samfile.fetch(until_eof=True): + if readLength == 0: + readLength = getReadLength(alignedread.cigar) + + try: + readMultiplicity = alignedread.opt('NH') + except KeyError: + ID = getReadID(alignedread) + readMultiplicity = readMultiplicityDict[ID] + + if readMultiplicity == 1: + if alignedread.is_reverse: + minusUnique += 1 + + if isSpliceEntry(alignedread.cigar): + uniqueSplice += 1 + else: + uniques += 1 + else: + if alignedread.is_reverse: + minusMulti += 1.0/readMultiplicity + + if isSpliceEntry(alignedread.cigar): + multiSplice += 1.0/readMultiplicity + else: + multis += 1.0/readMultiplicity + + totalReads = uniques + multis + uniqueSplice + multiSplice + plusUnique = uniques + uniqueSplice - minusUnique + plusMulti = multis + multiSplice - minusMulti + counts = ["Total\t%d" % totalReads, + "Unique\t%d" % uniques, + "Multis\t%d" % multis, + "UniqueSplices\t%d" % uniqueSplice, + "Multisplices\t%d" % multiSplice, + "PlusUnique\t%d" % plusUnique, + "PlusMulti\t%d" % plusMulti, + "MinusUnique\t%d" % minusUnique, + "MinusMulti\t%d" % minusMulti, + "ReadLength\t%d" % readLength + ] + + return counts + + +def buildHeader(templateheader, commentList): + header = templateheader + header["CO"] = commentList + + return header + + +def getReadID(alignedread): + ID = alignedread.qname + if alignedread.is_read1: + ID = ID + '/1' + + if alignedread.is_read2: + ID = ID + '/2' + + return ID + + +def getReadLength(cigar): + take = (0, 1) # CIGAR operation (M/match, I/insertion) + + return sum([length for op,length in cigar if op in take]) + + +def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag): + newAlignedRead = originalRead + newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)] + outfile.write(newAlignedRead) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file