convert standard analysis pipelines to use bam format natively
[erange.git] / bamPreprocessing.py
diff --git a/bamPreprocessing.py b/bamPreprocessing.py
new file mode 100644 (file)
index 0000000..dd5e861
--- /dev/null
@@ -0,0 +1,193 @@
+'''
+User specs:
+1. Checks if there are NH tags presents
+2. If there aren't, it goes through the BAM file, calculates the read multiplicity
+3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
+4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
+5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
+the file with one that has that information in the same manner, it will save the need to count all the reads every time.
+It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
+
+The only place to have the flag is going to be to attach them to the individual reads.  We'll use the 'ZG' tag and have it
+flagged here if the tag will be populated.  Read will be tagged with the feature label or 'NM' if it is not in a feature.
+'''
+import sys
+import pysam
+import optparse
+from cistematic.genomes import Genome
+from commoncode import isSpliceEntry, getFeaturesByChromDict
+
+
+def main(argv=None):
+
+    usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+    if len(sys.argv) < 3:
+        print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+        print '       BAM file has to be indexed'
+        sys.exit(1)
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    BAM = args[0]
+    outputfilename = args[1]
+
+    samfile = pysam.Samfile(BAM, "rb" )
+    readMultiplicityDict = getReadMultiplicity(samfile)
+
+    counts = getReadCounts(samfile, readMultiplicityDict)
+    outheader = buildHeader(samfile.header, counts)
+    if options.markGID:
+        genome = Genome(options.genomeName, inRAM=True)
+        if options.extendGenome != "":
+            genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
+
+        print "getting gene features...."
+        featuresByChromDict = getFeaturesByChromDict(genome)    
+        outheader['CO'].append(options.genomeComment)
+
+    outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
+    for alignedread in samfile.fetch(until_eof=True):
+        geneModelFlag = ""
+        if options.markGID:
+            chrom = samfile.getrname(alignedread.tid)
+            chromNum = chrom[3:]
+            geneModelFlag = "NM"
+            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
+                if start < alignedread.pos and stop > alignedread.pos:
+                    geneModelFlag = gid
+                    continue
+
+        ID = getReadID(alignedread)
+        try:
+            scaleby = alignedread.opt('NH')
+            outfile.write(alignedread)
+        except KeyError:
+            scaleby = readMultiplicityDict[ID]
+            writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
+
+    outfile.close()
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--genome", dest="genomeName")
+    parser.add_option("--genomeName", dest="genomeComment")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--markGID", action="store_true", dest="markGID")
+
+    parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
+
+    return parser
+
+
+def getReadMultiplicity(samfile):
+    """ There is an assumption here that the set of all reads will not have a mixture of
+        reads bearing the NH tag and reads without.
+    """
+    readMultiplicityDict = {}
+    processedReads = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        try:
+            readMultiplicity = alignedread.opt('NH')
+            return {}
+        except KeyError:
+            ID = getReadID(alignedread)
+            try:
+                readMultiplicityDict[ID] += 1
+            except KeyError:
+                readMultiplicityDict[ID] = 1
+
+        processedReads += 1
+        if processedReads % 5000000 == 0:
+            print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
+
+    return readMultiplicityDict
+
+
+def getReadCounts(samfile, readMultiplicityDict):
+    uniques = 0
+    multis = 0.0
+    minusUnique = 0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    minusMulti = 0.0
+    readLength = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        if readLength == 0:
+            readLength = getReadLength(alignedread.cigar)
+
+        try:
+            readMultiplicity = alignedread.opt('NH')
+        except KeyError:
+            ID = getReadID(alignedread)
+            readMultiplicity = readMultiplicityDict[ID]
+
+        if readMultiplicity == 1:
+            if alignedread.is_reverse:
+                minusUnique += 1
+
+            if isSpliceEntry(alignedread.cigar):
+                uniqueSplice += 1
+            else:
+                uniques += 1
+        else:
+            if alignedread.is_reverse:
+                minusMulti += 1.0/readMultiplicity
+
+            if isSpliceEntry(alignedread.cigar):
+                multiSplice += 1.0/readMultiplicity
+            else:
+                multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+    plusUnique = uniques + uniqueSplice - minusUnique
+    plusMulti = multis + multiSplice - minusMulti
+    counts = ["Total\t%d" % totalReads,
+              "Unique\t%d" % uniques,
+              "Multis\t%d" % multis,
+              "UniqueSplices\t%d" % uniqueSplice,
+              "Multisplices\t%d" % multiSplice,
+              "PlusUnique\t%d" % plusUnique,
+              "PlusMulti\t%d" % plusMulti,
+              "MinusUnique\t%d" % minusUnique,
+              "MinusMulti\t%d" % minusMulti,
+              "ReadLength\t%d" % readLength
+    ]
+
+    return counts
+
+
+def buildHeader(templateheader, commentList):
+    header = templateheader
+    header["CO"] = commentList
+
+    return header
+
+
+def getReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getReadLength(cigar):
+    take = (0, 1) # CIGAR operation (M/match, I/insertion)
+
+    return sum([length for op,length in cigar if op in take])
+
+
+def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
+    newAlignedRead = originalRead
+    newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
+    outfile.write(newAlignedRead)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file