3 1. Checks if there are NH tags presents
4 2. If there aren't, it goes through the BAM file, calculates the read multiplicity
5 3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
6 4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
7 5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
8 the file with one that has that information in the same manner, it will save the need to count all the reads every time.
9 It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
11 The only place to have the flag is going to be to attach them to the individual reads. We'll use the 'ZG' tag and have it
12 flagged here if the tag will be populated. Read will be tagged with the feature label or 'NM' if it is not in a feature.
17 from cistematic.genomes import Genome
18 from commoncode import isSpliceEntry
23 usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
25 print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
26 print ' BAM file has to be indexed'
29 parser = getParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
33 outputfilename = args[1]
35 samfile = pysam.Samfile(BAM, "rb" )
36 readMultiplicityDict = getReadMultiplicity(samfile)
38 counts, readLength = getReadCounts(samfile, readMultiplicityDict)
39 outheader = buildHeader(samfile.header, counts)
41 genome = Genome(options.genomeName, inRAM=True)
42 if options.extendGenome != "":
43 genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
45 print "getting gene features...."
46 outheader['CO'].append(options.genomeComment)
48 outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
49 for alignedread in samfile.fetch(until_eof=True):
52 chrom = samfile.getrname(alignedread.tid)
55 geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
57 (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
62 ID = getReadID(alignedread)
64 scaleby = alignedread.opt('NH')
65 outfile.write(alignedread)
67 scaleby = readMultiplicityDict[ID]
68 writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
74 parser = optparse.OptionParser(usage=usage)
75 parser.add_option("--models", dest="extendGenome")
76 parser.add_option("--genome", dest="genomeName")
77 parser.add_option("--genomeName", dest="genomeComment")
78 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
79 parser.add_option("--markGID", action="store_true", dest="markGID")
81 parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
86 def getReadMultiplicity(samfile):
87 """ There is an assumption here that the set of all reads will not have a mixture of
88 reads bearing the NH tag and reads without.
90 readMultiplicityDict = {}
92 for alignedread in samfile.fetch(until_eof=True):
94 readMultiplicity = alignedread.opt('NH')
97 ID = getReadID(alignedread)
99 readMultiplicityDict[ID] += 1
101 readMultiplicityDict[ID] = 1
104 if processedReads % 5000000 == 0:
105 print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
107 return readMultiplicityDict
110 def getReadCounts(samfile, readMultiplicityDict):
118 for alignedread in samfile.fetch(until_eof=True):
120 readLength = getReadLength(alignedread.cigar)
123 readMultiplicity = alignedread.opt('NH')
125 ID = getReadID(alignedread)
126 readMultiplicity = readMultiplicityDict[ID]
128 if readMultiplicity == 1:
129 if alignedread.is_reverse:
132 if isSpliceEntry(alignedread.cigar):
137 if alignedread.is_reverse:
138 minusMulti += 1.0/readMultiplicity
140 if isSpliceEntry(alignedread.cigar):
141 multiSplice += 1.0/readMultiplicity
143 multis += 1.0/readMultiplicity
145 totalReads = uniques + multis + uniqueSplice + multiSplice
146 plusUnique = uniques + uniqueSplice - minusUnique
147 plusMulti = multis + multiSplice - minusMulti
148 counts = ["Total\t%d" % totalReads,
149 "Unique\t%d" % uniques,
150 "Multis\t%d" % multis,
151 "UniqueSplices\t%d" % uniqueSplice,
152 "Multisplices\t%d" % multiSplice,
153 "PlusUnique\t%d" % plusUnique,
154 "PlusMulti\t%d" % plusMulti,
155 "MinusUnique\t%d" % minusUnique,
156 "MinusMulti\t%d" % minusMulti,
157 "ReadLength\t%d" % readLength
160 return counts, readLength
163 def buildHeader(templateheader, commentList):
164 header = templateheader
165 header["CO"] = commentList
170 def getReadID(alignedread):
171 ID = alignedread.qname
172 if alignedread.is_read1:
175 if alignedread.is_read2:
181 def getReadLength(cigar):
182 take = (0, 1) # CIGAR operation (M/match, I/insertion)
184 return sum([length for op,length in cigar if op in take])
187 def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
188 newAlignedRead = originalRead
189 newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
190 outfile.write(newAlignedRead)
193 if __name__ == "__main__":