3 1. Checks if there are NH tags presents
4 2. If there aren't, it goes through the BAM file, calculates the read multiplicity
5 3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
6 4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
7 5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
8 the file with one that has that information in the same manner, it will save the need to count all the reads every time.
9 It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
11 The only place to have the flag is going to be to attach them to the individual reads. We'll use the 'ZG' tag and have it
12 flagged here if the tag will be populated. Read will be tagged with the feature label or 'NM' if it is not in a feature.
17 from cistematic.genomes import Genome
18 from commoncode import isSpliceEntry, getFeaturesByChromDict
23 usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
25 print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
26 print ' BAM file has to be indexed'
29 parser = getParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
33 outputfilename = args[1]
35 samfile = pysam.Samfile(BAM, "rb" )
36 readMultiplicityDict = getReadMultiplicity(samfile)
38 counts = getReadCounts(samfile, readMultiplicityDict)
39 outheader = buildHeader(samfile.header, counts)
41 genome = Genome(options.genomeName, inRAM=True)
42 if options.extendGenome != "":
43 genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
45 print "getting gene features...."
46 featuresByChromDict = getFeaturesByChromDict(genome)
47 outheader['CO'].append(options.genomeComment)
49 outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
50 for alignedread in samfile.fetch(until_eof=True):
53 chrom = samfile.getrname(alignedread.tid)
56 for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
57 if start < alignedread.pos and stop > alignedread.pos:
61 ID = getReadID(alignedread)
63 scaleby = alignedread.opt('NH')
64 outfile.write(alignedread)
66 scaleby = readMultiplicityDict[ID]
67 writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
73 parser = optparse.OptionParser(usage=usage)
74 parser.add_option("--models", dest="extendGenome")
75 parser.add_option("--genome", dest="genomeName")
76 parser.add_option("--genomeName", dest="genomeComment")
77 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
78 parser.add_option("--markGID", action="store_true", dest="markGID")
80 parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
85 def getReadMultiplicity(samfile):
86 """ There is an assumption here that the set of all reads will not have a mixture of
87 reads bearing the NH tag and reads without.
89 readMultiplicityDict = {}
91 for alignedread in samfile.fetch(until_eof=True):
93 readMultiplicity = alignedread.opt('NH')
96 ID = getReadID(alignedread)
98 readMultiplicityDict[ID] += 1
100 readMultiplicityDict[ID] = 1
103 if processedReads % 5000000 == 0:
104 print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
106 return readMultiplicityDict
109 def getReadCounts(samfile, readMultiplicityDict):
117 for alignedread in samfile.fetch(until_eof=True):
119 readLength = getReadLength(alignedread.cigar)
122 readMultiplicity = alignedread.opt('NH')
124 ID = getReadID(alignedread)
125 readMultiplicity = readMultiplicityDict[ID]
127 if readMultiplicity == 1:
128 if alignedread.is_reverse:
131 if isSpliceEntry(alignedread.cigar):
136 if alignedread.is_reverse:
137 minusMulti += 1.0/readMultiplicity
139 if isSpliceEntry(alignedread.cigar):
140 multiSplice += 1.0/readMultiplicity
142 multis += 1.0/readMultiplicity
144 totalReads = uniques + multis + uniqueSplice + multiSplice
145 plusUnique = uniques + uniqueSplice - minusUnique
146 plusMulti = multis + multiSplice - minusMulti
147 counts = ["Total\t%d" % totalReads,
148 "Unique\t%d" % uniques,
149 "Multis\t%d" % multis,
150 "UniqueSplices\t%d" % uniqueSplice,
151 "Multisplices\t%d" % multiSplice,
152 "PlusUnique\t%d" % plusUnique,
153 "PlusMulti\t%d" % plusMulti,
154 "MinusUnique\t%d" % minusUnique,
155 "MinusMulti\t%d" % minusMulti,
156 "ReadLength\t%d" % readLength
162 def buildHeader(templateheader, commentList):
163 header = templateheader
164 header["CO"] = commentList
169 def getReadID(alignedread):
170 ID = alignedread.qname
171 if alignedread.is_read1:
174 if alignedread.is_read2:
180 def getReadLength(cigar):
181 take = (0, 1) # CIGAR operation (M/match, I/insertion)
183 return sum([length for op,length in cigar if op in take])
186 def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
187 newAlignedRead = originalRead
188 newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
189 outfile.write(newAlignedRead)
192 if __name__ == "__main__":