From: Sean Upchurch Date: Wed, 19 Jun 2013 18:56:48 +0000 (-0700) Subject: convert standard analysis pipelines to use bam format natively X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=commitdiff_plain;h=4ad5495359e4322da39868020a7398676261679e convert standard analysis pipelines to use bam format natively --- diff --git a/ReadDataset.py b/ReadDataset.py index 9a795c8..dffa76d 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -6,11 +6,10 @@ import re import sys import pysam from array import array -from commoncode import getReverseComplement +from commoncode import getReverseComplement, isSpliceEntry currentRDSVersion = "3.0" - class ReadDatasetError(Exception): pass @@ -68,7 +67,7 @@ class MaxCoordFinder(ReadCounter): -class BamReadDataset(): +class ReadDataset(): """ Class for storing reads from experiments. Assumes that custom scripts will translate incoming data into a format that can be inserted into the class using the insert* methods. Default class subtype ('DNA') includes @@ -454,7 +453,7 @@ class BamReadDataset(): newrow["chrom"] = chrom if withPairID: - newrow["pairID"] = pairID + newrow["pairID"] = pairReadSuffix[1:] try: resultsDict[dictKey].append(newrow) @@ -973,16 +972,6 @@ def getReadSense(reverse): return sense -def isSpliceEntry(cigarTupleList): - isSplice = False - for operation,length in cigarTupleList: - if operation == 3: - isSplice = True - break - - return isSplice - - def getSpliceRightStart(start, cigarTupleList): offset = 0 diff --git a/Region.py b/Region.py index 0d0a15b..39ba811 100644 --- a/Region.py +++ b/Region.py @@ -6,7 +6,7 @@ class Region(object): """ - def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0): + def __init__(self, start, stop, label="", index=0, chrom="", numReads=0., foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0): self.label = label self.index = index self.chrom = chrom diff --git a/bamPreprocessing.py b/bamPreprocessing.py new file mode 100644 index 0000000..dd5e861 --- /dev/null +++ b/bamPreprocessing.py @@ -0,0 +1,193 @@ +''' +User specs: +1. Checks if there are NH tags presents +2. If there aren't, it goes through the BAM file, calculates the read multiplicity +3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags +4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file +5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace +the file with one that has that information in the same manner, it will save the need to count all the reads every time. +It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand. + +The only place to have the flag is going to be to attach them to the individual reads. We'll use the 'ZG' tag and have it +flagged here if the tag will be populated. Read will be tagged with the feature label or 'NM' if it is not in a feature. +''' +import sys +import pysam +import optparse +from cistematic.genomes import Genome +from commoncode import isSpliceEntry, getFeaturesByChromDict + + +def main(argv=None): + + usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0] + if len(sys.argv) < 3: + print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0] + print ' BAM file has to be indexed' + sys.exit(1) + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + BAM = args[0] + outputfilename = args[1] + + samfile = pysam.Samfile(BAM, "rb" ) + readMultiplicityDict = getReadMultiplicity(samfile) + + counts = getReadCounts(samfile, readMultiplicityDict) + outheader = buildHeader(samfile.header, counts) + if options.markGID: + genome = Genome(options.genomeName, inRAM=True) + if options.extendGenome != "": + genome.extendFeatures(options.extendGenome, replace=options.replaceModels) + + print "getting gene features...." + featuresByChromDict = getFeaturesByChromDict(genome) + outheader['CO'].append(options.genomeComment) + + outfile = pysam.Samfile(outputfilename, "wb", header=outheader) + for alignedread in samfile.fetch(until_eof=True): + geneModelFlag = "" + if options.markGID: + chrom = samfile.getrname(alignedread.tid) + chromNum = chrom[3:] + geneModelFlag = "NM" + for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]: + if start < alignedread.pos and stop > alignedread.pos: + geneModelFlag = gid + continue + + ID = getReadID(alignedread) + try: + scaleby = alignedread.opt('NH') + outfile.write(alignedread) + except KeyError: + scaleby = readMultiplicityDict[ID] + writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag) + + outfile.close() + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--models", dest="extendGenome") + parser.add_option("--genome", dest="genomeName") + parser.add_option("--genomeName", dest="genomeComment") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + parser.add_option("--markGID", action="store_true", dest="markGID") + + parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="") + + return parser + + +def getReadMultiplicity(samfile): + """ There is an assumption here that the set of all reads will not have a mixture of + reads bearing the NH tag and reads without. + """ + readMultiplicityDict = {} + processedReads = 0 + for alignedread in samfile.fetch(until_eof=True): + try: + readMultiplicity = alignedread.opt('NH') + return {} + except KeyError: + ID = getReadID(alignedread) + try: + readMultiplicityDict[ID] += 1 + except KeyError: + readMultiplicityDict[ID] = 1 + + processedReads += 1 + if processedReads % 5000000 == 0: + print str(processedReads/1000000) + 'M alignments processed in multiplicity examination' + + return readMultiplicityDict + + +def getReadCounts(samfile, readMultiplicityDict): + uniques = 0 + multis = 0.0 + minusUnique = 0 + uniqueSplice = 0 + multiSplice = 0.0 + minusMulti = 0.0 + readLength = 0 + for alignedread in samfile.fetch(until_eof=True): + if readLength == 0: + readLength = getReadLength(alignedread.cigar) + + try: + readMultiplicity = alignedread.opt('NH') + except KeyError: + ID = getReadID(alignedread) + readMultiplicity = readMultiplicityDict[ID] + + if readMultiplicity == 1: + if alignedread.is_reverse: + minusUnique += 1 + + if isSpliceEntry(alignedread.cigar): + uniqueSplice += 1 + else: + uniques += 1 + else: + if alignedread.is_reverse: + minusMulti += 1.0/readMultiplicity + + if isSpliceEntry(alignedread.cigar): + multiSplice += 1.0/readMultiplicity + else: + multis += 1.0/readMultiplicity + + totalReads = uniques + multis + uniqueSplice + multiSplice + plusUnique = uniques + uniqueSplice - minusUnique + plusMulti = multis + multiSplice - minusMulti + counts = ["Total\t%d" % totalReads, + "Unique\t%d" % uniques, + "Multis\t%d" % multis, + "UniqueSplices\t%d" % uniqueSplice, + "Multisplices\t%d" % multiSplice, + "PlusUnique\t%d" % plusUnique, + "PlusMulti\t%d" % plusMulti, + "MinusUnique\t%d" % minusUnique, + "MinusMulti\t%d" % minusMulti, + "ReadLength\t%d" % readLength + ] + + return counts + + +def buildHeader(templateheader, commentList): + header = templateheader + header["CO"] = commentList + + return header + + +def getReadID(alignedread): + ID = alignedread.qname + if alignedread.is_read1: + ID = ID + '/1' + + if alignedread.is_read2: + ID = ID + '/2' + + return ID + + +def getReadLength(cigar): + take = (0, 1) # CIGAR operation (M/match, I/insertion) + + return sum([length for op,length in cigar if op in take]) + + +def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag): + newAlignedRead = originalRead + newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)] + outfile.write(newAlignedRead) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/checkrmask.py b/checkrmask.py index 457e4c1..8ae30ee 100755 --- a/checkrmask.py +++ b/checkrmask.py @@ -55,6 +55,27 @@ def makeParser(usage=""): def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None): + if os.path.isfile(dbfile): + checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename) + else: + outfile = open(outFileName, "w") + goodfile = open(goodFileName, "w") + infile = open(filename) + print "No database - passing through" + if logfilename is not None: + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + writeLog(logfilename, versionString, "No database - passing through") + + for line in infile: + outfile.write("%s\tNR\tNR\t0.00\n" % line) + goodfile.write(line) + + outfile.close() + goodfile.close() + + +def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None): + outfile = open(outFileName, "w") goodfile = open(goodFileName, "w") if startField < 0: @@ -63,29 +84,14 @@ def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cacheP if cachePages < 250000: cachePages = 250000 - doLog = False if logfilename is not None: writeLog(logfilename, versionString, string.join(sys.argv[1:])) - doLog = True infile = open(filename) - if os.path.isfile(dbfile): - db = sqlite.connect(dbfile) - sql = db.cursor() - sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages) - sql.execute("PRAGMA temp_store = MEMORY") - else: - print "No database - passing through" - if doLog: - writeLog(logfilename, versionString, "No database - passing through") - - for line in infile: - outfile.write("%s\tNR\tNR\t0.00\n" % line) - goodfile.write(line) - - outfile.close() - goodfile.close() - sys.exit(0) + db = sqlite.connect(dbfile) + sql = db.cursor() + sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages) + sql.execute("PRAGMA temp_store = MEMORY") featureList = [] featureDict = {} diff --git a/combineRPKMs.py b/combineRPKMs.py index 88b57ab..1e2d2aa 100755 --- a/combineRPKMs.py +++ b/combineRPKMs.py @@ -63,7 +63,7 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do outfile.write(header) finalfile = open(finalfileName) - #TODO: QForAli - the output lines are driven by finalfile. If there are genes in the first 2 that + #TODO: the output lines are driven by finalfile. If there are genes in the first 2 that # are not in the finalfile then they will be lost. for line in finalfile: fields = line.strip().split() diff --git a/combinerds.py b/combinerds.py index 2695e6b..d6960c2 100755 --- a/combinerds.py +++ b/combinerds.py @@ -21,7 +21,7 @@ def main(argv=None): if not argv: argv = sys.argv - usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0] + usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0] parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) @@ -98,9 +98,7 @@ def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, c for table in tableList: print "importing table %s from file %s" % (table, inputfile) dbColumns = "*" - if table == "uniqs": - dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName - elif table == "multi": + if table in ["uniqs", "multi"]: dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName elif table == "splices": dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName diff --git a/commoncode.py b/commoncode.py index cd44962..0e4fdbb 100755 --- a/commoncode.py +++ b/commoncode.py @@ -10,8 +10,8 @@ from cistematic.genomes import Genome import Region commoncodeVersion = 5.6 -currentRDSversion = 2.0 +#TODO: This is a function dumping ground - needs to be reworked class ErangeError(Exception): pass @@ -49,6 +49,78 @@ def writeLog(logFile, messenger, message): logfile.close() +def getChromInfoList(chrominfo): + chromInfoList = [] + linelist = open(chrominfo) + for line in linelist: + fields = line.strip().split('\t') + chr = fields[0] + start = 0 + end = int(fields[1]) + chromInfoList.append((chr,start,end)) + + linelist.close() + + return chromInfoList + + +# BAM file support functions +def isSpliceEntry(cigarTupleList): + isSplice = False + if cigarTupleList is not None: + for operation,length in cigarTupleList: + if operation == 3: + isSplice = True + break + + return isSplice + + +def addPairIDtoReadID(alignedread): + ID = alignedread.qname + if alignedread.is_read1: + ID = ID + '/1' + + if alignedread.is_read2: + ID = ID + '/2' + + return ID + + +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError + + +def getReadSense(read): + if read.is_reverse: + sense = "-" + else: + sense = "+" + + return sense + +#TODO: is this where we need to take into account the NH > 10 restriction? +def countThisRead(read, countUniqs, countMulti, countSplices, sense): + countRead = True + + if sense in ['-', '+'] and sense != getReadSense(read): + countRead = False + elif not countSplices and isSpliceEntry(read.cigar): + countRead = False + elif not countUniqs and read.opt('NH') == 1: + countRead = False + elif not countMulti and read.opt('NH') > 1: + countRead = False + + return countRead + + +# Cistematic functions def getGeneInfoDict(genome, cache=False): idb = geneinfoDB(cache=cache) if genome == "dmelanogaster": @@ -73,6 +145,7 @@ def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRA return geneannotDict +# Configuration File Support def getConfigParser(fileList=[]): configFiles = ["erange.config", os.path.expanduser("~/.erange.config")] for filename in fileList: @@ -129,6 +202,7 @@ def getAllConfigSectionOptions(parser, section): return setting +# All the Other Stuff from before def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False, fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False, doMerge=True, keepPeak=False, returnTop=0): @@ -192,7 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will # be merged with A but that will exit the loop and the merge with C will be missed. - #TODO: QForAli - Are we going to require sorted region files to have this even work? + #TODO: Are we going to require sorted region files to have this even work? for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -341,10 +415,6 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, # implementing a triangular smooth smoothArray = array("f", [0.] * length) - #TODO: QForAli - really? We're going to insert MANUFACTURED data (0.0's) and use them in the calculation - # (probably) just to avoid IndexError exceptions. Then we are going to completely ignore adjusting the - # last 2 elements of the array for (probably) the same issue. Is this how erange is dealing with the - # edge cases? for pos in range(2,length -2): smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 diff --git a/docs/README.chip-seq b/docs/README.chip-seq index 846a441..ea7b0a3 100644 --- a/docs/README.chip-seq +++ b/docs/README.chip-seq @@ -57,18 +57,24 @@ options are case sensitive and that they could well fail silently. -3. MAKING THE NECESSARY INPUT (RDS) FILES - -You will want to first convert your read mappings to the -native ERANGE read store. Please see the file -README.build-rds for instructions on how to do this. - -Build an RDS file for both the ChIP, and if available and -appropriate, the control. Note that we *HIGHLY* recommend -the use of a matched control sample to account for some -of the general background artifacts that can be present -in ChIP-seq samples (e.g. DNAse hypersensitivity, -assembly collapse of some sattelite repeats, etc....). +3. MAKING THE NECESSARY INPUT FILES + +Erange uses BAM format files, but there are a couple of +modifications that need to be made to the header and +individual entries. The python script bamPreprocessing.py +will do the following: +1. Count the reads by type and write these counts to the +header as comments. +2. Verify that every read has a value in the NH tag or add +it if needed. +3. Optionally annotate the reads with the geneID using the +ZG flag + +Note that we *HIGHLY* recommend the use of a matched +control sample to account for some of the general +background artifacts that can be present in ChIP-seq +samples (e.g. DNAse hypersensitivity, assembly collapse +of some sattelite repeats, etc....). 4. WEIGHING MULTIREADS diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt index b0a1c0f..96ea795 100644 --- a/docs/RNA-seq.analysisSteps.txt +++ b/docs/RNA-seq.analysisSteps.txt @@ -1,5 +1,5 @@ # analysis steps for an ERANGE analysis of RNA-seq data -# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh +# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.py # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g. # export PYTHONPATH=/my/path/to/cistematic diff --git a/docs/runRNAPairedAnalysis.sh b/docs/runRNAPairedAnalysis.sh index cf75b6a..bd82901 100755 --- a/docs/runRNAPairedAnalysis.sh +++ b/docs/runRNAPairedAnalysis.sh @@ -1,5 +1,7 @@ #!/bin/bash # +# This is no longer supported. It is recommended that the pythin script of the same name be used instead. +# # runRNAPairedAnalysis.sh # ENRAGE # diff --git a/docs/runSNPAnalysis.sh b/docs/runSNPAnalysis.sh index 103b576..51f2e4c 100755 --- a/docs/runSNPAnalysis.sh +++ b/docs/runSNPAnalysis.sh @@ -1,5 +1,7 @@ #!/bin/bash # +# This is no longer supported. It is recommended that the pythin script of the same name be used instead. +# # runSNPAnalysis.sh # # Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages @@ -59,4 +61,4 @@ python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt # make bed file for displaying the snps on UCSC genome browser python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed -fi \ No newline at end of file +fi diff --git a/docs/runStandardAnalysis.sh b/docs/runStandardAnalysis.sh index aa5fe60..830abea 100755 --- a/docs/runStandardAnalysis.sh +++ b/docs/runStandardAnalysis.sh @@ -1,5 +1,7 @@ #!/bin/bash # +# This is no longer supported. It is recommended that the pythin script of the same name be used instead. +# # runStandardAnalysis.sh # ENRAGE # @@ -88,4 +90,4 @@ python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache" python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache -fi \ No newline at end of file +fi diff --git a/docs/runStrandedAnalysis.sh b/docs/runStrandedAnalysis.sh index 4445901..ee9d115 100755 --- a/docs/runStrandedAnalysis.sh +++ b/docs/runStrandedAnalysis.sh @@ -1,5 +1,7 @@ #!/bin/bash # +# This is no longer supported. It is recommended that the pythin script of the same name be used instead. +# # runStrandedAnalysis.sh # ENRAGE # diff --git a/findall.py b/findall.py index 9dd85df..f8907a6 100755 --- a/findall.py +++ b/findall.py @@ -1,6 +1,6 @@ """ - usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile - [--control controlrdsfile] [--minimum minHits] [--ratio minRatio] + usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile + [--control controlbamfile] [--minimum minHits] [--ratio minRatio] [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num] [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction] [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw] @@ -50,8 +50,8 @@ import math import string import optparse import operator -from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption -import ReadDataset +import pysam +from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry import Region @@ -65,7 +65,7 @@ class RegionDirectionError(Exception): class RegionFinder(): def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="", minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="", - normalize=True, listPeak=False, reportshift=False, stringency=1.0): + normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False): self.statistics = {"index": 0, "total": 0, @@ -77,8 +77,8 @@ class RegionFinder(): self.regionLabel = label self.rnaSettings = False - self.controlRDSsize = 1 - self.sampleRDSsize = 1 + self.controlRDSsize = 1.0 + self.sampleRDSsize = 1.0 self.minRatio = minRatio self.minPeak = minPeak self.leftPlusRatio = leftPlusRatio @@ -115,6 +115,10 @@ class RegionFinder(): self.listPeak = listPeak self.reportshift = reportshift self.stringency = max(stringency, 1.0) + self.controlfile = controlfile + self.doControl = self.controlfile is not None + self.doPvalue = False + self.doRevBackground = doRevBackground def useRNASettings(self, readlen): @@ -125,7 +129,7 @@ class RegionFinder(): self.maxSpacing = readlen - def getHeader(self, doPvalue): + def getHeader(self): if self.normalize: countType = "RPM" else: @@ -142,33 +146,33 @@ class RegionFinder(): if self.reportshift: headerFields.append("readShift") - if doPvalue: + if self.doPvalue: headerFields.append("pValue") return string.join(headerFields, "\t") - def printSettings(self, doRevBackground, ptype, doControl, useMulti, doCache, pValueType): + def printSettings(self, ptype, useMulti, pValueType): print - self.printStatusMessages(doRevBackground, ptype, doControl, useMulti) - self.printOptionsSummary(useMulti, doCache, pValueType) + self.printStatusMessages(ptype, useMulti) + self.printOptionsSummary(useMulti, pValueType) - def printStatusMessages(self, doRevBackground, ptype, doControl, useMulti): + def printStatusMessages(self, ptype, useMulti): if self.shiftValue == "learn": print "Will try to learn shift" if self.normalize: print "Normalizing to RPM" - if doRevBackground: + if self.doRevBackground: print "Swapping IP and background to calculate FDR" if ptype != "": if ptype in ["NONE", "SELF"]: pass elif ptype == "BACK": - if doControl and doRevBackground: + if self.doControl and self.doRevBackground: pass else: print "must have a control dataset and -revbackground for pValue type 'back'" @@ -190,9 +194,9 @@ class RegionFinder(): print "only analyzing reads on the minus strand" - def printOptionsSummary(self, useMulti, doCache, pValueType): + def printOptionsSummary(self, useMulti, pValueType): - print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, self.listPeak, not useMulti, doCache) + print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti) print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded) try: print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) @@ -200,18 +204,18 @@ class RegionFinder(): print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) - def getAnalysisDescription(self, hitfile, useMulti, doCache, pValueType, controlfile, doControl): + def getAnalysisDescription(self, hitfile, useMulti, pValueType): description = ["#ERANGE %s" % versionString] - if doControl: - description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize)) + if self.doControl: + description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize)) else: description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize)) if self.withFlag != "": description.append("#restrict to Flag = %s" % self.withFlag) - description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, self.listPeak, not useMulti, doCache)) + description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti)) description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)) try: description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)) @@ -221,6 +225,31 @@ class RegionFinder(): return string.join(description, "\n") + def getFooter(self, bestShift): + index = self.statistics["index"] + mIndex = self.statistics["mIndex"] + footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)] + if self.doDirectionality: + footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"]) + + if self.doRevBackground: + try: + percent = min(100. * (float(mIndex)/index), 100.) + except ZeroDivisionError: + percent = 0. + + footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent)) + + if self.shiftValue == "auto" and self.reportshift: + + footerLines.append("#mode of shift values: %d" % bestShift) + + if self.statistics["badRegionTrim"] > 0: + footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"]) + + return string.join(footerLines, "\n") + + def updateControlStatistics(self, peak, sumAll, peakScore): plusRatio = float(peak.numPlus)/peak.numHits @@ -281,11 +310,11 @@ def main(argv=None): minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim, doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing, withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak, - reportshift=options.reportshift, stringency=options.stringency) + reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile, + doRevBackground=options.doRevBackground) findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings, - options.cachePages, options.ptype, options.controlfile, options.doRevBackground, - options.useMulti, options.combine5p) + options.ptype, options.useMulti, options.combine5p) def makeParser(): @@ -363,51 +392,67 @@ def makeParser(): return parser -def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, cachePages=None, - ptype="", controlfile=None, doRevBackground=False, useMulti=True, combine5p=False): +def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, + ptype="", useMulti=True, combine5p=False): writeLog(logfilename, versionString, string.join(sys.argv[1:])) - doCache = cachePages is not None - controlRDS = None - doControl = controlfile is not None - if doControl: + controlBAM = None + if regionFinder.doControl: print "\ncontrol:" - controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache) - regionFinder.controlRDSsize = len(controlRDS) / 1000000. + controlBAM = pysam.Samfile(regionFinder.controlfile, "rb") + regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000. print "\nsample:" - hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache) - regionFinder.sampleRDSsize = len(hitRDS) / 1000000. - pValueType = getPValueType(ptype, doControl, doRevBackground) - doPvalue = not pValueType == "none" - regionFinder.readlen = hitRDS.getReadSize() + sampleBAM = pysam.Samfile(hitfile, "rb") + regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000. + pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground) + regionFinder.doPvalue = not pValueType == "none" + regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength")) if rnaSettings: regionFinder.useRNASettings(regionFinder.readlen) - regionFinder.printSettings(doRevBackground, ptype, doControl, useMulti, doCache, pValueType) + regionFinder.printSettings(ptype, useMulti, pValueType) outfile = open(outfilename, outputMode) - header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl) + header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType) shiftDict = {} - chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl) - for chromosome in chromosomeList: - #TODO: QForAli -Really? Use first chr shift value for all of them + chromList = getChromosomeListToProcess(sampleBAM, controlBAM) + for chromosome in chromList: + #TODO: Really? Use first chr shift value for all of them + maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti) if regionFinder.shiftValue == "learn": - learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p) + regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p) - allregions, outregions = findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p) - if doRevBackground: - backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti) - writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType) + allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p) + if regionFinder.doRevBackground: + maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti) + backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti) else: - writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header) + backregions = [] + pValueType = "self" + + writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType) - footer = getFooter(regionFinder, shiftDict, doRevBackground) + try: + bestShift = getBestShiftInDict(shiftDict) + except ValueError: + bestShift = 0 + + footer = regionFinder.getFooter(bestShift) print footer print >> outfile, footer outfile.close() writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1]) +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError + + def getPValueType(ptype, doControl, doRevBackground): pValueType = "self" if ptype in ["NONE", "SELF", "BACK"]: @@ -424,52 +469,41 @@ def getPValueType(ptype, doControl, doRevBackground): return pValueType -def openRDSFile(filename, cachePages=None, doCache=False): - rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache) - if cachePages > rds.getDefaultCacheSize(): - rds.setDBcache(cachePages) - - return rds - - -def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl): - print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, doCache, pValueType, controlfile, doControl) - header = regionFinder.getHeader(doPvalue) +def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType): + print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType) + header = regionFinder.getHeader() print >> outfile, header return header -def getChromosomeListToProcess(hitRDS, controlRDS=None, doControl=False): - hitChromList = hitRDS.getChromosomes() - if doControl: - controlChromList = controlRDS.getChromosomes() - chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"] +def getChromosomeListToProcess(sampleBAM, controlBAM=None): + if controlBAM is not None: + chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"] else: - chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"] + chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"] return chromosomeList -def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, - outfile, useMulti, doControl, controlRDS, combine5p): +def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename, + outfile, useMulti, controlBAM, combine5p): outregions = [] allregions = [] print "chromosome %s" % (chromosome) previousHit = - 1 * regionFinder.maxSpacing readStartPositions = [-1] - totalWeight = 0 + totalWeight = 0.0 uniqueReadCount = 0 reads = [] - numStarts = 0 - badRegion = False - hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, - strand=regionFinder.stranded, combine5p=combine5p) + numStartsInRegion = 0 - maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) - for read in hitDict[chromosome]: - pos = read["start"] + for alignedread in sampleBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p): + continue + + pos = alignedread.pos if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): lastReadPos = readStartPositions[-1] lastBasePosition = lastReadPos + regionFinder.readlen - 1 @@ -485,8 +519,8 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, allregions.append(int(region.numReads)) regionLength = lastReadPos - region.start - if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength): - region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl) + if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength): + region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti) if region.foldRatio >= regionFinder.minRatio: # first pass, with absolute numbers @@ -495,25 +529,24 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, try: lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize) except IndexError: - badRegion = True + regionFinder.statistics["badRegionTrim"] += 1 continue - region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl) + region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti) - # just in case it changed, use latest data try: bestPos = peak.topPos[0] peakScore = peak.smoothArray[bestPos] if regionFinder.normalize: peakScore /= regionFinder.sampleRDSsize - except: + except (IndexError, AttributeError, ZeroDivisionError): continue if regionFinder.listPeak: - region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore) + region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore) if useMulti: - setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, + setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, regionFinder.normalize, regionFinder.doTrim) region.shift = peak.shift @@ -530,43 +563,75 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, regionFinder.statistics["failed"] += 1 readStartPositions = [] - totalWeight = 0 + totalWeight = 0.0 uniqueReadCount = 0 reads = [] - numStarts = 0 - if badRegion: - badRegion = False - regionFinder.statistics["badRegionTrim"] += 1 + numStartsInRegion = 0 if pos not in readStartPositions: - numStarts += 1 + numStartsInRegion += 1 readStartPositions.append(pos) - weight = read["weight"] + weight = 1.0/alignedread.opt('NH') totalWeight += weight if weight == 1.0: uniqueReadCount += 1 - reads.append({"start": pos, "sense": read["sense"], "weight": weight}) + reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos return allregions, outregions -def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti): +def getReadSense(read): + if read.is_reverse: + sense = "-" + else: + sense = "+" + + return sense + + +def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False): + if read.opt('NH') > 1 and not doMulti: + return True + + if strand == "+" and read.is_reverse: + return True + + if strand == "-" and not read.is_reverse: + return True + + return False + + +def getMaxCoordinate(samfile, chr, doMulti=False): + maxCoord = 0 + for alignedread in samfile.fetch(chr): + if alignedread.opt('NH') > 1: + if doMulti: + maxCoord = max(maxCoord, alignedread.pos) + else: + maxCoord = max(maxCoord, alignedread.pos) + + return maxCoord + + +def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti): #TODO: this is *almost* the same calculation - there are small yet important differences print "calculating background..." previousHit = - 1 * regionFinder.maxSpacing currentHitList = [-1] - currentTotalWeight = 0 + currentTotalWeight = 0.0 currentReadList = [] backregions = [] numStarts = 0 badRegion = False - hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True) - maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti) - for read in hitDict[chromosome]: - pos = read["start"] + for alignedread in controlBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti): + continue + + pos = alignedread.pos if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): lastReadPos = currentHitList[-1] lastBasePosition = lastReadPos + regionFinder.readlen - 1 @@ -578,7 +643,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight) regionLength = lastReadPos - region.start if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength): - numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti) if regionFinder.normalize: numMock /= regionFinder.sampleRDSsize @@ -595,13 +660,12 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti badRegion = True continue - numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti) if regionFinder.normalize: numMock /= regionFinder.sampleRDSsize foldRatio = region.numReads / numMock - # just in case it changed, use latest data try: bestPos = peak.topPos[0] peakScore = peak.smoothArray[bestPos] @@ -618,7 +682,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti regionFinder.updateControlStatistics(peak, region.numReads, peakScore) currentHitList = [] - currentTotalWeight = 0 + currentTotalWeight = 0.0 currentReadList = [] numStarts = 0 if badRegion: @@ -629,32 +693,31 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti numStarts += 1 currentHitList.append(pos) - weight = read["weight"] + weight = 1.0/alignedread.opt('NH') currentTotalWeight += weight - currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight}) + currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos return backregions -def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, - outfile, useMulti, doControl, controlRDS, combine5p): - - hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, - strand=regionFinder.stranded, combine5p=combine5p) +def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename, + outfile, useMulti, controlBAM, combine5p): - maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) print "learning shift.... will need at least 30 training sites" stringency = regionFinder.stringency previousHit = -1 * regionFinder.maxSpacing positionList = [-1] - totalWeight = 0 + totalWeight = 0.0 readList = [] shiftDict = {} count = 0 numStarts = 0 - for read in hitDict[chromosome]: - pos = read["start"] + for alignedread in sampleBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p): + continue + + pos = alignedread.pos if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): if regionFinder.normalize: totalWeight /= regionFinder.sampleRDSsize @@ -663,22 +726,22 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, regionStop = positionList[-1] regionLength = regionStop - regionStart if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency): - foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl) + foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti) if foldRatio >= regionFinder.minRatio: updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen) count += 1 positionList = [] - totalWeight = 0 + totalWeight = 0.0 readList = [] if pos not in positionList: numStarts += 1 positionList.append(pos) - weight = read["weight"] + weight = 1.0/alignedread.opt('NH') totalWeight += weight - readList.append({"start": pos, "sense": read["sense"], "weight": weight}) + readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, @@ -689,12 +752,14 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, print outline writeLog(logfilename, versionString, outfilename + outline) - regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename) - outline = "#picked shiftValue to be %d" % regionFinder.shiftValue + shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename) + outline = "#picked shiftValue to be %d" % shiftValue print outline print >> outfile, outline writeLog(logfilename, versionString, outfilename + outline) + return shiftValue, shiftDict + def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord): return abs(pos - previousHit) > maxSpacing or pos == maxCoord @@ -705,7 +770,7 @@ def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringen minNumReadStarts = stringency * regionFinder.minRatio minRegionLength = stringency * regionFinder.readlen - return sumAll >= minTotalReads and numStarts > minNumReadStarts and regionLength > minRegionLength + return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount): @@ -758,12 +823,12 @@ def peakEdgeLocated(peak, position, minSignalThresh): return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0] -def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl): +def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti): """ Fold ratio calculated is total read weight over control """ #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS - if doControl: - numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + if regionFinder.doControl: + numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti) if regionFinder.normalize: numMock /= regionFinder.controlRDSsize @@ -774,6 +839,25 @@ def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regi return foldRatio +def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False): + count = 0.0 + for alignedread in bamfile.fetch(chr, start, end): + if alignedread.opt('NH') > 1: + if countMulti: + if isSpliceEntry(alignedread.cigar): + if countSplices: + count += 1.0/alignedread.opt('NH') + else: + count += 1.0/alignedread.opt('NH') + elif uniqs: + if isSpliceEntry(alignedread.cigar): + if countSplices: + count += 1.0 + else: + count += 1.0 + + return count + def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen): peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto") try: @@ -819,9 +903,12 @@ def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRa return region -def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim): +def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim): if doTrim: - sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos) + sumMulti = 0.0 + for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos): + if alignedread.opt('NH') > 1: + sumMulti += 1.0/alignedread.opt('NH') else: sumMulti = currentTotalWeight - currentUniqueCount @@ -856,25 +943,33 @@ def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plus raise RegionDirectionError -def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, - allregions, header): - - writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, - allregions, header, backregions=[], pValueType="self") - - -def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, +def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=[], pValueType="none"): print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"] - if doPvalue: + if regionFinder.doPvalue: if pValueType == "self": poissonmean = calculatePoissonMean(allregions) else: poissonmean = calculatePoissonMean(backregions) print header - writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=regionFinder.reportshift, shiftDict=shiftDict) + for region in outregions: + if regionFinder.shiftValue == "auto" and regionFinder.reportshift: + try: + shiftDict[region.shift] += 1 + except KeyError: + shiftDict[region.shift] = 1 + + outline = getRegionString(region, regionFinder.reportshift) + + # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest + if regionFinder.doPvalue: + pValue = calculatePValue(int(region.numReads), poissonmean) + outline += "\t%1.2g" % pValue + + print outline + print >> outfile, outline def calculatePoissonMean(dataList): @@ -890,29 +985,8 @@ def calculatePoissonMean(dataList): return poissonmean -def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}): - for region in outregions: - if shiftValue == "auto" and reportshift: - try: - shiftDict[region.shift] += 1 - except KeyError: - shiftDict[region.shift] = 1 - - outline = getRegionString(region, reportshift) - - # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest - if doPvalue: - sumAll = int(region.numReads) - pValue = calculatePValue(sumAll, poissonmean) - outline += "\t%1.2g" % pValue - - print outline - print >> outfile, outline - - def calculatePValue(sum, poissonmean): pValue = math.exp(-poissonmean) - #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum) for i in xrange(sum): pValue *= poissonmean pValue /= i+1 @@ -929,34 +1003,9 @@ def getRegionString(region, reportShift): return outline -def getFooter(regionFinder, shiftDict, doRevBackground): - index = regionFinder.statistics["index"] - mIndex = regionFinder.statistics["mIndex"] - footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)] - if regionFinder.doDirectionality: - footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"]) - - if doRevBackground: - try: - percent = min(100. * (float(mIndex)/index), 100.) - except ZeroDivisionError: - percent = 0. - - footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent)) - - if regionFinder.shiftValue == "auto" and regionFinder.reportshift: - bestShift = getBestShiftInDict(shiftDict) - footerLines.append("#mode of shift values: %d" % bestShift) - - if regionFinder.statistics["badRegionTrim"] > 0: - footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"]) - - return string.join(footerLines, "\n") - - def getBestShiftInDict(shiftDict): return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0] if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/geneLocusBins.py b/geneLocusBins.py index 03cade2..14cb2b5 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -154,7 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins try: normalizedValue = 100. * binAmount / tagCount except ZeroDivisionError: - #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer + #TODO: this is the right way to refactor the original code, but I don't think this is the right answer normalizedValue = 100. * binAmount binAmount = normalizedValue diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py index 7b4a2cc..10ccc83 100755 --- a/geneMrnaCounts.py +++ b/geneMrnaCounts.py @@ -6,12 +6,11 @@ except: import sys import optparse -from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption -import ReadDataset +from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead from cistematic.genomes import Genome from cistematic.core.geneinfo import geneinfoDB -print "geneMrnaCounts: version 5.2" +print "geneMrnaCounts: version 6.0" def main(argv=None): @@ -33,7 +32,7 @@ def main(argv=None): geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices, options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels, - options.searchGID, options.countFeats, options.cachePages, options.markGID) + options.searchGID, options.countFeats, options.cachePages) def getParser(usage): @@ -47,7 +46,6 @@ def getParser(usage): parser.add_option("--searchGID", action="store_true", dest="searchGID") parser.add_option("--countfeatures", action="store_true", dest="countFeats") parser.add_option("--cache", type="int", dest="cachePages") - parser.add_option("--markGID", action="store_true", dest="markGID") configParser = getConfigParser() section = "geneMrnaCounts" @@ -60,17 +58,16 @@ def getParser(usage): searchGID = getConfigBoolOption(configParser, section, "searchGID", False) countFeats = getConfigBoolOption(configParser, section, "countFeats", False) cachePages = getConfigOption(configParser, section, "cachePages", None) - markGID = getConfigBoolOption(configParser, section, "markGID", False) parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti, extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID, - countFeats=countFeats, cachePages=cachePages, markGID=markGID) + countFeats=countFeats, cachePages=cachePages) return parser -def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False, +def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False, doUniqs=True, doMulti=False, extendGenome="", replaceModels=False, - searchGID=False, countFeats=False, cachePages=None, markGID=False): + searchGID=False, countFeats=False): if trackStrand: print "will track strandedness" @@ -86,16 +83,6 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice else: replaceModels = False - if cachePages is not None: - doCache = True - else: - cachePages = 100000 - doCache = False - - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) - genome = Genome(genomeName, inRAM=True) if extendGenome != "": genome.extendFeatures(extendGenome, replace=replaceModels) @@ -111,24 +98,15 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice for gid in gidList: gidCount[gid] = 0 - chromList = hitRDS.getChromosomes(fullChrom=False) - if len(chromList) == 0 and doSplices: - chromList = hitRDS.getChromosomes(table="splices", fullChrom=False) - - if markGID: - print "Flagging all reads as NM" - hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices) - - for chrom in chromList: + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + for chrom in chromosomeList: if chrom not in featuresByChromDict: continue if countFeats: seenFeaturesByChromDict[chrom] = set([]) - print "\nchr%s" % chrom - fullchrom = "chr%s" % chrom - regionList = [] + print "\nchr%s" % chrom print "counting GIDs" for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]: try: @@ -137,34 +115,31 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice if featureSense == "R": checkSense = "-" - regionData = (gid, fullchrom, start, stop, checkSense) - count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense) + count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense) else: - regionData = (gid, fullchrom, start, stop) - count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) + count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) gidCount[gid] += count - if markGID: - regionList.append(regionData) - if countFeats: seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense)) except: print "problem with %s - skipping" % gid - if markGID: - print "marking GIDs" - hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded) - print "finished marking" - print " " if countFeats: numFeatures = countFeatures(seenFeaturesByChromDict) print "saw %d features" % numFeatures - writeOutputFile(outfilename, genome, gidList, gidCount, searchGID) - if markGID and doCache: - hitRDS.saveCacheDB(hitfile) + writeOutputFile(outfilename, genome, gidCount, searchGID) + + +def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''): + count = 0.0 + for alignedread in bamfile.fetch(chrom, start, stop): + if countThisRead(alignedread, uniqs, multi, splices, sense): + count += 1.0/alignedread.opt('NH') + + return count def countFeatures(seenFeaturesByChromDict): @@ -178,18 +153,15 @@ def countFeatures(seenFeaturesByChromDict): return count -def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID): +def writeOutputFile(outfilename, genome, gidCount, searchGID): geneAnnotDict = genome.allAnnotInfo() genomeName = genome.genome outfile = open(outfilename, "w") idb = geneinfoDB(cache=True) geneInfoDict = idb.getallGeneInfo(genomeName) - for gid in gidList: + for gid in gidCount: symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict) - if gid in gidCount: - outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid])) - else: - outfile.write("%s\t%s\t0\n" % (gid, symbol)) + outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid])) outfile.close() diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 31213c3..3d02c67 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -6,13 +6,13 @@ except: import sys import optparse -from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption -import ReadDataset +import pysam +from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry from cistematic.genomes import Genome from commoncode import getGeneInfoDict from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB -print "geneMrnaCountsWeighted: version 4.3" +print "geneMrnaCountsWeighted: version 5.0" def main(argv=None): @@ -32,8 +32,9 @@ def main(argv=None): hitfile = args[1] countfile = args[2] outfilename = args[3] + bamfile = pysam.Samfile(hitfile, "rb") - geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense, + geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense, options.withUniqs, options.withMulti, options.acceptfile, options.cachePages, options.doVerbose, options.extendGenome, options.replaceModels) @@ -68,13 +69,7 @@ def makeParser(usage=""): return parser -#TODO: Reported user performance issue. Long run times in conditions: -# small number of reads ~40-50M -# all features on single chromosome -# -# User states has been a long time problem. - -def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, +def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True, withUniqs=False, withMulti=False, acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): @@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= doCache = True else: doCache = False - cachePages = 0 hg = Genome(genome, inRAM=True) if extendGenome: @@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= hg.extendFeatures(extendGenome, replace=replaceModels) - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache) - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) - allGIDs = set(hg.allGIDs()) if acceptfile is not None: regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose) @@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= gidReadDict[gid] = [] index = 0 - if withMulti and not withUniqs: - chromList = hitRDS.getChromosomes(table="multi", fullChrom=False) - else: - chromList = hitRDS.getChromosomes(fullChrom=False) - - readlen = hitRDS.getReadSize() - for chromosome in chromList: + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + for chromosome in chromosomeList: if doNotProcessChromosome(chromosome, featuresByChromDict.keys()): continue print "\n%s " % chromosome, - fullchrom = "chr%s" % chromosome - hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti) featureList = featuresByChromDict[chromosome] - readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index) + readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense) index = totalProcessedReads for (tagReadID, gid) in readGidList: try: @@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList): return chromosome not in chromosomeList -def getReadGIDs(hitDict, fullchrom, featList, readlen, index): +def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True): startFeature = 0 readGidList = [] - ignoreSense = True - for read in hitDict[fullchrom]: - tagStart = read["start"] - tagReadID = read["readID"] - if "sense" in read: - tagSense = read["sense"] - ignoreSense = False + readlen = getHeaderComment(bamfile.header, "ReadLength") + for alignedread in bamfile.fetch(chromosome): + if doNotProcessRead(alignedread, doUniqs, doMulti): + continue + + tagStart = alignedread.pos + tagReadID = addPairIDtoReadID(alignedread) + tagSense = "" + if not ignoreSense: + if alignedread.is_reverse: + tagSense = "-" + else: + tagSense = "+" index += 1 if index % 100000 == 0: @@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index): return readGidList, index +def doNotProcessRead(read, doUniqs, doMulti): + doNotProcess = False + if isSpliceEntry(read.cigar): + doNotProcess = True + elif doUniqs and read.opt('NH') > 1: + doNotProcess = True + elif doMulti and read.opt('NH') == 1: + doNotProcess = True + + return doNotProcess + + def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False): uniqueCountDict = {} diff --git a/geneStartBins.py b/geneStartBins.py index d929d41..3c51837 100755 --- a/geneStartBins.py +++ b/geneStartBins.py @@ -1,8 +1,3 @@ -# -# geneStartBins.py -# ENRAGE -# - try: import psyco psyco.full() @@ -10,11 +5,10 @@ except: pass import sys -from commoncode import * -import ReadDataset +from commoncode import getReadSense, getGeneInfoDict, getHeaderComment +import pysam from cistematic.genomes import Genome - print "geneStartBins: version 2.1" if len(sys.argv) < 4: print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0] @@ -24,7 +18,6 @@ if len(sys.argv) < 4: genome = sys.argv[1] hitfile = sys.argv[2] outfilename = sys.argv[3] - standardMinDist = 3000 if '-max' in sys.argv: standardMinDist = int(sys.argv[sys.argv.index('-max') + 1]) @@ -42,23 +35,21 @@ if '-cache' in sys.argv: bins = 10 standardMinThresh = standardMinDist / bins - -hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) -readlen = hitRDS.getReadSize() +bamfile = pysam.Samfile(hitfile, "rb") +readlen = getHeaderComment(bamfile.header, "ReadLength") normalizationFactor = 1.0 if normalize: - totalCount = len(hitRDS) + totalCount = getHeaderComment(bamfile.header, "Total") normalizationFactor = totalCount / 1000000. hg = Genome(genome) gidDict = {} geneinfoDict = getGeneInfoDict(genome, cache=True) featuresDict = hg.getallGeneFeatures() - outfile = open(outfilename,'w') - gidList = hg.allGIDs() gidList.sort() +chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] for gid in gidList: symbol = 'LOC' + gid geneinfo = '' @@ -78,7 +69,7 @@ for gid in gidList: if (start, stop) not in newfeatureList: newfeatureList.append((start, stop)) - if chrom not in hitDict: + if chrom not in chromosomeList: continue newfeatureList.sort() @@ -109,15 +100,16 @@ for gid in gidList: gstart = newfeatureList[-1][1] - glen gstop = newfeatureList[-1][1] + glen + tagCount = 0 if glen < standardMinDist / 2: continue binList = [0] * bins - for read in hitDict[chrom]: - tagStart = read["start"] - gstart - sense = read["sense"] - weight = read["weight"] + for alignedread in bamfile.fetch(chrom): + tagStart = alignedread.pos - gstart + sense = getReadSense(alignedread) + weight = 1.0/alignedread.opt('NH') if tagStart >= 2 * glen: break diff --git a/getsplicefa.py b/getsplicefa.py index 197800e..819675f 100755 --- a/getsplicefa.py +++ b/getsplicefa.py @@ -20,6 +20,7 @@ def main(argv=None): delimiter = "|" usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\ + \n\twhere ucscModels is the gene model file in same format as the UCSC Table Browser\ \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\ \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py index cf02cb3..e4eb12e 100644 --- a/normalizeExpandedExonic.py +++ b/normalizeExpandedExonic.py @@ -6,10 +6,10 @@ except: import sys import optparse -import ReadDataset +import pysam from cistematic.genomes import Genome from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB -from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption +from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment print "normalizeExpandedExonic: version 5.7" @@ -18,7 +18,7 @@ def main(argv=None): if not argv: argv = sys.argv - usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]" + usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]" parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) @@ -34,22 +34,19 @@ def main(argv=None): splicecountfile = args[3] outfile = args[4] - candidateLines = [] + candidatefilename = "" acceptedfilename = "" - if len(args) > 5: - try: - candidatefile = open(args[5]) - candidateLines = candidatefile.readlines() - candidatefile.close() - acceptedfilename = args[6] - except IndexError: - pass - - RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False) - uniqcount = RDS.getUniqsCount() + try: + candidatefilename = args[5] + acceptedfilename = args[6] + except IndexError: + pass + + bamfile = pysam.Samfile(hitfile, "rb") + uniqcount = getHeaderComment(bamfile.header, "Unique") normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile, - candidateLines, acceptedfilename, options.fieldID, + candidatefilename, acceptedfilename, options.fieldID, options.maxLength, options.doCache, options.extendGenome, options.replaceModels) @@ -77,102 +74,22 @@ def makeParser(usage=""): def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename, - outfilename, candidateLines=[], acceptedfilename="", + outfilename, candidatefilename="", acceptedfilename="", fieldID=0, maxLength=1000000000., doCache=False, extendGenome="", replaceModels=False): - - uniquecountfile = open(uniquecountfilename) - + + print "%d unique reads" % uniqcount + gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="") + totalCount = (uniqcount + splicecount) / 1000000. + uniqScale = uniqcount / 1000000. if acceptedfilename: acceptedfile = open(acceptedfilename, "w") - dosplicecount = False - if splicecountfilename != "none": - dosplicecount = True - splicecountfile = open(splicecountfilename) - - if extendGenome: - if replaceModels: - print "will replace gene models with %s" % extendGenome - else: - print "will extend gene models with %s" % extendGenome - - if doCache: - cacheGeneDB(genome) - hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) - print "%s cached" % genome - else: - hg = Genome(genome, inRAM=True) - - if extendGenome != "": - hg.extendFeatures(extendGenome, replace=replaceModels) - - - print "%d unique reads" % uniqcount - - splicecount = 0 - countDict = {} - gidList = [] - farList = [] - candidateDict = {} - - gidToGeneDict = {} - - featuresDict = hg.getallGeneFeatures() - print "got featuresDict" - outfile = open(outfilename, "w") - - for line in uniquecountfile: - fields = line.strip().split() - gid = fields[fieldID] - gene = fields[1] - countDict[gid] = float(fields[-1]) - gidList.append(gid) - gidToGeneDict[gid] = gene - - uniquecountfile.close() - - if dosplicecount: - for line in splicecountfile: - fields = line.strip().split() - gid = fields[fieldID] - try: - countDict[gid] += float(fields[-1]) - except: - print fields - continue - - splicecount += float(fields[-1]) - - splicecountfile.close() - - for line in candidateLines: - if "#" in line: - continue - - fields = line.strip().split() - gid = fields[1] - gene = fields[0] - if gid not in gidList: - if gid not in farList: - farList.append(gid) - gidToGeneDict[gid] = gene - - if gid not in countDict: - countDict[gid] = 0 - - countDict[gid] += float(fields[6]) - - if gid not in candidateDict: - candidateDict[gid] = [] - - candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])) - - totalCount = (uniqcount + splicecount) / 1000000. - uniqScale = uniqcount / 1000000. + featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels) + print "got featuresDict" for gid in gidList: - gene = gidToGeneDict[gid] + gene = gidToGeneDict[gid]["name"] featureList = [] try: featureList = featuresDict[gid] @@ -194,38 +111,41 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf elif geneLength > maxLength: geneLength = maxLength - rpm = countDict[gid] / totalCount + rpm = gidToGeneDict[gid]["count"] / totalCount rpkm = rpm / geneLength if gid in candidateDict: for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: - cratio = cCount / (cLength / 1000.) - cratio = (uniqScale * cratio) / totalCount + kilobaseLength = cLength / 1000. + cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount if 10. * cratio < rpkm: continue - countDict[gid] += cCount - geneLength += cLength / 1000. - acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene)) + gidToGeneDict[gid]["count"] += cCount + geneLength += kilobaseLength + try: + acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene)) + except TypeError: + pass - rpm = countDict[gid] / totalCount + rpm = gidToGeneDict[gid]["count"] / totalCount rpkm = rpm / geneLength outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm)) - for gid in farList: - gene = gidToGeneDict[gid] - geneLength = 0 - for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: - geneLength += cLength / 1000. - + for (gid, geneLength) in farList: if geneLength < 0.1: continue - for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: - cratio = cCount / (cLength / 1000.) - cratio = cratio / totalCount - acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene)) - - rpm = countDict[gid] / totalCount + gene = gidToGeneDict[gid]["name"] + if acceptedfilename: + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + kilobaseLength = cLength / 1000. + cratio = (cCount / kilobaseLength) / totalCount + try: + acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene)) + except TypeError: + pass + + rpm = gidToGeneDict[gid]["count"] / totalCount rpkm = rpm / geneLength outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm)) @@ -235,9 +155,115 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf except: pass + +def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""): + + gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID) + uniqueGidList = gidToGeneDict.keys() + if candidatefilename: + candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList) + else: + candidateDict = {} + geneDictUpdates = {} + + farList = [] + for gid in geneDictUpdates: + gidToGeneDict[gid] = geneDictUpdates[gid] + geneLength = 0. + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + geneLength += cLength / 1000. + + farList.append((gid, geneLength)) + + return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount + + +def getGeneDict(uniquecountfilename, splicecountfilename, fieldID): + + gidToGeneDict = {} + uniquecountfile = open(uniquecountfilename) + for line in uniquecountfile: + fields = line.strip().split() + gid = fields[fieldID] + gene = fields[1] + gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])} + + uniquecountfile.close() + splicecount = 0. + if splicecountfilename != "none": + splicecountfile = open(splicecountfilename) + for line in splicecountfile: + fields = line.strip().split() + gid = fields[fieldID] + try: + gidToGeneDict[gid]["count"] += float(fields[-1]) + except: + print fields + continue + + splicecount += float(fields[-1]) + + splicecountfile.close() + + return gidToGeneDict, splicecount + + +def processCandidatesFile(candidatefilename, fieldID, gidList): + """ Reads entries from the candiates file + gid entries not in gidList are returned for inclusion + creates and returns a dictionary keyed off gid with a single list of + tuples (count, length, chrom, start, stop) for each gid + """ + geneDictUpdates = {} + candidateDict = {} + candidatefile = open(candidatefilename) + for line in candidatefile.readlines(): + if "#" in line: + continue + + fields = line.strip().split() + gid = fields[1] + gene = fields[0] + if gid not in gidList: + try: + geneDictUpdates[gid]["count"] += float(fields[6]) + except KeyError: + geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])} + + try: + candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])) + except KeyError: + candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])] + + candidatefile.close() + + return candidateDict, geneDictUpdates + + +def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False): + if extendGenome: + if replaceModels: + print "will replace gene models with %s" % extendGenome + else: + print "will extend gene models with %s" % extendGenome + + if doCache: + cacheGeneDB(genome) + hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) + print "%s cached" % genome + else: + hg = Genome(genome, inRAM=True) + + if extendGenome != "": + hg.extendFeatures(extendGenome, replace=replaceModels) + + featuresDict = hg.getallGeneFeatures() + if doCache: uncacheGeneDB(genome) + return featuresDict + if __name__ == "__main__": main(sys.argv) \ No newline at end of file diff --git a/regionCounts.py b/regionCounts.py index ae005cb..e11508e 100755 --- a/regionCounts.py +++ b/regionCounts.py @@ -12,8 +12,8 @@ except: import sys import string import optparse -from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption -import ReadDataset +import pysam +from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense versionString = "regionCounts: version 3.10" print versionString @@ -32,10 +32,12 @@ def main(argv=None): sys.exit(1) regionfilename = args[0] - hitfile = args[1] + bamfilename = args[1] outfilename = args[2] - regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField, + bamfile = pysam.Samfile(bamfilename, "rb") + + regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField, options.useFullchrom, options.normalize, options.padregion, options.mergeregion, options.merging, options.doUniqs, options.doMulti, options.doSplices, options.usePeak, options.cachePages, options.logfilename, @@ -89,7 +91,7 @@ def getParser(usage): return parser -def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, +def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1, useFullchrom=False, normalize=True, padregion=0, mergeregion=0, merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False, cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False, @@ -99,11 +101,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, print "merging regions closer than %d bp" % mergeregion print "will use peak values" - if cachePages != -1: - doCache = True - else: - doCache = False - normalize = True doRPKM = False if doRPKM == True: @@ -119,24 +116,13 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, labeltoRegionDict = {} regionCount = {} - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) - readlen = hitRDS.getReadSize() - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) - - totalCount = len(hitRDS) + readlen = getHeaderComment(bamfile.header, "ReadLength") + totalCount = getHeaderComment(bamfile.header, "Total") if normalize: normalizationFactor = totalCount / 1000000. - chromList = hitRDS.getChromosomes(fullChrom=useFullchrom) - if len(chromList) == 0 and doSplices: - chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom) - + chromList = [chrom for chrom in bamfile.references if chrom != "chrM"] chromList.sort() - - if flagRDS: - hitRDS.setSynchronousPragma("OFF") - for rchrom in regionDict: if forceRegion and rchrom not in chromList: print rchrom @@ -146,7 +132,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, labeltoRegionDict[region.label] = (rchrom, region.start, region.stop) for rchrom in chromList: - regionList = [] + #regionList = [] if rchrom not in regionDict: continue @@ -156,11 +142,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, else: fullchrom = "chr%s" % rchrom - if usePeak: - readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True) - rindex = 0 - dictLen = len(readDict[fullchrom]) - for region in regionDict[rchrom]: label = region.label start = region.start @@ -168,17 +149,12 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, regionCount[label] = 0 labelList.append(label) labeltoRegionDict[label] = (rchrom, start, stop) - regionList.append((label, fullchrom, start, stop)) + #regionList.append((label, fullchrom, start, stop)) if usePeak: readList = [] - for localIndex in xrange(rindex, dictLen): - read = readDict[fullchrom][localIndex] - if read["start"] < start: - rindex += 1 - elif start <= read["start"] <= stop: - readList.append(read) - else: - break + for alignedread in bamfile.fetch(fullchrom, start, stop): + weight = 1.0/alignedread.opt('NH') + readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight}) if len(readList) < 1: continue @@ -193,13 +169,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, regionCount[label] += topValue else: - regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) - - if flagRDS: - hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices) - - if flagRDS: - hitRDS.setSynchronousPragma("ON") + regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices) if normalize: for label in regionCount: @@ -235,10 +205,29 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, outfile.write("\n") outfile.close() - if doCache and flagRDS: - hitRDS.saveCacheDB(hitfile) - - writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.)) + writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.)) + + +def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False): + uniques = 0 + multis = 0.0 + uniqueSplice = 0 + multiSplice = 0.0 + for alignedread in bamfile.fetch(chr, start, end): + readMultiplicity = alignedread.opt('NH') + if doSplices and isSpliceEntry(alignedread.cigar): + if readMultiplicity == 1 and doUniqs: + uniqueSplice += 1 + elif doMulti: + multiSplice += 1.0/readMultiplicity + elif readMultiplicity == 1 and doUniqs: + uniques += 1 + elif doMulti: + multis += 1.0/readMultiplicity + + totalReads = uniques + multis + uniqueSplice + multiSplice + + return totalReads if __name__ == "__main__": diff --git a/rnafarPairs.py b/rnafarPairs.py index 4d70c49..2c5aff4 100755 --- a/rnafarPairs.py +++ b/rnafarPairs.py @@ -1,9 +1,3 @@ -# -# RNAFARpairs.py -# ENRAGE -# -# Created by Ali Mortazavi on 11/2/08. -# """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options] looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM """ @@ -16,8 +10,8 @@ except: import sys import time import optparse -import ReadDataset -from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption +import pysam +from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry def main(argv=None): @@ -36,10 +30,12 @@ def main(argv=None): genome = args[0] goodfilename = args[1] - rdsfile = args[2] + bamfilename = args[2] outfilename = args[3] - rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist) + bamfile = pysam.Samfile(bamfilename, "rb") + + rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist) def makeParser(usage=""): @@ -62,16 +58,26 @@ def makeParser(usage=""): return parser -def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000): +def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000): + """ map all candidate regions that have paired ends overlapping with known genes + """ + + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + regions = {} + for chromosome in chromosomeList: + regions[chromosome] = [] + goodDict = {} goodfile = open(goodfilename) for line in goodfile: fields = line.split() - goodDict[fields[0]] = line + label = fields[0] + start = int(fields[2]) + stop = int(fields[3]) + goodDict[label] = line + regions[chromosome].append((start, stop, label)) goodfile.close() - RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache) - chromosomeList = RDS.getChromosomes() if doVerbose: print time.ctime() @@ -83,23 +89,18 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC assigned = {} farConnected = {} for chromosome in chromosomeList: - if doNotProcessChromosome(chromosome): - continue - print chromosome - uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) + regionList = regions[chromosome].sort() + uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist) if doVerbose: print len(uniqDict), time.ctime() - for readID in uniqDict: - readList = uniqDict[readID] - if len(readList) == 2: - total += 1 - if processReads(readList[:2], maxDist): - flags = (readList[0]["flag"], readList[1]["flag"]) - processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected) - total += processed - distinct += distinctPairs + total += pairCount + for readID, readList in uniqDict.items(): + flags = (readList[0]["flag"], readList[1]["flag"]) + processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected) + total += processed + distinct += distinctPairs entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile) distinct += entriesWritten @@ -109,15 +110,55 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC print time.ctime() -def doNotProcessChromosome(chromosome): - return chromosome == "chrM" +def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist): + """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag. + Only returns unique non-spliced read pairs matching the criteria given in processReads(). + """ + start = 1 + readDict = {} + for regionstart, regionstop, regionname in regions: + for alignedread in bamfile.fetch(chromosome, start, regionstop): + if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar): + if alignedread.pos >= regionstart: + flag = regionname + else: + flag = alignedread.opt("ZG") + try: + readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag}) + except KeyError: + readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}] -def processReads(reads, maxDist): + start = regionstop + 1 + + for alignedread in bamfile.fetch(chromosome, start): + if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar): + flag = alignedread.opt("ZG") + + try: + readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag}) + except KeyError: + readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}] + + pairCount = len(readDict.keys()) + for readID, readList in readDict.items(): + if len(readList) != 2: + del readDict[readID] + pairCount -= 1 + elif not processReads(readList, maxDist): + del readDict[readID] + + return readDict, pairCount + + +def processReads(reads, maxDist=500000): + """ For a pair of readID's to be acceptable: + - flags must be different + - neither flag can be 'NM' + - the read starts have to be within maxDist + """ process = False - start1 = reads[0]["start"] - start2 = reads[1]["start"] - dist = abs(start1 - start2) + dist = abs(reads[0]["start"] - reads[1]["start"]) flag1 = reads[0]["flag"] flag2 = reads[1]["flag"] @@ -128,6 +169,11 @@ def processReads(reads, maxDist): def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected): + """ Writes out the region information along with symbol and geneID for paired reads where one read + of the pair is in a known gene and the other is in a new region. If both reads in the pair are + in new regions the region is added to farConnected. No action is taken if both reads in the + pair are in known genes. + """ flag1, flag2 = flags total = 0 distinct = 0 @@ -147,7 +193,7 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou except KeyError: farConnected[geneID] = [farFlag] elif read1IsGood or read2IsGood: - total += 1 + total = 1 if read2IsGood: farFlag = flag2 geneID = flag1 @@ -155,30 +201,35 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou farFlag = flag1 geneID = flag2 - try: - if genome == "dmelanogaster": - symbol = geneInfoDict["Dmel_%s" % geneID][0][0] - else: - symbol = geneInfoDict[geneID][0][0] - except (KeyError, IndexError): - try: - symbol = geneAnnotDict[(genome, geneID)][0] - except (KeyError, IndexError): - symbol = "LOC%s" % geneID - - symbol = symbol.strip() - symbol = symbol.replace(" ","|") - symbol = symbol.replace("\t","|") - + symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict) if farFlag not in assigned: assigned[farFlag] = (symbol, geneID) print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip()) outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag])) - distinct += 1 + distinct = 1 return total, distinct +def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict): + try: + if genome == "dmelanogaster": + symbol = geneInfoDict["Dmel_%s" % geneID][0][0] + else: + symbol = geneInfoDict[geneID][0][0] + except (KeyError, IndexError): + try: + symbol = geneAnnotDict[(genome, geneID)][0] + except (KeyError, IndexError): + symbol = "LOC%s" % geneID + + symbol = symbol.strip() + symbol = symbol.replace(" ","|") + symbol = symbol.replace("\t","|") + + return symbol + + def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile): total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile) writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile) diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py index ebc25a5..0c01de4 100644 --- a/runRNAPairedAnalysis.py +++ b/runRNAPairedAnalysis.py @@ -1,7 +1,7 @@ import sys import optparse -import ReadDataset -from commoncode import getConfigParser, getConfigBoolOption, writeLog +import pysam +from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment from checkrmask import checkrmask from geneMrnaCounts import geneMrnaCounts from geneMrnaCountsWeighted import geneMrnaCountsWeighted @@ -50,84 +50,78 @@ def getParser(usage): return parser -def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False): +def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False): """ based on original script runRNAPairedAnalysis.sh - usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels] - where rdsprefix is the name of the rds file without the .rds extension + usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels] + where fileprefix is the name of the bam file without the .bam extension use "none" for the repeatmaskdb if you do not have one """ - rdsfile = "%s.rds" % rdsprefix + bamfilename = "%s.bam" % fileprefix + bamfile = pysam.Samfile(bamfilename, "rb") logfile = "rna.log" # log the parameters - message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb) + message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb) writeLog(logfile, "runRNAPairedAnalysis.py", message) # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) - uniquecountfilename = "%s.uniqs.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + # + # These will need to be marked by running the BAM preprocessor with the markGID option + uniquecountfilename = "%s.uniqs.count" % fileprefix + geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels) # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) - (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix readCounts = {} - readCounts["uniq"] = ucount - readCounts["splice"] = mcount - readCounts["multi"] = scount + readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique")) + readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices")) + readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis")) normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # recount the unique reads with weights calculated during the first pass - uniquerecountfilename = "%s.uniqs.recount" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + uniquerecountfilename = "%s.uniqs.recount" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # count splice reads - splicecountfilename = "%s.splices.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + splicecountfilename = "%s.splices.count" % fileprefix + geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels) # find new regions outside of gene models with reads piled up - newregionfilename = "%s.newregions.txt" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False) # filter out new regions that overlap repeats more than a certain fraction - outFileName = "%s.newregions.repstatus" % rdsprefix - goodFileName = "%s.newregions.good" % rdsprefix - checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) + outFileName = "%s.newregions.repstatus" % fileprefix + regionfilename = "%s.newregions.checked" % fileprefix + checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile) # calculate the read densities - regionfilename = "%s.newregions.checked" % rdsprefix - regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile) + goodFileName = "%s.newregions.good" % fileprefix + regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile) # map all candidate regions that have paired ends overlapping with known genes - candidatefilename = "%s.candidates.txt" % rdsprefix - rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True) + candidatefilename = "%s.candidates.txt" % fileprefix + rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True) - expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix # calculate expanded exonic read density - acceptedfilename = "%s.accepted.rpkm" % rdsprefix - try: - candidatefile = open(candidatefilename) - candidateLines = candidatefile.readlines() - candidatefile.close() - except IOError: - candidateLines = [] - - normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, + acceptedfilename = "%s.accepted.rpkm" % fileprefix + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename, acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # weigh multi-reads - multicountfilename = "%s.multi.count" % rdsprefix - acceptfile = "%s.accepted.rpkm" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1, + multicountfilename = "%s.multi.count" % fileprefix + acceptfile = "%s.accepted.rpkm" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # calculate final exonic read density - outfilename = "%s.final.rpkm" % rdsprefix + outfilename = "%s.final.rpkm" % fileprefix normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py index adfa375..d20ef64 100644 --- a/runStandardAnalysis.py +++ b/runStandardAnalysis.py @@ -1,7 +1,7 @@ import sys import optparse -import ReadDataset -from commoncode import getConfigParser, getConfigBoolOption, writeLog +import pysam +from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment from checkrmask import checkrmask from geneMrnaCounts import geneMrnaCounts from geneMrnaCountsWeighted import geneMrnaCountsWeighted @@ -17,7 +17,7 @@ def main(argv=None): argv = sys.argv print "runStandardAnalysis: version %s" % VERSION - usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]" + usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]" parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) @@ -27,15 +27,15 @@ def main(argv=None): sys.exit(1) genome = args[0] - rdsprefix = args[1] + fileprefix = args[1] repeatmaskdb = args[2] - bpradius = args[3] + bpradius = int(args[3]) try: modelfile = args[4] except IndexError: modelfile = "" - runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels) + runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels) def getParser(usage): @@ -50,82 +50,78 @@ def getParser(usage): return parser -def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False): +def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False): """ based on original script runStandardAnalysis.sh - usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels] - where rdsprefix is the name of the rds file without the .rds extension + usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels] + where fileprefix is the name of the bam file without the .bam extension use "none" for the repeatmaskdb if you do not have one """ - rdsfile = "%s.rds" % rdsprefix + bamfilename = "%s.bam" % fileprefix + bamfile = pysam.Samfile(bamfilename, "rb") logfile = "rna.log" # log the parameters - message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius) + message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius) writeLog(logfile, "runStandardAnalysis.py", message) # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) - uniquecountfilename = "%s.uniqs.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + # + # These will need to be marked by running the BAM preprocessor with the markGID option + uniquecountfilename = "%s.uniqs.count" % fileprefix + geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels) # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count - splicecountfilename = "none" - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) - (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix readCounts = {} - readCounts["uniq"] = ucount - readCounts["splice"] = mcount - readCounts["multi"] = scount - normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique")) + readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices")) + readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis")) + normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # recount the unique reads with weights calculated during the first pass - uniquerecountfilename = "%s.uniqs.recount" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + uniquerecountfilename = "%s.uniqs.recount" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # count splice reads - splicecountfilename = "%s.splices.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1) + splicecountfilename = "%s.splices.count" % fileprefix + geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels) # Alternative 1: find new regions outside of gene models with reads piled up - newregionfilename = "%s.newregions.txt" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False) # Alternative 1: filter out new regions that overlap repeats more than a certain fraction - outFileName = "%s.newregions.repstatus" % rdsprefix - goodFileName = "%s.newregions.good" % rdsprefix + outFileName = "%s.newregions.repstatus" % fileprefix + goodFileName = "%s.newregions.good" % fileprefix checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) # map all candidate regions that are within a given radius of a gene in bp - candidatefilename = "%s.candidates.txt" % rdsprefix + candidatefilename = "%s.candidates.txt" % fileprefix getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) - expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix # calculate expanded exonic read density - acceptedfilename = "%s.accepted.rpkm" % rdsprefix - try: - candidatefile = open(candidatefilename) - candidateLines = candidatefile.readlines() - candidatefile.close() - except IOError: - candidateLines = [] - - normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename, - doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + acceptedfilename = "%s.accepted.rpkm" % fileprefix + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename, + acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # weigh multi-reads - multicountfilename = "%s.multi.count" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile, + multicountfilename = "%s.multi.count" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # calculate final exonic read density - outfilename = "%s.final.rpkm" % rdsprefix + outfilename = "%s.final.rpkm" % fileprefix normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) + #TODO: remove when not tracking + writeLog(logfile, "runStandardAnalysis.py", "analysis complete") + if __name__ == "__main__": main(sys.argv) \ No newline at end of file diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py index 55c87ca..35a6700 100644 --- a/runStrandedAnalysis.py +++ b/runStrandedAnalysis.py @@ -1,7 +1,7 @@ import sys import optparse -import ReadDataset -from commoncode import writeLog +import pysam +from commoncode import writeLog, getHeaderComment from checkrmask import checkrmask from geneMrnaCounts import geneMrnaCounts from geneMrnaCountsWeighted import geneMrnaCountsWeighted @@ -40,58 +40,57 @@ def getParser(usage): return parser -def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius): +def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius): """ based on original script runStrandedAnalysis.sh usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius where rdsprefix is the name of the rds file without the .rds extension use "none" for the repeatmaskdb if you do not have one """ - rdsfile = "%s.rds" % rdsprefix + bamfilename = "%s.bam" % fileprefix + bamfile = pysam.Samfile(bamfilename, "rb") logfile = "rna.log" # log the parameters - message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius) + message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius) writeLog(logfile, "runStrandedAnalysis.py", message) # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) - uniquecountfilename = "%s.uniqs.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True) + uniquecountfilename = "%s.uniqs.count" % fileprefix + geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True) # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) - (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix readCounts = {} - readCounts["uniq"] = ucount - readCounts["splice"] = mcount - readCounts["multi"] = scount + readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique")) + readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices")) + readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis")) normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True) # recount the unique reads with weights calculated during the first pass - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - uniquerecountfilename = "%s.uniqs.recount" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix + uniquerecountfilename = "%s.uniqs.recount" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False) # count splice reads - splicecountfilename = "%s.splices.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1) + splicecountfilename = "%s.splices.count" % fileprefix + geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False) # find new regions outside of gene models with reads piled up - newregionfilename = "%s.newregions.txt" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False) regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a") + findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a") # filter out new regions that overlap repeats more than a certain fraction - newregionfilename = "%s.newregions.txt" % rdsprefix - outFileName = "%s.newregions.repstatus" % rdsprefix - goodFileName = "%s.newregions.good" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix + outFileName = "%s.newregions.repstatus" % fileprefix + goodFileName = "%s.newregions.good" % fileprefix checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) #TODO: these calls look wrong @@ -100,28 +99,21 @@ def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius): #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good # map all candidate regions that are within a given radius of a gene in bp - candidatefilename = "%s.candidates.txt" % rdsprefix + candidatefilename = "%s.candidates.txt" % fileprefix getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True) - expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix # calculate expanded exonic read density - acceptedfilename = "%s.accepted.rpkm" % rdsprefix - try: - candidatefile = open(candidatefilename) - candidateLines = candidatefile.readlines() - candidatefile.close() - except IOError: - candidateLines = [] - - normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename, - doCache=True) + acceptedfilename = "%s.accepted.rpkm" % fileprefix + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename, + acceptedfilename=acceptedfilename, doCache=True) # weigh multi-reads - multicountfilename = "%s.multi.count" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1) + multicountfilename = "%s.multi.count" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1) # calculate final exonic read density - outfilename = "%s.final.rpkm" % rdsprefix + outfilename = "%s.final.rpkm" % fileprefix normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) diff --git a/siteintersects.py b/siteintersects.py index 58d9e30..9c0c38f 100755 --- a/siteintersects.py +++ b/siteintersects.py @@ -127,7 +127,7 @@ def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSite siteNotCommon = True #TODO: as handed to me this is O(n^m) - probably could be better... - #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file + #TODO: if there are multiple matches will count them but only write the first match to the file for (rstart, rstop, rline) in siteDict[chrom]: if regionsOverlap(start, stop, rstart, rstop): commonSites += 1 diff --git a/weighMultireads.py b/weighMultireads.py index 3fd6bd8..f4b1691 100755 --- a/weighMultireads.py +++ b/weighMultireads.py @@ -168,7 +168,7 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False): chrom = "chr%s" % multiReadList[index][1] reweighList.append((round(score,3), chrom, start, theID)) - #TODO: QForAli - Is this right? If match index is 0 are we doing nothing? + #TODO: Is this right? If match index is 0 are we doing nothing? if theMatch > 0: RDS.reweighMultireads(reweighList) fixedPair += 1