import sys
import pysam
from array import array
-from commoncode import getReverseComplement
+from commoncode import getReverseComplement, isSpliceEntry
currentRDSVersion = "3.0"
-
class ReadDatasetError(Exception):
pass
-class BamReadDataset():
+class ReadDataset():
""" Class for storing reads from experiments. Assumes that custom scripts
will translate incoming data into a format that can be inserted into the
class using the insert* methods. Default class subtype ('DNA') includes
newrow["chrom"] = chrom
if withPairID:
- newrow["pairID"] = pairID
+ newrow["pairID"] = pairReadSuffix[1:]
try:
resultsDict[dictKey].append(newrow)
return sense
-def isSpliceEntry(cigarTupleList):
- isSplice = False
- for operation,length in cigarTupleList:
- if operation == 3:
- isSplice = True
- break
-
- return isSplice
-
-
def getSpliceRightStart(start, cigarTupleList):
offset = 0
"""
- def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
+ def __init__(self, start, stop, label="", index=0, chrom="", numReads=0., foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
self.label = label
self.index = index
self.chrom = chrom
--- /dev/null
+'''
+User specs:
+1. Checks if there are NH tags presents
+2. If there aren't, it goes through the BAM file, calculates the read multiplicity
+3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
+4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
+5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
+the file with one that has that information in the same manner, it will save the need to count all the reads every time.
+It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
+
+The only place to have the flag is going to be to attach them to the individual reads. We'll use the 'ZG' tag and have it
+flagged here if the tag will be populated. Read will be tagged with the feature label or 'NM' if it is not in a feature.
+'''
+import sys
+import pysam
+import optparse
+from cistematic.genomes import Genome
+from commoncode import isSpliceEntry, getFeaturesByChromDict
+
+
+def main(argv=None):
+
+ usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+ if len(sys.argv) < 3:
+ print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+ print ' BAM file has to be indexed'
+ sys.exit(1)
+
+ parser = getParser(usage)
+ (options, args) = parser.parse_args(argv[1:])
+
+ BAM = args[0]
+ outputfilename = args[1]
+
+ samfile = pysam.Samfile(BAM, "rb" )
+ readMultiplicityDict = getReadMultiplicity(samfile)
+
+ counts = getReadCounts(samfile, readMultiplicityDict)
+ outheader = buildHeader(samfile.header, counts)
+ if options.markGID:
+ genome = Genome(options.genomeName, inRAM=True)
+ if options.extendGenome != "":
+ genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
+
+ print "getting gene features...."
+ featuresByChromDict = getFeaturesByChromDict(genome)
+ outheader['CO'].append(options.genomeComment)
+
+ outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
+ for alignedread in samfile.fetch(until_eof=True):
+ geneModelFlag = ""
+ if options.markGID:
+ chrom = samfile.getrname(alignedread.tid)
+ chromNum = chrom[3:]
+ geneModelFlag = "NM"
+ for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
+ if start < alignedread.pos and stop > alignedread.pos:
+ geneModelFlag = gid
+ continue
+
+ ID = getReadID(alignedread)
+ try:
+ scaleby = alignedread.opt('NH')
+ outfile.write(alignedread)
+ except KeyError:
+ scaleby = readMultiplicityDict[ID]
+ writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
+
+ outfile.close()
+
+
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--genome", dest="genomeName")
+ parser.add_option("--genomeName", dest="genomeComment")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.add_option("--markGID", action="store_true", dest="markGID")
+
+ parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
+
+ return parser
+
+
+def getReadMultiplicity(samfile):
+ """ There is an assumption here that the set of all reads will not have a mixture of
+ reads bearing the NH tag and reads without.
+ """
+ readMultiplicityDict = {}
+ processedReads = 0
+ for alignedread in samfile.fetch(until_eof=True):
+ try:
+ readMultiplicity = alignedread.opt('NH')
+ return {}
+ except KeyError:
+ ID = getReadID(alignedread)
+ try:
+ readMultiplicityDict[ID] += 1
+ except KeyError:
+ readMultiplicityDict[ID] = 1
+
+ processedReads += 1
+ if processedReads % 5000000 == 0:
+ print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
+
+ return readMultiplicityDict
+
+
+def getReadCounts(samfile, readMultiplicityDict):
+ uniques = 0
+ multis = 0.0
+ minusUnique = 0
+ uniqueSplice = 0
+ multiSplice = 0.0
+ minusMulti = 0.0
+ readLength = 0
+ for alignedread in samfile.fetch(until_eof=True):
+ if readLength == 0:
+ readLength = getReadLength(alignedread.cigar)
+
+ try:
+ readMultiplicity = alignedread.opt('NH')
+ except KeyError:
+ ID = getReadID(alignedread)
+ readMultiplicity = readMultiplicityDict[ID]
+
+ if readMultiplicity == 1:
+ if alignedread.is_reverse:
+ minusUnique += 1
+
+ if isSpliceEntry(alignedread.cigar):
+ uniqueSplice += 1
+ else:
+ uniques += 1
+ else:
+ if alignedread.is_reverse:
+ minusMulti += 1.0/readMultiplicity
+
+ if isSpliceEntry(alignedread.cigar):
+ multiSplice += 1.0/readMultiplicity
+ else:
+ multis += 1.0/readMultiplicity
+
+ totalReads = uniques + multis + uniqueSplice + multiSplice
+ plusUnique = uniques + uniqueSplice - minusUnique
+ plusMulti = multis + multiSplice - minusMulti
+ counts = ["Total\t%d" % totalReads,
+ "Unique\t%d" % uniques,
+ "Multis\t%d" % multis,
+ "UniqueSplices\t%d" % uniqueSplice,
+ "Multisplices\t%d" % multiSplice,
+ "PlusUnique\t%d" % plusUnique,
+ "PlusMulti\t%d" % plusMulti,
+ "MinusUnique\t%d" % minusUnique,
+ "MinusMulti\t%d" % minusMulti,
+ "ReadLength\t%d" % readLength
+ ]
+
+ return counts
+
+
+def buildHeader(templateheader, commentList):
+ header = templateheader
+ header["CO"] = commentList
+
+ return header
+
+
+def getReadID(alignedread):
+ ID = alignedread.qname
+ if alignedread.is_read1:
+ ID = ID + '/1'
+
+ if alignedread.is_read2:
+ ID = ID + '/2'
+
+ return ID
+
+
+def getReadLength(cigar):
+ take = (0, 1) # CIGAR operation (M/match, I/insertion)
+
+ return sum([length for op,length in cigar if op in take])
+
+
+def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
+ newAlignedRead = originalRead
+ newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
+ outfile.write(newAlignedRead)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+ if os.path.isfile(dbfile):
+ checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename)
+ else:
+ outfile = open(outFileName, "w")
+ goodfile = open(goodFileName, "w")
+ infile = open(filename)
+ print "No database - passing through"
+ if logfilename is not None:
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+ writeLog(logfilename, versionString, "No database - passing through")
+
+ for line in infile:
+ outfile.write("%s\tNR\tNR\t0.00\n" % line)
+ goodfile.write(line)
+
+ outfile.close()
+ goodfile.close()
+
+
+def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+
outfile = open(outFileName, "w")
goodfile = open(goodFileName, "w")
if startField < 0:
if cachePages < 250000:
cachePages = 250000
- doLog = False
if logfilename is not None:
writeLog(logfilename, versionString, string.join(sys.argv[1:]))
- doLog = True
infile = open(filename)
- if os.path.isfile(dbfile):
- db = sqlite.connect(dbfile)
- sql = db.cursor()
- sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
- sql.execute("PRAGMA temp_store = MEMORY")
- else:
- print "No database - passing through"
- if doLog:
- writeLog(logfilename, versionString, "No database - passing through")
-
- for line in infile:
- outfile.write("%s\tNR\tNR\t0.00\n" % line)
- goodfile.write(line)
-
- outfile.close()
- goodfile.close()
- sys.exit(0)
+ db = sqlite.connect(dbfile)
+ sql = db.cursor()
+ sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+ sql.execute("PRAGMA temp_store = MEMORY")
featureList = []
featureDict = {}
outfile.write(header)
finalfile = open(finalfileName)
- #TODO: QForAli - the output lines are driven by finalfile. If there are genes in the first 2 that
+ #TODO: the output lines are driven by finalfile. If there are genes in the first 2 that
# are not in the finalfile then they will be lost.
for line in finalfile:
fields = line.strip().split()
if not argv:
argv = sys.argv
- usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+ usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
for table in tableList:
print "importing table %s from file %s" % (table, inputfile)
dbColumns = "*"
- if table == "uniqs":
- dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
- elif table == "multi":
+ if table in ["uniqs", "multi"]:
dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
elif table == "splices":
dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
import Region
commoncodeVersion = 5.6
-currentRDSversion = 2.0
+#TODO: This is a function dumping ground - needs to be reworked
class ErangeError(Exception):
pass
logfile.close()
+def getChromInfoList(chrominfo):
+ chromInfoList = []
+ linelist = open(chrominfo)
+ for line in linelist:
+ fields = line.strip().split('\t')
+ chr = fields[0]
+ start = 0
+ end = int(fields[1])
+ chromInfoList.append((chr,start,end))
+
+ linelist.close()
+
+ return chromInfoList
+
+
+# BAM file support functions
+def isSpliceEntry(cigarTupleList):
+ isSplice = False
+ if cigarTupleList is not None:
+ for operation,length in cigarTupleList:
+ if operation == 3:
+ isSplice = True
+ break
+
+ return isSplice
+
+
+def addPairIDtoReadID(alignedread):
+ ID = alignedread.qname
+ if alignedread.is_read1:
+ ID = ID + '/1'
+
+ if alignedread.is_read2:
+ ID = ID + '/2'
+
+ return ID
+
+
+def getHeaderComment(bamHeader, commentKey):
+ for comment in bamHeader["CO"]:
+ fields = comment.split("\t")
+ if fields[0] == commentKey:
+ return fields[1]
+
+ raise KeyError
+
+
+def getReadSense(read):
+ if read.is_reverse:
+ sense = "-"
+ else:
+ sense = "+"
+
+ return sense
+
+#TODO: is this where we need to take into account the NH > 10 restriction?
+def countThisRead(read, countUniqs, countMulti, countSplices, sense):
+ countRead = True
+
+ if sense in ['-', '+'] and sense != getReadSense(read):
+ countRead = False
+ elif not countSplices and isSpliceEntry(read.cigar):
+ countRead = False
+ elif not countUniqs and read.opt('NH') == 1:
+ countRead = False
+ elif not countMulti and read.opt('NH') > 1:
+ countRead = False
+
+ return countRead
+
+
+# Cistematic functions
def getGeneInfoDict(genome, cache=False):
idb = geneinfoDB(cache=cache)
if genome == "dmelanogaster":
return geneannotDict
+# Configuration File Support
def getConfigParser(fileList=[]):
configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
for filename in fileList:
return setting
+# All the Other Stuff from before
def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
doMerge=True, keepPeak=False, returnTop=0):
# the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
# will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
# be merged with A but that will exit the loop and the merge with C will be missed.
- #TODO: QForAli - Are we going to require sorted region files to have this even work?
+ #TODO: Are we going to require sorted region files to have this even work?
for regionEntry in regionList:
if regionEntry[0] == "#":
if "pvalue" in regionEntry:
# implementing a triangular smooth
smoothArray = array("f", [0.] * length)
- #TODO: QForAli - really? We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
- # (probably) just to avoid IndexError exceptions. Then we are going to completely ignore adjusting the
- # last 2 elements of the array for (probably) the same issue. Is this how erange is dealing with the
- # edge cases?
for pos in range(2,length -2):
smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
fail silently.
-3. MAKING THE NECESSARY INPUT (RDS) FILES
-
-You will want to first convert your read mappings to the
-native ERANGE read store. Please see the file
-README.build-rds for instructions on how to do this.
-
-Build an RDS file for both the ChIP, and if available and
-appropriate, the control. Note that we *HIGHLY* recommend
-the use of a matched control sample to account for some
-of the general background artifacts that can be present
-in ChIP-seq samples (e.g. DNAse hypersensitivity,
-assembly collapse of some sattelite repeats, etc....).
+3. MAKING THE NECESSARY INPUT FILES
+
+Erange uses BAM format files, but there are a couple of
+modifications that need to be made to the header and
+individual entries. The python script bamPreprocessing.py
+will do the following:
+1. Count the reads by type and write these counts to the
+header as comments.
+2. Verify that every read has a value in the NH tag or add
+it if needed.
+3. Optionally annotate the reads with the geneID using the
+ZG flag
+
+Note that we *HIGHLY* recommend the use of a matched
+control sample to account for some of the general
+background artifacts that can be present in ChIP-seq
+samples (e.g. DNAse hypersensitivity, assembly collapse
+of some sattelite repeats, etc....).
4. WEIGHING MULTIREADS
# analysis steps for an ERANGE analysis of RNA-seq data
-# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
+# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.py
# preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
# export PYTHONPATH=/my/path/to/cistematic
#!/bin/bash
#
+# This is no longer supported. It is recommended that the pythin script of the same name be used instead.
+#
# runRNAPairedAnalysis.sh
# ENRAGE
#
#!/bin/bash
#
+# This is no longer supported. It is recommended that the pythin script of the same name be used instead.
+#
# runSNPAnalysis.sh
#
# Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
# make bed file for displaying the snps on UCSC genome browser
python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
-fi
\ No newline at end of file
+fi
#!/bin/bash
#
+# This is no longer supported. It is recommended that the pythin script of the same name be used instead.
+#
# runStandardAnalysis.sh
# ENRAGE
#
echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
-fi
\ No newline at end of file
+fi
#!/bin/bash
#
+# This is no longer supported. It is recommended that the pythin script of the same name be used instead.
+#
# runStrandedAnalysis.sh
# ENRAGE
#
"""
- usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
- [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+ usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+ [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
[--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
[--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
[--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
import string
import optparse
import operator
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
import Region
class RegionFinder():
def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
- normalize=True, listPeak=False, reportshift=False, stringency=1.0):
+ normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
self.statistics = {"index": 0,
"total": 0,
self.regionLabel = label
self.rnaSettings = False
- self.controlRDSsize = 1
- self.sampleRDSsize = 1
+ self.controlRDSsize = 1.0
+ self.sampleRDSsize = 1.0
self.minRatio = minRatio
self.minPeak = minPeak
self.leftPlusRatio = leftPlusRatio
self.listPeak = listPeak
self.reportshift = reportshift
self.stringency = max(stringency, 1.0)
+ self.controlfile = controlfile
+ self.doControl = self.controlfile is not None
+ self.doPvalue = False
+ self.doRevBackground = doRevBackground
def useRNASettings(self, readlen):
self.maxSpacing = readlen
- def getHeader(self, doPvalue):
+ def getHeader(self):
if self.normalize:
countType = "RPM"
else:
if self.reportshift:
headerFields.append("readShift")
- if doPvalue:
+ if self.doPvalue:
headerFields.append("pValue")
return string.join(headerFields, "\t")
- def printSettings(self, doRevBackground, ptype, doControl, useMulti, doCache, pValueType):
+ def printSettings(self, ptype, useMulti, pValueType):
print
- self.printStatusMessages(doRevBackground, ptype, doControl, useMulti)
- self.printOptionsSummary(useMulti, doCache, pValueType)
+ self.printStatusMessages(ptype, useMulti)
+ self.printOptionsSummary(useMulti, pValueType)
- def printStatusMessages(self, doRevBackground, ptype, doControl, useMulti):
+ def printStatusMessages(self, ptype, useMulti):
if self.shiftValue == "learn":
print "Will try to learn shift"
if self.normalize:
print "Normalizing to RPM"
- if doRevBackground:
+ if self.doRevBackground:
print "Swapping IP and background to calculate FDR"
if ptype != "":
if ptype in ["NONE", "SELF"]:
pass
elif ptype == "BACK":
- if doControl and doRevBackground:
+ if self.doControl and self.doRevBackground:
pass
else:
print "must have a control dataset and -revbackground for pValue type 'back'"
print "only analyzing reads on the minus strand"
- def printOptionsSummary(self, useMulti, doCache, pValueType):
+ def printOptionsSummary(self, useMulti, pValueType):
- print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, self.listPeak, not useMulti, doCache)
+ print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
try:
print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
- def getAnalysisDescription(self, hitfile, useMulti, doCache, pValueType, controlfile, doControl):
+ def getAnalysisDescription(self, hitfile, useMulti, pValueType):
description = ["#ERANGE %s" % versionString]
- if doControl:
- description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+ if self.doControl:
+ description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
else:
description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
if self.withFlag != "":
description.append("#restrict to Flag = %s" % self.withFlag)
- description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, self.listPeak, not useMulti, doCache))
+ description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
try:
description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
return string.join(description, "\n")
+ def getFooter(self, bestShift):
+ index = self.statistics["index"]
+ mIndex = self.statistics["mIndex"]
+ footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+ if self.doDirectionality:
+ footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+ if self.doRevBackground:
+ try:
+ percent = min(100. * (float(mIndex)/index), 100.)
+ except ZeroDivisionError:
+ percent = 0.
+
+ footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+ if self.shiftValue == "auto" and self.reportshift:
+
+ footerLines.append("#mode of shift values: %d" % bestShift)
+
+ if self.statistics["badRegionTrim"] > 0:
+ footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+ return string.join(footerLines, "\n")
+
+
def updateControlStatistics(self, peak, sumAll, peakScore):
plusRatio = float(peak.numPlus)/peak.numHits
minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
- reportshift=options.reportshift, stringency=options.stringency)
+ reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+ doRevBackground=options.doRevBackground)
findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
- options.cachePages, options.ptype, options.controlfile, options.doRevBackground,
- options.useMulti, options.combine5p)
+ options.ptype, options.useMulti, options.combine5p)
def makeParser():
return parser
-def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, cachePages=None,
- ptype="", controlfile=None, doRevBackground=False, useMulti=True, combine5p=False):
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+ ptype="", useMulti=True, combine5p=False):
writeLog(logfilename, versionString, string.join(sys.argv[1:]))
- doCache = cachePages is not None
- controlRDS = None
- doControl = controlfile is not None
- if doControl:
+ controlBAM = None
+ if regionFinder.doControl:
print "\ncontrol:"
- controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
- regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+ controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+ regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
print "\nsample:"
- hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
- regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
- pValueType = getPValueType(ptype, doControl, doRevBackground)
- doPvalue = not pValueType == "none"
- regionFinder.readlen = hitRDS.getReadSize()
+ sampleBAM = pysam.Samfile(hitfile, "rb")
+ regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+ pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+ regionFinder.doPvalue = not pValueType == "none"
+ regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
if rnaSettings:
regionFinder.useRNASettings(regionFinder.readlen)
- regionFinder.printSettings(doRevBackground, ptype, doControl, useMulti, doCache, pValueType)
+ regionFinder.printSettings(ptype, useMulti, pValueType)
outfile = open(outfilename, outputMode)
- header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl)
+ header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
shiftDict = {}
- chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
- for chromosome in chromosomeList:
- #TODO: QForAli -Really? Use first chr shift value for all of them
+ chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+ for chromosome in chromList:
+ #TODO: Really? Use first chr shift value for all of them
+ maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
if regionFinder.shiftValue == "learn":
- learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
+ regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
- allregions, outregions = findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
- if doRevBackground:
- backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti)
- writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+ allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+ if regionFinder.doRevBackground:
+ maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+ backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
else:
- writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header)
+ backregions = []
+ pValueType = "self"
+
+ writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
- footer = getFooter(regionFinder, shiftDict, doRevBackground)
+ try:
+ bestShift = getBestShiftInDict(shiftDict)
+ except ValueError:
+ bestShift = 0
+
+ footer = regionFinder.getFooter(bestShift)
print footer
print >> outfile, footer
outfile.close()
writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+def getHeaderComment(bamHeader, commentKey):
+ for comment in bamHeader["CO"]:
+ fields = comment.split("\t")
+ if fields[0] == commentKey:
+ return fields[1]
+
+ raise KeyError
+
+
def getPValueType(ptype, doControl, doRevBackground):
pValueType = "self"
if ptype in ["NONE", "SELF", "BACK"]:
return pValueType
-def openRDSFile(filename, cachePages=None, doCache=False):
- rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
- if cachePages > rds.getDefaultCacheSize():
- rds.setDBcache(cachePages)
-
- return rds
-
-
-def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl):
- print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, doCache, pValueType, controlfile, doControl)
- header = regionFinder.getHeader(doPvalue)
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+ print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+ header = regionFinder.getHeader()
print >> outfile, header
return header
-def getChromosomeListToProcess(hitRDS, controlRDS=None, doControl=False):
- hitChromList = hitRDS.getChromosomes()
- if doControl:
- controlChromList = controlRDS.getChromosomes()
- chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+ if controlBAM is not None:
+ chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
else:
- chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
+ chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
return chromosomeList
-def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
- outfile, useMulti, doControl, controlRDS, combine5p):
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+ outfile, useMulti, controlBAM, combine5p):
outregions = []
allregions = []
print "chromosome %s" % (chromosome)
previousHit = - 1 * regionFinder.maxSpacing
readStartPositions = [-1]
- totalWeight = 0
+ totalWeight = 0.0
uniqueReadCount = 0
reads = []
- numStarts = 0
- badRegion = False
- hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
- strand=regionFinder.stranded, combine5p=combine5p)
+ numStartsInRegion = 0
- maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
- for read in hitDict[chromosome]:
- pos = read["start"]
+ for alignedread in sampleBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+ continue
+
+ pos = alignedread.pos
if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
lastReadPos = readStartPositions[-1]
lastBasePosition = lastReadPos + regionFinder.readlen - 1
allregions.append(int(region.numReads))
regionLength = lastReadPos - region.start
- if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
- region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+ if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+ region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
if region.foldRatio >= regionFinder.minRatio:
# first pass, with absolute numbers
try:
lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
except IndexError:
- badRegion = True
+ regionFinder.statistics["badRegionTrim"] += 1
continue
- region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+ region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
- # just in case it changed, use latest data
try:
bestPos = peak.topPos[0]
peakScore = peak.smoothArray[bestPos]
if regionFinder.normalize:
peakScore /= regionFinder.sampleRDSsize
- except:
+ except (IndexError, AttributeError, ZeroDivisionError):
continue
if regionFinder.listPeak:
- region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
+ region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
if useMulti:
- setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+ setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
regionFinder.normalize, regionFinder.doTrim)
region.shift = peak.shift
regionFinder.statistics["failed"] += 1
readStartPositions = []
- totalWeight = 0
+ totalWeight = 0.0
uniqueReadCount = 0
reads = []
- numStarts = 0
- if badRegion:
- badRegion = False
- regionFinder.statistics["badRegionTrim"] += 1
+ numStartsInRegion = 0
if pos not in readStartPositions:
- numStarts += 1
+ numStartsInRegion += 1
readStartPositions.append(pos)
- weight = read["weight"]
+ weight = 1.0/alignedread.opt('NH')
totalWeight += weight
if weight == 1.0:
uniqueReadCount += 1
- reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+ reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
return allregions, outregions
-def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti):
+def getReadSense(read):
+ if read.is_reverse:
+ sense = "-"
+ else:
+ sense = "+"
+
+ return sense
+
+
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+ if read.opt('NH') > 1 and not doMulti:
+ return True
+
+ if strand == "+" and read.is_reverse:
+ return True
+
+ if strand == "-" and not read.is_reverse:
+ return True
+
+ return False
+
+
+def getMaxCoordinate(samfile, chr, doMulti=False):
+ maxCoord = 0
+ for alignedread in samfile.fetch(chr):
+ if alignedread.opt('NH') > 1:
+ if doMulti:
+ maxCoord = max(maxCoord, alignedread.pos)
+ else:
+ maxCoord = max(maxCoord, alignedread.pos)
+
+ return maxCoord
+
+
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
#TODO: this is *almost* the same calculation - there are small yet important differences
print "calculating background..."
previousHit = - 1 * regionFinder.maxSpacing
currentHitList = [-1]
- currentTotalWeight = 0
+ currentTotalWeight = 0.0
currentReadList = []
backregions = []
numStarts = 0
badRegion = False
- hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
- maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
- for read in hitDict[chromosome]:
- pos = read["start"]
+ for alignedread in controlBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti):
+ continue
+
+ pos = alignedread.pos
if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
lastReadPos = currentHitList[-1]
lastBasePosition = lastReadPos + regionFinder.readlen - 1
region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
regionLength = lastReadPos - region.start
if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
- numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+ numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
if regionFinder.normalize:
numMock /= regionFinder.sampleRDSsize
badRegion = True
continue
- numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+ numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
if regionFinder.normalize:
numMock /= regionFinder.sampleRDSsize
foldRatio = region.numReads / numMock
- # just in case it changed, use latest data
try:
bestPos = peak.topPos[0]
peakScore = peak.smoothArray[bestPos]
regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
currentHitList = []
- currentTotalWeight = 0
+ currentTotalWeight = 0.0
currentReadList = []
numStarts = 0
if badRegion:
numStarts += 1
currentHitList.append(pos)
- weight = read["weight"]
+ weight = 1.0/alignedread.opt('NH')
currentTotalWeight += weight
- currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+ currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
return backregions
-def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
- outfile, useMulti, doControl, controlRDS, combine5p):
-
- hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
- strand=regionFinder.stranded, combine5p=combine5p)
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+ outfile, useMulti, controlBAM, combine5p):
- maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
print "learning shift.... will need at least 30 training sites"
stringency = regionFinder.stringency
previousHit = -1 * regionFinder.maxSpacing
positionList = [-1]
- totalWeight = 0
+ totalWeight = 0.0
readList = []
shiftDict = {}
count = 0
numStarts = 0
- for read in hitDict[chromosome]:
- pos = read["start"]
+ for alignedread in sampleBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+ continue
+
+ pos = alignedread.pos
if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
if regionFinder.normalize:
totalWeight /= regionFinder.sampleRDSsize
regionStop = positionList[-1]
regionLength = regionStop - regionStart
if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
- foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl)
+ foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
if foldRatio >= regionFinder.minRatio:
updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
count += 1
positionList = []
- totalWeight = 0
+ totalWeight = 0.0
readList = []
if pos not in positionList:
numStarts += 1
positionList.append(pos)
- weight = read["weight"]
+ weight = 1.0/alignedread.opt('NH')
totalWeight += weight
- readList.append({"start": pos, "sense": read["sense"], "weight": weight})
+ readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
print outline
writeLog(logfilename, versionString, outfilename + outline)
- regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
- outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+ shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+ outline = "#picked shiftValue to be %d" % shiftValue
print outline
print >> outfile, outline
writeLog(logfilename, versionString, outfilename + outline)
+ return shiftValue, shiftDict
+
def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
return abs(pos - previousHit) > maxSpacing or pos == maxCoord
minNumReadStarts = stringency * regionFinder.minRatio
minRegionLength = stringency * regionFinder.readlen
- return sumAll >= minTotalReads and numStarts > minNumReadStarts and regionLength > minRegionLength
+ return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
-def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl):
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
""" Fold ratio calculated is total read weight over control
"""
#TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
- if doControl:
- numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+ if regionFinder.doControl:
+ numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
if regionFinder.normalize:
numMock /= regionFinder.controlRDSsize
return foldRatio
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+ count = 0.0
+ for alignedread in bamfile.fetch(chr, start, end):
+ if alignedread.opt('NH') > 1:
+ if countMulti:
+ if isSpliceEntry(alignedread.cigar):
+ if countSplices:
+ count += 1.0/alignedread.opt('NH')
+ else:
+ count += 1.0/alignedread.opt('NH')
+ elif uniqs:
+ if isSpliceEntry(alignedread.cigar):
+ if countSplices:
+ count += 1.0
+ else:
+ count += 1.0
+
+ return count
+
def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
try:
return region
-def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
if doTrim:
- sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+ sumMulti = 0.0
+ for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+ if alignedread.opt('NH') > 1:
+ sumMulti += 1.0/alignedread.opt('NH')
else:
sumMulti = currentTotalWeight - currentUniqueCount
raise RegionDirectionError
-def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
- allregions, header):
-
- writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
- allregions, header, backregions=[], pValueType="self")
-
-
-def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
allregions, header, backregions=[], pValueType="none"):
print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
- if doPvalue:
+ if regionFinder.doPvalue:
if pValueType == "self":
poissonmean = calculatePoissonMean(allregions)
else:
poissonmean = calculatePoissonMean(backregions)
print header
- writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=regionFinder.reportshift, shiftDict=shiftDict)
+ for region in outregions:
+ if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+ try:
+ shiftDict[region.shift] += 1
+ except KeyError:
+ shiftDict[region.shift] = 1
+
+ outline = getRegionString(region, regionFinder.reportshift)
+
+ # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+ if regionFinder.doPvalue:
+ pValue = calculatePValue(int(region.numReads), poissonmean)
+ outline += "\t%1.2g" % pValue
+
+ print outline
+ print >> outfile, outline
def calculatePoissonMean(dataList):
return poissonmean
-def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
- for region in outregions:
- if shiftValue == "auto" and reportshift:
- try:
- shiftDict[region.shift] += 1
- except KeyError:
- shiftDict[region.shift] = 1
-
- outline = getRegionString(region, reportshift)
-
- # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
- if doPvalue:
- sumAll = int(region.numReads)
- pValue = calculatePValue(sumAll, poissonmean)
- outline += "\t%1.2g" % pValue
-
- print outline
- print >> outfile, outline
-
-
def calculatePValue(sum, poissonmean):
pValue = math.exp(-poissonmean)
- #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
for i in xrange(sum):
pValue *= poissonmean
pValue /= i+1
return outline
-def getFooter(regionFinder, shiftDict, doRevBackground):
- index = regionFinder.statistics["index"]
- mIndex = regionFinder.statistics["mIndex"]
- footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
- if regionFinder.doDirectionality:
- footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
-
- if doRevBackground:
- try:
- percent = min(100. * (float(mIndex)/index), 100.)
- except ZeroDivisionError:
- percent = 0.
-
- footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
-
- if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
- bestShift = getBestShiftInDict(shiftDict)
- footerLines.append("#mode of shift values: %d" % bestShift)
-
- if regionFinder.statistics["badRegionTrim"] > 0:
- footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
-
- return string.join(footerLines, "\n")
-
-
def getBestShiftInDict(shiftDict):
return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
try:
normalizedValue = 100. * binAmount / tagCount
except ZeroDivisionError:
- #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
+ #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
normalizedValue = 100. * binAmount
binAmount = normalizedValue
import sys
import optparse
-from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
from cistematic.genomes import Genome
from cistematic.core.geneinfo import geneinfoDB
-print "geneMrnaCounts: version 5.2"
+print "geneMrnaCounts: version 6.0"
def main(argv=None):
geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
- options.searchGID, options.countFeats, options.cachePages, options.markGID)
+ options.searchGID, options.countFeats, options.cachePages)
def getParser(usage):
parser.add_option("--searchGID", action="store_true", dest="searchGID")
parser.add_option("--countfeatures", action="store_true", dest="countFeats")
parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--markGID", action="store_true", dest="markGID")
configParser = getConfigParser()
section = "geneMrnaCounts"
searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
cachePages = getConfigOption(configParser, section, "cachePages", None)
- markGID = getConfigBoolOption(configParser, section, "markGID", False)
parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
- countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+ countFeats=countFeats, cachePages=cachePages)
return parser
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
- searchGID=False, countFeats=False, cachePages=None, markGID=False):
+ searchGID=False, countFeats=False):
if trackStrand:
print "will track strandedness"
else:
replaceModels = False
- if cachePages is not None:
- doCache = True
- else:
- cachePages = 100000
- doCache = False
-
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
-
genome = Genome(genomeName, inRAM=True)
if extendGenome != "":
genome.extendFeatures(extendGenome, replace=replaceModels)
for gid in gidList:
gidCount[gid] = 0
- chromList = hitRDS.getChromosomes(fullChrom=False)
- if len(chromList) == 0 and doSplices:
- chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
- if markGID:
- print "Flagging all reads as NM"
- hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
- for chrom in chromList:
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ for chrom in chromosomeList:
if chrom not in featuresByChromDict:
continue
if countFeats:
seenFeaturesByChromDict[chrom] = set([])
- print "\nchr%s" % chrom
- fullchrom = "chr%s" % chrom
- regionList = []
+ print "\nchr%s" % chrom
print "counting GIDs"
for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
try:
if featureSense == "R":
checkSense = "-"
- regionData = (gid, fullchrom, start, stop, checkSense)
- count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+ count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
else:
- regionData = (gid, fullchrom, start, stop)
- count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+ count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
gidCount[gid] += count
- if markGID:
- regionList.append(regionData)
-
if countFeats:
seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
except:
print "problem with %s - skipping" % gid
- if markGID:
- print "marking GIDs"
- hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
- print "finished marking"
-
print " "
if countFeats:
numFeatures = countFeatures(seenFeaturesByChromDict)
print "saw %d features" % numFeatures
- writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
- if markGID and doCache:
- hitRDS.saveCacheDB(hitfile)
+ writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+ count = 0.0
+ for alignedread in bamfile.fetch(chrom, start, stop):
+ if countThisRead(alignedread, uniqs, multi, splices, sense):
+ count += 1.0/alignedread.opt('NH')
+
+ return count
def countFeatures(seenFeaturesByChromDict):
return count
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
geneAnnotDict = genome.allAnnotInfo()
genomeName = genome.genome
outfile = open(outfilename, "w")
idb = geneinfoDB(cache=True)
geneInfoDict = idb.getallGeneInfo(genomeName)
- for gid in gidList:
+ for gid in gidCount:
symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
- if gid in gidCount:
- outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
- else:
- outfile.write("%s\t%s\t0\n" % (gid, symbol))
+ outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
outfile.close()
import sys
import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
from cistematic.genomes import Genome
from commoncode import getGeneInfoDict
from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
def main(argv=None):
hitfile = args[1]
countfile = args[2]
outfilename = args[3]
+ bamfile = pysam.Samfile(hitfile, "rb")
- geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+ geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
options.withUniqs, options.withMulti,
options.acceptfile, options.cachePages, options.doVerbose,
options.extendGenome, options.replaceModels)
return parser
-#TODO: Reported user performance issue. Long run times in conditions:
-# small number of reads ~40-50M
-# all features on single chromosome
-#
-# User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
withUniqs=False, withMulti=False, acceptfile=None,
cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
doCache = True
else:
doCache = False
- cachePages = 0
hg = Genome(genome, inRAM=True)
if extendGenome:
hg.extendFeatures(extendGenome, replace=replaceModels)
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
-
allGIDs = set(hg.allGIDs())
if acceptfile is not None:
regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
gidReadDict[gid] = []
index = 0
- if withMulti and not withUniqs:
- chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
- else:
- chromList = hitRDS.getChromosomes(fullChrom=False)
-
- readlen = hitRDS.getReadSize()
- for chromosome in chromList:
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ for chromosome in chromosomeList:
if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
continue
print "\n%s " % chromosome,
- fullchrom = "chr%s" % chromosome
- hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
featureList = featuresByChromDict[chromosome]
- readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+ readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
index = totalProcessedReads
for (tagReadID, gid) in readGidList:
try:
return chromosome not in chromosomeList
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
startFeature = 0
readGidList = []
- ignoreSense = True
- for read in hitDict[fullchrom]:
- tagStart = read["start"]
- tagReadID = read["readID"]
- if "sense" in read:
- tagSense = read["sense"]
- ignoreSense = False
+ readlen = getHeaderComment(bamfile.header, "ReadLength")
+ for alignedread in bamfile.fetch(chromosome):
+ if doNotProcessRead(alignedread, doUniqs, doMulti):
+ continue
+
+ tagStart = alignedread.pos
+ tagReadID = addPairIDtoReadID(alignedread)
+ tagSense = ""
+ if not ignoreSense:
+ if alignedread.is_reverse:
+ tagSense = "-"
+ else:
+ tagSense = "+"
index += 1
if index % 100000 == 0:
return readGidList, index
+def doNotProcessRead(read, doUniqs, doMulti):
+ doNotProcess = False
+ if isSpliceEntry(read.cigar):
+ doNotProcess = True
+ elif doUniqs and read.opt('NH') > 1:
+ doNotProcess = True
+ elif doMulti and read.opt('NH') == 1:
+ doNotProcess = True
+
+ return doNotProcess
+
+
def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
uniqueCountDict = {}
-#
-# geneStartBins.py
-# ENRAGE
-#
-
try:
import psyco
psyco.full()
pass
import sys
-from commoncode import *
-import ReadDataset
+from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
+import pysam
from cistematic.genomes import Genome
-
print "geneStartBins: version 2.1"
if len(sys.argv) < 4:
print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
genome = sys.argv[1]
hitfile = sys.argv[2]
outfilename = sys.argv[3]
-
standardMinDist = 3000
if '-max' in sys.argv:
standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
bins = 10
standardMinThresh = standardMinDist / bins
-
-hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
-readlen = hitRDS.getReadSize()
+bamfile = pysam.Samfile(hitfile, "rb")
+readlen = getHeaderComment(bamfile.header, "ReadLength")
normalizationFactor = 1.0
if normalize:
- totalCount = len(hitRDS)
+ totalCount = getHeaderComment(bamfile.header, "Total")
normalizationFactor = totalCount / 1000000.
hg = Genome(genome)
gidDict = {}
geneinfoDict = getGeneInfoDict(genome, cache=True)
featuresDict = hg.getallGeneFeatures()
-
outfile = open(outfilename,'w')
-
gidList = hg.allGIDs()
gidList.sort()
+chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
for gid in gidList:
symbol = 'LOC' + gid
geneinfo = ''
if (start, stop) not in newfeatureList:
newfeatureList.append((start, stop))
- if chrom not in hitDict:
+ if chrom not in chromosomeList:
continue
newfeatureList.sort()
gstart = newfeatureList[-1][1] - glen
gstop = newfeatureList[-1][1] + glen
+
tagCount = 0
if glen < standardMinDist / 2:
continue
binList = [0] * bins
- for read in hitDict[chrom]:
- tagStart = read["start"] - gstart
- sense = read["sense"]
- weight = read["weight"]
+ for alignedread in bamfile.fetch(chrom):
+ tagStart = alignedread.pos - gstart
+ sense = getReadSense(alignedread)
+ weight = 1.0/alignedread.opt('NH')
if tagStart >= 2 * glen:
break
delimiter = "|"
usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+ \n\twhere ucscModels is the gene model file in same format as the UCSC Table Browser\
\n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
\n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
import sys
import optparse
-import ReadDataset
+import pysam
from cistematic.genomes import Genome
from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
print "normalizeExpandedExonic: version 5.7"
if not argv:
argv = sys.argv
- usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+ usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
splicecountfile = args[3]
outfile = args[4]
- candidateLines = []
+ candidatefilename = ""
acceptedfilename = ""
- if len(args) > 5:
- try:
- candidatefile = open(args[5])
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- acceptedfilename = args[6]
- except IndexError:
- pass
-
- RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)
- uniqcount = RDS.getUniqsCount()
+ try:
+ candidatefilename = args[5]
+ acceptedfilename = args[6]
+ except IndexError:
+ pass
+
+ bamfile = pysam.Samfile(hitfile, "rb")
+ uniqcount = getHeaderComment(bamfile.header, "Unique")
normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
- candidateLines, acceptedfilename, options.fieldID,
+ candidatefilename, acceptedfilename, options.fieldID,
options.maxLength, options.doCache, options.extendGenome,
options.replaceModels)
def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
- outfilename, candidateLines=[], acceptedfilename="",
+ outfilename, candidatefilename="", acceptedfilename="",
fieldID=0, maxLength=1000000000., doCache=False,
extendGenome="", replaceModels=False):
-
- uniquecountfile = open(uniquecountfilename)
-
+
+ print "%d unique reads" % uniqcount
+ gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
+ totalCount = (uniqcount + splicecount) / 1000000.
+ uniqScale = uniqcount / 1000000.
if acceptedfilename:
acceptedfile = open(acceptedfilename, "w")
- dosplicecount = False
- if splicecountfilename != "none":
- dosplicecount = True
- splicecountfile = open(splicecountfilename)
-
- if extendGenome:
- if replaceModels:
- print "will replace gene models with %s" % extendGenome
- else:
- print "will extend gene models with %s" % extendGenome
-
- if doCache:
- cacheGeneDB(genome)
- hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
- print "%s cached" % genome
- else:
- hg = Genome(genome, inRAM=True)
-
- if extendGenome != "":
- hg.extendFeatures(extendGenome, replace=replaceModels)
-
-
- print "%d unique reads" % uniqcount
-
- splicecount = 0
- countDict = {}
- gidList = []
- farList = []
- candidateDict = {}
-
- gidToGeneDict = {}
-
- featuresDict = hg.getallGeneFeatures()
- print "got featuresDict"
-
outfile = open(outfilename, "w")
-
- for line in uniquecountfile:
- fields = line.strip().split()
- gid = fields[fieldID]
- gene = fields[1]
- countDict[gid] = float(fields[-1])
- gidList.append(gid)
- gidToGeneDict[gid] = gene
-
- uniquecountfile.close()
-
- if dosplicecount:
- for line in splicecountfile:
- fields = line.strip().split()
- gid = fields[fieldID]
- try:
- countDict[gid] += float(fields[-1])
- except:
- print fields
- continue
-
- splicecount += float(fields[-1])
-
- splicecountfile.close()
-
- for line in candidateLines:
- if "#" in line:
- continue
-
- fields = line.strip().split()
- gid = fields[1]
- gene = fields[0]
- if gid not in gidList:
- if gid not in farList:
- farList.append(gid)
- gidToGeneDict[gid] = gene
-
- if gid not in countDict:
- countDict[gid] = 0
-
- countDict[gid] += float(fields[6])
-
- if gid not in candidateDict:
- candidateDict[gid] = []
-
- candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
- totalCount = (uniqcount + splicecount) / 1000000.
- uniqScale = uniqcount / 1000000.
+ featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+ print "got featuresDict"
for gid in gidList:
- gene = gidToGeneDict[gid]
+ gene = gidToGeneDict[gid]["name"]
featureList = []
try:
featureList = featuresDict[gid]
elif geneLength > maxLength:
geneLength = maxLength
- rpm = countDict[gid] / totalCount
+ rpm = gidToGeneDict[gid]["count"] / totalCount
rpkm = rpm / geneLength
if gid in candidateDict:
for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
- cratio = cCount / (cLength / 1000.)
- cratio = (uniqScale * cratio) / totalCount
+ kilobaseLength = cLength / 1000.
+ cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
if 10. * cratio < rpkm:
continue
- countDict[gid] += cCount
- geneLength += cLength / 1000.
- acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+ gidToGeneDict[gid]["count"] += cCount
+ geneLength += kilobaseLength
+ try:
+ acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+ except TypeError:
+ pass
- rpm = countDict[gid] / totalCount
+ rpm = gidToGeneDict[gid]["count"] / totalCount
rpkm = rpm / geneLength
outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
- for gid in farList:
- gene = gidToGeneDict[gid]
- geneLength = 0
- for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
- geneLength += cLength / 1000.
-
+ for (gid, geneLength) in farList:
if geneLength < 0.1:
continue
- for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
- cratio = cCount / (cLength / 1000.)
- cratio = cratio / totalCount
- acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
- rpm = countDict[gid] / totalCount
+ gene = gidToGeneDict[gid]["name"]
+ if acceptedfilename:
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ kilobaseLength = cLength / 1000.
+ cratio = (cCount / kilobaseLength) / totalCount
+ try:
+ acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+ except TypeError:
+ pass
+
+ rpm = gidToGeneDict[gid]["count"] / totalCount
rpkm = rpm / geneLength
outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
except:
pass
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+ gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+ uniqueGidList = gidToGeneDict.keys()
+ if candidatefilename:
+ candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+ else:
+ candidateDict = {}
+ geneDictUpdates = {}
+
+ farList = []
+ for gid in geneDictUpdates:
+ gidToGeneDict[gid] = geneDictUpdates[gid]
+ geneLength = 0.
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ geneLength += cLength / 1000.
+
+ farList.append((gid, geneLength))
+
+ return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+ gidToGeneDict = {}
+ uniquecountfile = open(uniquecountfilename)
+ for line in uniquecountfile:
+ fields = line.strip().split()
+ gid = fields[fieldID]
+ gene = fields[1]
+ gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+ uniquecountfile.close()
+ splicecount = 0.
+ if splicecountfilename != "none":
+ splicecountfile = open(splicecountfilename)
+ for line in splicecountfile:
+ fields = line.strip().split()
+ gid = fields[fieldID]
+ try:
+ gidToGeneDict[gid]["count"] += float(fields[-1])
+ except:
+ print fields
+ continue
+
+ splicecount += float(fields[-1])
+
+ splicecountfile.close()
+
+ return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+ """ Reads entries from the candiates file
+ gid entries not in gidList are returned for inclusion
+ creates and returns a dictionary keyed off gid with a single list of
+ tuples (count, length, chrom, start, stop) for each gid
+ """
+ geneDictUpdates = {}
+ candidateDict = {}
+ candidatefile = open(candidatefilename)
+ for line in candidatefile.readlines():
+ if "#" in line:
+ continue
+
+ fields = line.strip().split()
+ gid = fields[1]
+ gene = fields[0]
+ if gid not in gidList:
+ try:
+ geneDictUpdates[gid]["count"] += float(fields[6])
+ except KeyError:
+ geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+ try:
+ candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+ except KeyError:
+ candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+ candidatefile.close()
+
+ return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+ if extendGenome:
+ if replaceModels:
+ print "will replace gene models with %s" % extendGenome
+ else:
+ print "will extend gene models with %s" % extendGenome
+
+ if doCache:
+ cacheGeneDB(genome)
+ hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+ print "%s cached" % genome
+ else:
+ hg = Genome(genome, inRAM=True)
+
+ if extendGenome != "":
+ hg.extendFeatures(extendGenome, replace=replaceModels)
+
+ featuresDict = hg.getallGeneFeatures()
+
if doCache:
uncacheGeneDB(genome)
+ return featuresDict
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
import sys
import string
import optparse
-from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense
versionString = "regionCounts: version 3.10"
print versionString
sys.exit(1)
regionfilename = args[0]
- hitfile = args[1]
+ bamfilename = args[1]
outfilename = args[2]
- regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+ bamfile = pysam.Samfile(bamfilename, "rb")
+
+ regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField,
options.useFullchrom, options.normalize, options.padregion,
options.mergeregion, options.merging, options.doUniqs, options.doMulti,
options.doSplices, options.usePeak, options.cachePages, options.logfilename,
return parser
-def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
+def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1,
useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
print "merging regions closer than %d bp" % mergeregion
print "will use peak values"
- if cachePages != -1:
- doCache = True
- else:
- doCache = False
-
normalize = True
doRPKM = False
if doRPKM == True:
labeltoRegionDict = {}
regionCount = {}
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
- readlen = hitRDS.getReadSize()
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
-
- totalCount = len(hitRDS)
+ readlen = getHeaderComment(bamfile.header, "ReadLength")
+ totalCount = getHeaderComment(bamfile.header, "Total")
if normalize:
normalizationFactor = totalCount / 1000000.
- chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
- if len(chromList) == 0 and doSplices:
- chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
-
+ chromList = [chrom for chrom in bamfile.references if chrom != "chrM"]
chromList.sort()
-
- if flagRDS:
- hitRDS.setSynchronousPragma("OFF")
-
for rchrom in regionDict:
if forceRegion and rchrom not in chromList:
print rchrom
labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
for rchrom in chromList:
- regionList = []
+ #regionList = []
if rchrom not in regionDict:
continue
else:
fullchrom = "chr%s" % rchrom
- if usePeak:
- readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
- rindex = 0
- dictLen = len(readDict[fullchrom])
-
for region in regionDict[rchrom]:
label = region.label
start = region.start
regionCount[label] = 0
labelList.append(label)
labeltoRegionDict[label] = (rchrom, start, stop)
- regionList.append((label, fullchrom, start, stop))
+ #regionList.append((label, fullchrom, start, stop))
if usePeak:
readList = []
- for localIndex in xrange(rindex, dictLen):
- read = readDict[fullchrom][localIndex]
- if read["start"] < start:
- rindex += 1
- elif start <= read["start"] <= stop:
- readList.append(read)
- else:
- break
+ for alignedread in bamfile.fetch(fullchrom, start, stop):
+ weight = 1.0/alignedread.opt('NH')
+ readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight})
if len(readList) < 1:
continue
regionCount[label] += topValue
else:
- regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
- if flagRDS:
- hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
- if flagRDS:
- hitRDS.setSynchronousPragma("ON")
+ regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices)
if normalize:
for label in regionCount:
outfile.write("\n")
outfile.close()
- if doCache and flagRDS:
- hitRDS.saveCacheDB(hitfile)
-
- writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
+ writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.))
+
+
+def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False):
+ uniques = 0
+ multis = 0.0
+ uniqueSplice = 0
+ multiSplice = 0.0
+ for alignedread in bamfile.fetch(chr, start, end):
+ readMultiplicity = alignedread.opt('NH')
+ if doSplices and isSpliceEntry(alignedread.cigar):
+ if readMultiplicity == 1 and doUniqs:
+ uniqueSplice += 1
+ elif doMulti:
+ multiSplice += 1.0/readMultiplicity
+ elif readMultiplicity == 1 and doUniqs:
+ uniques += 1
+ elif doMulti:
+ multis += 1.0/readMultiplicity
+
+ totalReads = uniques + multis + uniqueSplice + multiSplice
+
+ return totalReads
if __name__ == "__main__":
-#
-# RNAFARpairs.py
-# ENRAGE
-#
-# Created by Ali Mortazavi on 11/2/08.
-#
""" usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
"""
import sys
import time
import optparse
-import ReadDataset
-from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+import pysam
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry
def main(argv=None):
genome = args[0]
goodfilename = args[1]
- rdsfile = args[2]
+ bamfilename = args[2]
outfilename = args[3]
- rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+ bamfile = pysam.Samfile(bamfilename, "rb")
+
+ rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
def makeParser(usage=""):
return parser
-def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+ """ map all candidate regions that have paired ends overlapping with known genes
+ """
+
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ regions = {}
+ for chromosome in chromosomeList:
+ regions[chromosome] = []
+
goodDict = {}
goodfile = open(goodfilename)
for line in goodfile:
fields = line.split()
- goodDict[fields[0]] = line
+ label = fields[0]
+ start = int(fields[2])
+ stop = int(fields[3])
+ goodDict[label] = line
+ regions[chromosome].append((start, stop, label))
goodfile.close()
- RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
- chromosomeList = RDS.getChromosomes()
if doVerbose:
print time.ctime()
assigned = {}
farConnected = {}
for chromosome in chromosomeList:
- if doNotProcessChromosome(chromosome):
- continue
-
print chromosome
- uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+ regionList = regions[chromosome].sort()
+ uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist)
if doVerbose:
print len(uniqDict), time.ctime()
- for readID in uniqDict:
- readList = uniqDict[readID]
- if len(readList) == 2:
- total += 1
- if processReads(readList[:2], maxDist):
- flags = (readList[0]["flag"], readList[1]["flag"])
- processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
- total += processed
- distinct += distinctPairs
+ total += pairCount
+ for readID, readList in uniqDict.items():
+ flags = (readList[0]["flag"], readList[1]["flag"])
+ processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+ total += processed
+ distinct += distinctPairs
entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
distinct += entriesWritten
print time.ctime()
-def doNotProcessChromosome(chromosome):
- return chromosome == "chrM"
+def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
+ """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
+ Only returns unique non-spliced read pairs matching the criteria given in processReads().
+ """
+ start = 1
+ readDict = {}
+ for regionstart, regionstop, regionname in regions:
+ for alignedread in bamfile.fetch(chromosome, start, regionstop):
+ if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+ if alignedread.pos >= regionstart:
+ flag = regionname
+ else:
+ flag = alignedread.opt("ZG")
+ try:
+ readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+ except KeyError:
+ readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
-def processReads(reads, maxDist):
+ start = regionstop + 1
+
+ for alignedread in bamfile.fetch(chromosome, start):
+ if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+ flag = alignedread.opt("ZG")
+
+ try:
+ readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+ except KeyError:
+ readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+ pairCount = len(readDict.keys())
+ for readID, readList in readDict.items():
+ if len(readList) != 2:
+ del readDict[readID]
+ pairCount -= 1
+ elif not processReads(readList, maxDist):
+ del readDict[readID]
+
+ return readDict, pairCount
+
+
+def processReads(reads, maxDist=500000):
+ """ For a pair of readID's to be acceptable:
+ - flags must be different
+ - neither flag can be 'NM'
+ - the read starts have to be within maxDist
+ """
process = False
- start1 = reads[0]["start"]
- start2 = reads[1]["start"]
- dist = abs(start1 - start2)
+ dist = abs(reads[0]["start"] - reads[1]["start"])
flag1 = reads[0]["flag"]
flag2 = reads[1]["flag"]
def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+ """ Writes out the region information along with symbol and geneID for paired reads where one read
+ of the pair is in a known gene and the other is in a new region. If both reads in the pair are
+ in new regions the region is added to farConnected. No action is taken if both reads in the
+ pair are in known genes.
+ """
flag1, flag2 = flags
total = 0
distinct = 0
except KeyError:
farConnected[geneID] = [farFlag]
elif read1IsGood or read2IsGood:
- total += 1
+ total = 1
if read2IsGood:
farFlag = flag2
geneID = flag1
farFlag = flag1
geneID = flag2
- try:
- if genome == "dmelanogaster":
- symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
- else:
- symbol = geneInfoDict[geneID][0][0]
- except (KeyError, IndexError):
- try:
- symbol = geneAnnotDict[(genome, geneID)][0]
- except (KeyError, IndexError):
- symbol = "LOC%s" % geneID
-
- symbol = symbol.strip()
- symbol = symbol.replace(" ","|")
- symbol = symbol.replace("\t","|")
-
+ symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
if farFlag not in assigned:
assigned[farFlag] = (symbol, geneID)
print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
- distinct += 1
+ distinct = 1
return total, distinct
+def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+ else:
+ symbol = geneInfoDict[geneID][0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneAnnotDict[(genome, geneID)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % geneID
+
+ symbol = symbol.strip()
+ symbol = symbol.replace(" ","|")
+ symbol = symbol.replace("\t","|")
+
+ return symbol
+
+
def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
import sys
import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
from checkrmask import checkrmask
from geneMrnaCounts import geneMrnaCounts
from geneMrnaCountsWeighted import geneMrnaCountsWeighted
return parser
-def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False):
""" based on original script runRNAPairedAnalysis.sh
- usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
- where rdsprefix is the name of the rds file without the .rds extension
+ usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels]
+ where fileprefix is the name of the bam file without the .bam extension
use "none" for the repeatmaskdb if you do not have one
"""
- rdsfile = "%s.rds" % rdsprefix
+ bamfilename = "%s.bam" % fileprefix
+ bamfile = pysam.Samfile(bamfilename, "rb")
logfile = "rna.log"
# log the parameters
- message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+ message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb)
writeLog(logfile, "runRNAPairedAnalysis.py", message)
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
- uniquecountfilename = "%s.uniqs.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+ #
+ # These will need to be marked by running the BAM preprocessor with the markGID option
+ uniquecountfilename = "%s.uniqs.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
- (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
readCounts = {}
- readCounts["uniq"] = ucount
- readCounts["splice"] = mcount
- readCounts["multi"] = scount
+ readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+ readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+ readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
# recount the unique reads with weights calculated during the first pass
- uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+ uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
# count splice reads
- splicecountfilename = "%s.splices.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+ splicecountfilename = "%s.splices.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
# find new regions outside of gene models with reads piled up
- newregionfilename = "%s.newregions.txt" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+ findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
# filter out new regions that overlap repeats more than a certain fraction
- outFileName = "%s.newregions.repstatus" % rdsprefix
- goodFileName = "%s.newregions.good" % rdsprefix
- checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+ outFileName = "%s.newregions.repstatus" % fileprefix
+ regionfilename = "%s.newregions.checked" % fileprefix
+ checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile)
# calculate the read densities
- regionfilename = "%s.newregions.checked" % rdsprefix
- regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+ goodFileName = "%s.newregions.good" % fileprefix
+ regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
# map all candidate regions that have paired ends overlapping with known genes
- candidatefilename = "%s.candidates.txt" % rdsprefix
- rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+ candidatefilename = "%s.candidates.txt" % fileprefix
+ rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True)
- expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+ expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
# calculate expanded exonic read density
- acceptedfilename = "%s.accepted.rpkm" % rdsprefix
- try:
- candidatefile = open(candidatefilename)
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- except IOError:
- candidateLines = []
-
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+ acceptedfilename = "%s.accepted.rpkm" % fileprefix
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
# weigh multi-reads
- multicountfilename = "%s.multi.count" % rdsprefix
- acceptfile = "%s.accepted.rpkm" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+ multicountfilename = "%s.multi.count" % fileprefix
+ acceptfile = "%s.accepted.rpkm" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
extendGenome=modelfile, replaceModels=replacemodels)
# calculate final exonic read density
- outfilename = "%s.final.rpkm" % rdsprefix
+ outfilename = "%s.final.rpkm" % fileprefix
normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
import sys
import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
from checkrmask import checkrmask
from geneMrnaCounts import geneMrnaCounts
from geneMrnaCountsWeighted import geneMrnaCountsWeighted
argv = sys.argv
print "runStandardAnalysis: version %s" % VERSION
- usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+ usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
sys.exit(1)
genome = args[0]
- rdsprefix = args[1]
+ fileprefix = args[1]
repeatmaskdb = args[2]
- bpradius = args[3]
+ bpradius = int(args[3])
try:
modelfile = args[4]
except IndexError:
modelfile = ""
- runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+ runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
def getParser(usage):
return parser
-def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
""" based on original script runStandardAnalysis.sh
- usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
- where rdsprefix is the name of the rds file without the .rds extension
+ usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+ where fileprefix is the name of the bam file without the .bam extension
use "none" for the repeatmaskdb if you do not have one
"""
- rdsfile = "%s.rds" % rdsprefix
+ bamfilename = "%s.bam" % fileprefix
+ bamfile = pysam.Samfile(bamfilename, "rb")
logfile = "rna.log"
# log the parameters
- message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+ message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
writeLog(logfile, "runStandardAnalysis.py", message)
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
- uniquecountfilename = "%s.uniqs.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+ #
+ # These will need to be marked by running the BAM preprocessor with the markGID option
+ uniquecountfilename = "%s.uniqs.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
- splicecountfilename = "none"
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
- (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
readCounts = {}
- readCounts["uniq"] = ucount
- readCounts["splice"] = mcount
- readCounts["multi"] = scount
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+ readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+ readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+ readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
# recount the unique reads with weights calculated during the first pass
- uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+ uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
# count splice reads
- splicecountfilename = "%s.splices.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+ splicecountfilename = "%s.splices.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
# Alternative 1: find new regions outside of gene models with reads piled up
- newregionfilename = "%s.newregions.txt" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+ findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
- outFileName = "%s.newregions.repstatus" % rdsprefix
- goodFileName = "%s.newregions.good" % rdsprefix
+ outFileName = "%s.newregions.repstatus" % fileprefix
+ goodFileName = "%s.newregions.good" % fileprefix
checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
# map all candidate regions that are within a given radius of a gene in bp
- candidatefilename = "%s.candidates.txt" % rdsprefix
+ candidatefilename = "%s.candidates.txt" % fileprefix
getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
- expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+ expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
# calculate expanded exonic read density
- acceptedfilename = "%s.accepted.rpkm" % rdsprefix
- try:
- candidatefile = open(candidatefilename)
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- except IOError:
- candidateLines = []
-
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
- doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+ acceptedfilename = "%s.accepted.rpkm" % fileprefix
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+ acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
# weigh multi-reads
- multicountfilename = "%s.multi.count" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+ multicountfilename = "%s.multi.count" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
replaceModels=replacemodels)
# calculate final exonic read density
- outfilename = "%s.final.rpkm" % rdsprefix
+ outfilename = "%s.final.rpkm" % fileprefix
normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+ #TODO: remove when not tracking
+ writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
import sys
import optparse
-import ReadDataset
-from commoncode import writeLog
+import pysam
+from commoncode import writeLog, getHeaderComment
from checkrmask import checkrmask
from geneMrnaCounts import geneMrnaCounts
from geneMrnaCountsWeighted import geneMrnaCountsWeighted
return parser
-def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
""" based on original script runStrandedAnalysis.sh
usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
where rdsprefix is the name of the rds file without the .rds extension
use "none" for the repeatmaskdb if you do not have one
"""
- rdsfile = "%s.rds" % rdsprefix
+ bamfilename = "%s.bam" % fileprefix
+ bamfile = pysam.Samfile(bamfilename, "rb")
logfile = "rna.log"
# log the parameters
- message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+ message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
writeLog(logfile, "runStrandedAnalysis.py", message)
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
- uniquecountfilename = "%s.uniqs.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+ uniquecountfilename = "%s.uniqs.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
- (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
readCounts = {}
- readCounts["uniq"] = ucount
- readCounts["splice"] = mcount
- readCounts["multi"] = scount
+ readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+ readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+ readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
# recount the unique reads with weights calculated during the first pass
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
+ uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
# count splice reads
- splicecountfilename = "%s.splices.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+ splicecountfilename = "%s.splices.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
# find new regions outside of gene models with reads piled up
- newregionfilename = "%s.newregions.txt" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+ findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+ findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
# filter out new regions that overlap repeats more than a certain fraction
- newregionfilename = "%s.newregions.txt" % rdsprefix
- outFileName = "%s.newregions.repstatus" % rdsprefix
- goodFileName = "%s.newregions.good" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
+ outFileName = "%s.newregions.repstatus" % fileprefix
+ goodFileName = "%s.newregions.good" % fileprefix
checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
#TODO: these calls look wrong
#python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
# map all candidate regions that are within a given radius of a gene in bp
- candidatefilename = "%s.candidates.txt" % rdsprefix
+ candidatefilename = "%s.candidates.txt" % fileprefix
getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
- expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+ expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
# calculate expanded exonic read density
- acceptedfilename = "%s.accepted.rpkm" % rdsprefix
- try:
- candidatefile = open(candidatefilename)
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- except IOError:
- candidateLines = []
-
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
- doCache=True)
+ acceptedfilename = "%s.accepted.rpkm" % fileprefix
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+ acceptedfilename=acceptedfilename, doCache=True)
# weigh multi-reads
- multicountfilename = "%s.multi.count" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+ multicountfilename = "%s.multi.count" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
# calculate final exonic read density
- outfilename = "%s.final.rpkm" % rdsprefix
+ outfilename = "%s.final.rpkm" % fileprefix
normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
siteNotCommon = True
#TODO: as handed to me this is O(n^m) - probably could be better...
- #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file
+ #TODO: if there are multiple matches will count them but only write the first match to the file
for (rstart, rstop, rline) in siteDict[chrom]:
if regionsOverlap(start, stop, rstart, rstop):
commonSites += 1
chrom = "chr%s" % multiReadList[index][1]
reweighList.append((round(score,3), chrom, start, theID))
- #TODO: QForAli - Is this right? If match index is 0 are we doing nothing?
+ #TODO: Is this right? If match index is 0 are we doing nothing?
if theMatch > 0:
RDS.reweighMultireads(reweighList)
fixedPair += 1