X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=runRNAPairedAnalysis.py;fp=runRNAPairedAnalysis.py;h=0c01de47d6a4b0aa7a1f4476ad82c024da669809;hp=ebc25a530af4166ea69b17a8228a6943b19960c6;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py index ebc25a5..0c01de4 100644 --- a/runRNAPairedAnalysis.py +++ b/runRNAPairedAnalysis.py @@ -1,7 +1,7 @@ import sys import optparse -import ReadDataset -from commoncode import getConfigParser, getConfigBoolOption, writeLog +import pysam +from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment from checkrmask import checkrmask from geneMrnaCounts import geneMrnaCounts from geneMrnaCountsWeighted import geneMrnaCountsWeighted @@ -50,84 +50,78 @@ def getParser(usage): return parser -def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False): +def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False): """ based on original script runRNAPairedAnalysis.sh - usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels] - where rdsprefix is the name of the rds file without the .rds extension + usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels] + where fileprefix is the name of the bam file without the .bam extension use "none" for the repeatmaskdb if you do not have one """ - rdsfile = "%s.rds" % rdsprefix + bamfilename = "%s.bam" % fileprefix + bamfile = pysam.Samfile(bamfilename, "rb") logfile = "rna.log" # log the parameters - message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb) + message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb) writeLog(logfile, "runRNAPairedAnalysis.py", message) # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) - uniquecountfilename = "%s.uniqs.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + # + # These will need to be marked by running the BAM preprocessor with the markGID option + uniquecountfilename = "%s.uniqs.count" % fileprefix + geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels) # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) - (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix readCounts = {} - readCounts["uniq"] = ucount - readCounts["splice"] = mcount - readCounts["multi"] = scount + readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique")) + readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices")) + readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis")) normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # recount the unique reads with weights calculated during the first pass - uniquerecountfilename = "%s.uniqs.recount" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + uniquerecountfilename = "%s.uniqs.recount" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # count splice reads - splicecountfilename = "%s.splices.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + splicecountfilename = "%s.splices.count" % fileprefix + geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels) # find new regions outside of gene models with reads piled up - newregionfilename = "%s.newregions.txt" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False) # filter out new regions that overlap repeats more than a certain fraction - outFileName = "%s.newregions.repstatus" % rdsprefix - goodFileName = "%s.newregions.good" % rdsprefix - checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) + outFileName = "%s.newregions.repstatus" % fileprefix + regionfilename = "%s.newregions.checked" % fileprefix + checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile) # calculate the read densities - regionfilename = "%s.newregions.checked" % rdsprefix - regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile) + goodFileName = "%s.newregions.good" % fileprefix + regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile) # map all candidate regions that have paired ends overlapping with known genes - candidatefilename = "%s.candidates.txt" % rdsprefix - rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True) + candidatefilename = "%s.candidates.txt" % fileprefix + rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True) - expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix # calculate expanded exonic read density - acceptedfilename = "%s.accepted.rpkm" % rdsprefix - try: - candidatefile = open(candidatefilename) - candidateLines = candidatefile.readlines() - candidatefile.close() - except IOError: - candidateLines = [] - - normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, + acceptedfilename = "%s.accepted.rpkm" % fileprefix + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename, acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # weigh multi-reads - multicountfilename = "%s.multi.count" % rdsprefix - acceptfile = "%s.accepted.rpkm" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1, + multicountfilename = "%s.multi.count" % fileprefix + acceptfile = "%s.accepted.rpkm" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # calculate final exonic read density - outfilename = "%s.final.rpkm" % rdsprefix + outfilename = "%s.final.rpkm" % fileprefix normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)