X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=runStandardAnalysis.py;fp=runStandardAnalysis.py;h=d20ef640899392ec1f08f474328298d1274e69c9;hp=adfa37522a5f2c9af6d70b2a117102a6ab4163c8;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py index adfa375..d20ef64 100644 --- a/runStandardAnalysis.py +++ b/runStandardAnalysis.py @@ -1,7 +1,7 @@ import sys import optparse -import ReadDataset -from commoncode import getConfigParser, getConfigBoolOption, writeLog +import pysam +from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment from checkrmask import checkrmask from geneMrnaCounts import geneMrnaCounts from geneMrnaCountsWeighted import geneMrnaCountsWeighted @@ -17,7 +17,7 @@ def main(argv=None): argv = sys.argv print "runStandardAnalysis: version %s" % VERSION - usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]" + usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]" parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) @@ -27,15 +27,15 @@ def main(argv=None): sys.exit(1) genome = args[0] - rdsprefix = args[1] + fileprefix = args[1] repeatmaskdb = args[2] - bpradius = args[3] + bpradius = int(args[3]) try: modelfile = args[4] except IndexError: modelfile = "" - runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels) + runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels) def getParser(usage): @@ -50,82 +50,78 @@ def getParser(usage): return parser -def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False): +def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False): """ based on original script runStandardAnalysis.sh - usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels] - where rdsprefix is the name of the rds file without the .rds extension + usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels] + where fileprefix is the name of the bam file without the .bam extension use "none" for the repeatmaskdb if you do not have one """ - rdsfile = "%s.rds" % rdsprefix + bamfilename = "%s.bam" % fileprefix + bamfile = pysam.Samfile(bamfilename, "rb") logfile = "rna.log" # log the parameters - message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius) + message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius) writeLog(logfile, "runStandardAnalysis.py", message) # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) - uniquecountfilename = "%s.uniqs.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + # + # These will need to be marked by running the BAM preprocessor with the markGID option + uniquecountfilename = "%s.uniqs.count" % fileprefix + geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels) # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count - splicecountfilename = "none" - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) - (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix readCounts = {} - readCounts["uniq"] = ucount - readCounts["splice"] = mcount - readCounts["multi"] = scount - normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique")) + readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices")) + readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis")) + normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # recount the unique reads with weights calculated during the first pass - uniquerecountfilename = "%s.uniqs.recount" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + uniquerecountfilename = "%s.uniqs.recount" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # count splice reads - splicecountfilename = "%s.splices.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1) + splicecountfilename = "%s.splices.count" % fileprefix + geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels) # Alternative 1: find new regions outside of gene models with reads piled up - newregionfilename = "%s.newregions.txt" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False) # Alternative 1: filter out new regions that overlap repeats more than a certain fraction - outFileName = "%s.newregions.repstatus" % rdsprefix - goodFileName = "%s.newregions.good" % rdsprefix + outFileName = "%s.newregions.repstatus" % fileprefix + goodFileName = "%s.newregions.good" % fileprefix checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) # map all candidate regions that are within a given radius of a gene in bp - candidatefilename = "%s.candidates.txt" % rdsprefix + candidatefilename = "%s.candidates.txt" % fileprefix getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) - expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix # calculate expanded exonic read density - acceptedfilename = "%s.accepted.rpkm" % rdsprefix - try: - candidatefile = open(candidatefilename) - candidateLines = candidatefile.readlines() - candidatefile.close() - except IOError: - candidateLines = [] - - normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename, - doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + acceptedfilename = "%s.accepted.rpkm" % fileprefix + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename, + acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) # weigh multi-reads - multicountfilename = "%s.multi.count" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile, + multicountfilename = "%s.multi.count" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) # calculate final exonic read density - outfilename = "%s.final.rpkm" % rdsprefix + outfilename = "%s.final.rpkm" % fileprefix normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) + #TODO: remove when not tracking + writeLog(logfile, "runStandardAnalysis.py", "analysis complete") + if __name__ == "__main__": main(sys.argv) \ No newline at end of file