X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=runRNAPairedAnalysis.py;fp=runRNAPairedAnalysis.py;h=ebc25a530af4166ea69b17a8228a6943b19960c6;hp=0000000000000000000000000000000000000000;hb=cfc5602b26323ad2365295145e3f6c622d912eb4;hpb=c4561c55cfa9726530c6777b6515c4ef66306b2f diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py new file mode 100644 index 0000000..ebc25a5 --- /dev/null +++ b/runRNAPairedAnalysis.py @@ -0,0 +1,135 @@ +import sys +import optparse +import ReadDataset +from commoncode import getConfigParser, getConfigBoolOption, writeLog +from checkrmask import checkrmask +from geneMrnaCounts import geneMrnaCounts +from geneMrnaCountsWeighted import geneMrnaCountsWeighted +from regionCounts import regionCounts +from rnafarPairs import rnaFarPairs +from normalizeFinalExonic import normalizeFinalExonic +from normalizeExpandedExonic import normalizeExpandedExonic +from findall import findall, RegionFinder + +VERSION = "1.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + print "runRNAPairedAnalysis: version %s" % VERSION + usage = "usage: python runRNAPairedAnalysis.py genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]" + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = args[0] + rdsprefix = args[1] + repeatmaskdb = args[2] + try: + modelfile = args[3] + except IndexError: + modelfile = "" + + runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile=modelfile, replacemodels=options.replacemodels) + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--replacemodels", action="store_true", dest="replacemodels") + + configParser = getConfigParser() + section = "RunRNAPairedAnalysis" + replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False) + parser.set_defaults(replacemodels=replacemodels) + + return parser + + +def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False): + """ based on original script runRNAPairedAnalysis.sh + usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels] + where rdsprefix is the name of the rds file without the .rds extension + use "none" for the repeatmaskdb if you do not have one + """ + + rdsfile = "%s.rds" % rdsprefix + logfile = "rna.log" + + # log the parameters + message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb) + writeLog(logfile, "runRNAPairedAnalysis.py", message) + + # count the unique reads falling on the gene models ; the nomatch files are + # mappable reads that fell outside of the Cistematic gene models and not the + # unmappable of Eland (i.e, the "NM" reads) + uniquecountfilename = "%s.uniqs.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + + # calculate a first-pass RPKM to re-weigh the unique reads, + # using 'none' for the splice count + initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix + RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) + (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + readCounts = {} + readCounts["uniq"] = ucount + readCounts["splice"] = mcount + readCounts["multi"] = scount + normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + # recount the unique reads with weights calculated during the first pass + uniquerecountfilename = "%s.uniqs.recount" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + + # count splice reads + splicecountfilename = "%s.splices.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + + # find new regions outside of gene models with reads piled up + newregionfilename = "%s.newregions.txt" % rdsprefix + regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") + findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + + # filter out new regions that overlap repeats more than a certain fraction + outFileName = "%s.newregions.repstatus" % rdsprefix + goodFileName = "%s.newregions.good" % rdsprefix + checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) + + # calculate the read densities + regionfilename = "%s.newregions.checked" % rdsprefix + regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile) + + # map all candidate regions that have paired ends overlapping with known genes + candidatefilename = "%s.candidates.txt" % rdsprefix + rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True) + + expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + # calculate expanded exonic read density + acceptedfilename = "%s.accepted.rpkm" % rdsprefix + try: + candidatefile = open(candidatefilename) + candidateLines = candidatefile.readlines() + candidatefile.close() + except IOError: + candidateLines = [] + + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, + acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + # weigh multi-reads + multicountfilename = "%s.multi.count" % rdsprefix + acceptfile = "%s.accepted.rpkm" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1, + extendGenome=modelfile, replaceModels=replacemodels) + + # calculate final exonic read density + outfilename = "%s.final.rpkm" % rdsprefix + normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file