+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, writeLog
+from checkrmask import checkrmask
+from geneMrnaCounts import geneMrnaCounts
+from geneMrnaCountsWeighted import geneMrnaCountsWeighted
+from regionCounts import regionCounts
+from rnafarPairs import rnaFarPairs
+from normalizeFinalExonic import normalizeFinalExonic
+from normalizeExpandedExonic import normalizeExpandedExonic
+from findall import findall, RegionFinder
+
+VERSION = "1.0"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "runRNAPairedAnalysis: version %s" % VERSION
+ usage = "usage: python runRNAPairedAnalysis.py genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]"
+
+ parser = getParser(usage)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ rdsprefix = args[1]
+ repeatmaskdb = args[2]
+ try:
+ modelfile = args[3]
+ except IndexError:
+ modelfile = ""
+
+ runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile=modelfile, replacemodels=options.replacemodels)
+
+
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
+
+ configParser = getConfigParser()
+ section = "RunRNAPairedAnalysis"
+ replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
+ parser.set_defaults(replacemodels=replacemodels)
+
+ return parser
+
+
+def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+ """ based on original script runRNAPairedAnalysis.sh
+ usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
+ where rdsprefix is the name of the rds file without the .rds extension
+ use "none" for the repeatmaskdb if you do not have one
+ """
+
+ rdsfile = "%s.rds" % rdsprefix
+ logfile = "rna.log"
+
+ # log the parameters
+ message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+ writeLog(logfile, "runRNAPairedAnalysis.py", message)
+
+ # count the unique reads falling on the gene models ; the nomatch files are
+ # mappable reads that fell outside of the Cistematic gene models and not the
+ # unmappable of Eland (i.e, the "NM" reads)
+ uniquecountfilename = "%s.uniqs.count" % rdsprefix
+ geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+
+ # calculate a first-pass RPKM to re-weigh the unique reads,
+ # using 'none' for the splice count
+ initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
+ (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+ readCounts = {}
+ readCounts["uniq"] = ucount
+ readCounts["splice"] = mcount
+ readCounts["multi"] = scount
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+ # recount the unique reads with weights calculated during the first pass
+ uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
+ geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+
+ # count splice reads
+ splicecountfilename = "%s.splices.count" % rdsprefix
+ geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+
+ # find new regions outside of gene models with reads piled up
+ newregionfilename = "%s.newregions.txt" % rdsprefix
+ regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
+ findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+
+ # filter out new regions that overlap repeats more than a certain fraction
+ outFileName = "%s.newregions.repstatus" % rdsprefix
+ goodFileName = "%s.newregions.good" % rdsprefix
+ checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+
+ # calculate the read densities
+ regionfilename = "%s.newregions.checked" % rdsprefix
+ regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+
+ # map all candidate regions that have paired ends overlapping with known genes
+ candidatefilename = "%s.candidates.txt" % rdsprefix
+ rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+
+ expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+ # calculate expanded exonic read density
+ acceptedfilename = "%s.accepted.rpkm" % rdsprefix
+ try:
+ candidatefile = open(candidatefilename)
+ candidateLines = candidatefile.readlines()
+ candidatefile.close()
+ except IOError:
+ candidateLines = []
+
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+ acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+ # weigh multi-reads
+ multicountfilename = "%s.multi.count" % rdsprefix
+ acceptfile = "%s.accepted.rpkm" % rdsprefix
+ geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+ extendGenome=modelfile, replaceModels=replacemodels)
+
+ # calculate final exonic read density
+ outfilename = "%s.final.rpkm" % rdsprefix
+ normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file