X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=runStandardAnalysis.py;fp=runStandardAnalysis.py;h=adfa37522a5f2c9af6d70b2a117102a6ab4163c8;hp=0000000000000000000000000000000000000000;hb=cfc5602b26323ad2365295145e3f6c622d912eb4;hpb=c4561c55cfa9726530c6777b6515c4ef66306b2f

diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py
new file mode 100644
index 0000000..adfa375
--- /dev/null
+++ b/runStandardAnalysis.py
@@ -0,0 +1,131 @@
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, writeLog
+from checkrmask import checkrmask
+from geneMrnaCounts import geneMrnaCounts
+from geneMrnaCountsWeighted import geneMrnaCountsWeighted
+from getallgenes import getallgenes
+from normalizeFinalExonic import normalizeFinalExonic
+from normalizeExpandedExonic import normalizeExpandedExonic
+from findall import findall, RegionFinder
+
+VERSION = "1.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "runStandardAnalysis: version %s" % VERSION
+    usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    rdsprefix = args[1]
+    repeatmaskdb = args[2]
+    bpradius = args[3]
+    try:
+        modelfile = args[4]
+    except IndexError:
+        modelfile = ""
+
+    runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
+
+    configParser = getConfigParser()
+    section = "RunStandardAnalysis"
+    replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
+    parser.set_defaults(replacemodels=replacemodels)
+
+    return parser
+
+
+def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+    """ based on original script runStandardAnalysis.sh
+        usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+               where rdsprefix is the name of the rds file without the .rds extension
+               use "none" for the repeatmaskdb if you do not have one
+    """
+
+    rdsfile = "%s.rds" % rdsprefix
+    logfile = "rna.log"
+
+    # log the parameters
+    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    writeLog(logfile, "runStandardAnalysis.py", message)
+
+    # count the unique reads falling on the gene models ; the nomatch files are
+    # mappable reads that fell outside of the Cistematic gene models and not the
+    # unmappable of Eland (i.e, the "NM" reads)
+    uniquecountfilename = "%s.uniqs.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+
+    # calculate a first-pass RPKM to re-weigh the unique reads,
+    # using 'none' for the splice count
+    splicecountfilename = "none"
+    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
+    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    readCounts = {}
+    readCounts["uniq"] = ucount
+    readCounts["splice"] = mcount
+    readCounts["multi"] = scount
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # recount the unique reads with weights calculated during the first pass
+    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # count splice reads
+    splicecountfilename = "%s.splices.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+
+    # Alternative 1: find new regions outside of gene models with reads piled up
+    newregionfilename = "%s.newregions.txt" % rdsprefix
+    regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
+    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+
+    # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
+    outFileName = "%s.newregions.repstatus" % rdsprefix
+    goodFileName = "%s.newregions.good" % rdsprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+
+    # map all candidate regions that are within a given radius of a gene in bp
+    candidatefilename = "%s.candidates.txt" % rdsprefix
+    getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    # calculate expanded exonic read density
+    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
+    try:
+        candidatefile = open(candidatefilename)
+        candidateLines = candidatefile.readlines()
+        candidatefile.close()
+    except IOError:
+        candidateLines = []
+
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
+                            doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # weigh multi-reads
+    multicountfilename = "%s.multi.count" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+                           replaceModels=replacemodels)
+
+    # calculate final exonic read density
+    outfilename = "%s.final.rpkm" % rdsprefix
+    normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file