4 from commoncode import getConfigParser, getConfigBoolOption, writeLog
5 from checkrmask import checkrmask
6 from geneMrnaCounts import geneMrnaCounts
7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
8 from regionCounts import regionCounts
9 from rnafarPairs import rnaFarPairs
10 from normalizeFinalExonic import normalizeFinalExonic
11 from normalizeExpandedExonic import normalizeExpandedExonic
12 from findall import findall, RegionFinder
20 print "runRNAPairedAnalysis: version %s" % VERSION
21 usage = "usage: python runRNAPairedAnalysis.py genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]"
23 parser = getParser(usage)
24 (options, args) = parser.parse_args(argv[1:])
32 repeatmaskdb = args[2]
38 runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile=modelfile, replacemodels=options.replacemodels)
42 parser = optparse.OptionParser(usage=usage)
43 parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
45 configParser = getConfigParser()
46 section = "RunRNAPairedAnalysis"
47 replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
48 parser.set_defaults(replacemodels=replacemodels)
53 def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
54 """ based on original script runRNAPairedAnalysis.sh
55 usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
56 where rdsprefix is the name of the rds file without the .rds extension
57 use "none" for the repeatmaskdb if you do not have one
60 rdsfile = "%s.rds" % rdsprefix
64 message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
65 writeLog(logfile, "runRNAPairedAnalysis.py", message)
67 # count the unique reads falling on the gene models ; the nomatch files are
68 # mappable reads that fell outside of the Cistematic gene models and not the
69 # unmappable of Eland (i.e, the "NM" reads)
70 uniquecountfilename = "%s.uniqs.count" % rdsprefix
71 geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
73 # calculate a first-pass RPKM to re-weigh the unique reads,
74 # using 'none' for the splice count
75 initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
76 RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
77 (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
79 readCounts["uniq"] = ucount
80 readCounts["splice"] = mcount
81 readCounts["multi"] = scount
82 normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
84 # recount the unique reads with weights calculated during the first pass
85 uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
86 geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
89 splicecountfilename = "%s.splices.count" % rdsprefix
90 geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
92 # find new regions outside of gene models with reads piled up
93 newregionfilename = "%s.newregions.txt" % rdsprefix
94 regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
95 findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
97 # filter out new regions that overlap repeats more than a certain fraction
98 outFileName = "%s.newregions.repstatus" % rdsprefix
99 goodFileName = "%s.newregions.good" % rdsprefix
100 checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
102 # calculate the read densities
103 regionfilename = "%s.newregions.checked" % rdsprefix
104 regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
106 # map all candidate regions that have paired ends overlapping with known genes
107 candidatefilename = "%s.candidates.txt" % rdsprefix
108 rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
110 expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
111 # calculate expanded exonic read density
112 acceptedfilename = "%s.accepted.rpkm" % rdsprefix
114 candidatefile = open(candidatefilename)
115 candidateLines = candidatefile.readlines()
116 candidatefile.close()
120 normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
121 acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
124 multicountfilename = "%s.multi.count" % rdsprefix
125 acceptfile = "%s.accepted.rpkm" % rdsprefix
126 geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
127 extendGenome=modelfile, replaceModels=replacemodels)
129 # calculate final exonic read density
130 outfilename = "%s.final.rpkm" % rdsprefix
131 normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
134 if __name__ == "__main__":