4 from commoncode import writeLog
5 from checkrmask import checkrmask
6 from geneMrnaCounts import geneMrnaCounts
7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
8 from getallgenes import getallgenes
9 from normalizeFinalExonic import normalizeFinalExonic
10 from normalizeExpandedExonic import normalizeExpandedExonic
11 from findall import findall, RegionFinder
19 print "runStrandedAnalysis: version %s" % VERSION
20 usage = "usage: python runStrandedAnalysis.py genome rdsprefix repeatmaskdb bpradius"
22 parser = getParser(usage)
23 (options, args) = parser.parse_args(argv[1:])
31 repeatmaskdb = args[2]
34 runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius)
38 parser = optparse.OptionParser(usage=usage)
43 def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
44 """ based on original script runStrandedAnalysis.sh
45 usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
46 where rdsprefix is the name of the rds file without the .rds extension
47 use "none" for the repeatmaskdb if you do not have one
50 rdsfile = "%s.rds" % rdsprefix
54 message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
55 writeLog(logfile, "runStrandedAnalysis.py", message)
57 # count the unique reads falling on the gene models ; the nomatch files are
58 # mappable reads that fell outside of the Cistematic gene models and not the
59 # unmappable of Eland (i.e, the "NM" reads)
60 uniquecountfilename = "%s.uniqs.count" % rdsprefix
61 geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
63 # calculate a first-pass RPKM to re-weigh the unique reads,
64 # using 'none' for the splice count
65 initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
66 RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
67 (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
69 readCounts["uniq"] = ucount
70 readCounts["splice"] = mcount
71 readCounts["multi"] = scount
72 normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
74 # recount the unique reads with weights calculated during the first pass
75 initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
76 uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
77 geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
80 splicecountfilename = "%s.splices.count" % rdsprefix
81 geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
83 # find new regions outside of gene models with reads piled up
84 newregionfilename = "%s.newregions.txt" % rdsprefix
85 regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
86 findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
88 regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
89 findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
91 # filter out new regions that overlap repeats more than a certain fraction
92 newregionfilename = "%s.newregions.txt" % rdsprefix
93 outFileName = "%s.newregions.repstatus" % rdsprefix
94 goodFileName = "%s.newregions.good" % rdsprefix
95 checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
97 #TODO: these calls look wrong
98 # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
99 #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
100 #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
102 # map all candidate regions that are within a given radius of a gene in bp
103 candidatefilename = "%s.candidates.txt" % rdsprefix
104 getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
106 expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
107 # calculate expanded exonic read density
108 acceptedfilename = "%s.accepted.rpkm" % rdsprefix
110 candidatefile = open(candidatefilename)
111 candidateLines = candidatefile.readlines()
112 candidatefile.close()
116 normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
120 multicountfilename = "%s.multi.count" % rdsprefix
121 geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
123 # calculate final exonic read density
124 outfilename = "%s.final.rpkm" % rdsprefix
125 normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
128 if __name__ == "__main__":