4 from commoncode import writeLog, getHeaderComment
5 from checkrmask import checkrmask
6 from geneMrnaCounts import geneMrnaCounts
7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
8 from getallgenes import getallgenes
9 from normalizeFinalExonic import normalizeFinalExonic
10 from normalizeExpandedExonic import normalizeExpandedExonic
11 from findall import findall, RegionFinder
19 print "runStrandedAnalysis: version %s" % VERSION
20 usage = "usage: python runStrandedAnalysis.py genome rdsprefix repeatmaskdb bpradius"
22 parser = getParser(usage)
23 (options, args) = parser.parse_args(argv[1:])
31 repeatmaskdb = args[2]
34 runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius)
38 parser = optparse.OptionParser(usage=usage)
43 def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
44 """ based on original script runStrandedAnalysis.sh
45 usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
46 where rdsprefix is the name of the rds file without the .rds extension
47 use "none" for the repeatmaskdb if you do not have one
50 bamfilename = "%s.bam" % fileprefix
51 bamfile = pysam.Samfile(bamfilename, "rb")
55 message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
56 writeLog(logfile, "runStrandedAnalysis.py", message)
58 # count the unique reads falling on the gene models ; the nomatch files are
59 # mappable reads that fell outside of the Cistematic gene models and not the
60 # unmappable of Eland (i.e, the "NM" reads)
61 uniquecountfilename = "%s.uniqs.count" % fileprefix
62 geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
64 # calculate a first-pass RPKM to re-weigh the unique reads,
65 # using 'none' for the splice count
66 initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
68 readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
69 readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
70 readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
71 normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
73 # recount the unique reads with weights calculated during the first pass
74 initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
75 uniquerecountfilename = "%s.uniqs.recount" % fileprefix
76 geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
79 splicecountfilename = "%s.splices.count" % fileprefix
80 geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
82 # find new regions outside of gene models with reads piled up
83 newregionfilename = "%s.newregions.txt" % fileprefix
84 regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
85 findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
87 regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
88 findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
90 # filter out new regions that overlap repeats more than a certain fraction
91 newregionfilename = "%s.newregions.txt" % fileprefix
92 outFileName = "%s.newregions.repstatus" % fileprefix
93 goodFileName = "%s.newregions.good" % fileprefix
94 checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
96 #TODO: these calls look wrong
97 # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
98 #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
99 #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
101 # map all candidate regions that are within a given radius of a gene in bp
102 candidatefilename = "%s.candidates.txt" % fileprefix
103 getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
105 expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
106 # calculate expanded exonic read density
107 acceptedfilename = "%s.accepted.rpkm" % fileprefix
108 normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
109 acceptedfilename=acceptedfilename, doCache=True)
112 multicountfilename = "%s.multi.count" % fileprefix
113 geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
115 # calculate final exonic read density
116 outfilename = "%s.final.rpkm" % fileprefix
117 normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
120 if __name__ == "__main__":