runStrandedAnalysis.py

   1 import sys
   2 import optparse
   3 import pysam
   4 from commoncode import writeLog, getHeaderComment
   5 from checkrmask import checkrmask
   6 from geneMrnaCounts import geneMrnaCounts
   7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
   8 from getallgenes import getallgenes
   9 from normalizeFinalExonic import normalizeFinalExonic
  10 from normalizeExpandedExonic import normalizeExpandedExonic
  11 from findall import findall, RegionFinder
  12
  13 VERSION = "1.0"
  14
  15 def main(argv=None):
  16     if not argv:
  17         argv = sys.argv
  18
  19     print "runStrandedAnalysis: version %s" % VERSION
  20     usage = "usage: python runStrandedAnalysis.py genome rdsprefix repeatmaskdb bpradius"
  21
  22     parser = getParser(usage)
  23     (options, args) = parser.parse_args(argv[1:])
  24
  25     if len(args) < 4:
  26         print usage
  27         sys.exit(1)
  28
  29     genome = args[0]
  30     rdsprefix = args[1]
  31     repeatmaskdb = args[2]
  32     bpradius = args[3]
  33
  34     runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius)
  35
  36
  37 def getParser(usage):
  38     parser = optparse.OptionParser(usage=usage)
  39
  40     return parser
  41
  42
  43 def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
  44     """ based on original script runStrandedAnalysis.sh
  45         usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
  46                where rdsprefix is the name of the rds file without the .rds extension
  47                use "none" for the repeatmaskdb if you do not have one
  48     """
  49
  50     bamfilename = "%s.bam" % fileprefix
  51     bamfile = pysam.Samfile(bamfilename, "rb")
  52     logfile = "rna.log"
  53
  54     # log the parameters
  55     message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
  56     writeLog(logfile, "runStrandedAnalysis.py", message)
  57
  58     # count the unique reads falling on the gene models ; the nomatch files are
  59     # mappable reads that fell outside of the Cistematic gene models and not the
  60     # unmappable of Eland (i.e, the "NM" reads)
  61     uniquecountfilename = "%s.uniqs.count" % fileprefix
  62     geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
  63
  64     # calculate a first-pass RPKM to re-weigh the unique reads,
  65     # using 'none' for the splice count
  66     initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
  67     readCounts = {}
  68     readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
  69     readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
  70     readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
  71     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  72
  73     # recount the unique reads with weights calculated during the first pass
  74     initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
  75     uniquerecountfilename = "%s.uniqs.recount" % fileprefix
  76     geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
  77
  78     # count splice reads
  79     splicecountfilename = "%s.splices.count" % fileprefix
  80     geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
  81
  82     # find new regions outside of gene models with reads piled up
  83     newregionfilename = "%s.newregions.txt" % fileprefix
  84     regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
  85     findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  86
  87     regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
  88     findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
  89
  90     # filter out new regions that overlap repeats more than a certain fraction
  91     newregionfilename = "%s.newregions.txt" % fileprefix
  92     outFileName = "%s.newregions.repstatus" % fileprefix
  93     goodFileName = "%s.newregions.good" % fileprefix
  94     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  95
  96     #TODO: these calls look wrong
  97     # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
  98     #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
  99     #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
 100
 101     # map all candidate regions that are within a given radius of a gene in bp
 102     candidatefilename = "%s.candidates.txt" % fileprefix
 103     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
 104
 105     expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
 106     # calculate expanded exonic read density
 107     acceptedfilename = "%s.accepted.rpkm" % fileprefix
 108     normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
 109                             acceptedfilename=acceptedfilename, doCache=True)
 110
 111     # weigh multi-reads
 112     multicountfilename = "%s.multi.count" % fileprefix
 113     geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
 114
 115     # calculate final exonic read density
 116     outfilename = "%s.final.rpkm" % fileprefix
 117     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 118
 119
 120 if __name__ == "__main__":
 121     main(sys.argv)