docs/runStrandedAnalysis.sh

   1 #!/bin/bash
   2 #
   3 # runStrandedAnalysis.sh
   4 # ENRAGE
   5 #
   6 # example: . ../commoncode/runStrandedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000
   7 #
   8 #          assuming that we have rds database with the prefix c2c12rna.24R.
   9
  10 # set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
  11
  12 if [ -z "$ERANGEPATH" ]
  13 then
  14     ERANGEPATH='../erange'
  15 fi
  16
  17 echo 'runStrandedAnalysis.sh: version 4.2'
  18
  19 if [ -z "$1" ]
  20 then
  21     echo
  22     echo 'usage:runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius'
  23     echo
  24     echo 'where rdsprefix is the name of the rds file without the .rds extension'
  25     echo 'use "none" for the repeatmaskdb if you do not have one'
  26     echo
  27 else
  28
  29 # log the parameters
  30 arguments=$1' '$2' '$3' '$4
  31 echo 'running with settings: ' $arguments
  32 python $ERANGEPATH/recordLog.py rna.log runStrandedAnalysis.sh "with parameters: $arguments"
  33
  34 # count the unique reads falling on the gene models ; the nomatch files are
  35 # mappable reads that fell outside of the Cistematic gene models and not the
  36 # unmappable of Eland (i.e, the "NM" reads)
  37 python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --stranded --markGID --cache 1
  38
  39 # calculate a first-pass RPKM to re-weigh the unique reads,
  40 # using 'none' for the splice count
  41 python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache
  42
  43 # recount the unique reads with weights calculated during the first pass
  44 python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --stranded --uniq --cache 1
  45
  46 # count splice reads
  47 python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --stranded --splices --noUniqs --cache 1
  48
  49 # find new regions outside of gene models with reads piled up
  50 python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --strandfilter plus --log rna.log --cache 1
  51 python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --strandfilter minus --log rna.log --cache 1 --append
  52
  53 # filter out new regions that overlap repeats more than a certain fraction
  54 python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --startField 1 --log rna.log --cache 1
  55
  56 # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
  57 #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
  58 #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
  59
  60 # map all candidate regions that are within a given radius of a gene in bp
  61 python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --stranded --cache
  62
  63 # calculate expanded exonic read density
  64 python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache
  65
  66 # weigh multi-reads
  67 python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --stranded --multi --cache 1
  68
  69 # calculate final exonic read density
  70 python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
  71
  72 fi