docs/RNA-seq.analysisSteps.txt

   1 # analysis steps for an ERANGE analysis of RNA-seq data
   2 # This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
   3
   4 # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
   5 #              export PYTHONPATH=/my/path/to/cistematic
   6 #
   7 # preliminary: set CISTEMATIC_ROOT to the directory that contains the genome directories (such as H_sapiens or M_musculus), e.g.
   8 #              export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes
   9 #
  10 # preliminary: set ERANGEPATH, e.g.
  11 #              export ERANGEPATH=/my/path/to/erange
  12 #
  13 # preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g.
  14 #              export CISTEMATIC_TEMP=/any/local/dir
  15 #
  16 # preliminary: create splice file using getsplicefa.py with maxBorder set to 4 bp shorter than the read length, e.g.
  17 #              python $ERANGEPATH/getsplicefa.py hsapiens /my/path/to/human/knownGene.txt hg18splice32.fa 28
  18 #
  19 # preliminary: build expanded genome using Eland's squashGenome or Bowtie's bowtie-build (see README.build-rds)
  20 #              a slower alternative is to use blat just on the genome.
  21 #
  22 # preliminary: build repeatmask database using buildrmaskdb.py, e.g.
  23 #              python $ERANGEPATH/buildrmaskdb.py /path/to/hg19repeats /path/to/hg18repeats/rmask.db
  24 #              if you don't have an repeatmask database, just use "none" for the rmask database below
  25
  26 # run bowtie on expanded genome or just blat on the regular genome
  27 # as described in README.build-rds
  28 #
  29
  30 # create rds file with one lane's worth of data (add -index if using only one lane)
  31 # The example below sets the default cache to 1000000
  32 # The name::value pairs are optional documentart metadata, and can be set to any desired name or value
  33 python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX
  34
  35 # can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups
  36 # rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer
  37 #python $ERANGEPATH/rdsmetadata.py LHCN10213.rds --defaultcache 2000000 --nocount
  38
  39 # append more data (only add -index when adding last lane)
  40 python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 --append --index
  41
  42 # count the unique reads falling on the gene models ; the nomatch files are
  43 # mappable reads that fell outside of the Cistematic gene models and not the
  44 # unmappable of Eland (i.e, the "NM" reads)
  45 python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count --markGID --cache 1
  46
  47 # count splice reads
  48 python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count --splices --noUniqs --cache 1
  49
  50 # calculate a first-pass RPKM to re-weigh the unique reads,
  51 # using 'none' for the splice count
  52 python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm --cache
  53
  54 # recount the unique reads with weights calculated during the first pass
  55 python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount --uniq --cache 1
  56
  57 # There is a choice of either identifying new regions from the data alone
  58 # (Alternative 1), or using a pre-computed list of new regions (presumably
  59 # pooled from multiple nomatch.bed files, or literature) against the nomatch.bed
  60 # file (Alternative 2)
  61
  62 # Alternative 1: find new regions outside of gene models with reads piled up
  63 python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
  64
  65 # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
  66 #                use "none" if you don't have a repeatmask database
  67 python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good --log rna.log --startField 1 --cache 1
  68
  69 # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
  70 #python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
  71
  72 # map all candidate regions that are within a 20kb radius of a gene in bp
  73 # take out -cache if running locally
  74 python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 --radius 20001 --trackfar --cache
  75
  76 # calculate expanded exonic read density
  77 python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm --cache
  78
  79 # create bed file of accepted candidate regions
  80 python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0
  81
  82 # weigh multi-reads
  83 python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count --accept LHCN10213.accepted.rpkm --multi --cache 1
  84
  85 # calculate final exonic read density
  86 python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm --multifraction --withGID --cache
  87