X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=docs%2FRNA-seq.analysisSteps.txt;h=b0a1c0f09e270ad2cc47c9af05c84564b5325aaf;hp=e9a5213ce8b9e4d139bd7c719b83b1e8bc487b70;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt index e9a5213..b0a1c0f 100644 --- a/docs/RNA-seq.analysisSteps.txt +++ b/docs/RNA-seq.analysisSteps.txt @@ -8,7 +8,7 @@ # export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes # # preliminary: set ERANGEPATH, e.g. -# export ERANGEPATH=/proj/genome/experiments/commoncode +# export ERANGEPATH=/my/path/to/erange # # preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g. # export CISTEMATIC_TEMP=/any/local/dir @@ -30,29 +30,29 @@ # create rds file with one lane's worth of data (add -index if using only one lane) # The example below sets the default cache to 1000000 # The name::value pairs are optional documentart metadata, and can be set to any desired name or value -python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX +python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX # can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups # rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer -#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds -defaultcache 2000000 -nocount +#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds --defaultcache 2000000 --nocount # append more data (only add -index when adding last lane) -python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 -append -index +python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 --append --index # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) -python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count -markGID -cache 1 +python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count --markGID --cache 1 # count splice reads -python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count -splices -noUniqs -cache 1 +python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count --splices --noUniqs --cache 1 # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count -python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm -cache +python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm --cache # recount the unique reads with weights calculated during the first pass -python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount -uniq -cache 1 +python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount --uniq --cache 1 # There is a choice of either identifying new regions from the data alone # (Alternative 1), or using a pre-computed list of new regions (presumably @@ -60,28 +60,28 @@ python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.fi # file (Alternative 2) # Alternative 1: find new regions outside of gene models with reads piled up -python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1 +python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1 # Alternative 1: filter out new regions that overlap repeats more than a certain fraction # use "none" if you don't have a repeatmask database -python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1 +python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good --log rna.log --startField 1 --cache 1 # Alternative 2: use a precomputed list of "new" regions (outside of gene models) #python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good # map all candidate regions that are within a 20kb radius of a gene in bp # take out -cache if running locally -python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 -radius 20001 -trackfar -cache +python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 --radius 20001 --trackfar --cache # calculate expanded exonic read density -python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm -cache +python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm --cache # create bed file of accepted candidate regions python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0 # weigh multi-reads -python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count -accept LHCN10213.accepted.rpkm -multi -cache 1 +python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count --accept LHCN10213.accepted.rpkm --multi --cache 1 # calculate final exonic read density -python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm -multifraction -withGID -cache +python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm --multifraction --withGID --cache