# analysis steps for an ERANGE analysis of RNA-seq data
# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh

# preliminary: create splice file using getsplicefa.py
# preliminary: build expanded genome using Eland's squashGenome
# preliminary: build repeatmask database using buildrmaskdb.py

#run eland2 on expanded genome, i.e. eland_25
/proj/genome/experiments/eland64/ElandPL03/Eland/eland_25 FC11048.s1.25mer.txt /woldlab/myod/data1/alim/genomes/mm9sp25 FC11048.s1.25mer.mm9.eland2 --multi

#create bed file of unique reads
python2.5 ../commoncode/maketrackfromeland2.py c2c12rna24R.comb.eland2 c2c12rna24R c2c12rna.24R.uniqs.bed

#create bed file of multi reads
python2.5 ../commoncode/maketrackmulti.py c2c12rna24R.comb.eland2 c2c12rna24Rmulti c2c12rna.24R.multi.bed

#create bed file of spliced reads
python2.5 ../commoncode/remapSplicesEland2.py ../mm9splices/knownGene.txt c2c12rna24R.comb.eland2 c2c12rna24Rsplices c2c12rna.24R.splices.bed

# count the unique reads falling on the gene models ; the nomatch files are 
# mappable reads that fell outside of the Cistematic gene models and not the 
# unmappable of Eland (i.e, the "NM" reads)
python2.5 ../commoncode/geneMrnaCounts.py mouse c2c12rna.24R.uniqs.bed c2c12rna.24R.uniqs.count c2c12rna.24R.nomatch.bed

# count splice reads
python2.5 ../commoncode/geneMrnaCounts.py mouse c2c12rna.24R.splices.bed c2c12rna.24R.splices.count c2c12rna.24R.nomatchsplices.bed

# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
python2.5 ../commoncode/normalizeExpandedExonic.py mouse c2c12rna.24R.uniqs.bed c2c12rna.24R.uniqs.count none c2c12rna.24R.firstpass.rpkm -cache

# recount the unique reads with weights calculated during the first pass
python2.5 ../commoncode/geneMrnaCountsWeighted.py mouse c2c12rna.24R.uniqs.bed c2c12rna.24R.firstpass.rpkm c2c12rna.24R.uniqs.recount -cache

# There is a choice of either identifying new regions from the data alone 
# (Alternative 1), or using a pre-computed list of new regions (presumably 
# pooled from multiple nomatch.bed files, or literature) against the nomatch.bed
# file (Alternative 2)

# Alternative 1: find new regions outside of gene models with reads piled up 
python2.5 ../commoncode/findallnocontrol.py RNA24R c2c12rna.24R.nomatch.bed c2c12rna.24R.newregions.txt 25 40

# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
python2.5 ../commoncode/checkrmask.py ../mm9repeats/rmask.db c2c12rna.24R.newregions.txt c2c12rna.24R.newregions.repstatus c2c12rna.24R.newregions.good 1

# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
python2.5 ../commoncode/regionCounts.py ../RNAFAR/all.newregions.good c2c12rna.24R.nomatch.bed c2c12rna.24R.newregions.good c2c12rna.24R.stillnomatch.bed

# map all candidate regions that are within a 20kb radius of a gene in bp
# take out -cache if running locally
python2.5 ../commoncode/getallgenes.py mouse c2c12rna.24R.newregions.good c2c12rna.24R.candidates.txt 20001 -trackfar -cache

# calculate expanded exonic read density
python2.5 ../commoncode/normalizeExpandedExonic.py mouse c2c12rna.24R.uniqs.bed c2c12rna.24R.uniqs.recount c2c12rna.24R.splices.count c2c12rna.24R.expanded.rpkm c2c12rna.24R.candidates.txt c2c12rna.24R.accepted.rpkm -cache

# create bed file of accepted candidate regions
python2.5 ../commoncode/regiontobed.py RNAFAR c2c12rna.24R.accepted.rpkm RNAFAR.bed 255,0,0

# weigh multi-reads
python2.5 ../commoncode/geneMrnaCountsWeighted.py mouse c2c12rna.24R.multi.bed c2c12rna.24R.expanded.rpkm c2c12rna.24R.accepted.rpkm c2c12rna.24R.multi.count -cache

# calculate final exonic read density
python2.5 ../commoncode/normalizeFinalExonic.py mouse c2c12rna.24R.uniqs.bed c2c12rna.24R.splices.bed c2c12rna.24R.multi.bed c2c12rna.24R.expanded.rpkm c2c12rna.24R.multi.count c2c12rna.24R.final.rpkm