X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=runSNPAnalysis.py;fp=runSNPAnalysis.py;h=e511d806f197ebb8a83846b39e88b94fc5544674;hp=0000000000000000000000000000000000000000;hb=cfc5602b26323ad2365295145e3f6c622d912eb4;hpb=c4561c55cfa9726530c6777b6515c4ef66306b2f diff --git a/runSNPAnalysis.py b/runSNPAnalysis.py new file mode 100644 index 0000000..e511d80 --- /dev/null +++ b/runSNPAnalysis.py @@ -0,0 +1,100 @@ +import sys +import optparse +from commoncode import writeLog +from makeSNPtrack import makeSNPtrack +from getNovelSNPs import getNovelSNPsFromFile +from getSNPGeneInfo import writeSNPGeneInfo +from chksnp import chkSNPFile +from chkSNPrmask import chkSNPrmask +from getSNPs import writeSNPsToFile + +VERSION = "1.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + print "runSNPAnalysis: version %s" % VERSION + usage = "usage: python runSNPAnalysis.py genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]" + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + rdsfile = args[1] + label = args[2] + rmaskdbfile = args[3] + dbsnpfile = args[4] + uniqStartMin = args[5] + totalRatio = args[6] + rpkmfile = args[7] + try: + cachepages = args[8] + except IndexError: + cachepages = None + + runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages) + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + + return parser + + +def runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages=None): + """ based on original script runSNPAnalysis.sh + usage: runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages] + where for each position S: + uniqStartMin = # independent reads supporting base change at S + totalRatio = total # reads supporting base change at S / total # reads that pass through S + """ + + # log the parameters + message = "with parameters: %s %s %s %s %s %s %s %s" % (genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile) + writeLog("rna.log", "runStandardAnalysis.py", message) + + # get all SNPs by extracting it from the RDS + fullsnpfilename = "%s.snps.txt" % label + try: + snpCache = int(cachepages) + except TypeError, ValueError: + snpCache = 0 + + doCache = cachepages is not None + writeSNPsToFile(rdsfile, uniqStartMin, totalRatio, fullsnpfilename, doCache, cachePages=snpCache, forceChr=True) + + # get SNPs in non-repeat regions only + snpfilename = "%s.nr_snps.txt" % label + chkSNPrmask(rmaskdbfile, fullsnpfilename, snpfilename, repeats=False, cachePages=cachepages) + + # Check to see if SNPs are found in dbSNP + # if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC + # usage: python2.5 buildsnpdb.py snpdbdir snpdbname + # the database flat file must be in the snpdbdir directory + # To build dbSNP database file, run the following command + # python2.5 buildsnpdb.py snp128.txt dbSNP128 + + # get dbSNP info for SNPs that are found in the dbSNP database + dbsnpFileName = "%s.nr_dbsnp.txt" % label + chkSNPFile(dbsnpfile, snpfilename, dbsnpFileName, cachePages=cachepages) + + # get gene info for the snps found in dbSNP + genefilename = "%s.nr_dbsnp_geneinfo.txt" % label + writeSNPGeneInfo(genome, dbsnpFileName, rpkmfile, genefilename, doCache=doCache) + + # get gene info for snps that are not found in dbSNP + outfilename = "%s.nr_final.txt" % label + getNovelSNPsFromFile(genome, genefilename, outfilename) + + # make bed file for displaying the snps on UCSC genome browser + bedfilename = "%s.nr_snps.bed" % label + makeSNPtrack(snpfilename, label, bedfilename) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file