runSNPAnalysis.py

   1 import sys
   2 import optparse
   3 from commoncode import writeLog
   4 from makeSNPtrack import makeSNPtrack
   5 from getNovelSNPs import getNovelSNPsFromFile
   6 from getSNPGeneInfo import writeSNPGeneInfo
   7 from chksnp import chkSNPFile
   8 from chkSNPrmask import chkSNPrmask
   9 from getSNPs import writeSNPsToFile
  10
  11 VERSION = "1.0"
  12
  13 def main(argv=None):
  14     if not argv:
  15         argv = sys.argv
  16
  17     print "runSNPAnalysis: version %s" % VERSION
  18     usage = "usage: python runSNPAnalysis.py genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]"
  19
  20     parser = getParser(usage)
  21     (options, args) = parser.parse_args(argv[1:])
  22
  23     if len(args) < 4:
  24         print usage
  25         sys.exit(1)
  26
  27     genome = args[0]
  28     rdsfile = args[1]
  29     label = args[2]
  30     rmaskdbfile = args[3]
  31     dbsnpfile = args[4]
  32     uniqStartMin = args[5]
  33     totalRatio = args[6]
  34     rpkmfile = args[7]
  35     try:
  36         cachepages = args[8]
  37     except IndexError:
  38         cachepages = None
  39
  40     runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages)
  41
  42
  43 def getParser(usage):
  44     parser = optparse.OptionParser(usage=usage)
  45
  46     return parser
  47
  48
  49 def runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages=None):
  50     """ based on original script runSNPAnalysis.sh
  51         usage: runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]
  52                where for each position S:
  53                    uniqStartMin = # independent reads supporting base change at S
  54                    totalRatio = total # reads supporting base change at S / total # reads that pass through S
  55     """
  56
  57     # log the parameters
  58     message = "with parameters: %s %s %s %s %s %s %s %s" % (genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile)
  59     writeLog("rna.log", "runStandardAnalysis.py", message)
  60
  61     # get all SNPs by extracting it from the RDS
  62     fullsnpfilename = "%s.snps.txt" % label
  63     try:
  64         snpCache = int(cachepages)
  65     except TypeError, ValueError:
  66         snpCache = 0
  67
  68     doCache = cachepages is not None
  69     writeSNPsToFile(rdsfile, uniqStartMin, totalRatio, fullsnpfilename, doCache, cachePages=snpCache, forceChr=True)
  70
  71     # get SNPs in non-repeat regions only
  72     snpfilename = "%s.nr_snps.txt" % label
  73     chkSNPrmask(rmaskdbfile, fullsnpfilename, snpfilename, repeats=False, cachePages=cachepages)
  74
  75     # Check to see if SNPs are found in dbSNP
  76     # if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC
  77     # usage: python2.5 buildsnpdb.py snpdbdir snpdbname
  78     # the database flat file must be in the snpdbdir directory
  79     # To build dbSNP database file, run the following command
  80     # python2.5 buildsnpdb.py snp128.txt dbSNP128
  81
  82     # get dbSNP info for SNPs that are found in the dbSNP database
  83     dbsnpFileName = "%s.nr_dbsnp.txt" % label
  84     chkSNPFile(dbsnpfile, snpfilename, dbsnpFileName, cachePages=cachepages)
  85
  86     # get gene info for the snps found in dbSNP
  87     genefilename = "%s.nr_dbsnp_geneinfo.txt" % label
  88     writeSNPGeneInfo(genome, dbsnpFileName, rpkmfile, genefilename, doCache=doCache)
  89
  90     # get gene info for snps that are not found in dbSNP
  91     outfilename = "%s.nr_final.txt" % label
  92     getNovelSNPsFromFile(genome, genefilename, outfilename)
  93
  94     # make bed file for displaying the snps on UCSC genome browser
  95     bedfilename = "%s.nr_snps.bed" % label
  96     makeSNPtrack(snpfilename, label, bedfilename)
  97
  98
  99 if __name__ == "__main__":
 100     main(sys.argv)