fix bug in RegionFinder.updateControlStatistics that was causing crash
[erange.git] / getNovelSNPs.py
1 #
2 #  getNovelSNPs.py
3 #  ENRAGE
4 #
5 # This script attempts to annotate the novel sncs/snps from the snp summary file 
6 # Written by: Wendy Lee
7 # Written on: Aug 7th, 2008
8
9 import sys
10 import string
11 from cistematic.genomes import Genome 
12 from commoncode import writeLog
13
14 print "getNovelSNPs: version 1.6"
15
16 try:
17     import psyco
18     psyco.full()
19 except:
20     pass
21
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %s genome snpsfile nondbsnp_geneinfo_outfile" % argv[0]
28
29     if len(argv) < 4:
30         print usage
31         sys.exit(2)
32
33     genome = argv[1]
34     snpfile = argv[2]
35     outfilename = argv[3]
36
37     getNovelSNPsFromFile(genome, snpfile, outfilename)
38
39
40 def getNovelSNPsFromFile(genome, snpfile, outfilename): 
41     infile = file(snpfile, "r")
42     writeNovelSNPFile(genome, infile, outfilename)
43     writeLog("snp.log", sys.argv[0], "outputfile: %s" % outfilename)
44     infile.close()
45
46
47 def writeNovelSNPFile(genome, snpPropertiesList, outfilename):
48     hg = Genome(genome)
49     outString = ""
50     outfile  = open(outfilename, "w")
51     outfile.write("#Sl\tCl\tchrom\tmis pos\t\tmatch\tuniq_mis\ttot_mis\tbase_chg\tknown_snp\tfunction\tgene\tgeneId\trpkm\n") 
52     for line in snpPropertiesList:
53         if doNotProcessLine(line):
54             continue
55
56         outString = getNovelSNPInfo(genome, line, hg)
57         if outString == line:
58             outfile.write(outString)
59         else:
60             outfile.write("%s\n" % outString)
61
62     outfile.close()
63
64
65 def doNotProcessLine(line):
66     return line[0] == "#"
67
68
69 def getNovelSNPInfo(genome, snpEntry, hg):
70     fields = snpEntry.split()
71     #TODO: refactor naming. is fields[8] rpkm?
72     if fields[8].find("N\A") == -1: 
73         return snpEntry
74     else:
75         snpInfo = ""
76         gid = fields[11]
77         snc_start = int(fields[3])
78         featuresList = hg.getGeneFeatures((genome, gid))
79         func = "N\A"
80         for (ftype, chromosome, start, stop, orientation) in featuresList:
81             if int(start) <= snc_start <= int(stop):
82                 func = ftype
83                 break 
84  
85         for i in range (0, 9):
86             snpInfo = string.join([snpInfo, fields[i]], "\t")
87
88         snpInfo = string.join([snpInfo, func], "\t")
89         for i in range (10, 13):
90             snpInfo = string.join([snpInfo, fields[i]], "\t")
91
92     return snpInfo
93
94
95 if __name__ == "__main__":
96     main(sys.argv)