erange 4.0a dev release with integrated cistematic
[erange.git] / analyzego.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 import sys
8 import optparse
9 from commoncode import getGeneInfoDict, getConfigParser, getConfigOption
10 from cistematic.cisstat.analyzego import calculateGOStats
11
12 print "analyzego: version 2.2"
13
14 def main(argv=None):
15     if not argv:
16         argv = sys.argv
17
18     usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]"
19
20     parser = makeParser(usage)
21     (options, args) = parser.parse_args(argv[1:])
22
23     if len(args) < 3:
24         print usage
25         sys.exit(1)
26
27     fieldID = 1
28     if options.translateGene:
29         fieldID = 0
30
31     if options.fieldID is not None:
32         fieldID = options.fieldID
33
34     genome = args[0]
35     infilename = args[1]
36     prefix = args[2]
37
38     analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
39
40
41 def makeParser(usage=""):
42     parser = optparse.OptionParser(usage=usage)
43     parser.add_option("--geneName", action="store_true", dest="translateGene",
44                       help="translate gene")
45     parser.add_option("--field", type="int", dest="fieldID",
46                       help="column containing gene ID/Name")
47
48     configParser = getConfigParser()
49     section = "analyzego"
50     translateGene = getConfigOption(configParser, section, "translateGene", False)
51     fieldID = getConfigOption(configParser, section, "fieldID", None)
52
53     parser.set_defaults(translateGene=translateGene, fieldID=fieldID)
54
55     return parser
56
57
58 def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
59     infile = open(infilename)
60     analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
61     infile.close()
62
63
64 def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
65     if translateGene:
66         symbolToGidDict = getSymbolDict(genome)
67
68     locusList = []
69     for line in geneInfoList:
70         fields = line.split()
71         if translateGene:
72             gene = fields[fieldID]
73             if "LOC" in gene:
74                 gID = gene[3:]
75             elif "FAR" in gene:
76                 print "ignoring %s" % gene
77                 continue
78             else:
79                 try:
80                     gID = symbolToGidDict[gene]
81                 except KeyError:
82                     print "ignoring %s" % gene
83                     continue
84         else:
85             gID = fields[fieldID]
86
87         if (genome, gID) not in locusList:
88             locusList.append((genome, gID))
89
90     if len(locusList) > 0:
91         calculateGOStats(locusList, prefix)
92
93
94 def getSymbolDict(genome):
95     geneinfoDict = getGeneInfoDict(genome, cache=True)
96     symbolToGidDict = {}
97     for gid in geneinfoDict:
98         symbol = geneinfoDict[gid][0][0].strip()
99         symbolToGidDict[symbol] = gid
100
101     return symbolToGidDict
102
103
104 if __name__ == "__main__":
105     main(sys.argv)