first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / analyzego.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 import sys
8 import optparse
9 from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption
10 from cistematic.cisstat.analyzego import calculateGOStats
11
12 print "analyzego: version 2.2"
13
14 def main(argv=None):
15     if not argv:
16         argv = sys.argv
17
18     usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]"
19
20     parser = makeParser(usage)
21     (options, args) = parser.parse_args(argv[1:])
22
23     if len(args) < 3:
24         print usage
25         sys.exit(1)
26
27     fieldID = 1
28     if options.translateGene:
29         fieldID = 0
30
31     if options.fieldID is not None:
32         fieldID = options.fieldID
33
34     genome = args[0]
35     infilename = args[1]
36     prefix = args[2]
37
38     analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID, numTests=options.numTests)
39
40
41 def makeParser(usage=""):
42     parser = optparse.OptionParser(usage=usage)
43     parser.add_option("--geneName", action="store_true", dest="translateGene",
44                       help="translate gene")
45     parser.add_option("--field", type="int", dest="fieldID",
46                       help="column containing gene ID/Name")
47     parser.add_option("--trials", type="int", dest="numTests",
48                       help="column containing gene ID/Name")
49
50     configParser = getConfigParser()
51     section = "analyzego"
52     translateGene = getConfigOption(configParser, section, "translateGene", False)
53     fieldID = getConfigOption(configParser, section, "fieldID", None)
54     numTests = getConfigIntOption(configParser, section, "numTests", 1)
55
56     parser.set_defaults(translateGene=translateGene, fieldID=fieldID, numTests=numTests)
57
58     return parser
59
60
61 def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1, numTests=1):
62     infile = open(infilename)
63     analyzeGO(genome, infile, prefix, translateGene, fieldID, numTests)
64     infile.close()
65
66
67 def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1, numTests=1):
68     if translateGene:
69         symbolToGidDict = getSymbolDict(genome)
70
71     locusList = []
72     for line in geneInfoList:
73         fields = line.split()
74         if translateGene:
75             gene = fields[fieldID]
76             if "LOC" in gene:
77                 gID = gene[3:]
78             elif "FAR" in gene:
79                 print "ignoring %s" % gene
80                 continue
81             else:
82                 try:
83                     gID = symbolToGidDict[gene]
84                 except KeyError:
85                     print "ignoring %s" % gene
86                     continue
87         else:
88             gID = fields[fieldID]
89
90         if (genome, gID) not in locusList:
91             locusList.append((genome, gID))
92
93     if len(locusList) > 0:
94         calculateGOStats(locusList, prefix, trials=numTests)
95
96
97 def getSymbolDict(genome):
98     geneinfoDict = getGeneInfoDict(genome, cache=True)
99     symbolToGidDict = {}
100     for gid in geneinfoDict:
101         symbol = geneinfoDict[gid][0][0].strip()
102         symbolToGidDict[symbol] = gid
103
104     return symbolToGidDict
105
106
107 if __name__ == "__main__":
108     main(sys.argv)