first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneNeighbors.py
1 #
2 #  geneNeighbors.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys
13 import optparse
14 from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigIntOption, getConfigBoolOption, getConfigOption
15 from cistematic.genomes import Genome
16 from commoncode import getGeneInfoDict
17
18 print "geneNeighbors: version 2.5" % sys.argv[0]
19
20
21 def main(argv=None):
22     if not argv:
23         argv = sys.argv
24
25     usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]"
26
27     parser = getParser(usage)
28     (options, args) = parser.parse_args(argv[1:])
29
30     if len(args) < 2:
31         print usage
32         sys.exit(1)
33
34     genome = args[0]
35     outfilename = args[1]
36
37     index = geneNeighbors(genome, outfilename, options.acceptFile, options.checkSense,
38                           options.downMax, options.upMax, options.minDist, options.minLocus,
39                           options.maxLocus)
40
41     print "\n%d genes matched" % index
42
43
44 def getParser(usage):
45     parser = optparse.OptionParser(usage=usage)
46     parser.add_option("--regions", dest="acceptFile")
47     parser.add_option("--downstream", type="int", dest="downMax")
48     parser.add_option("--upstream", type="int", dest="upMax")
49     parser.add_option("--mindist", type="int", dest="minDist")
50     parser.add_option("--minlocus", type="int", dest="minLocus")
51     parser.add_option("--maxlocus", type="int", dest="maxLocus")
52     parser.add_option("--samesense", action="store_true", dest="checkSense")
53
54     configParser = getConfigParser()
55     section = "geneNeighbors"
56     acceptfile = getConfigOption(configParser, section, "acceptfile", "")
57     checkSense = getConfigBoolOption(configParser, section, "checkSense", False)
58     downMax = getConfigIntOption(configParser, section, "downMax", 10000000)
59     upMax = getConfigIntOption(configParser, section, "upMax", 10000000)
60     minDist = getConfigIntOption(configParser, section, "minDist", 0)
61     minLocus = getConfigIntOption(configParser, section, "minLocus", -1)
62     maxLocus = getConfigIntOption(configParser, section, "maxLocus", 10000000)
63
64     parser.set_defaults(acceptfile=acceptfile, checkSense=checkSense, downMax=downMax,
65                         upMax=upMax, minDist=minDist, minLocus=minLocus, maxLocus=maxLocus)
66
67     return parser
68
69
70 def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
71                   downMax=10000000, upMax=10000000, minDist=0, minLocus=-1,
72                   maxLocus=10000000):
73
74     acceptDict = {}
75     if acceptfile:
76         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
77
78     hg = Genome(genome)
79     geneinfoDict = getGeneInfoDict(genome, cache=True)
80     locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
81
82     gidList = hg.allGIDs()
83     gidList.sort()
84     for chrom in acceptDict:
85         for region in acceptDict[chrom]:
86             if region.label not in gidList:
87                 gidList.append(region.label)
88
89     index = 0
90     outfile = open(outfilename,"w")
91     chromList = locusByChromDict.keys()
92     chromList.sort()
93     for chrom in chromList:
94         if len(locusByChromDict[chrom]) < 3 or "NT" in chrom or "MT" in chrom:
95             continue
96
97         print chrom + " ",
98     
99         prevStop = locusByChromDict[chrom][0][1]
100         prevGID = locusByChromDict[chrom][0][2]
101         if "FAR" not in prevGID:
102             symbol = "LOC" + prevGID
103             geneinfo = ""
104             try:
105                 geneinfo = geneinfoDict[prevGID]
106                 symbol = geneinfo[0][0]
107             except:
108                 pass
109         else:
110             symbol = prevGID
111
112         prevGID = symbol
113         prevSense = locusByChromDict[chrom][0][4]
114
115         currentStart = locusByChromDict[chrom][1][0]
116         currentStop = locusByChromDict[chrom][1][1]
117         currentGID = locusByChromDict[chrom][1][2]
118         if "FAR" not in currentGID:
119             symbol = "LOC" + currentGID
120             geneinfo = ""
121             try:
122                 geneinfo = geneinfoDict[currentGID]
123                 symbol = geneinfo[0][0]
124             except:
125                 pass
126         else:
127             symbol = currentGID
128
129         currentGID = symbol
130         currentGlen = locusByChromDict[chrom][1][3]
131         currentSense = locusByChromDict[chrom][1][4] 
132
133         for (nextStart, nextStop, nextGID, nextGlen, nextSense) in locusByChromDict[chrom][2:]:
134             if "FAR" not in nextGID:
135                 symbol = "LOC" + nextGID
136                 geneinfo = ""
137                 try:
138                     geneinfo = geneinfoDict[nextGID]
139                     symbol = geneinfo[0][0]
140                 except:
141                     pass
142             else:
143                 symbol = nextGID
144
145             nextGID = symbol
146             leftDist = currentStart - prevStop
147             rightDist = nextStart - currentStop
148             if (currentSense == "F" and minDist < leftDist < upMax and minDist < rightDist < downMax) or (currentSense == "R" and minDist < rightDist < upMax and minDist < leftDist < downMax):
149                 if not checkSense or currentSense == nextSense:
150                     if minLocus <= currentGlen <= maxLocus:
151                         outfile.write("%s\t%s\t%s\t%s\t%d\t%s\t%s\t%d\n" % (currentGID, currentSense, prevGID, prevSense, leftDist, nextGID, nextSense, rightDist))
152                         index += 1
153
154             prevStop = currentStop
155             prevGID = currentGID
156             prevSense = currentSense
157             currentStart = nextStart
158             currentStop = nextStop
159             currentGID = nextGID
160             currentGlen = nextGlen
161             currentSense = nextSense
162
163     outfile.close()
164     return index
165
166
167 if __name__ == "__main__":
168     main(sys.argv)