snapshot of 4.0a development. initial git repo commit
[erange.git] / geneNeighbors.py
1 #
2 #  geneNeighbors.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys, optparse
13 from commoncode import getMergedRegions, getLocusByChromDict
14 from cistematic.genomes import Genome
15 from cistematic.core.geneinfo import geneinfoDB
16
17 print "%prog: version 2.4"
18
19
20 def main(argv=None):
21     if not argv:
22         argv = sys.argv
23
24     usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]"
25
26     parser = optparse.OptionParser(usage=usage)
27     parser.add_option("--regions", dest="acceptFile")
28     parser.add_option("--downstream", type="int", dest="downMax")
29     parser.add_option("--upstream", type="int", dest="upMax")
30     parser.add_option("--mindist", type="int", dest="minDist")
31     parser.add_option("--minlocus", type="int", dest="minLocus")
32     parser.add_option("--maxlocus", type="int", dest="maxLocus")
33     parser.add_option("--samesense", action="store_true", dest="checkSense")
34     parser.set_defaults(acceptfile="", checkSense=False, downMax=10000000,
35                         upMax=10000000, minDist=0, minLocus=-1, maxLocus=10000000)
36     (options, args) = parser.parse_args(argv[1:])
37
38     if len(args) < 2:
39         print usage
40         sys.exit(1)
41
42     genome = args[0]
43     outfilename = args[1]
44
45     index = geneNeighbors(genome, outfilename, options.acceptFile, options.checkSense,
46                           options.downMax, options.upMax, options.minDist, options.minLocus,
47                           options.maxLocus)
48
49     print "\n%d genes matched" % index
50
51
52 def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
53                   downMax=10000000, upMax=10000000, minDist=0, minLocus=-1,
54                   maxLocus=10000000):
55
56     acceptDict = {}
57     if acceptfile:
58         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
59
60     hg = Genome(genome)
61     idb = geneinfoDB(cache=True)
62
63     geneinfoDict = idb.getallGeneInfo(genome)
64     locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
65
66     gidList = hg.allGIDs()
67     gidList.sort()
68     for chrom in acceptDict:
69         for (label, start, stop, length) in acceptDict[chrom]:
70             if label not in gidList:
71                 gidList.append(label)
72
73     index = 0
74     outfile = open(outfilename,"w")
75     chromList = locusByChromDict.keys()
76     chromList.sort()
77     for chrom in chromList:
78         if len(locusByChromDict[chrom]) < 3 or "NT" in chrom or "MT" in chrom:
79             continue
80
81         print chrom + " ",
82     
83         prevStop = locusByChromDict[chrom][0][1]
84         prevGID = locusByChromDict[chrom][0][2]
85         if "FAR" not in prevGID:
86             symbol = "LOC" + prevGID
87             geneinfo = ""
88             try:
89                 geneinfo = geneinfoDict[prevGID]
90                 symbol = geneinfo[0][0]
91             except:
92                 pass
93         else:
94             symbol = prevGID
95
96         prevGID = symbol
97         prevSense = locusByChromDict[chrom][0][4]
98
99         currentStart = locusByChromDict[chrom][1][0]
100         currentStop = locusByChromDict[chrom][1][1]
101         currentGID = locusByChromDict[chrom][1][2]
102         if "FAR" not in currentGID:
103             symbol = "LOC" + currentGID
104             geneinfo = ""
105             try:
106                 geneinfo = geneinfoDict[currentGID]
107                 symbol = geneinfo[0][0]
108             except:
109                 pass
110         else:
111             symbol = currentGID
112
113         currentGID = symbol
114         currentGlen = locusByChromDict[chrom][1][3]
115         currentSense = locusByChromDict[chrom][1][4] 
116
117         for (nextStart, nextStop, nextGID, nextGlen, nextSense) in locusByChromDict[chrom][2:]:
118             if "FAR" not in nextGID:
119                 symbol = "LOC" + nextGID
120                 geneinfo = ""
121                 try:
122                     geneinfo = geneinfoDict[nextGID]
123                     symbol = geneinfo[0][0]
124                 except:
125                     pass
126             else:
127                 symbol = nextGID
128
129             nextGID = symbol
130             leftDist = currentStart - prevStop
131             rightDist = nextStart - currentStop
132             if (currentSense == "F" and minDist < leftDist < upMax and minDist < rightDist < downMax) or (currentSense == "R" and minDist < rightDist < upMax and minDist < leftDist < downMax):
133                 if not checkSense or currentSense == nextSense:
134                     if minLocus <= currentGlen <= maxLocus:
135                         outfile.write("%s\t%s\t%s\t%s\t%d\t%s\t%s\t%d\n" % (currentGID, currentSense, prevGID, prevSense, leftDist, nextGID, nextSense, rightDist))
136                         index += 1
137
138             prevStop = currentStop
139             prevGID = currentGID
140             prevSense = currentSense
141             currentStart = nextStart
142             currentStop = nextStop
143             currentGID = nextGID
144             currentGlen = nextGlen
145             currentSense = nextSense
146
147     outfile.close()
148     return index
149
150
151 if __name__ == "__main__":
152     main(sys.argv)