snapshot of 4.0a development. initial git repo commit
[erange.git] / intersects.py
1 #
2 #  intersects.py
3 #  ENRAGE
4 #
5
6 import sys, optparse
7
8 print 'version 2.0'
9
10 def main(argv=None):
11     if not argv:
12         argv = sys.argv
13
14     usage = "usage: python %prog infile1 infile2 outfile [options]"
15
16     parser = optparse.OptionParser(usage=usage)
17     parser.add_option("-d", dest="delimiter")
18     parser.add_option("--file3", dest="infile3")
19     parser.add_option("-1", type="int", dest="matchfield1")
20     parser.add_option("-2", type="int", dest="matchfield2")
21     parser.add_option("-3", type="int", dest="matchfield3")
22     parser.add_option("-reject1", dest="reject1file")
23     parser.add_option("-trackGID", action="store_true", dest="trackGID")
24     parser.set_defaults(delimiter="\t", infile3=None, matchField1=0, matchField2=0,
25                         matchField3=0, rejectFileName="", trackGID=False)
26     (options, args) = parser.parse_args(argv[1:])
27
28     if len(args) < 3:
29         print usage
30         sys.exit(1)
31
32     infile1 = args[0]
33     infile2 = args[1]
34     outfile = args[2]
35
36     intersects(infile1, infile2, outfile, options.delimiter, options.infile3,
37                options.matchField1, options.matchField2, options.matchField3,
38                options.rejectFileName, options.trackGID)
39
40
41 def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
42                matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
43
44     if rejectFileName:
45         doReject1 = True
46         reject1file = open(rejectFileName)
47     else:
48         doReject1 = False
49
50     if infile3Name is not None:
51         doFile3 = True
52     else:
53         doFile3 = False
54
55     matchedList = []
56     matchedList12 = []
57     matchedList13 = []
58     matchedList23 = []
59     gidDict = {}
60
61     if trackGID:
62         gidKeys = gidDict.keys()
63         list1, fileGIDDict = getCandidatesAndGIDFromFile(infile1Name, delimiter, matchField1, gidKeys)
64         for entry in fileGIDDict.keys():
65             gidDict[entry] = fileGIDDict[entry]
66
67         gidKeys = gidDict.keys()
68         list2, fileGIDDict = getCandidatesAndGIDFromFile(infile2Name, delimiter, matchField2, gidKeys)
69         for entry in fileGIDDict.keys():
70             gidDict[entry] = fileGIDDict[entry]
71             
72         if doFile3:
73             gidKeys = gidDict.keys()
74             list3, fileGIDDict = getCandidatesAndGIDFromFile(infile3Name, delimiter, matchField3, gidKeys)
75             for entry in fileGIDDict.keys():
76                 gidDict[entry] = fileGIDDict[entry]
77     else:
78         list1 = getCandidateListFromFile(infile1Name, delimiter, matchField1)
79         list2 = getCandidateListFromFile(infile2Name, delimiter, matchField2)
80         if doFile3:
81             list3 = getCandidateListFromFile(infile3Name, delimiter, matchField3)
82
83     for candidate in list1:
84         if doFile3 and candidate in list2 and candidate in list3:
85             matchedList.append(candidate)
86         elif doFile3 and candidate in list3:
87             matchedList13.append(candidate)
88         elif doFile3 and candidate in list2:
89             matchedList12.append(candidate)
90         elif not doFile3 and candidate in list2:
91             matchedList.append(candidate)
92         elif doReject1:
93             if trackGID:
94                 reject1file.write("%s%s%s\n" % (candidate, delimiter, gidDict[candidate]))
95             else:
96                 reject1file.write("%s\n" % candidate)
97
98     if doFile3:
99         for candidate in list2:
100             if candidate not in list1 and candidate in list3:
101                 matchedList23.append(candidate)
102
103     print len(list1), len(list2), len(list3)
104     if doFile3:
105         print len(matchedList12), len(matchedList13), len(matchedList23)
106     print len(matchedList)
107
108     outfile = open(outfileName, "w")
109     for match in matchedList:
110         if trackGID:
111             outfile.write("%s%s%s\n" % (match, delimiter, gidDict[match]))
112         else:
113             outfile.write("%s\n" % match)
114
115     outfile.close()
116
117
118 def getCandidatesFromFile(filename, delimiter, matchField, trackGID=False, gidList=[]):
119     infile = open(filename)
120     candidateList = []
121     gidDict = {}
122
123     for line in infile:
124         if line[0] == "#":
125             continue
126
127         fields = line.strip().split(delimiter)
128         candidate = fields[matchField]
129         if candidate not in candidateList:
130             candidateList.append(candidate)
131
132         if trackGID and candidate not in gidList:
133             gidDict[candidate] = fields[matchField + 1]
134
135     infile.close()
136     return candidateList, gidDict
137
138
139 def getCandidatesAndGIDFromFile(filename, delimiter, matchField, gidList=[]):
140     return getCandidatesFromFile(filename, delimiter, matchField, trackGID=True, gidList=[])
141
142
143 def getCandidateListFromFile(filename, delimiter, matchField):
144     candidateList, gidDict = getCandidatesFromFile(filename, delimiter, matchField)
145     return candidateList
146
147
148 if __name__ == "__main__":
149     main(sys.argv)