first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / intersects.py
1 #
2 #  intersects.py
3 #  ENRAGE
4 #
5
6 import sys
7 import optparse
8 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
9
10 print "intersects: version 2.1"
11
12 def main(argv=None):
13     if not argv:
14         argv = sys.argv
15
16     usage = "usage: python %prog infile1 infile2 outfile [options]"
17
18     parser = getParser(usage)
19     (options, args) = parser.parse_args(argv[1:])
20
21     if len(args) < 3:
22         print usage
23         sys.exit(1)
24
25     infile1 = args[0]
26     infile2 = args[1]
27     outfile = args[2]
28
29     intersects(infile1, infile2, outfile, options.delimiter, options.infile3,
30                options.matchField1, options.matchField2, options.matchField3,
31                options.rejectFileName, options.trackGID)
32
33
34 def getParser(usage):
35     parser = optparse.OptionParser(usage=usage)
36     parser.add_option("-d", dest="delimiter")
37     parser.add_option("--file3", dest="infile3")
38     parser.add_option("-1", type="int", dest="matchfield1")
39     parser.add_option("-2", type="int", dest="matchfield2")
40     parser.add_option("-3", type="int", dest="matchfield3")
41     parser.add_option("-reject1", dest="reject1file")
42     parser.add_option("-trackGID", action="store_true", dest="trackGID")
43
44     configParser = getConfigParser()
45     section = "geneMrnaCountsWeighted"
46     delimiter = getConfigOption(configParser, section, "delimiter", "\t")
47     infile3 = getConfigOption(configParser, section, "infile3", None)
48     matchField1 = getConfigIntOption(configParser, section, "matchField1", 0)
49     matchField2 = getConfigIntOption(configParser, section, "matchField2", 0)
50     matchField3 = getConfigIntOption(configParser, section, "matchField3", 0)
51     rejectFileName = getConfigOption(configParser, section, "rejectFileName", "\t")
52     trackGID = getConfigBoolOption(configParser, section, "trackGID", False)
53
54     parser.set_defaults(delimiter=delimiter, infile3=infile3, matchField1=matchField1, matchField2=matchField2,
55                         matchField3=matchField3, rejectFileName=rejectFileName, trackGID=trackGID)
56
57     return parser
58
59
60 def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
61                matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
62
63     if rejectFileName:
64         doReject1 = True
65         reject1file = open(rejectFileName)
66     else:
67         doReject1 = False
68
69     if infile3Name is not None:
70         doFile3 = True
71     else:
72         doFile3 = False
73
74     matchedList = []
75     matchedList12 = []
76     matchedList13 = []
77     matchedList23 = []
78     gidDict = {}
79
80     if trackGID:
81         gidKeys = gidDict.keys()
82         list1, fileGIDDict = getCandidatesAndGIDFromFile(infile1Name, delimiter, matchField1, gidKeys)
83         for entry in fileGIDDict.keys():
84             gidDict[entry] = fileGIDDict[entry]
85
86         gidKeys = gidDict.keys()
87         list2, fileGIDDict = getCandidatesAndGIDFromFile(infile2Name, delimiter, matchField2, gidKeys)
88         for entry in fileGIDDict.keys():
89             gidDict[entry] = fileGIDDict[entry]
90             
91         if doFile3:
92             gidKeys = gidDict.keys()
93             list3, fileGIDDict = getCandidatesAndGIDFromFile(infile3Name, delimiter, matchField3, gidKeys)
94             for entry in fileGIDDict.keys():
95                 gidDict[entry] = fileGIDDict[entry]
96     else:
97         list1 = getCandidateListFromFile(infile1Name, delimiter, matchField1)
98         list2 = getCandidateListFromFile(infile2Name, delimiter, matchField2)
99         if doFile3:
100             list3 = getCandidateListFromFile(infile3Name, delimiter, matchField3)
101
102     for candidate in list1:
103         if doFile3 and candidate in list2 and candidate in list3:
104             matchedList.append(candidate)
105         elif doFile3 and candidate in list3:
106             matchedList13.append(candidate)
107         elif doFile3 and candidate in list2:
108             matchedList12.append(candidate)
109         elif not doFile3 and candidate in list2:
110             matchedList.append(candidate)
111         elif doReject1:
112             if trackGID:
113                 reject1file.write("%s%s%s\n" % (candidate, delimiter, gidDict[candidate]))
114             else:
115                 reject1file.write("%s\n" % candidate)
116
117     if doFile3:
118         for candidate in list2:
119             if candidate not in list1 and candidate in list3:
120                 matchedList23.append(candidate)
121
122     print len(list1), len(list2), len(list3)
123     if doFile3:
124         print len(matchedList12), len(matchedList13), len(matchedList23)
125     print len(matchedList)
126
127     outfile = open(outfileName, "w")
128     for match in matchedList:
129         if trackGID:
130             outfile.write("%s%s%s\n" % (match, delimiter, gidDict[match]))
131         else:
132             outfile.write("%s\n" % match)
133
134     outfile.close()
135
136
137 def getCandidatesFromFile(filename, delimiter, matchField, trackGID=False, gidList=[]):
138     infile = open(filename)
139     candidateList = []
140     gidDict = {}
141
142     for line in infile:
143         if line[0] == "#":
144             continue
145
146         fields = line.strip().split(delimiter)
147         candidate = fields[matchField]
148         if candidate not in candidateList:
149             candidateList.append(candidate)
150
151         if trackGID and candidate not in gidList:
152             gidDict[candidate] = fields[matchField + 1]
153
154     infile.close()
155     return candidateList, gidDict
156
157
158 def getCandidatesAndGIDFromFile(filename, delimiter, matchField, gidList=[]):
159     return getCandidatesFromFile(filename, delimiter, matchField, trackGID=True, gidList=[])
160
161
162 def getCandidateListFromFile(filename, delimiter, matchField):
163     candidateList, gidDict = getCandidatesFromFile(filename, delimiter, matchField)
164     return candidateList
165
166
167 if __name__ == "__main__":
168     main(sys.argv)