8 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
10 print "intersects: version 2.1"
16 usage = "usage: python %prog infile1 infile2 outfile [options]"
18 parser = getParser(usage)
19 (options, args) = parser.parse_args(argv[1:])
29 intersects(infile1, infile2, outfile, options.delimiter, options.infile3,
30 options.matchField1, options.matchField2, options.matchField3,
31 options.rejectFileName, options.trackGID)
35 parser = optparse.OptionParser(usage=usage)
36 parser.add_option("-d", dest="delimiter")
37 parser.add_option("--file3", dest="infile3")
38 parser.add_option("-1", type="int", dest="matchfield1")
39 parser.add_option("-2", type="int", dest="matchfield2")
40 parser.add_option("-3", type="int", dest="matchfield3")
41 parser.add_option("-reject1", dest="reject1file")
42 parser.add_option("-trackGID", action="store_true", dest="trackGID")
44 configParser = getConfigParser()
45 section = "geneMrnaCountsWeighted"
46 delimiter = getConfigOption(configParser, section, "delimiter", "\t")
47 infile3 = getConfigOption(configParser, section, "infile3", None)
48 matchField1 = getConfigIntOption(configParser, section, "matchField1", 0)
49 matchField2 = getConfigIntOption(configParser, section, "matchField2", 0)
50 matchField3 = getConfigIntOption(configParser, section, "matchField3", 0)
51 rejectFileName = getConfigOption(configParser, section, "rejectFileName", "\t")
52 trackGID = getConfigBoolOption(configParser, section, "trackGID", False)
54 parser.set_defaults(delimiter=delimiter, infile3=infile3, matchField1=matchField1, matchField2=matchField2,
55 matchField3=matchField3, rejectFileName=rejectFileName, trackGID=trackGID)
60 def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
61 matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
65 reject1file = open(rejectFileName)
69 if infile3Name is not None:
81 gidKeys = gidDict.keys()
82 list1, fileGIDDict = getCandidatesAndGIDFromFile(infile1Name, delimiter, matchField1, gidKeys)
83 for entry in fileGIDDict.keys():
84 gidDict[entry] = fileGIDDict[entry]
86 gidKeys = gidDict.keys()
87 list2, fileGIDDict = getCandidatesAndGIDFromFile(infile2Name, delimiter, matchField2, gidKeys)
88 for entry in fileGIDDict.keys():
89 gidDict[entry] = fileGIDDict[entry]
92 gidKeys = gidDict.keys()
93 list3, fileGIDDict = getCandidatesAndGIDFromFile(infile3Name, delimiter, matchField3, gidKeys)
94 for entry in fileGIDDict.keys():
95 gidDict[entry] = fileGIDDict[entry]
97 list1 = getCandidateListFromFile(infile1Name, delimiter, matchField1)
98 list2 = getCandidateListFromFile(infile2Name, delimiter, matchField2)
100 list3 = getCandidateListFromFile(infile3Name, delimiter, matchField3)
102 for candidate in list1:
103 if doFile3 and candidate in list2 and candidate in list3:
104 matchedList.append(candidate)
105 elif doFile3 and candidate in list3:
106 matchedList13.append(candidate)
107 elif doFile3 and candidate in list2:
108 matchedList12.append(candidate)
109 elif not doFile3 and candidate in list2:
110 matchedList.append(candidate)
113 reject1file.write("%s%s%s\n" % (candidate, delimiter, gidDict[candidate]))
115 reject1file.write("%s\n" % candidate)
118 for candidate in list2:
119 if candidate not in list1 and candidate in list3:
120 matchedList23.append(candidate)
122 print len(list1), len(list2), len(list3)
124 print len(matchedList12), len(matchedList13), len(matchedList23)
125 print len(matchedList)
127 outfile = open(outfileName, "w")
128 for match in matchedList:
130 outfile.write("%s%s%s\n" % (match, delimiter, gidDict[match]))
132 outfile.write("%s\n" % match)
137 def getCandidatesFromFile(filename, delimiter, matchField, trackGID=False, gidList=[]):
138 infile = open(filename)
146 fields = line.strip().split(delimiter)
147 candidate = fields[matchField]
148 if candidate not in candidateList:
149 candidateList.append(candidate)
151 if trackGID and candidate not in gidList:
152 gidDict[candidate] = fields[matchField + 1]
155 return candidateList, gidDict
158 def getCandidatesAndGIDFromFile(filename, delimiter, matchField, gidList=[]):
159 return getCandidatesFromFile(filename, delimiter, matchField, trackGID=True, gidList=[])
162 def getCandidateListFromFile(filename, delimiter, matchField):
163 candidateList, gidDict = getCandidatesFromFile(filename, delimiter, matchField)
167 if __name__ == "__main__":