snapshot of 4.0a development. initial git repo commit
[erange.git] / chiapet / .svn / text-base / markLinkers.py.svn-base
1 import sys
2
3 def main(argv=None):
4     if not argv:
5         argv = sys.argv
6
7     linkerfile = argv[1]
8     infile = argv[2]
9     outfile = argv[3]
10
11     markLinkers(linkerfile, infile, outfile)
12
13
14 def markLinkers(linkerFileName, inFileName, outFileName):
15     infile = open(inFileName)
16     outfile = open(outFileName, "w")
17     linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
18
19     for line in infile:
20         if len(line) < 2:
21             continue
22
23         if "@" in line:
24             readID = line.strip()
25             readID = readID.replace("@", "")
26         else:
27             found = False
28             for linkerID in linkerList:
29                 position = line.find(linkerDict[linkerID])
30                 if position >= 19:
31                     found = True
32                     outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
33                     outfile.write("%s\n" % line[:20])
34
35                 if not found:
36                     outfile.write(">NA_%s\n" % readID)
37                     outfile.write("%s\n" % line[:20])
38
39
40 def getLinkerInformationFromFile(linkerFileName):
41     linkerDict = {}
42     linkerList = []
43     try:
44         linkerfile = open(linkerFileName)
45         return getLinkerInformation(linkerfile)
46     except IOError:
47         return linkerDict, linkerList
48
49
50 def getLinkerInformation(linkerInformationList):
51     linkerDict = {}
52     linkerList = []
53
54     for entry in linkerInformationList:
55         if ">" in entry:
56             linkerID = entry.strip()
57             linkerID = linkerID[1:]
58             linkerList.append(linkerID)
59         else:
60             sequence = entry.strip()
61             linkerDict[linkerID] = sequence[:10]
62
63     return linkerDict, linkerList
64
65
66
67 if __name__ == "__main__":
68     main(sys.argv)