snapshot of 4.0a development. initial git repo commit
[erange.git] / rnapath / .svn / tmp / processvelvet.py.tmp
1 import sys
2 import optparse
3
4 print "%prog: version 1.1"
5
6 def main(argv=None):
7     if not argv:
8         argv = sys.argv
9
10     usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
11
12     parser = optparse.OptionParser(usage=usage)
13     parser.add_option("--prefix", dest="contigPrefix")
14     parser.add_option("--filter", dest="filterFileName")
15     parser.add_option("--min", type="int", dest="minSize")
16     parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
17     parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
18     (options, args) = parser.parse_args(argv[1:])
19
20     if len(args) < 2:
21         print usage
22         sys.exit(2)
23
24     infile = args[0]
25     outfile = args[1]
26
27     processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
28
29
30 def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
31     infile = open(inFileName)
32     outfile = open(outFileName, "w")
33     filterList = getFilterList(filterFileName)
34
35     node = {"contigPrefix": contigPrefix,
36             "completeID": "",
37             "currentSeq": ""
38     }
39
40     counts = {"acceptedSize": 0,
41               "nSize": 0,
42               "contigsAccepted": 0,
43               "filteredSize": 0
44     }
45
46     for line in infile:
47         if ">NODE" in line:
48             writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
49             node["completeID"] = line.strip()[1:]
50             node["currentSeq"] = ""
51         else:
52             node["currentSeq"] += line
53
54     writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
55
56     infile.close()
57     outfile.close()
58
59     print "%d contigs accepted" % counts["contigsAccepted"]
60     print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
61     print "%d bp accepted" % counts["acceptedSize"]
62     print "%d bp accepted N" % counts["nSize"]
63     print "%d bp filtered\n" % counts["filteredSize"]
64
65
66 def getFilterList(filterFileName=""):
67     filterList = []
68
69     if filterFileName:
70         try:
71             filterFile = open(filterFileName)
72         except IOError:
73             return filterList
74
75         for line in filterFile:
76             if "NODE" in line:
77                 fields = line.strip().split()
78                 try:
79                     exclude = fields[9]
80                 except IndexError:
81                     continue
82
83                 if exclude not in filterList:
84                     filterList.append(exclude)
85
86         filterFile.close()
87
88     return filterList
89
90
91 def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
92     completeID = node["completeID"]
93     currentSeq = node["currentSeq"]
94     sequenceLength = len(currentSeq) - currentSeq.count("\n")
95     if len(completeID) > 5 and completeID not in filterList:
96         fields = completeID.split("_")
97         newID = fields[1]
98         if keepCoverage:
99             newID = fields[1] + "_" + fields[-1].strip()
100
101         if sequenceLength >= minSize:
102             outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
103             counts["acceptedSize"] += sequenceLength
104             counts["nSize"] += currentSeq.count("N")
105             counts["contigsAccepted"] += 1
106     else:
107         counts["filteredSize"] += sequenceLength
108
109 if __name__ == "__main__":
110     main(sys.argv)