first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / rnapath / processvelvet.py
1 import sys
2 import optparse
3 from erange.commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
4
5 print "processvelvet: version 1.2"
6
7 def main(argv=None):
8     if not argv:
9         argv = sys.argv
10
11     usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
12
13     parser = getParser(usage)
14     (options, args) = parser.parse_args(argv[1:])
15
16     if len(args) < 2:
17         print usage
18         sys.exit(2)
19
20     infile = args[0]
21     outfile = args[1]
22
23     processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
24
25
26 def getParser(usage):
27     parser = optparse.OptionParser(usage=usage)
28     parser.add_option("--prefix", dest="contigPrefix")
29     parser.add_option("--filter", dest="filterFileName")
30     parser.add_option("--min", type="int", dest="minSize")
31     parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
32
33     configParser = getConfigParser()
34     section = "processvelvet"
35     contigPrefix = getConfigOption(configParser, section, "contigPrefix", "chr")
36     filterFileName = getConfigOption(configParser, section, "filterFileName", "")
37     minSize = getConfigIntOption(configParser, section, "minSize", 0)
38     keepCoverage = getConfigBoolOption(configParser, section, "keepCoverage", False)
39
40     parser.set_defaults(contigPrefix=contigPrefix, filterFileName=filterFileName, minSize=minSize, keepCoverage=keepCoverage)
41
42     return parser
43
44
45 def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
46     infile = open(inFileName)
47     outfile = open(outFileName, "w")
48     filterList = getFilterList(filterFileName)
49
50     node = {"contigPrefix": contigPrefix,
51             "completeID": "",
52             "currentSeq": ""
53     }
54
55     counts = {"acceptedSize": 0,
56               "nSize": 0,
57               "contigsAccepted": 0,
58               "filteredSize": 0
59     }
60
61     for line in infile:
62         if ">NODE" in line:
63             writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
64             node["completeID"] = line.strip()[1:]
65             node["currentSeq"] = ""
66         else:
67             node["currentSeq"] += line
68
69     writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
70
71     infile.close()
72     outfile.close()
73
74     print "%d contigs accepted" % counts["contigsAccepted"]
75     print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
76     print "%d bp accepted" % counts["acceptedSize"]
77     print "%d bp accepted N" % counts["nSize"]
78     print "%d bp filtered\n" % counts["filteredSize"]
79
80
81 def getFilterList(filterFileName=""):
82     filterList = []
83
84     if filterFileName:
85         try:
86             filterFile = open(filterFileName)
87         except IOError:
88             return filterList
89
90         for line in filterFile:
91             if "NODE" in line:
92                 fields = line.strip().split()
93                 try:
94                     exclude = fields[9]
95                 except IndexError:
96                     continue
97
98                 if exclude not in filterList:
99                     filterList.append(exclude)
100
101         filterFile.close()
102
103     return filterList
104
105
106 def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
107     completeID = node["completeID"]
108     currentSeq = node["currentSeq"]
109     sequenceLength = len(currentSeq) - currentSeq.count("\n")
110     if len(completeID) > 5 and completeID not in filterList:
111         fields = completeID.split("_")
112         newID = fields[1]
113         if keepCoverage:
114             newID = fields[1] + "_" + fields[-1].strip()
115
116         if sequenceLength >= minSize:
117             outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
118             counts["acceptedSize"] += sequenceLength
119             counts["nSize"] += currentSeq.count("N")
120             counts["contigsAccepted"] += 1
121     else:
122         counts["filteredSize"] += sequenceLength
123
124 if __name__ == "__main__":
125     main(sys.argv)