import sys import optparse print "%prog: version 1.1" def main(argv=None): if not argv: argv = sys.argv usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]" parser = optparse.OptionParser(usage=usage) parser.add_option("--prefix", dest="contigPrefix") parser.add_option("--filter", dest="filterFileName") parser.add_option("--min", type="int", dest="minSize") parser.add_option("--keepcov", action="store_true", dest="keepCoverage") parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False) (options, args) = parser.parse_args(argv[1:]) if len(args) < 2: print usage sys.exit(2) infile = args[0] outfile = args[1] processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage) def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False): infile = open(inFileName) outfile = open(outFileName, "w") filterList = getFilterList(filterFileName) node = {"contigPrefix": contigPrefix, "completeID": "", "currentSeq": "" } counts = {"acceptedSize": 0, "nSize": 0, "contigsAccepted": 0, "filteredSize": 0 } for line in infile: if ">NODE" in line: writeNode(outfile, node, filterList, counts, minSize, keepCoverage) node["completeID"] = line.strip()[1:] node["currentSeq"] = "" else: node["currentSeq"] += line writeNode(outfile, node, filterList, counts, minSize, keepCoverage) infile.close() outfile.close() print "%d contigs accepted" % counts["contigsAccepted"] print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"]) print "%d bp accepted" % counts["acceptedSize"] print "%d bp accepted N" % counts["nSize"] print "%d bp filtered\n" % counts["filteredSize"] def getFilterList(filterFileName=""): filterList = [] if filterFileName: try: filterFile = open(filterFileName) except IOError: return filterList for line in filterFile: if "NODE" in line: fields = line.strip().split() try: exclude = fields[9] except IndexError: continue if exclude not in filterList: filterList.append(exclude) filterFile.close() return filterList def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False): completeID = node["completeID"] currentSeq = node["currentSeq"] sequenceLength = len(currentSeq) - currentSeq.count("\n") if len(completeID) > 5 and completeID not in filterList: fields = completeID.split("_") newID = fields[1] if keepCoverage: newID = fields[1] + "_" + fields[-1].strip() if sequenceLength >= minSize: outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq)) counts["acceptedSize"] += sequenceLength counts["nSize"] += currentSeq.count("N") counts["contigsAccepted"] += 1 else: counts["filteredSize"] += sequenceLength if __name__ == "__main__": main(sys.argv)