rnapath/.svn/tmp/processvelvet.py.tmp

   1 import sys
   2 import optparse
   3
   4 print "%prog: version 1.1"
   5
   6 def main(argv=None):
   7     if not argv:
   8         argv = sys.argv
   9
  10     usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
  11
  12     parser = optparse.OptionParser(usage=usage)
  13     parser.add_option("--prefix", dest="contigPrefix")
  14     parser.add_option("--filter", dest="filterFileName")
  15     parser.add_option("--min", type="int", dest="minSize")
  16     parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
  17     parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
  18     (options, args) = parser.parse_args(argv[1:])
  19
  20     if len(args) < 2:
  21         print usage
  22         sys.exit(2)
  23
  24     infile = args[0]
  25     outfile = args[1]
  26
  27     processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
  28
  29
  30 def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
  31     infile = open(inFileName)
  32     outfile = open(outFileName, "w")
  33     filterList = getFilterList(filterFileName)
  34
  35     node = {"contigPrefix": contigPrefix,
  36             "completeID": "",
  37             "currentSeq": ""
  38     }
  39
  40     counts = {"acceptedSize": 0,
  41               "nSize": 0,
  42               "contigsAccepted": 0,
  43               "filteredSize": 0
  44     }
  45
  46     for line in infile:
  47         if ">NODE" in line:
  48             writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
  49             node["completeID"] = line.strip()[1:]
  50             node["currentSeq"] = ""
  51         else:
  52             node["currentSeq"] += line
  53
  54     writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
  55
  56     infile.close()
  57     outfile.close()
  58
  59     print "%d contigs accepted" % counts["contigsAccepted"]
  60     print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
  61     print "%d bp accepted" % counts["acceptedSize"]
  62     print "%d bp accepted N" % counts["nSize"]
  63     print "%d bp filtered\n" % counts["filteredSize"]
  64
  65
  66 def getFilterList(filterFileName=""):
  67     filterList = []
  68
  69     if filterFileName:
  70         try:
  71             filterFile = open(filterFileName)
  72         except IOError:
  73             return filterList
  74
  75         for line in filterFile:
  76             if "NODE" in line:
  77                 fields = line.strip().split()
  78                 try:
  79                     exclude = fields[9]
  80                 except IndexError:
  81                     continue
  82
  83                 if exclude not in filterList:
  84                     filterList.append(exclude)
  85
  86         filterFile.close()
  87
  88     return filterList
  89
  90
  91 def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
  92     completeID = node["completeID"]
  93     currentSeq = node["currentSeq"]
  94     sequenceLength = len(currentSeq) - currentSeq.count("\n")
  95     if len(completeID) > 5 and completeID not in filterList:
  96         fields = completeID.split("_")
  97         newID = fields[1]
  98         if keepCoverage:
  99             newID = fields[1] + "_" + fields[-1].strip()
 100
 101         if sequenceLength >= minSize:
 102             outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
 103             counts["acceptedSize"] += sequenceLength
 104             counts["nSize"] += currentSeq.count("N")
 105             counts["contigsAccepted"] += 1
 106     else:
 107         counts["filteredSize"] += sequenceLength
 108
 109 if __name__ == "__main__":
 110     main(sys.argv)