first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / combineRPKMs.py
diff --git a/combineRPKMs.py b/combineRPKMs.py

index 8fd8f9f56b80d6af257cccdfdbad8f2e8fef1c9b..1e2d2aaedbf3476fc43058500c30c76c2b7c3e03 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -3,14 +3,17 @@
  #  ENRAGE
  #
  
-print 'version 1.0'
+print "combineRPKMs: version 1.1"
  try:
      import psyco
      psyco.full()
  except:
      pass
  
-import sys, optparse
+import sys
+import optparse
+import string
+from commoncode import getConfigParser, getConfigBoolOption
  
  
  def main(argv=None):
@@ -18,9 +21,7 @@ def main(argv=None):
          argv = sys.argv
  
      usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]"
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
-    parser.set_defaults(doFraction=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -35,53 +36,71 @@ def main(argv=None):
      combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction)
  
  
-def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
-    firstfile = open(firstfileName)
-    expandedfile = open(expandedfileName)
-    finalfile = open(finalfileName)
-    outfile = open(outfileName, "w")
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
  
-    firstDict = {}
-    gidDict = {}
-    expandedDict = {}
+    configParser = getConfigParser()
+    section = "combineRPKMs"
+    doFraction = getConfigBoolOption(configParser, section, "doFraction", False)
  
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
+    parser.set_defaults(doFraction=doFraction)
  
-    firstfile.close()
+    return parser
  
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
  
-    expandedfile.close()
+def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
+
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
  
      if doFraction:
          header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
      else:
          header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n"
  
+    outfile = open(outfileName, "w")
      outfile.write(header)
  
+    finalfile = open(finalfileName)
+    #TODO: the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #      are not in the finalfile then they will be lost.
      for line in finalfile:
          fields = line.strip().split()
          gene = fields[0]
          rnakb = fields[1]
          finalRPKM = fields[2]
          firstRPKM = firstDict.get(gene, "")
-        outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM)
+        outputFields = [gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM]
  
          if doFraction:
              fraction = fields[3]
-            outline += "\t%s" % fraction
-    
-        outfile.write(outline + '\n')
+            outputFields.append(fraction)
+
+        outline = "%s\n" % string.join(outputFields, "\t")
+        outfile.write(outline)
  
      finalfile.close()
      outfile.close()
  
  
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file