first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / combineRPKMs.py
index 8fd8f9f56b80d6af257cccdfdbad8f2e8fef1c9b..1e2d2aaedbf3476fc43058500c30c76c2b7c3e03 100755 (executable)
@@ -3,14 +3,17 @@
 #  ENRAGE
 #
 
-print 'version 1.0'
+print "combineRPKMs: version 1.1"
 try:
     import psyco
     psyco.full()
 except:
     pass
 
-import sys, optparse
+import sys
+import optparse
+import string
+from commoncode import getConfigParser, getConfigBoolOption
 
 
 def main(argv=None):
@@ -18,9 +21,7 @@ def main(argv=None):
         argv = sys.argv
 
     usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]"
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
-    parser.set_defaults(doFraction=False)
+    parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 3:
@@ -35,53 +36,71 @@ def main(argv=None):
     combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction)
 
 
-def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
-    firstfile = open(firstfileName)
-    expandedfile = open(expandedfileName)
-    finalfile = open(finalfileName)
-    outfile = open(outfileName, "w")
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
 
-    firstDict = {}
-    gidDict = {}
-    expandedDict = {}
+    configParser = getConfigParser()
+    section = "combineRPKMs"
+    doFraction = getConfigBoolOption(configParser, section, "doFraction", False)
 
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
+    parser.set_defaults(doFraction=doFraction)
 
-    firstfile.close()
+    return parser
 
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
 
-    expandedfile.close()
+def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
+
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
 
     if doFraction:
         header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
     else:
         header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n"
 
+    outfile = open(outfileName, "w")
     outfile.write(header)
 
+    finalfile = open(finalfileName)
+    #TODO: the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #      are not in the finalfile then they will be lost.
     for line in finalfile:
         fields = line.strip().split()
         gene = fields[0]
         rnakb = fields[1]
         finalRPKM = fields[2]
         firstRPKM = firstDict.get(gene, "")
-        outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM)
+        outputFields = [gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM]
 
         if doFraction:
             fraction = fields[3]
-            outline += "\t%s" % fraction
-    
-        outfile.write(outline + '\n')
+            outputFields.append(fraction)
+
+        outline = "%s\n" % string.join(outputFields, "\t")
+        outfile.write(outline)
 
     finalfile.close()
     outfile.close()
 
 
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file