first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / combineRPKMs.py
index ead4e1b81d0b37335fbf00908f1aea615cec33dd..1e2d2aaedbf3476fc43058500c30c76c2b7c3e03 100755 (executable)
@@ -51,23 +51,8 @@ def makeParser(usage=""):
 
 def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
 
-    firstDict = {}
-    firstfile = open(firstfileName)
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
-
-    firstfile.close()
-
-    expandedDict = {}
-    gidDict = {}
-    expandedfile = open(expandedfileName)
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
-
-    expandedfile.close()
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
 
     if doFraction:
         header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
@@ -78,6 +63,8 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
     outfile.write(header)
 
     finalfile = open(finalfileName)
+    #TODO: the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #      are not in the finalfile then they will be lost.
     for line in finalfile:
         fields = line.strip().split()
         gene = fields[0]
@@ -97,5 +84,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
     outfile.close()
 
 
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file