X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=combineRPKMs.py;h=1e2d2aaedbf3476fc43058500c30c76c2b7c3e03;hp=ead4e1b81d0b37335fbf00908f1aea615cec33dd;hb=HEAD;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/combineRPKMs.py b/combineRPKMs.py index ead4e1b..1e2d2aa 100755 --- a/combineRPKMs.py +++ b/combineRPKMs.py @@ -51,23 +51,8 @@ def makeParser(usage=""): def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False): - firstDict = {} - firstfile = open(firstfileName) - for line in firstfile: - fields = line.strip().split() - firstDict[fields[1]] = fields[-1] - - firstfile.close() - - expandedDict = {} - gidDict = {} - expandedfile = open(expandedfileName) - for line in expandedfile: - fields = line.strip().split() - expandedDict[fields[1]] = fields[-1] - gidDict[fields[1]] = fields[0] - - expandedfile.close() + firstDict = getRPKMDict(firstfileName) + gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True) if doFraction: header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n" @@ -78,6 +63,8 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do outfile.write(header) finalfile = open(finalfileName) + #TODO: the output lines are driven by finalfile. If there are genes in the first 2 that + # are not in the finalfile then they will be lost. for line in finalfile: fields = line.strip().split() gene = fields[0] @@ -97,5 +84,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do outfile.close() +def getRPKMDict(rpkmFileName, getGIDDict=False): + gidDict = {} + rpkmDict = {} + rpkmFile = open(rpkmFileName) + for line in rpkmFile: + fields = line.strip().split() + rpkmDict[fields[1]] = fields[-1] + if getGIDDict: + gidDict[fields[1]] = fields[0] + + rpkmFile.close() + + if getGIDDict: + return gidDict, rpkmDict + else: + return rpkmDict + + if __name__ == "__main__": main(sys.argv) \ No newline at end of file