erange version 4.0a dev release
[erange.git] / rnafarPairs.py
index d1baebd3e7e6e61278c4c141a84fe2396e323f28..4d70c4965ef77c2eea4f646e58526db83682ffa6 100755 (executable)
@@ -13,26 +13,21 @@ try:
 except:
     pass
 
-import sys, time, optparse
-from commoncode import readDataset
-from cistematic.core.geneinfo import geneinfoDB
-from cistematic.genomes import Genome
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+
 
 def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    print "%prog: version 3.6"
+    print "rnafarPairs: version 3.7"
     usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--verbose", action="store_true", dest="doVerbose",
-                      help="verbose output")
-    parser.add_option("--cache", action="store_true", dest="doCache",
-                      help="use cache")
-    parser.add_option("--maxDist", type="int", dest="maxDist",
-                      help="maximum distance")
-    parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
+    parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 4:
@@ -45,7 +40,27 @@ def main(argv=None):
     outfilename = args[3]
 
     rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
-    
+
+
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--verbose", action="store_true", dest="doVerbose",
+                      help="verbose output")
+    parser.add_option("--cache", action="store_true", dest="doCache",
+                      help="use cache")
+    parser.add_option("--maxDist", type="int", dest="maxDist",
+                      help="maximum distance")
+
+    configParser = getConfigParser()
+    section = "rnafarPairs"
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
+
+    parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
+
+    return parser
+
 
 def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
     goodDict = {}
@@ -54,33 +69,25 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
         fields = line.split()
         goodDict[fields[0]] = line
 
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
-    rdsChromList = RDS.getChromosomes()
-
+    goodfile.close()
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
+    chromosomeList = RDS.getChromosomes()
     if doVerbose:
         print time.ctime()
 
     distinct = 0
     total = 0
     outfile = open(outfilename,"w")
-
-    idb = geneinfoDB()
-    if genome == "dmelanogaster":
-        geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
-    else:
-        geneinfoDict = idb.getallGeneInfo(genome)
-
-    hg = Genome(genome)
-    geneannotDict = hg.allAnnotInfo()
-
+    geneinfoDict = getGeneInfoDict(genome)
+    geneannotDict = getGeneAnnotDict(genome)
     assigned = {}
     farConnected = {}
-    for achrom in rdsChromList:
-        if achrom == "chrM":
+    for chromosome in chromosomeList:
+        if doNotProcessChromosome(chromosome):
             continue
 
-        print achrom
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+        print chromosome
+        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
         if doVerbose:
             print len(uniqDict), time.ctime()    
 
@@ -88,82 +95,124 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
             readList = uniqDict[readID]
             if len(readList) == 2:
                 total += 1
-                (start1, flag1, pair1) = readList[0]
-                (start2, flag2, pair2) = readList[1]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
-                        geneID = ""
-                        saw1 = False
-                        saw2 = False
-                        if flag1 in goodDict:
-                            geneID = flag2
-                            farFlag = flag1
-                            saw1 = True
-
-                        if flag2 in goodDict:
-                            geneID = flag1
-                            farFlag = flag2
-                            saw2 = True
-
-                        if saw1 or saw2:
-                            total += 1
-
-                        if saw1 and saw2:
-                            if flag1 < flag2:
-                                geneID = flag1
-                                farFlag = flag2
-                            else:
-                                geneID = flag2
-                                farFlag = flag1
-
-                            if geneID in farConnected:
-                                farConnected[geneID].append(farFlag)
-                            else:
-                                farConnected[geneID] = [farFlag]
-                        elif geneID != "":
-                            try:
-                                if genome == "dmelanogaster":
-                                    symbol = geneinfoDict["Dmel_" + geneID][0][0]
-                                else:
-                                    symbol = geneinfoDict[geneID][0][0]
-                            except:
-                                try:
-                                    symbol = geneannotDict[(genome, geneID)][0]
-                                except:
-                                    symbol = "LOC" + geneID
-
-                            symbol = symbol.strip()
-                            symbol = symbol.replace(" ","|")
-                            symbol = symbol.replace("\t","|")
-                            if farFlag not in assigned:
-                                assigned[farFlag] = (symbol, geneID)
-                                print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
-                                outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
-                                distinct += 1
-
-    farIndex = 0
+                if processReads(readList[:2], maxDist):
+                    flags = (readList[0]["flag"], readList[1]["flag"])
+                    processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+                    total += processed
+                    distinct += distinctPairs
+
+    entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
+    distinct += entriesWritten
+    outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+    outfile.close()
+    print "distinct: %d\ttotal: %d" % (distinct, total)
+    print time.ctime()
+
+
+def doNotProcessChromosome(chromosome):
+    return chromosome == "chrM"
+
+
+def processReads(reads, maxDist):
+    process = False
+    start1 = reads[0]["start"]
+    start2 = reads[1]["start"]
+    dist = abs(start1 - start2)
+    flag1 = reads[0]["flag"]
+    flag2 = reads[1]["flag"]
+
+    if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+        process = True
+
+    return process
+
+
+def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+    flag1, flag2 = flags
+    total = 0
+    distinct = 0
+    read1IsGood = flag1 in goodDict
+    read2IsGood = flag2 in goodDict
+
+    if read1IsGood and read2IsGood:
+        if flag1 < flag2:
+            geneID = flag1
+            farFlag = flag2
+        else:
+            geneID = flag2
+            farFlag = flag1
+
+        try:
+            farConnected[geneID].append(farFlag)
+        except KeyError:
+            farConnected[geneID] = [farFlag]
+    elif read1IsGood or read2IsGood:
+        total += 1
+        if read2IsGood:
+            farFlag = flag2
+            geneID = flag1
+        else:
+            farFlag = flag1
+            geneID = flag2
+
+        try:
+            if genome == "dmelanogaster":
+                symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+            else:
+                symbol = geneInfoDict[geneID][0][0]
+        except (KeyError, IndexError):
+            try:
+                symbol = geneAnnotDict[(genome, geneID)][0]
+            except (KeyError, IndexError):
+                symbol = "LOC%s" % geneID
+
+        symbol = symbol.strip()
+        symbol = symbol.replace(" ","|")
+        symbol = symbol.replace("\t","|")
+
+        if farFlag not in assigned:
+            assigned[farFlag] = (symbol, geneID)
+            print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+            outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+            distinct += 1
+
+    return total, distinct
+
+
+def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
+    total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
+    writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
+
+    return written
+
+
+def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
+    total = 0
+    written = 0
     for farFlag in farConnected:
         geneID = ""
         symbol = ""
         idList = [farFlag] + farConnected[farFlag]
-        for oneID in idList:
-            if oneID in assigned:
-                (symbol, geneID) = assigned[oneID]
+        for ID in idList:
+            if ID in assigned:
+                (symbol, geneID) = assigned[ID]
 
         if geneID == "":
-            farIndex += 1
-            symbol = "FAR%d" % farIndex
-            geneID = -1 * farIndex
+            total += 1
+            symbol = "FAR%d" % total
+            geneID = -1 * total
 
-        for oneID in idList:
-            if oneID not in assigned:
-                print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
-                outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
-                distinct += 1
-                assigned[oneID] = (symbol, geneID)
+        for ID in idList:
+            if ID not in assigned:
+                print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
+                outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
+                written += 1
+                assigned[ID] = (symbol, geneID)
 
+    return total, written
+
+
+def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
     for farFlag in goodDict:
         if farFlag not in assigned:
             farIndex += 1
@@ -171,10 +220,6 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
             print line.strip()
             outfile.write(line)
 
-    outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
-    outfile.close()
-    print "distinct: %d\ttotal: %d" % (distinct, total)
-    print time.ctime()
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file