except:
pass
-import sys, time, optparse
-from commoncode import readDataset
-from cistematic.core.geneinfo import geneinfoDB
-from cistematic.genomes import Genome
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+
def main(argv=None):
if not argv:
argv = sys.argv
- print "%prog: version 3.6"
+ print "rnafarPairs: version 3.7"
usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--verbose", action="store_true", dest="doVerbose",
- help="verbose output")
- parser.add_option("--cache", action="store_true", dest="doCache",
- help="use cache")
- parser.add_option("--maxDist", type="int", dest="maxDist",
- help="maximum distance")
- parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
outfilename = args[3]
rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
-
+
+
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--verbose", action="store_true", dest="doVerbose",
+ help="verbose output")
+ parser.add_option("--cache", action="store_true", dest="doCache",
+ help="use cache")
+ parser.add_option("--maxDist", type="int", dest="maxDist",
+ help="maximum distance")
+
+ configParser = getConfigParser()
+ section = "rnafarPairs"
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
+
+ parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
+
+ return parser
+
def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
goodDict = {}
fields = line.split()
goodDict[fields[0]] = line
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
- rdsChromList = RDS.getChromosomes()
-
+ goodfile.close()
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
+ chromosomeList = RDS.getChromosomes()
if doVerbose:
print time.ctime()
distinct = 0
total = 0
outfile = open(outfilename,"w")
-
- idb = geneinfoDB()
- if genome == "dmelanogaster":
- geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
- else:
- geneinfoDict = idb.getallGeneInfo(genome)
-
- hg = Genome(genome)
- geneannotDict = hg.allAnnotInfo()
-
+ geneinfoDict = getGeneInfoDict(genome)
+ geneannotDict = getGeneAnnotDict(genome)
assigned = {}
farConnected = {}
- for achrom in rdsChromList:
- if achrom == "chrM":
+ for chromosome in chromosomeList:
+ if doNotProcessChromosome(chromosome):
continue
- print achrom
- uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+ print chromosome
+ uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
if doVerbose:
print len(uniqDict), time.ctime()
readList = uniqDict[readID]
if len(readList) == 2:
total += 1
- (start1, flag1, pair1) = readList[0]
- (start2, flag2, pair2) = readList[1]
-
- if flag1 != flag2:
- dist = abs(start1 - start2)
- if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
- geneID = ""
- saw1 = False
- saw2 = False
- if flag1 in goodDict:
- geneID = flag2
- farFlag = flag1
- saw1 = True
-
- if flag2 in goodDict:
- geneID = flag1
- farFlag = flag2
- saw2 = True
-
- if saw1 or saw2:
- total += 1
-
- if saw1 and saw2:
- if flag1 < flag2:
- geneID = flag1
- farFlag = flag2
- else:
- geneID = flag2
- farFlag = flag1
-
- if geneID in farConnected:
- farConnected[geneID].append(farFlag)
- else:
- farConnected[geneID] = [farFlag]
- elif geneID != "":
- try:
- if genome == "dmelanogaster":
- symbol = geneinfoDict["Dmel_" + geneID][0][0]
- else:
- symbol = geneinfoDict[geneID][0][0]
- except:
- try:
- symbol = geneannotDict[(genome, geneID)][0]
- except:
- symbol = "LOC" + geneID
-
- symbol = symbol.strip()
- symbol = symbol.replace(" ","|")
- symbol = symbol.replace("\t","|")
- if farFlag not in assigned:
- assigned[farFlag] = (symbol, geneID)
- print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
- outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
- distinct += 1
-
- farIndex = 0
+ if processReads(readList[:2], maxDist):
+ flags = (readList[0]["flag"], readList[1]["flag"])
+ processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+ total += processed
+ distinct += distinctPairs
+
+ entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
+ distinct += entriesWritten
+ outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+ outfile.close()
+ print "distinct: %d\ttotal: %d" % (distinct, total)
+ print time.ctime()
+
+
+def doNotProcessChromosome(chromosome):
+ return chromosome == "chrM"
+
+
+def processReads(reads, maxDist):
+ process = False
+ start1 = reads[0]["start"]
+ start2 = reads[1]["start"]
+ dist = abs(start1 - start2)
+ flag1 = reads[0]["flag"]
+ flag2 = reads[1]["flag"]
+
+ if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+ process = True
+
+ return process
+
+
+def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+ flag1, flag2 = flags
+ total = 0
+ distinct = 0
+ read1IsGood = flag1 in goodDict
+ read2IsGood = flag2 in goodDict
+
+ if read1IsGood and read2IsGood:
+ if flag1 < flag2:
+ geneID = flag1
+ farFlag = flag2
+ else:
+ geneID = flag2
+ farFlag = flag1
+
+ try:
+ farConnected[geneID].append(farFlag)
+ except KeyError:
+ farConnected[geneID] = [farFlag]
+ elif read1IsGood or read2IsGood:
+ total += 1
+ if read2IsGood:
+ farFlag = flag2
+ geneID = flag1
+ else:
+ farFlag = flag1
+ geneID = flag2
+
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+ else:
+ symbol = geneInfoDict[geneID][0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneAnnotDict[(genome, geneID)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % geneID
+
+ symbol = symbol.strip()
+ symbol = symbol.replace(" ","|")
+ symbol = symbol.replace("\t","|")
+
+ if farFlag not in assigned:
+ assigned[farFlag] = (symbol, geneID)
+ print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+ distinct += 1
+
+ return total, distinct
+
+
+def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
+ total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
+ writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
+
+ return written
+
+
+def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
+ total = 0
+ written = 0
for farFlag in farConnected:
geneID = ""
symbol = ""
idList = [farFlag] + farConnected[farFlag]
- for oneID in idList:
- if oneID in assigned:
- (symbol, geneID) = assigned[oneID]
+ for ID in idList:
+ if ID in assigned:
+ (symbol, geneID) = assigned[ID]
if geneID == "":
- farIndex += 1
- symbol = "FAR%d" % farIndex
- geneID = -1 * farIndex
+ total += 1
+ symbol = "FAR%d" % total
+ geneID = -1 * total
- for oneID in idList:
- if oneID not in assigned:
- print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
- outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
- distinct += 1
- assigned[oneID] = (symbol, geneID)
+ for ID in idList:
+ if ID not in assigned:
+ print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
+ written += 1
+ assigned[ID] = (symbol, geneID)
+ return total, written
+
+
+def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
for farFlag in goodDict:
if farFlag not in assigned:
farIndex += 1
print line.strip()
outfile.write(line)
- outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
- outfile.close()
- print "distinct: %d\ttotal: %d" % (distinct, total)
- print time.ctime()
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file