- for readID in uniqDict:
- readList = uniqDict[readID]
- if len(readList) == 2:
- total += 1
- (start1, flag1, pair1) = readList[0]
- (start2, flag2, pair2) = readList[1]
-
- if flag1 != flag2:
- dist = abs(start1 - start2)
- if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
- geneID = ""
- saw1 = False
- saw2 = False
- if flag1 in goodDict:
- geneID = flag2
- farFlag = flag1
- saw1 = True
-
- if flag2 in goodDict:
- geneID = flag1
- farFlag = flag2
- saw2 = True
-
- if saw1 or saw2:
- total += 1
-
- if saw1 and saw2:
- if flag1 < flag2:
- geneID = flag1
- farFlag = flag2
- else:
- geneID = flag2
- farFlag = flag1
-
- if geneID in farConnected:
- farConnected[geneID].append(farFlag)
- else:
- farConnected[geneID] = [farFlag]
- elif geneID != "":
- try:
- if genome == "dmelanogaster":
- symbol = geneinfoDict["Dmel_" + geneID][0][0]
- else:
- symbol = geneinfoDict[geneID][0][0]
- except:
- try:
- symbol = geneannotDict[(genome, geneID)][0]
- except:
- symbol = "LOC" + geneID
-
- symbol = symbol.strip()
- symbol = symbol.replace(" ","|")
- symbol = symbol.replace("\t","|")
- if farFlag not in assigned:
- assigned[farFlag] = (symbol, geneID)
- print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
- outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
- distinct += 1
-
- farIndex = 0
+ total += pairCount
+ for readID, readList in uniqDict.items():
+ flags = (readList[0]["flag"], readList[1]["flag"])
+ processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+ total += processed
+ distinct += distinctPairs
+
+ entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
+ distinct += entriesWritten
+ outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+ outfile.close()
+ print "distinct: %d\ttotal: %d" % (distinct, total)
+ print time.ctime()
+
+
+def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
+ """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
+ Only returns unique non-spliced read pairs matching the criteria given in processReads().
+ """
+ start = 1
+ readDict = {}
+ for regionstart, regionstop, regionname in regions:
+ for alignedread in bamfile.fetch(chromosome, start, regionstop):
+ if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+ if alignedread.pos >= regionstart:
+ flag = regionname
+ else:
+ flag = alignedread.opt("ZG")
+
+ try:
+ readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+ except KeyError:
+ readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+ start = regionstop + 1
+
+ for alignedread in bamfile.fetch(chromosome, start):
+ if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+ flag = alignedread.opt("ZG")
+
+ try:
+ readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+ except KeyError:
+ readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+ pairCount = len(readDict.keys())
+ for readID, readList in readDict.items():
+ if len(readList) != 2:
+ del readDict[readID]
+ pairCount -= 1
+ elif not processReads(readList, maxDist):
+ del readDict[readID]
+
+ return readDict, pairCount
+
+
+def processReads(reads, maxDist=500000):
+ """ For a pair of readID's to be acceptable:
+ - flags must be different
+ - neither flag can be 'NM'
+ - the read starts have to be within maxDist
+ """
+ process = False
+ dist = abs(reads[0]["start"] - reads[1]["start"])
+ flag1 = reads[0]["flag"]
+ flag2 = reads[1]["flag"]
+
+ if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+ process = True
+
+ return process
+
+
+def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+ """ Writes out the region information along with symbol and geneID for paired reads where one read
+ of the pair is in a known gene and the other is in a new region. If both reads in the pair are
+ in new regions the region is added to farConnected. No action is taken if both reads in the
+ pair are in known genes.
+ """
+ flag1, flag2 = flags
+ total = 0
+ distinct = 0
+ read1IsGood = flag1 in goodDict
+ read2IsGood = flag2 in goodDict
+
+ if read1IsGood and read2IsGood:
+ if flag1 < flag2:
+ geneID = flag1
+ farFlag = flag2
+ else:
+ geneID = flag2
+ farFlag = flag1
+
+ try:
+ farConnected[geneID].append(farFlag)
+ except KeyError:
+ farConnected[geneID] = [farFlag]
+ elif read1IsGood or read2IsGood:
+ total = 1
+ if read2IsGood:
+ farFlag = flag2
+ geneID = flag1
+ else:
+ farFlag = flag1
+ geneID = flag2
+
+ symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
+ if farFlag not in assigned:
+ assigned[farFlag] = (symbol, geneID)
+ print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+ distinct = 1
+
+ return total, distinct
+
+
+def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+ else:
+ symbol = geneInfoDict[geneID][0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneAnnotDict[(genome, geneID)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % geneID
+
+ symbol = symbol.strip()
+ symbol = symbol.replace(" ","|")
+ symbol = symbol.replace("\t","|")
+
+ return symbol
+
+
+def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
+ total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
+ writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
+
+ return written
+
+
+def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
+ total = 0
+ written = 0