1 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
2 looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
14 from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry
21 print "rnafarPairs: version 3.7"
22 usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
24 parser = makeParser(usage)
25 (options, args) = parser.parse_args(argv[1:])
32 goodfilename = args[1]
36 bamfile = pysam.Samfile(bamfilename, "rb")
38 rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
41 def makeParser(usage=""):
42 parser = optparse.OptionParser(usage=usage)
43 parser.add_option("--verbose", action="store_true", dest="doVerbose",
44 help="verbose output")
45 parser.add_option("--cache", action="store_true", dest="doCache",
47 parser.add_option("--maxDist", type="int", dest="maxDist",
48 help="maximum distance")
50 configParser = getConfigParser()
51 section = "rnafarPairs"
52 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
53 doCache = getConfigBoolOption(configParser, section, "doCache", False)
54 maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
56 parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
61 def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
62 """ map all candidate regions that have paired ends overlapping with known genes
65 chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
67 for chromosome in chromosomeList:
68 regions[chromosome] = []
71 goodfile = open(goodfilename)
75 start = int(fields[2])
77 goodDict[label] = line
78 regions[chromosome].append((start, stop, label))
86 outfile = open(outfilename,"w")
87 geneinfoDict = getGeneInfoDict(genome)
88 geneannotDict = getGeneAnnotDict(genome)
91 for chromosome in chromosomeList:
93 regionList = regions[chromosome].sort()
94 uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist)
96 print len(uniqDict), time.ctime()
99 for readID, readList in uniqDict.items():
100 flags = (readList[0]["flag"], readList[1]["flag"])
101 processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
103 distinct += distinctPairs
105 entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
106 distinct += entriesWritten
107 outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
109 print "distinct: %d\ttotal: %d" % (distinct, total)
113 def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
114 """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
115 Only returns unique non-spliced read pairs matching the criteria given in processReads().
119 for regionstart, regionstop, regionname in regions:
120 for alignedread in bamfile.fetch(chromosome, start, regionstop):
121 if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
122 if alignedread.pos >= regionstart:
125 flag = alignedread.opt("ZG")
128 readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
130 readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
132 start = regionstop + 1
134 for alignedread in bamfile.fetch(chromosome, start):
135 if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
136 flag = alignedread.opt("ZG")
139 readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
141 readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
143 pairCount = len(readDict.keys())
144 for readID, readList in readDict.items():
145 if len(readList) != 2:
148 elif not processReads(readList, maxDist):
151 return readDict, pairCount
154 def processReads(reads, maxDist=500000):
155 """ For a pair of readID's to be acceptable:
156 - flags must be different
157 - neither flag can be 'NM'
158 - the read starts have to be within maxDist
161 dist = abs(reads[0]["start"] - reads[1]["start"])
162 flag1 = reads[0]["flag"]
163 flag2 = reads[1]["flag"]
165 if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
171 def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
172 """ Writes out the region information along with symbol and geneID for paired reads where one read
173 of the pair is in a known gene and the other is in a new region. If both reads in the pair are
174 in new regions the region is added to farConnected. No action is taken if both reads in the
175 pair are in known genes.
180 read1IsGood = flag1 in goodDict
181 read2IsGood = flag2 in goodDict
183 if read1IsGood and read2IsGood:
192 farConnected[geneID].append(farFlag)
194 farConnected[geneID] = [farFlag]
195 elif read1IsGood or read2IsGood:
204 symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
205 if farFlag not in assigned:
206 assigned[farFlag] = (symbol, geneID)
207 print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
208 outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
211 return total, distinct
214 def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
216 if genome == "dmelanogaster":
217 symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
219 symbol = geneInfoDict[geneID][0][0]
220 except (KeyError, IndexError):
222 symbol = geneAnnotDict[(genome, geneID)][0]
223 except (KeyError, IndexError):
224 symbol = "LOC%s" % geneID
226 symbol = symbol.strip()
227 symbol = symbol.replace(" ","|")
228 symbol = symbol.replace("\t","|")
233 def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
234 total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
235 writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
240 def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
243 for farFlag in farConnected:
246 idList = [farFlag] + farConnected[farFlag]
249 (symbol, geneID) = assigned[ID]
253 symbol = "FAR%d" % total
257 if ID not in assigned:
258 print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
259 outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
261 assigned[ID] = (symbol, geneID)
263 return total, written
266 def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
267 for farFlag in goodDict:
268 if farFlag not in assigned:
270 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
275 if __name__ == "__main__":