4 from numpy import zeros, int16
5 from erange.commoncode import getConfigParser, getConfigOption, getConfigIntOption
7 versionString = "RNAPATH: version 0.96"
12 """ returns the complementary basepair to base nt
14 compDict = { "A": "T",
37 return compDict.get(nt, "N")
40 def complement(sequence, length=-1):
41 """ returns the complement of the sequence.
45 seqLength = len(sequence)
47 if length == seqLength or length < 0:
48 seqList = list(sequence)
50 return "".join(map(compNT, seqList))
52 #TODO: this seems to want to deal with case where length is more than
53 # sequence length except that a negative index on a sequence is fine
54 # index will only be overrun if length is negative but that case is
56 for index in range(seqLength - 1,seqLength - length - 1, -1):
58 newSeq += compNT(sequence[index])
69 usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
71 parser = getParser(usage)
72 (options, args) = parser.parse_args(argv[1:])
78 incontigfilename = args[0]
79 distalPairsfile = args[1]
80 outpathfilename = args[2]
81 outcontigfilename = args[3]
83 rnaPath(incontigfilename, distalPairsfile, outpathfilename,
84 outcontigfilename, options.pathPrefix, options.overlap)
88 parser = optparse.OptionParser(usage=usage)
89 parser.add_option("--prefix", dest="pathPrefix")
90 parser.add_option("--overlap", type="int", dest="overlap")
92 configParser = getConfigParser()
94 pathPrefix = getConfigOption(configParser, section, "pathPrefix", "RNAPATH")
95 overlap = getConfigIntOption(configParser, section, "overlap", 30)
97 parser.set_defaults(pathPrefix=pathPrefix, overlap=overlap)
102 def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
103 outcontigfilename, pathPrefix="RNAPATH", overlap=30):
105 outpathfile = open(outpathfilename, "w")
107 outheader = "#settings: %s" % " ".join(sys.argv)
109 print >> outpathfile, outheader
111 contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
112 halfSize = calculateN50(origSize)
113 print "building the adjacency graph"
114 pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
116 print "found %d paths" % len(pathList)
120 outcontigfile = open(outcontigfilename, "w")
121 for path in pathList:
123 outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path)))
126 vertexNameList.append(nameList[vertex])
127 pathDescription = string.join(vertexNameList, ",")
129 print >> outpathfile, pathDescription
130 currentVertex = path[0]
132 assemblyList = currentVertex
133 sequence = contigDict[currentVertex]
134 for nextVertex in path[1:]:
135 if (currentVertex, nextVertex) in edgeSenseDict:
136 senseList = edgeSenseDict[currentVertex, nextVertex]
137 FR = senseList.count(("+", "-"))
138 RF = senseList.count(("-", "+"))
140 senseList = edgeSenseDict[nextVertex, currentVertex]
142 FR = senseList.count(("-", "+"))
143 RF = senseList.count(("+", "-"))
145 FF = senseList.count(("+", "+"))
146 RR = senseList.count(("-", "-"))
147 if currentSense == "-":
148 # we had flipped the upstream piece! Must flip again
156 if FR >= FF and FR >= RR and FR >= RF:
157 # we have FR - leave alone
160 assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
161 seqleft = sequence[-20:]
162 seqright = contigDict[nextVertex][:overlap]
163 if seqleft in seqright:
164 pos = seqright.index(seqleft)
166 outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
168 print >> outpathfile, outstring
169 sequence += contigDict[nextVertex][offset:]
171 sequence += "NN" + contigDict[nextVertex]
174 elif FF >= RR and FF >= RF:
175 # we have FF - flip seqright
178 assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
179 seqleft = sequence[-20:]
180 seqright = complement(contigDict[nextVertex])[:overlap]
181 if seqleft in seqright:
182 pos = seqright.index(seqleft)
184 outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
186 print >> outpathfile, outstring
187 sequence += complement(contigDict[nextVertex])[offset:]
189 sequence += "NN" + complement(contigDict[nextVertex])
193 # we have RR - flip seqleft
196 assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
197 seqleft = complement(sequence)[:20]
198 seqright = contigDict[nextVertex][:overlap]
199 if seqleft in seqright:
200 pos = seqright.index(seqleft)
202 outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
204 print >> outpathfile, outstring
205 sequence = complement(sequence) + contigDict[nextVertex][offset:]
207 sequence = complement(sequence) + "NN" + contigDict[nextVertex]
211 # we have RF - flip both
214 assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
215 seqleft = complement(sequence)[-20:]
216 seqright = complement(contigDict[nextVertex])[:overlap]
217 if seqleft in seqright:
218 pos = seqright.index(seqleft)
220 outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
222 print >> outpathfile, outstring
223 sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
225 sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
229 outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
231 print >> outpathfile, outstring
232 currentVertex = nextVertex
234 outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
235 newSizeList.append(len(sequence))
237 for vertex in contigDict:
238 if vertex in visitedDict:
241 newSizeList.append(len(contigDict[vertex]))
242 outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
244 calculateN50(newSizeList, referenceMean=halfSize)
247 def calculateN50(sizeList, referenceMean=None):
248 if referenceMean is None:
249 totalSize = sum(sizeList)
250 referenceMean = totalSize / 2
254 currentTotalLength = 0
255 for size in sizeList:
256 if currentTotalLength + size > referenceMean:
257 print "#contigs", len(sizeList)
261 currentTotalLength += size
268 def getContigsFromFile(contigFileName):
277 incontigfile = open(contigFileName)
279 print "Error opening contig file: %s" % contigFileName
280 return contigNum, nameList, contigDict, origSize
282 for line in incontigfile:
284 if currentChrom !="":
285 nameList.append(currentChrom)
286 contigDict[contigNum] = seq
287 origSize.append(len(seq))
290 currentChrom = line.strip().split()[0][1:]
297 return contigNum, nameList, contigDict, origSize
300 def getPath(contigNum, distalPairsfile, nameList):
301 edgeMatrix = EdgeMatrix(contigNum)
303 print len(edgeMatrix.edgeArray)
305 print len(edgeMatrix.edgeArray[50])
309 print "processing distal pairs"
310 verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
312 willVisitList = verticesWithEdges.keys()
314 print "visiting %d vertices" % len(willVisitList)
316 print "cleaning up graph of edges with weight 1"
317 verticesToDelete = []
318 for rindex in willVisitList:
319 if rindex not in notSoloDict:
320 cindex = vertexEdges[rindex][0]
321 edgeMatrix.edgeArray[rindex][cindex] = 0
322 edgeMatrix.edgeArray[cindex][rindex] = 0
323 verticesToDelete.append(rindex)
325 for vertex in verticesToDelete:
326 willVisitList.remove(vertex)
328 print "%d 1-edges zeroed out" % len(verticesToDelete)
331 print "visiting %d vertices" % len(willVisitList)
334 print "picking top 2 edges per vertex - zero out others"
335 for rindex in willVisitList:
336 vertices = vertexEdges[rindex]
338 for avertex in vertices:
339 if avertex in willVisitList:
340 rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
345 zeroedEdge += len(rEdges[2:])
346 for (weight, cindex) in rEdges[2:]:
347 edgeMatrix.edgeArray[rindex][cindex] = 0
348 edgeMatrix.edgeArray[cindex][rindex] = 0
349 elif len(rEdges) == 1:
350 if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
351 leafList.append(rindex)
353 print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
354 pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
356 return pathList, edgeSenseDict, visitedDict
359 def traverseGraph(leafList, edgeMatrix):
363 print "traveling through the graph"
364 for rindex in leafList:
365 if visitedDict.has_key(rindex):
368 path = edgeMatrix.visitLink(rindex)
371 visitedDict[vertex] = ""
374 pathList.append(path)
376 return pathList, visitedDict
379 def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
380 contigToRowLookup = {}
381 verticesWithEdges = {}
386 distalPairs = open(distalPairsfilename)
387 for line in distalPairs:
391 fields = line.strip().split()
392 contA = "chr%s" % fields[1]
394 contig1 = contigToRowLookup[contA]
397 contig1 = nameList.index(contA)
398 contigToRowLookup[contA] = contig1
400 print "problem with end1: ", line
405 contB = "chr%s" % fields[4]
407 contig2 = contigToRowLookup[contB]
410 contig2 = nameList.index(contB)
411 contigToRowLookup[contB] = contig2
413 print "problem with end2: ", line
418 edgeMatrix.edgeArray[contig1][contig2] += 1
419 edgeMatrix.edgeArray[contig2][contig1] += 1
420 verticesWithEdges[contig1] = ""
421 verticesWithEdges[contig2] = ""
422 if (contig1, contig2) in edgeSenseDict:
423 edgeSenseDict[contig1, contig2].append((sense1, sense2))
424 elif (contig2, contig1) in edgeSenseDict:
425 edgeSenseDict[contig2, contig1].append((sense2, sense1))
427 edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
429 if contig1 in vertexEdges:
430 if contig2 not in vertexEdges[contig1]:
431 vertexEdges[contig1].append(contig2)
433 vertexEdges[contig1] = [contig2]
435 if contig2 in vertexEdges:
436 if contig1 not in vertexEdges[contig2]:
437 vertexEdges[contig2].append(contig1)
439 vertexEdges[contig2] = [contig1]
441 if edgeMatrix.edgeArray[contig1][contig2] > 1:
442 notSoloDict[contig1] = ""
443 notSoloDict[contig2] = ""
447 return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
451 """ Describes a sparse matrix to hold edge data.
454 def __init__(self, dimension):
455 self.dimension = dimension
456 self.edgeArray = zeros((self.dimension, self.dimension), int16)
459 def visitLink(self, fromVertex, ignoreList=[]):
460 returnPath = [fromVertex]
462 for toindex in xrange(self.dimension):
463 if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
464 toVertex.append(toindex)
466 for vertex in toVertex:
467 if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
468 self.edgeArray[fromVertex][vertex] = 0
469 self.edgeArray[vertex][fromVertex] = 0
470 return returnPath + [vertex]
472 self.edgeArray[fromVertex][vertex] = 0
474 return returnPath + self.visitLink(vertex, returnPath)
476 return returnPath + [vertex]
480 if __name__ == "__main__":