4 from numpy import zeros, int16
6 versionString = "%s: version 0.95" % sys.argv[0]
11 """ returns the complementary basepair to base nt
13 compDict = { "A": "T",
36 return compDict.get(nt, "N")
39 def complement(sequence, length=-1):
40 """ returns the complement of the sequence.
44 seqLength = len(sequence)
46 if length == seqLength or length < 0:
47 seqList = list(sequence)
49 return "".join(map(compNT, seqList))
51 #TODO: this seems to want to deal with case where length is more than
52 # sequence length except that a negative index on a sequence is fine
53 # index will only be overrun if length is negative but that case is
55 for index in range(seqLength - 1,seqLength - length - 1, -1):
57 newSeq += compNT(sequence[index])
68 usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
70 parser = optparse.OptionParser(usage=usage)
71 parser.add_option("--prefix", dest="pathPrefix")
72 parser.add_option("--overlap", type="int", dest="overlap")
73 parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
74 (options, args) = parser.parse_args(argv[1:])
80 incontigfilename = args[0]
81 distalPairsfile = args[1]
82 outpathfilename = args[2]
83 outcontigfilename = args[3]
85 rnaPath(incontigfilename, distalPairsfile, outpathfilename,
86 outcontigfilename, options.pathPrefix, options.overlap)
89 def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
90 outcontigfilename, pathPrefix="RNAPATH", overlap=30):
92 outpathfile = open(outpathfilename, "w")
94 outheader = "#settings: %s" % " ".join(sys.argv)
96 print >> outpathfile, outheader
98 contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
99 halfSize = calculateN50(origSize)
100 print "building the adjacency graph"
101 pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
103 print "found %d paths" % len(pathList)
107 outcontigfile = open(outcontigfilename, "w")
108 for path in pathList:
110 outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path)))
113 vertexNameList.append(nameList[vertex])
114 pathDescription = string.join(vertexNameList, ",")
116 print >> outpathfile, pathDescription
117 currentVertex = path[0]
119 assemblyList = currentVertex
120 sequence = contigDict[currentVertex]
121 for nextVertex in path[1:]:
122 if (currentVertex, nextVertex) in edgeSenseDict:
123 senseList = edgeSenseDict[currentVertex, nextVertex]
124 FR = senseList.count(("+", "-"))
125 RF = senseList.count(("-", "+"))
127 senseList = edgeSenseDict[nextVertex, currentVertex]
129 FR = senseList.count(("-", "+"))
130 RF = senseList.count(("+", "-"))
132 FF = senseList.count(("+", "+"))
133 RR = senseList.count(("-", "-"))
134 if currentSense == "-":
135 # we had flipped the upstream piece! Must flip again
143 if FR >= FF and FR >= RR and FR >= RF:
144 # we have FR - leave alone
147 assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
148 seqleft = sequence[-20:]
149 seqright = contigDict[nextVertex][:overlap]
150 if seqleft in seqright:
151 pos = seqright.index(seqleft)
153 outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
155 print >> outpathfile, outstring
156 sequence += contigDict[nextVertex][offset:]
158 sequence += "NN" + contigDict[nextVertex]
161 elif FF >= RR and FF >= RF:
162 # we have FF - flip seqright
165 assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
166 seqleft = sequence[-20:]
167 seqright = complement(contigDict[nextVertex])[:overlap]
168 if seqleft in seqright:
169 pos = seqright.index(seqleft)
171 outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
173 print >> outpathfile, outstring
174 sequence += complement(contigDict[nextVertex])[offset:]
176 sequence += "NN" + complement(contigDict[nextVertex])
180 # we have RR - flip seqleft
183 assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
184 seqleft = complement(sequence)[:20]
185 seqright = contigDict[nextVertex][:overlap]
186 if seqleft in seqright:
187 pos = seqright.index(seqleft)
189 outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
191 print >> outpathfile, outstring
192 sequence = complement(sequence) + contigDict[nextVertex][offset:]
194 sequence = complement(sequence) + "NN" + contigDict[nextVertex]
198 # we have RF - flip both
201 assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
202 seqleft = complement(sequence)[-20:]
203 seqright = complement(contigDict[nextVertex])[:overlap]
204 if seqleft in seqright:
205 pos = seqright.index(seqleft)
207 outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
209 print >> outpathfile, outstring
210 sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
212 sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
216 outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
218 print >> outpathfile, outstring
219 currentVertex = nextVertex
221 outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
222 newSizeList.append(len(sequence))
224 for vertex in contigDict:
225 if vertex in visitedDict:
228 newSizeList.append(len(contigDict[vertex]))
229 outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
231 calculateN50(newSizeList, referenceMean=halfSize)
234 def calculateN50(sizeList, referenceMean=None):
235 if referenceMean is None:
236 totalSize = sum(sizeList)
237 referenceMean = totalSize / 2
241 currentTotalLength = 0
242 for size in sizeList:
243 if currentTotalLength + size > referenceMean:
244 print "#contigs", len(sizeList)
248 currentTotalLength += size
255 def getContigsFromFile(contigFileName):
264 incontigfile = open(contigFileName)
266 print "Error opening contig file: %s" % contigFileName
267 return contigNum, nameList, contigDict, origSize
269 for line in incontigfile:
271 if currentChrom !="":
272 nameList.append(currentChrom)
273 contigDict[contigNum] = seq
274 origSize.append(len(seq))
277 currentChrom = line.strip().split()[0][1:]
284 return contigNum, nameList, contigDict, origSize
287 def getPath(contigNum, distalPairsfile, nameList):
288 edgeMatrix = EdgeMatrix(contigNum)
290 print len(edgeMatrix.edgeArray)
292 print len(edgeMatrix.edgeArray[50])
296 print "processing distal pairs"
297 verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
299 willVisitList = verticesWithEdges.keys()
301 print "visiting %d vertices" % len(willVisitList)
303 print "cleaning up graph of edges with weight 1"
304 verticesToDelete = []
305 for rindex in willVisitList:
306 if rindex not in notSoloDict:
307 cindex = vertexEdges[rindex][0]
308 edgeMatrix.edgeArray[rindex][cindex] = 0
309 edgeMatrix.edgeArray[cindex][rindex] = 0
310 verticesToDelete.append(rindex)
312 for vertex in verticesToDelete:
313 willVisitList.remove(vertex)
315 print "%d 1-edges zeroed out" % len(verticesToDelete)
318 print "visiting %d vertices" % len(willVisitList)
321 print "picking top 2 edges per vertex - zero out others"
322 for rindex in willVisitList:
323 vertices = vertexEdges[rindex]
325 for avertex in vertices:
326 if avertex in willVisitList:
327 rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
332 zeroedEdge += len(rEdges[2:])
333 for (weight, cindex) in rEdges[2:]:
334 edgeMatrix.edgeArray[rindex][cindex] = 0
335 edgeMatrix.edgeArray[cindex][rindex] = 0
336 elif len(rEdges) == 1:
337 if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
338 leafList.append(rindex)
340 print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
341 pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
343 return pathList, edgeSenseDict, visitedDict
346 def traverseGraph(leafList, edgeMatrix):
350 print "traveling through the graph"
351 for rindex in leafList:
352 if visitedDict.has_key(rindex):
355 path = edgeMatrix.visitLink(rindex)
358 visitedDict[vertex] = ""
361 pathList.append(path)
363 return pathList, visitedDict
366 def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
367 contigToRowLookup = {}
368 verticesWithEdges = {}
373 distalPairs = open(distalPairsfilename)
374 for line in distalPairs:
378 fields = line.strip().split()
379 contA = "chr%s" % fields[1]
381 contig1 = contigToRowLookup[contA]
384 contig1 = nameList.index(contA)
385 contigToRowLookup[contA] = contig1
387 print "problem with end1: ", line
392 contB = "chr%s" % fields[4]
394 contig2 = contigToRowLookup[contB]
397 contig2 = nameList.index(contB)
398 contigToRowLookup[contB] = contig2
400 print "problem with end2: ", line
405 edgeMatrix.edgeArray[contig1][contig2] += 1
406 edgeMatrix.edgeArray[contig2][contig1] += 1
407 verticesWithEdges[contig1] = ""
408 verticesWithEdges[contig2] = ""
409 if (contig1, contig2) in edgeSenseDict:
410 edgeSenseDict[contig1, contig2].append((sense1, sense2))
411 elif (contig2, contig1) in edgeSenseDict:
412 edgeSenseDict[contig2, contig1].append((sense2, sense1))
414 edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
416 if contig1 in vertexEdges:
417 if contig2 not in vertexEdges[contig1]:
418 vertexEdges[contig1].append(contig2)
420 vertexEdges[contig1] = [contig2]
422 if contig2 in vertexEdges:
423 if contig1 not in vertexEdges[contig2]:
424 vertexEdges[contig2].append(contig1)
426 vertexEdges[contig2] = [contig1]
428 if edgeMatrix.edgeArray[contig1][contig2] > 1:
429 notSoloDict[contig1] = ""
430 notSoloDict[contig2] = ""
434 return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
438 """ Describes a sparse matrix to hold edge data.
441 def __init__(self, dimension):
442 self.dimension = dimension
443 self.edgeArray = zeros((self.dimension, self.dimension), int16)
446 def visitLink(self, fromVertex, ignoreList=[]):
447 returnPath = [fromVertex]
449 for toindex in xrange(self.dimension):
450 if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
451 toVertex.append(toindex)
453 for vertex in toVertex:
454 if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
455 self.edgeArray[fromVertex][vertex] = 0
456 self.edgeArray[vertex][fromVertex] = 0
457 return returnPath + [vertex]
459 self.edgeArray[fromVertex][vertex] = 0
461 return returnPath + self.visitLink(vertex, returnPath)
463 return returnPath + [vertex]
467 if __name__ == "__main__":