snapshot of 4.0a development. initial git repo commit
[erange.git] / rnapath / .svn / tmp / RNAPATH.py.tmp
1 import sys
2 import optparse
3 import string
4 from numpy import zeros, int16
5
6 versionString = "%s: version 0.95" % sys.argv[0]
7 print versionString
8
9
10 def compNT(nt):
11     """ returns the complementary basepair to base nt
12     """
13     compDict = { "A": "T",
14                  "T": "A",
15                  "G": "C",
16                  "C": "G",
17                  "S": "S",
18                  "W": "W",
19                  "R": "Y",
20                  "Y": "R",
21                  "M": "K",
22                  "K": "M",
23                  "H": "D",
24                  "D": "H",
25                  "B": "V",
26                  "V": "B",
27                  "N": "N",
28                  "a": "t",
29                  "t": "a",
30                  "g": "c",
31                  "c": "g",
32                  "n": "n",
33                  "z": "z"
34     }
35
36     return compDict.get(nt, "N")
37
38
39 def complement(sequence, length=-1):
40     """ returns the complement of the sequence.
41     """
42     newSeq = ""
43     
44     seqLength = len(sequence)
45     
46     if length == seqLength or length < 0:
47         seqList = list(sequence)
48         seqList.reverse()
49         return "".join(map(compNT, seqList))
50
51     #TODO: this seems to want to deal with case where length is more than
52     # sequence length except that a negative index on a sequence is fine
53     # index will only be overrun if length is negative but that case is
54     # handled above
55     for index in range(seqLength - 1,seqLength - length - 1, -1):
56         try:
57             newSeq += compNT(sequence[index])
58         except:
59             newSeq += "N"
60
61     return newSeq
62
63
64 def main(argv=None):
65     if not argv:
66         argv = sys.argv
67
68     usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
69
70     parser = optparse.OptionParser(usage=usage)
71     parser.add_option("--prefix", dest="pathPrefix")
72     parser.add_option("--overlap", type="int", dest="overlap")
73     parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
74     (options, args) = parser.parse_args(argv[1:])
75
76     if len(args) < 4:
77         print usage
78         sys.exit(0)
79
80     incontigfilename = args[0]
81     distalPairsfile = args[1]
82     outpathfilename = args[2]
83     outcontigfilename = args[3]
84
85     rnaPath(incontigfilename, distalPairsfile, outpathfilename,
86             outcontigfilename, options.pathPrefix, options.overlap)
87
88
89 def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
90             outcontigfilename, pathPrefix="RNAPATH", overlap=30):
91
92     outpathfile = open(outpathfilename, "w")
93     
94     outheader = "#settings: %s" % " ".join(sys.argv)
95     print outheader
96     print >> outpathfile, outheader
97    
98     contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
99     halfSize = calculateN50(origSize)
100     print "building the adjacency graph"
101     pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
102
103     print "found %d paths" % len(pathList)            
104
105     newSizeList = []
106     pathID = 0
107     outcontigfile = open(outcontigfilename, "w")
108     for path in pathList:
109         pathID += 1
110         outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) 
111         vertexNameList = []
112         for vertex in path:
113             vertexNameList.append(nameList[vertex])
114             pathDescription = string.join(vertexNameList, ",")
115
116         print >> outpathfile, pathDescription
117         currentVertex = path[0]
118         currentSense = "+"
119         assemblyList = currentVertex
120         sequence = contigDict[currentVertex]
121         for nextVertex in path[1:]:
122             if (currentVertex, nextVertex) in edgeSenseDict:
123                 senseList = edgeSenseDict[currentVertex, nextVertex]
124                 FR = senseList.count(("+", "-"))
125                 RF = senseList.count(("-", "+"))
126             else:
127                 senseList = edgeSenseDict[nextVertex, currentVertex]
128                 # flip
129                 FR = senseList.count(("-", "+"))
130                 RF = senseList.count(("+", "-"))
131
132             FF = senseList.count(("+", "+"))
133             RR = senseList.count(("-", "-"))
134             if currentSense == "-":
135                 # we had flipped the upstream piece! Must flip again
136                 temp1 = FR
137                 temp2 = FF
138                 FR = RR
139                 FF = RF
140                 RR = temp1
141                 RF = temp2
142
143             if FR >= FF and FR >= RR and FR >= RF:
144                 # we have FR - leave alone
145                 sense1 = "+"
146                 sense2 = "-"
147                 assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
148                 seqleft = sequence[-20:]
149                 seqright = contigDict[nextVertex][:overlap]
150                 if seqleft in seqright:
151                     pos = seqright.index(seqleft)
152                     offset = pos + 20
153                     outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
154                     print outstring
155                     print >> outpathfile, outstring
156                     sequence += contigDict[nextVertex][offset:]
157                 else:
158                     sequence += "NN" + contigDict[nextVertex]
159
160                 currentSense = "+"
161             elif FF >= RR and FF >= RF:
162                 # we have FF - flip seqright
163                 sense1 = "+"
164                 sense2 = "+"
165                 assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
166                 seqleft = sequence[-20:]
167                 seqright = complement(contigDict[nextVertex])[:overlap]
168                 if seqleft in seqright:
169                     pos = seqright.index(seqleft)
170                     offset = pos + 20
171                     outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
172                     print outstring
173                     print >> outpathfile, outstring
174                     sequence += complement(contigDict[nextVertex])[offset:]
175                 else:
176                     sequence += "NN" + complement(contigDict[nextVertex])
177
178                 currentSense = "-"
179             elif RR >= RF:
180                 # we have RR - flip seqleft
181                 sense1 = "-"
182                 sense2 = "-"
183                 assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
184                 seqleft = complement(sequence)[:20]
185                 seqright = contigDict[nextVertex][:overlap]
186                 if seqleft in seqright:
187                     pos = seqright.index(seqleft)
188                     offset = pos + 20
189                     outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
190                     print outstring
191                     print >> outpathfile, outstring
192                     sequence = complement(sequence) + contigDict[nextVertex][offset:]
193                 else:
194                     sequence = complement(sequence) + "NN" + contigDict[nextVertex]
195
196                 currentSense = "+"
197             else:
198                 # we have RF - flip both
199                 sense1 = "-"
200                 sense2 = "+"
201                 assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
202                 seqleft = complement(sequence)[-20:]
203                 seqright = complement(contigDict[nextVertex])[:overlap]
204                 if seqleft in seqright:
205                     pos = seqright.index(seqleft)
206                     offset = pos + 20
207                     outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
208                     print outstring
209                     print >> outpathfile, outstring
210                     sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
211                 else:
212                     sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
213
214                 currentSense = "-"
215
216             outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
217             print outstring
218             print >> outpathfile, outstring
219             currentVertex = nextVertex
220
221         outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
222         newSizeList.append(len(sequence))
223
224     for vertex in contigDict:
225         if vertex in visitedDict:
226             continue
227
228         newSizeList.append(len(contigDict[vertex]))
229         outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
230
231     calculateN50(newSizeList, referenceMean=halfSize)
232
233
234 def calculateN50(sizeList, referenceMean=None):
235     if referenceMean is None:
236         totalSize = sum(sizeList)
237         referenceMean = totalSize / 2
238
239     sizeList.sort()
240     sizeList.reverse()
241     currentTotalLength = 0
242     for size in sizeList:
243         if currentTotalLength + size > referenceMean:
244             print "#contigs", len(sizeList)
245             print "N50", size
246             break
247
248         currentTotalLength += size
249
250     print sizeList[:50]
251
252     return referenceMean
253
254
255 def getContigsFromFile(contigFileName):
256     nameList = []
257     origSize = []
258     contigNum = 0
259     currentChrom = ""
260     seq = ""
261     contigDict = {}
262
263     try:
264         incontigfile = open(contigFileName)
265     except IOError:
266         print "Error opening contig file: %s" % contigFileName
267         return contigNum, nameList, contigDict, origSize
268
269     for line in incontigfile:
270         if ">" in line:
271             if currentChrom !="":
272                 nameList.append(currentChrom)
273                 contigDict[contigNum] = seq
274                 origSize.append(len(seq))
275                 contigNum += 1
276
277             currentChrom = line.strip().split()[0][1:]
278             seq = ""
279         else:
280             seq += line.strip()
281
282     incontigfile.close()
283
284     return contigNum, nameList, contigDict, origSize
285
286
287 def getPath(contigNum, distalPairsfile, nameList):
288     edgeMatrix = EdgeMatrix(contigNum)
289
290     print len(edgeMatrix.edgeArray)
291     try:
292         print len(edgeMatrix.edgeArray[50])
293     except IndexError:
294         pass
295
296     print "processing distal pairs"
297     verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
298
299     willVisitList = verticesWithEdges.keys()
300     willVisitList.sort()
301     print "visiting %d vertices" % len(willVisitList)
302
303     print "cleaning up graph of edges with weight 1"
304     verticesToDelete = []
305     for rindex in willVisitList:
306         if rindex not in notSoloDict:
307             cindex = vertexEdges[rindex][0]
308             edgeMatrix.edgeArray[rindex][cindex] = 0
309             edgeMatrix.edgeArray[cindex][rindex] = 0
310             verticesToDelete.append(rindex)
311
312     for vertex in verticesToDelete:
313         willVisitList.remove(vertex)
314
315     print "%d 1-edges zeroed out" % len(verticesToDelete)
316
317     zeroedEdge = 0
318     print "visiting %d vertices" % len(willVisitList)
319
320     leafList = []
321     print "picking top 2 edges per vertex - zero out others"
322     for rindex in willVisitList:
323         vertices = vertexEdges[rindex]
324         rEdges = []
325         for avertex in vertices:
326             if avertex in willVisitList:
327                 rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
328
329         if len(rEdges) > 2:
330             rEdges.sort()
331             rEdges.reverse()
332             zeroedEdge += len(rEdges[2:])
333             for (weight, cindex) in rEdges[2:]:
334                 edgeMatrix.edgeArray[rindex][cindex] = 0
335                 edgeMatrix.edgeArray[cindex][rindex] = 0
336         elif len(rEdges) == 1:
337             if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
338                 leafList.append(rindex)
339
340     print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
341     pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
342
343     return pathList, edgeSenseDict, visitedDict
344
345
346 def traverseGraph(leafList, edgeMatrix):
347     pathList = []
348     visitedDict = {}
349     leafList.sort()
350     print "traveling through the graph"
351     for rindex in leafList:
352         if visitedDict.has_key(rindex):
353             pass
354         else:
355             path = edgeMatrix.visitLink(rindex)
356             if len(path) > 1:
357                 for vertex in path:
358                     visitedDict[vertex] = ""
359
360                 print path
361                 pathList.append(path)
362
363     return pathList, visitedDict
364
365
366 def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
367     contigToRowLookup = {}
368     verticesWithEdges = {}
369     vertexEdges = {}
370     notSoloDict = {}
371     edgeSenseDict = {}
372
373     distalPairs = open(distalPairsfilename)
374     for line in distalPairs:
375         if line[0] == "#":
376             continue
377
378         fields = line.strip().split()
379         contA = "chr%s" % fields[1]
380         try:
381             contig1 = contigToRowLookup[contA]
382         except KeyError:
383             try:
384                 contig1 = nameList.index(contA)
385                 contigToRowLookup[contA] = contig1
386             except ValueError:
387                 print "problem with end1: ", line
388                 continue
389
390         sense1 = fields[3]
391
392         contB = "chr%s" % fields[4]
393         try:
394             contig2 = contigToRowLookup[contB]
395         except KeyError:
396             try:
397                 contig2 = nameList.index(contB)
398                 contigToRowLookup[contB] = contig2
399             except ValueError:
400                 print "problem with end2: ", line
401                 continue
402
403         sense2 = fields[6]
404
405         edgeMatrix.edgeArray[contig1][contig2] += 1
406         edgeMatrix.edgeArray[contig2][contig1] += 1
407         verticesWithEdges[contig1] = ""
408         verticesWithEdges[contig2] = ""
409         if (contig1, contig2) in edgeSenseDict:
410             edgeSenseDict[contig1, contig2].append((sense1, sense2))
411         elif (contig2, contig1) in edgeSenseDict:
412             edgeSenseDict[contig2, contig1].append((sense2, sense1))
413         else:
414             edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
415
416         if contig1 in vertexEdges:
417             if contig2 not in vertexEdges[contig1]:
418                 vertexEdges[contig1].append(contig2)
419         else:
420             vertexEdges[contig1] = [contig2]
421
422         if contig2 in vertexEdges:
423             if contig1 not in vertexEdges[contig2]:
424                 vertexEdges[contig2].append(contig1)
425         else:
426             vertexEdges[contig2] = [contig1]
427
428         if edgeMatrix.edgeArray[contig1][contig2] > 1:
429             notSoloDict[contig1] = ""
430             notSoloDict[contig2] = ""
431
432     distalPairs.close()
433     
434     return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
435
436
437 class EdgeMatrix:
438     """ Describes a sparse matrix to hold edge data.
439     """
440
441     def __init__(self, dimension):
442         self.dimension = dimension
443         self.edgeArray = zeros((self.dimension, self.dimension), int16)
444
445
446     def visitLink(self, fromVertex, ignoreList=[]):
447         returnPath = [fromVertex]
448         toVertex = []
449         for toindex in xrange(self.dimension):
450             if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
451                 toVertex.append(toindex)
452
453         for vertex in toVertex:
454             if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
455                 self.edgeArray[fromVertex][vertex] = 0
456                 self.edgeArray[vertex][fromVertex] = 0
457                 return returnPath + [vertex]
458             else:
459                 self.edgeArray[fromVertex][vertex] = 0
460                 try:
461                     return returnPath + self.visitLink(vertex, returnPath)
462                 except IOError:
463                     return returnPath + [vertex]
464         return []
465
466
467 if __name__ == "__main__":
468     main(sys.argv)