first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / makeGraphs.py
1 import sys
2 import os
3
4
5 def getEdges(nodeList, shorten=False):
6     edgeDict = {}
7
8     for nodeEntry in nodeList:
9         try:
10             (node1, node2, count) = nodeEntry.strip().split("\t")
11         except ValueError:
12             continue
13
14         if shorten:
15             try:
16                 node1 = node1.split("_")[1]
17             except IndexError:
18                 pass
19
20             try:
21                 node2 = node2.split("_")[1]
22             except IndexError:
23                 pass
24
25         node1Detail = (node1, int(count))
26         node2Detail = (node2, int(count))
27         try:
28             if node2Detail not in edgeDict[node1]:
29                 edgeDict[node1].append(node2Detail)
30         except KeyError:
31             edgeDict[node1] = [node2Detail]
32
33         try:
34             if node1Detail not in edgeDict[node2]:
35                 edgeDict[node2].append(node1Detail)
36         except KeyError:
37             edgeDict[node2] = [node1Detail]
38
39     return edgeDict
40
41
42 def getEdgesFromFile(inFileName, shorten=False):
43
44     infile = open(inFileName)
45     edgeDict = getEdges(infile, shorten)
46     infile.close()
47
48     return edgeDict
49
50
51 def getOutputLine(currentNode, node, nodeCount):
52     if nodeCount > 2:
53         outputLine = '\t"%s" -- "%s" [ label = "%d", penwidth=%d, color="red", constraint=false] ; \n' % (currentNode, node, nodeCount, nodeCount)
54     else:
55         outputLine = '\t"%s" -- "%s" [ label = "%d", color="red", constraint=false] ; \n' % (currentNode, node, nodeCount)
56
57     return outputLine
58
59
60 infilename = sys.argv[1]
61 outprefix = sys.argv[2]
62
63 shorten = False
64 if "-shorten" in sys.argv:
65     shorten = True
66
67 edgeDict = getEdgesFromFile(infilename, shorten)
68
69 nodeList = edgeDict.keys()
70 seenNodeDict = {}
71 seenEdgeDict = {}
72 currentNodeList = []
73 currentEdgeList = []
74 treeList = []
75 localCount = []
76
77 outstat = open("%s.stats" % outprefix,"w")
78 outstat.write("#gID\tnodes\tedges\tweight\n")
79
80 def visitNodes(currentNode):
81     if currentNode in seenNodeDict:
82         return
83
84     seenNodeDict[currentNode] = []
85     for (node, nodeCount) in edgeDict[currentNode]:
86         nodePair = [node, currentNode]
87         nodePair.sort()
88         if str(nodePair) not in seenEdgeDict:
89             if node not in currentNodeList:
90                 currentNodeList.append(node)
91
92             outputLine = getOutputLine(currentNode, node, nodeCount)
93             currentEdgeList.append(outputLine)
94             seenEdgeDict[str(nodePair)] = 0
95             localCount[0] += nodeCount
96             try:
97                 visitNodes(node)
98             except:
99                 pass
100
101 print "getting trees"
102 for node in nodeList:
103     if node not in seenNodeDict:
104         currentNodeList = [node]
105         currentEdgeList = []
106         localCount = [0]
107         outfile = open("%s.%s.gv" % (outprefix, node), "w")
108         treeList.append(node)
109         outfile.write("graph g%s {\n" % node)
110         visitNodes(node)
111         currentNodeList.sort()
112         outfile.write('subgraph G0 {\n\t"%s" ' % currentNodeList[0])
113         for anode in currentNodeList[1:]:
114             outfile.write('-- "%s" ' % anode)
115
116         outfile.write(" [ weight = 100 ] ;\n\tordering = out ;\n}\n")
117         for line in currentEdgeList:
118             outfile.write(line)
119
120         outfile.write("}\n")
121         outfile.close()
122         outstat.write("%s\t%d\t%d\t%d\n" % (node, len(currentNodeList), len(currentEdgeList), localCount[0]))
123
124 print "generating pngs"
125 for node in treeList:
126     output = os.popen("dot -Tpng %s.%s.gv > %s.%s.png" % (outprefix, node, outprefix, node))
127
128 outstat.close()