snapshot of 4.0a development. initial git repo commit
[erange.git] / rnafarPairs.py
1 #
2 #  RNAFARpairs.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 11/2/08.
6 #
7 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
8            looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
9 """
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys, time, optparse
17 from commoncode import readDataset
18 from cistematic.core.geneinfo import geneinfoDB
19 from cistematic.genomes import Genome
20
21 def main(argv=None):
22     if not argv:
23         argv = sys.argv
24
25     print "%prog: version 3.6"
26     usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
27
28     parser = optparse.OptionParser(usage=usage)
29     parser.add_option("--verbose", action="store_true", dest="doVerbose",
30                       help="verbose output")
31     parser.add_option("--cache", action="store_true", dest="doCache",
32                       help="use cache")
33     parser.add_option("--maxDist", type="int", dest="maxDist",
34                       help="maximum distance")
35     parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
36     (options, args) = parser.parse_args(argv[1:])
37
38     if len(args) < 4:
39         print usage
40         sys.exit(1)
41
42     genome = args[0]
43     goodfilename = args[1]
44     rdsfile = args[2]
45     outfilename = args[3]
46
47     rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
48     
49
50 def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
51     goodDict = {}
52     goodfile = open(goodfilename)
53     for line in goodfile:
54         fields = line.split()
55         goodDict[fields[0]] = line
56
57     RDS = readDataset(rdsfile, verbose = True, cache=doCache)
58     rdsChromList = RDS.getChromosomes()
59
60     if doVerbose:
61         print time.ctime()
62
63     distinct = 0
64     total = 0
65     outfile = open(outfilename,"w")
66
67     idb = geneinfoDB()
68     if genome == "dmelanogaster":
69         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
70     else:
71         geneinfoDict = idb.getallGeneInfo(genome)
72
73     hg = Genome(genome)
74     geneannotDict = hg.allAnnotInfo()
75
76     assigned = {}
77     farConnected = {}
78     for achrom in rdsChromList:
79         if achrom == "chrM":
80             continue
81
82         print achrom
83         uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
84         if doVerbose:
85             print len(uniqDict), time.ctime()    
86
87         for readID in uniqDict:
88             readList = uniqDict[readID]
89             if len(readList) == 2:
90                 total += 1
91                 (start1, flag1, pair1) = readList[0]
92                 (start2, flag2, pair2) = readList[1]
93
94                 if flag1 != flag2:
95                     dist = abs(start1 - start2)
96                     if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
97                         geneID = ""
98                         saw1 = False
99                         saw2 = False
100                         if flag1 in goodDict:
101                             geneID = flag2
102                             farFlag = flag1
103                             saw1 = True
104
105                         if flag2 in goodDict:
106                             geneID = flag1
107                             farFlag = flag2
108                             saw2 = True
109
110                         if saw1 or saw2:
111                             total += 1
112
113                         if saw1 and saw2:
114                             if flag1 < flag2:
115                                 geneID = flag1
116                                 farFlag = flag2
117                             else:
118                                 geneID = flag2
119                                 farFlag = flag1
120
121                             if geneID in farConnected:
122                                 farConnected[geneID].append(farFlag)
123                             else:
124                                 farConnected[geneID] = [farFlag]
125                         elif geneID != "":
126                             try:
127                                 if genome == "dmelanogaster":
128                                     symbol = geneinfoDict["Dmel_" + geneID][0][0]
129                                 else:
130                                     symbol = geneinfoDict[geneID][0][0]
131                             except:
132                                 try:
133                                     symbol = geneannotDict[(genome, geneID)][0]
134                                 except:
135                                     symbol = "LOC" + geneID
136
137                             symbol = symbol.strip()
138                             symbol = symbol.replace(" ","|")
139                             symbol = symbol.replace("\t","|")
140                             if farFlag not in assigned:
141                                 assigned[farFlag] = (symbol, geneID)
142                                 print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
143                                 outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
144                                 distinct += 1
145
146     farIndex = 0
147     for farFlag in farConnected:
148         geneID = ""
149         symbol = ""
150         idList = [farFlag] + farConnected[farFlag]
151         for oneID in idList:
152             if oneID in assigned:
153                 (symbol, geneID) = assigned[oneID]
154
155         if geneID == "":
156             farIndex += 1
157             symbol = "FAR%d" % farIndex
158             geneID = -1 * farIndex
159
160         for oneID in idList:
161             if oneID not in assigned:
162                 print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
163                 outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
164                 distinct += 1
165                 assigned[oneID] = (symbol, geneID)
166
167     for farFlag in goodDict:
168         if farFlag not in assigned:
169             farIndex += 1
170             line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
171             print line.strip()
172             outfile.write(line)
173
174     outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
175     outfile.close()
176     print "distinct: %d\ttotal: %d" % (distinct, total)
177     print time.ctime()
178
179 if __name__ == "__main__":
180     main(sys.argv)