erange version 4.0a dev release
[erange.git] / rnafarPairs.py
1 #
2 #  RNAFARpairs.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 11/2/08.
6 #
7 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
8            looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
9 """
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys
17 import time
18 import optparse
19 import ReadDataset
20 from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
21
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     print "rnafarPairs: version 3.7"
28     usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
29
30     parser = makeParser(usage)
31     (options, args) = parser.parse_args(argv[1:])
32
33     if len(args) < 4:
34         print usage
35         sys.exit(1)
36
37     genome = args[0]
38     goodfilename = args[1]
39     rdsfile = args[2]
40     outfilename = args[3]
41
42     rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
43
44
45 def makeParser(usage=""):
46     parser = optparse.OptionParser(usage=usage)
47     parser.add_option("--verbose", action="store_true", dest="doVerbose",
48                       help="verbose output")
49     parser.add_option("--cache", action="store_true", dest="doCache",
50                       help="use cache")
51     parser.add_option("--maxDist", type="int", dest="maxDist",
52                       help="maximum distance")
53
54     configParser = getConfigParser()
55     section = "rnafarPairs"
56     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
57     doCache = getConfigBoolOption(configParser, section, "doCache", False)
58     maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
59
60     parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
61
62     return parser
63
64
65 def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
66     goodDict = {}
67     goodfile = open(goodfilename)
68     for line in goodfile:
69         fields = line.split()
70         goodDict[fields[0]] = line
71
72     goodfile.close()
73     RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
74     chromosomeList = RDS.getChromosomes()
75     if doVerbose:
76         print time.ctime()
77
78     distinct = 0
79     total = 0
80     outfile = open(outfilename,"w")
81     geneinfoDict = getGeneInfoDict(genome)
82     geneannotDict = getGeneAnnotDict(genome)
83     assigned = {}
84     farConnected = {}
85     for chromosome in chromosomeList:
86         if doNotProcessChromosome(chromosome):
87             continue
88
89         print chromosome
90         uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
91         if doVerbose:
92             print len(uniqDict), time.ctime()    
93
94         for readID in uniqDict:
95             readList = uniqDict[readID]
96             if len(readList) == 2:
97                 total += 1
98                 if processReads(readList[:2], maxDist):
99                     flags = (readList[0]["flag"], readList[1]["flag"])
100                     processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
101                     total += processed
102                     distinct += distinctPairs
103
104     entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
105     distinct += entriesWritten
106     outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
107     outfile.close()
108     print "distinct: %d\ttotal: %d" % (distinct, total)
109     print time.ctime()
110
111
112 def doNotProcessChromosome(chromosome):
113     return chromosome == "chrM"
114
115
116 def processReads(reads, maxDist):
117     process = False
118     start1 = reads[0]["start"]
119     start2 = reads[1]["start"]
120     dist = abs(start1 - start2)
121     flag1 = reads[0]["flag"]
122     flag2 = reads[1]["flag"]
123
124     if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
125         process = True
126
127     return process
128
129
130 def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
131     flag1, flag2 = flags
132     total = 0
133     distinct = 0
134     read1IsGood = flag1 in goodDict
135     read2IsGood = flag2 in goodDict
136
137     if read1IsGood and read2IsGood:
138         if flag1 < flag2:
139             geneID = flag1
140             farFlag = flag2
141         else:
142             geneID = flag2
143             farFlag = flag1
144
145         try:
146             farConnected[geneID].append(farFlag)
147         except KeyError:
148             farConnected[geneID] = [farFlag]
149     elif read1IsGood or read2IsGood:
150         total += 1
151         if read2IsGood:
152             farFlag = flag2
153             geneID = flag1
154         else:
155             farFlag = flag1
156             geneID = flag2
157
158         try:
159             if genome == "dmelanogaster":
160                 symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
161             else:
162                 symbol = geneInfoDict[geneID][0][0]
163         except (KeyError, IndexError):
164             try:
165                 symbol = geneAnnotDict[(genome, geneID)][0]
166             except (KeyError, IndexError):
167                 symbol = "LOC%s" % geneID
168
169         symbol = symbol.strip()
170         symbol = symbol.replace(" ","|")
171         symbol = symbol.replace("\t","|")
172
173         if farFlag not in assigned:
174             assigned[farFlag] = (symbol, geneID)
175             print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
176             outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
177             distinct += 1
178
179     return total, distinct
180
181
182 def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
183     total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
184     writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
185
186     return written
187
188
189 def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
190     total = 0
191     written = 0
192     for farFlag in farConnected:
193         geneID = ""
194         symbol = ""
195         idList = [farFlag] + farConnected[farFlag]
196         for ID in idList:
197             if ID in assigned:
198                 (symbol, geneID) = assigned[ID]
199
200         if geneID == "":
201             total += 1
202             symbol = "FAR%d" % total
203             geneID = -1 * total
204
205         for ID in idList:
206             if ID not in assigned:
207                 print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
208                 outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
209                 written += 1
210                 assigned[ID] = (symbol, geneID)
211
212     return total, written
213
214
215 def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
216     for farFlag in goodDict:
217         if farFlag not in assigned:
218             farIndex += 1
219             line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
220             print line.strip()
221             outfile.write(line)
222
223
224 if __name__ == "__main__":
225     main(sys.argv)