5 # Created by Ali Mortazavi on 11/2/08.
7 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
8 looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
20 from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
27 print "rnafarPairs: version 3.7"
28 usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
30 parser = makeParser(usage)
31 (options, args) = parser.parse_args(argv[1:])
38 goodfilename = args[1]
42 rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
45 def makeParser(usage=""):
46 parser = optparse.OptionParser(usage=usage)
47 parser.add_option("--verbose", action="store_true", dest="doVerbose",
48 help="verbose output")
49 parser.add_option("--cache", action="store_true", dest="doCache",
51 parser.add_option("--maxDist", type="int", dest="maxDist",
52 help="maximum distance")
54 configParser = getConfigParser()
55 section = "rnafarPairs"
56 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
57 doCache = getConfigBoolOption(configParser, section, "doCache", False)
58 maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
60 parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
65 def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
67 goodfile = open(goodfilename)
70 goodDict[fields[0]] = line
73 RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
74 chromosomeList = RDS.getChromosomes()
80 outfile = open(outfilename,"w")
81 geneinfoDict = getGeneInfoDict(genome)
82 geneannotDict = getGeneAnnotDict(genome)
85 for chromosome in chromosomeList:
86 if doNotProcessChromosome(chromosome):
90 uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
92 print len(uniqDict), time.ctime()
94 for readID in uniqDict:
95 readList = uniqDict[readID]
96 if len(readList) == 2:
98 if processReads(readList[:2], maxDist):
99 flags = (readList[0]["flag"], readList[1]["flag"])
100 processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
102 distinct += distinctPairs
104 entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
105 distinct += entriesWritten
106 outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
108 print "distinct: %d\ttotal: %d" % (distinct, total)
112 def doNotProcessChromosome(chromosome):
113 return chromosome == "chrM"
116 def processReads(reads, maxDist):
118 start1 = reads[0]["start"]
119 start2 = reads[1]["start"]
120 dist = abs(start1 - start2)
121 flag1 = reads[0]["flag"]
122 flag2 = reads[1]["flag"]
124 if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
130 def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
134 read1IsGood = flag1 in goodDict
135 read2IsGood = flag2 in goodDict
137 if read1IsGood and read2IsGood:
146 farConnected[geneID].append(farFlag)
148 farConnected[geneID] = [farFlag]
149 elif read1IsGood or read2IsGood:
159 if genome == "dmelanogaster":
160 symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
162 symbol = geneInfoDict[geneID][0][0]
163 except (KeyError, IndexError):
165 symbol = geneAnnotDict[(genome, geneID)][0]
166 except (KeyError, IndexError):
167 symbol = "LOC%s" % geneID
169 symbol = symbol.strip()
170 symbol = symbol.replace(" ","|")
171 symbol = symbol.replace("\t","|")
173 if farFlag not in assigned:
174 assigned[farFlag] = (symbol, geneID)
175 print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
176 outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
179 return total, distinct
182 def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
183 total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
184 writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
189 def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
192 for farFlag in farConnected:
195 idList = [farFlag] + farConnected[farFlag]
198 (symbol, geneID) = assigned[ID]
202 symbol = "FAR%d" % total
206 if ID not in assigned:
207 print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
208 outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
210 assigned[ID] = (symbol, geneID)
212 return total, written
215 def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
216 for farFlag in goodDict:
217 if farFlag not in assigned:
219 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
224 if __name__ == "__main__":