5 # Created by Ali Mortazavi on 11/2/08.
7 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
8 looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
16 import sys, time, optparse
17 from commoncode import readDataset
18 from cistematic.core.geneinfo import geneinfoDB
19 from cistematic.genomes import Genome
25 print "%prog: version 3.6"
26 usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
28 parser = optparse.OptionParser(usage=usage)
29 parser.add_option("--verbose", action="store_true", dest="doVerbose",
30 help="verbose output")
31 parser.add_option("--cache", action="store_true", dest="doCache",
33 parser.add_option("--maxDist", type="int", dest="maxDist",
34 help="maximum distance")
35 parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
36 (options, args) = parser.parse_args(argv[1:])
43 goodfilename = args[1]
47 rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
50 def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
52 goodfile = open(goodfilename)
55 goodDict[fields[0]] = line
57 RDS = readDataset(rdsfile, verbose = True, cache=doCache)
58 rdsChromList = RDS.getChromosomes()
65 outfile = open(outfilename,"w")
68 if genome == "dmelanogaster":
69 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
71 geneinfoDict = idb.getallGeneInfo(genome)
74 geneannotDict = hg.allAnnotInfo()
78 for achrom in rdsChromList:
83 uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
85 print len(uniqDict), time.ctime()
87 for readID in uniqDict:
88 readList = uniqDict[readID]
89 if len(readList) == 2:
91 (start1, flag1, pair1) = readList[0]
92 (start2, flag2, pair2) = readList[1]
95 dist = abs(start1 - start2)
96 if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
100 if flag1 in goodDict:
105 if flag2 in goodDict:
121 if geneID in farConnected:
122 farConnected[geneID].append(farFlag)
124 farConnected[geneID] = [farFlag]
127 if genome == "dmelanogaster":
128 symbol = geneinfoDict["Dmel_" + geneID][0][0]
130 symbol = geneinfoDict[geneID][0][0]
133 symbol = geneannotDict[(genome, geneID)][0]
135 symbol = "LOC" + geneID
137 symbol = symbol.strip()
138 symbol = symbol.replace(" ","|")
139 symbol = symbol.replace("\t","|")
140 if farFlag not in assigned:
141 assigned[farFlag] = (symbol, geneID)
142 print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
143 outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
147 for farFlag in farConnected:
150 idList = [farFlag] + farConnected[farFlag]
152 if oneID in assigned:
153 (symbol, geneID) = assigned[oneID]
157 symbol = "FAR%d" % farIndex
158 geneID = -1 * farIndex
161 if oneID not in assigned:
162 print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
163 outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
165 assigned[oneID] = (symbol, geneID)
167 for farFlag in goodDict:
168 if farFlag not in assigned:
170 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
174 outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
176 print "distinct: %d\ttotal: %d" % (distinct, total)
179 if __name__ == "__main__":