8 print "psyco not running"
10 from cistematic.genomes import Genome
11 from commoncode import getConfigParser, getConfigIntOption
18 verstring = "getsplicefa: version 1.1"
22 usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
23 \n\twhere ucscModels is the gene model file in same format as the UCSC Table Browser\
24 \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
25 \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
27 parser = makeParser(usage)
28 parser.set_defaults(doVerbose=False, spacer=2)
29 (options, args) = parser.parse_args(argv[1:])
33 datafilename = args[1]
35 maxBorder = int(args[3])
40 getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
43 def makeParser(usage=""):
44 parser = optparse.OptionParser(usage=usage)
45 parser.add_option("--verbose", action="store_true", dest="doVerbose",
46 help="show verbose messages [default: False]")
47 parser.add_option("--spacer", type="int", dest="spacer",
48 help="number of spacer NTs to use [default: 2")
50 configParser = getConfigParser()
51 section = "getsplicefa"
52 doVerbose = getConfigIntOption(configParser, section, "doVerbose", False)
53 spacer = getConfigIntOption(configParser, section, "spacer", 2)
55 parser.set_defaults(doVerbose=doVerbose, spacer=spacer)
60 def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
61 spacerseq = "N" * spacer
63 datafile = open(datafilename)
71 nameToComplementDict = {}
78 spliceCount = int(fields[7]) - 1
82 counter += spliceCount
83 spliceCountDict[name] = spliceCount
88 nameToChromDict[name] = chrom
89 if chrom not in alreadySeen:
90 alreadySeen[chrom] = []
92 nameToComplementDict[name] = fields[2]
95 for val in fields[8].split(",")[:-1]:
96 exonStarts.append(int(val))
98 for val in fields[9].split(",")[:-1]:
99 exonStops.append(int(val))
101 exonStartDict[name] = exonStarts
102 exonStopDict[name] = exonStops
104 for index in range(spliceCount + 1):
105 exonLengths.append(exonStops[index] - exonStarts[index] + 1)
107 exonLengthDict[name] = exonLengths
109 print len(spliceCountDict)
116 outfile = open(outfilename, "w")
117 for name in nameToChromDict:
119 spliceCount = spliceCountDict[name]
123 exonStarts = exonStartDict[name]
124 exonStops = exonStopDict[name]
125 exonLengths = exonLengthDict[name]
126 chrom = nameToChromDict[name]
127 for index in range(spliceCount):
128 if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]:
131 regionstart = exonStops[index] - maxBorder
132 alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
133 beforeLen = exonLengths[index]
134 afterLen = exonLengths[index + 1]
135 if (beforeLen + afterLen) < maxBorder + spacer:
139 if (beforeLen + afterLen) < 2 * maxBorder:
142 if beforeLen > maxBorder:
143 beforeLen = maxBorder
145 if afterLen > maxBorder:
149 beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder)
150 afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
153 print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1])
156 sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "")
157 spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "")
158 outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence)
159 outfile.write(outstring)
163 if spliceCounter > 10000:
164 print "%d genes" % splicefileindex
169 print "%d splices too short to be seen" % missedCount
170 print "%d splices will be under-reported" % depressedCount
172 if __name__ == "__main__":