8 print "psyco not running"
9 from cistematic.core import complement
10 from cistematic.genomes import Genome
17 verstring = "%prog: version 1.0"
21 usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
22 \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
23 \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
25 parser = optparse.OptionParser(usage=usage)
26 parser.add_option("--verbose", action="store_true", dest="doVerbose",
27 help="show verbose messages [default: False]")
28 parser.add_option("--spacer", type="int", dest="spacer",
29 help="number of spacer NTs to use [default: 2")
30 parser.set_defaults(doVerbose=False, spacer=2)
31 (options, args) = parser.parse_args(argv[1:])
35 datafilename = args[1]
42 getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
45 def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
46 spacerseq = "N" * spacer
48 datafile = open(datafilename)
56 nameToComplementDict = {}
63 spliceCount = int(fields[7]) - 1
67 counter += spliceCount
68 spliceCountDict[name] = spliceCount
73 nameToChromDict[name] = chrom
74 if chrom not in alreadySeen:
75 alreadySeen[chrom] = []
77 nameToComplementDict[name] = fields[2]
80 for val in fields[8].split(",")[:-1]:
81 exonStarts.append(int(val))
83 for val in fields[9].split(",")[:-1]:
84 exonStops.append(int(val))
86 exonStartDict[name] = exonStarts
87 exonStopDict[name] = exonStops
89 for index in range(spliceCount + 1):
90 exonLengths.append(exonStops[index] - exonStarts[index])
92 exonLengthDict[name] = exonLengths
94 print len(spliceCountDict)
101 outfile = open(outfilename, "w")
102 for name in nameToChromDict:
104 spliceCount = spliceCountDict[name]
108 exonStarts = exonStartDict[name]
109 exonStops = exonStopDict[name]
110 exonLengths = exonLengthDict[name]
111 chrom = nameToChromDict[name]
112 for index in range(spliceCount):
113 if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]:
116 regionstart = exonStops[index] - maxBorder
117 alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
118 beforeLen = exonLengths[index]
119 afterLen = exonLengths[index + 1]
120 if (beforeLen + afterLen) < maxBorder + spacer:
124 if (beforeLen + afterLen) < 2 * maxBorder:
127 if beforeLen > maxBorder:
128 beforeLen = maxBorder
130 if afterLen > maxBorder:
134 beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder)
135 afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
138 print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1])
141 sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "")
142 spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "")
143 outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence)
144 outfile.write(outstring)
148 if spliceCounter > 10000:
149 print "%d genes" % splicefileindex
154 print "%d splices too short to be seen" % missedCount
155 print "%d splices will be under-reported" % depressedCount
157 if __name__ == "__main__":