1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
31 from pysqlite2 import dbapi2 as sqlite
33 from sqlite3 import dbapi2 as sqlite
37 from os import environ
39 if environ.get("CISTEMATIC_ROOT"):
40 cisRoot = environ.get("CISTEMATIC_ROOT")
42 cisRoot = "/proj/genome"
44 __all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri",
45 "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus",
46 "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis",
47 "btaurus", "drerio", "ecaballus"
50 supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens",
51 "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae",
52 "cbrenneri", "cremanei", "dmelanogaster", "mdomestica",
53 "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus"
69 """ returns the complementary basepair to base nt
71 compDict = {"A": "T", "T": "A",
86 return compDict.get(nt, "N")
89 def complement(sequence, length=0):
90 """ returns the complement of the sequence.
93 seqLength = len(sequence)
94 if length == seqLength:
95 seqList = list(sequence)
97 return "".join(map(compNT, seqList))
99 for index in range(seqLength - 1, seqLength - length - 1, -1):
101 newSeq += compNT(sequence[index])
115 customAnnotations = False
118 def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False):
121 self.chromosome = chrom
124 self.version = version
129 if genome in supportedGenomes and dbFile == "":
130 self.dbFile = geneDB[genome]
131 self.supported = True
134 self.memoryBacked = True
135 self.memoryDB = sqlite.connect(":memory:")
136 self.createGeneDB(inMemory=True)
137 sql = self.memoryDB.cursor()
139 sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
140 sql.execute("ATTACH '%s' as diskdb" % self.dbFile)
141 for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]:
142 sql.execute("insert into %s select * from diskdb.%s" % (table, table))
144 sql.execute("DETACH diskdb")
146 if self.dbFile != "":
147 print "could not import %s" % self.dbFile
150 self.createIndices(inMemory=True)
152 self.memoryBacked = False
156 def setGeneDB(self, dbFile):
158 self.supported = False
161 def setChromosome(self, chrom):
162 self.chromosome = chrom
165 def checkGene(self, geneID):
166 """ returns True if the geneID matches an entry in the genome database.
168 (genome, gID) = geneID
169 if genome != self.genome:
173 stmt = "SELECT chromosome from gene_entries where name = '%s' " % gID
174 res = self.queryDB(stmt)
183 def geneInfo(self, geneID, version="1"):
184 (genome, gID) = geneID
186 if genome != self.genome:
190 stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version)
191 res = self.queryDB(stmt)
203 result = (chrom, start, stop, length, orientation)
210 def getallGeneInfo(self, name="", chrom="", version=""):
213 stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start "
214 res = self.queryDB(stmt, fetchall=True)
216 (name, chromosome, start, stop, orientation) = entry
217 if name not in resultDict:
218 resultDict[name] = []
220 if chromosome not in chromList:
221 resultDict[chromosome] = []
222 chromList.append(chromosome)
224 resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation))
229 def leftGeneDistance(self, geneID, radius=50000, version="1"):
231 res = self.geneInfo(geneID, version)
233 (chrom, start, stop, length, orientation) = res
239 stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start)
240 res = self.queryDB(stmt, fetchall=True)
242 rstart = int(entry[1])
243 rstop = int(entry[2])
249 thelength = start - rstart
250 if (start - rstop) > 0 and (start - rstop) < thelength:
251 thelength = start - rstop
253 if thelength > 0 and thelength < result:
258 def rightGeneDistance(self, geneID, radius=50000, version="1"):
260 res = self.geneInfo(geneID, version)
262 (chrom, start, stop, length, orientation) = res
268 stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius)
269 res = self.queryDB(stmt, fetchall=True)
271 rstart = int(entry[1])
272 rstop = int(entry[2])
278 thelength = rstop - stop
279 if (rstart - stop) > 0 and (rstart - stop) < thelength:
280 thelength = rstart - stop
282 if thelength > 0 and thelength < result:
288 def annotInfo(self, geneID):
289 (genome, gID) = geneID
291 if genome != self.genome:
294 stmt = "SELECT description from gene_annotation where name = '%s'" % gID
295 res = self.queryDB(stmt, fetchall=True)
298 result.append(entry[0])
303 def goInfo(self, geneID):
304 (genome, gID) = geneID
306 if genome != self.genome:
309 stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID
310 res = self.queryDB(stmt, fetchall=True)
313 result.append(string.join(entry,"\t"))
318 def chromInfo(self, chrom=""):
319 if chrom == "" and self.chromosome != "":
320 chrom = self.chromosome
322 stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom
323 res = self.queryDB(stmt)
324 result = "%s\t%s" % (res[0], res[1])
330 """ returns [ list of all orf names]
333 stmt = "SELECT distinct name from gene_entries"
334 res = self.queryDB(stmt, fetchall=True)
336 result.append(entry[0])
341 def allGIDsbyGOID(self, GOID):
342 """ returns [ list of all orf names] that match a particular GOID
345 stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
346 res = self.queryDB(stmt, fetchall=True)
348 result.append(entry[0])
354 """ returns the list of GOID's in the genome
357 stmt = "SELECT distinct GOID from gene_ontology"
358 res = self.queryDB(stmt, fetchall=True)
360 result.append(entry[0])
365 def allGOterms(self):
366 """ returns the list of GOID's and their associated GO term in the genome
369 stmt = "SELECT distinct GOID, GOterm from gene_ontology"
370 res = self.queryDB(stmt, fetchall=True)
372 result[str(entry[0])] = str(entry[1])
377 def getGOIDCount(self, GOID):
378 """ returns the match count for a particular GOID
380 stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
381 res = self.queryDB(stmt, fetchall=True)
386 def allAnnotInfo(self):
388 stmt = "SELECT name, description from gene_annotation"
389 res = self.queryDB(stmt, fetchall=True)
392 geneID = (self.genome, entry[0])
393 if geneID not in result:
396 result[(self.genome,entry[0])].append(entry[1])
403 stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name "
404 res = self.queryDB(stmt, fetchall=True)
406 geneID = (self.genome, entry[0])
407 if geneID not in result:
410 result[(self.genome, entry[0])].append(string.join(entry[1:], "\t"))
415 def allChromNames(self, partition=1, slice=0):
417 stmt = "SELECT distinct name from chromosomes"
418 res = self.queryDB(stmt, fetchall=True)
420 for index in range(reslen):
421 if (index + slice) % partition == 0:
423 result.append(entry[0])
428 def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"):
429 (chrom, start, stop, length, orientation) = self.geneInfo(gID, version)
430 seq = self.sequence(chrom, start, length, maskCDS, maskLower)
431 if orientation == "R":
432 seq = complement(seq, length)
437 def getGeneFeatures(self, gID, type="", version="1"):
440 (genome, geneid) = gID
442 featureClause = ' and type = "%s" ' % type
444 stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s" %s order by start ' % (geneid, str(version), featureClause)
445 res = self.queryDB(stmt, fetchall=True)
447 (type, chromosome, start, stop, orientation) = entry
448 results.append((type, chromosome, start, stop, orientation))
453 def getallGeneFeatures(self):
455 stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start "
456 res = self.queryDB(stmt, fetchall=True)
458 (name, type, chromosome, start, stop, orientation) = entry
459 if name not in resultDict:
460 resultDict[name] = []
462 resultDict[name].append((type, chromosome, start, stop, orientation))
467 def getFeatureTypes(self, ftype=''):
468 """ Returns the distinct feature types available in the sequence_features
469 tables. Can optionally limit by feature type; the wild-card % can be
470 used to search feature substrings.
478 if len(ftype) > 0 and not useLike:
479 whereClause = 'where type = "%s" ' % ftype
481 whereClause = 'where type LIKE "%s" ' % ftype
483 stmt = "select distinct type from sequence_features %s" % whereClause
484 res = self.queryDB(stmt, fetchall=True)
486 results.append(entry[0])
490 def getFeatures(self, ftype, name="", chrom="", version =""):
491 """ Get features stored in sequence_features that match a feature type,
492 optionally restricted by name/value, chromosome, or version. Will
493 search for substrings when ftype and/or name are given with a % to
494 indicate the location of the wildcard. Returns a dictionary of features
495 with chromosomes as the keys.
512 nameClause = ' and name %s "%s" ' % (nameLike, name)
515 chromClause = ' and chromosome = "%s" ' % chrom
518 versionClause = ' and version = "%s" ' % version
520 stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause)
521 res = self.queryDB(stmt, fetchall=True)
523 (name, version, chromosome, start, stop, orientation, atype) = entry
524 if chromosome not in chromList:
525 results[chromosome] = []
526 chromList.append(chromosome)
528 results[chromosome].append((name, version, chromosome, start, stop, orientation, atype))
533 def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""):
534 """ Return features that are on a particular stretch of the genome. Can optionally
535 limit by feature type; the wild-card % can be used to search feature substrings.
539 qstop = qstart + qlength
544 if len(ftype) > 0 and not useLike:
545 featureClause = ' and type = "%s" ' % ftype
547 featureClause = ' and type LIKE "%s" ' % ftype
549 #select all features that encompass our start/stop
550 stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause)
551 res = self.queryDB(stmt, fetchall=True)
553 (chromosome, start, stop, orientation, name, version, atype) = entry
554 results.append((name, version, chromosome, start, stop, orientation, atype))
556 # select all features that have a "stop" between start and stop that aren't yet in the dataset
557 stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
558 res = self.queryDB(stmt, fetchall=True)
560 (chromosome, start, stop, orientation, name, version, atype) = entry
561 if (name, version, chromosome, start, stop, orientation, atype) not in results:
562 results.append((name, version, chromosome, start, stop, orientation, atype))
564 # select all features on chromosome that have a "start" between start and stop
565 stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
566 res = self.queryDB(stmt, fetchall=True)
568 (chromosome, start, stop, orientation, name, version, atype) = entry
569 if (name, version, chromosome, start, stop, orientation, atype) not in results:
570 results.append((name, version, chromosome, start, stop, orientation, atype))
575 def getGenesIntersecting(self, chrom, qstart, qlength):
576 """ Return features that are on a ptarticular stretch of the genome. Can optionally
577 limit by feature type; the wild-card % can be used to search feature substrings.
580 qstop = qstart + qlength
582 #select all features that encompass our start/stop
583 stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop)
584 res = self.queryDB(stmt, fetchall=True)
586 (chromosome, start, stop, orientation, name, version) = entry
587 results.append((name, version, chromosome, start, stop, orientation, atype))
589 # select all features that have a "stop" between start and stop that aren't yet in the dataset
590 stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop)
591 res = self.queryDB(stmt, fetchall=True)
593 (chromosome, start, stop, orientation, name, version) = entry
594 if (name, version, chromosome, start, stop, orientation) not in results:
595 results.append((name, version, chromosome, start, stop, orientation, atype))
597 # select all features on chromosome that have a "start" between start and stop
598 stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop)
599 res = self.queryDB(stmt, fetchall=True)
601 (chromosome, start, stop, orientation, name, version) = entry
602 if (name, version, chromosome, start, stop, orientation) not in results:
603 results.append((name, version, chromosome, start, stop, orientation, atype))
608 def sequence(self, chrom, start, length, maskCDS=False, maskLower=False):
610 if chrom == "" and self.chromosome != "":
611 chrom = self.chromosome
613 stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
614 res = self.queryDB(stmt)
618 stmt = "select sequence from sequences where name = '%s' " % seqName
619 res = self.queryDB(stmt)
620 seq = res[0][start: start + length]
623 chromFile = open("%s%s" % (cisRoot, seqName), "r")
624 mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ)
625 mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ)
627 seq = mymap.read(length)
631 stop = start + length - 1
633 features = self.getFeaturesIntersecting(chrom, start, length, "CDS")
634 for entry in features:
635 (name, geneID, chromosome, fstart, fstop, forientation, type) = entry
642 nstart = fstart - start
643 nstop = fstop - start + 1
644 for pos in range(nstop - nstart):
645 seqArray[nstart + pos] = "N"
647 seq = string.join(seqArray, "")
651 for index in range(len(seqArray)):
652 if seqArray[index] in ["a", "c" , "g", "t"]:
653 seqArray[index] = "N"
655 seq = string.join(seqArray, "")
660 def getChromosomeSequence(self, chrom=""):
662 if chrom == "" and self.chromosome != "":
663 chrom = self.chromosome
665 stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
666 res = self.queryDB(stmt)
668 print "Could not find chromosome %s" % chrom
674 res = self.queryDB('select sequence from sequences where name = "%s"' % chrom)
678 chromFile = open("%s%s" % (cisRoot, seqName), "r")
679 seq = chromFile.readline()
685 def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1):
687 if chrom == "" and self.chromosome != "":
688 chrom = self.chromosome
690 stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom
691 res = self.queryDB(stmt, fetchall=True)
692 if lowerbound > 0 and upperbound > 0:
694 start = int(entry[0])
697 start, stop = stop, start
699 if stop < lowerbound or start > upperbound:
702 result.append((start, stop, entry[2], (self.genome, entry[3])))
705 start = int(entry[0])
708 start, stop = stop, start
710 result.append((start, stop, entry[2], (self.genome, entry[3])))
715 def createGeneDB(self, dbFile="", inMemory=False):
719 tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar",
720 "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar",
721 "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar",
722 "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar",
723 "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar",
724 "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar"
727 for table in tableDict.keys():
728 stmt = "create table %s(%s)" % (table, tableDict[table])
729 self.writeDB(stmt, useMemory=inMemory)
732 def createIndices(self, inMemory=False):
733 indexDict = {"nameIndex1": ("gene_entries", "name"),
734 "nameIndex2": ("gene_annotation", "name"),
735 "nameIndex3": ("gene_ontology", "name"),
736 "goidIndex": ("gene_ontology", "GOID"),
737 "nameIndex4": ("sequences", "name"),
738 "nameIndex5": ("chromosomes", "name"),
739 "geneIDIndex": ("sequence_features", "name, type"),
740 "posIndex": ("sequence_features", "chromosome, start, stop, type"),
741 "typeIndex": ("sequence_features", "type")
744 for indexName in indexDict.keys():
745 (tableName, fields) = indexDict[indexName]
746 stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields)
747 self.writeDB(stmt, useMemory=inMemory)
750 def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0):
751 (genome, gID) = geneID
752 length = str(abs(int(start) - int(stop)) + 1)
753 stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature)
757 def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""):
758 (genome, gID) = geneID
759 length = str(abs(int(start) - int(stop)) + 1)
760 stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type)
764 def addAnnotation(self, geneID, description):
765 (genome, gID) = geneID
766 stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description)
770 def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""):
771 (genome, gID) = geneID
772 stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)
776 def addSequence(self, geneID, seq, seqType, seqLen="-1"):
777 (genome, gID) = geneID
779 seqLen = str(len(seq))
781 stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq)
785 def addChromosomeEntry(self, chromo, seqName, storageType):
786 stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType)
790 def addSequenceBatch(self, entryArray, inMemory=False):
792 stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)"
793 for entry in entryArray:
794 (geneID, seq, seqType, seqLen) = entry
795 (genome, gID) = geneID
797 seqLen = str(len(seq))
799 stmtArray.append((gID, str(seqLen), seqType, seq))
801 self.writeBatchDB(stmt, stmtArray)
804 def addChromosomeEntryBatch(self, entryArray, inMemory=False):
805 stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, ?, ?, ?)"
806 self.writeBatchDB(stmt, entryArray, useMemory=inMemory)
809 def addGeneEntryBatch(self, entryArray, inMemory=False):
811 stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
812 for entry in entryArray:
813 (geneID, chrom, start, stop, orientation, feature, version) = entry
814 (genome, gID) = geneID
815 length = str(abs(int(start) - int(stop)) + 1)
816 stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature))
818 self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
821 def addFeatureEntryBatch(self, entryArray, inMemory=False):
823 stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
824 for entry in entryArray:
825 (geneID, version, chrom, start, stop, orientation, type) = entry
826 (genome, gID) = geneID
827 length = str(abs(int(start) - int(stop)) + 1)
828 stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type))
830 self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
833 def addAnnotationBatch(self, entryArray, inMemory=False):
835 stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) "
836 for entry in entryArray:
837 (geneID, description) = entry
838 (genome, gID) = geneID
839 stmtArray.append((gID, description))
841 self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
844 def addGoInfoBatch(self, entryArray, inMemory=False):
846 stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
847 for entry in entryArray:
848 (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry
849 (genome, gID) = geneID
850 stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other))
852 self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
855 def extendFeatures(self, featureFile, fileType="cistematic", replace=False):
864 senseArray = {"+": "F",
869 featfile = open(featureFile)
870 if fileType == "cistematic":
871 for line in featfile:
875 fields = line.strip().split()
876 if currentGene == "":
877 currentGene = fields[0]
879 gstart = int(fields[3])
880 gsense = senseArray[fields[5]]
882 if fields[0] != currentGene:
883 geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1"))
884 currentGene = fields[0]
886 gstart = int(fields[3])
887 gsense = senseArray[fields[5]]
889 lstart = int(fields[3])
890 gstop = int(fields[4])
892 geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype))
893 elif fileType == "UCSC":
894 for line in featfile:
898 geneFields = line.split("\t")
899 exonNum = int(geneFields[8])
900 exonStarts = geneFields[9].split(",")
901 exonStops = geneFields[10].split(",")
902 gchrom = geneFields[2][3:]
903 gsense = senseArray[geneFields[3]]
904 gstop = int(geneFields[7]) - 1
905 gstart = int(geneFields[6]) - 1
906 geneID = ("generic", geneFields[1])
907 for index in range(exonNum):
908 estart = int(exonStarts[index]) - 1
909 estop = int(exonStops[index]) - 1
910 if estart >= gstart and estop <= gstop:
911 geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS"))
912 elif estop <= gstart:
918 geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
919 elif estart >= gstop:
925 geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
926 elif estart <= gstop and estart > gstart:
932 geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS"))
933 geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType))
934 elif estart < gstart and estop <= gstop:
940 geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType))
941 geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS"))
950 geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1))
951 geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS"))
952 geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2))
954 print "%s format not supported yet"
960 self.queryDB("delete from sequence_features", useMemory=True)
961 self.queryDB("delete from gene_entries", useMemory=True)
963 self.addGeneEntryBatch(geneEntryList, inMemory=True)
964 self.addFeatureEntryBatch(geneFeatureList, inMemory=True)
967 def queryDB(self, stmt, fetchall=False, useMemory=False):
968 if useMemory or self.memoryBacked:
971 db = sqlite.connect(self.dbFile, timeout = 60)
976 results = sql.fetchall()
978 results = sql.fetchone()
981 if not (useMemory or self.memoryBacked):
987 def writeDB(self, stmt, useMemory=False):
991 db = sqlite.connect(self.dbFile, timeout = 60)
1001 def writeBatchDB(self, stmt, stmtArray, useMemory=False):
1005 db = sqlite.connect(self.dbFile, timeout = 60)
1009 sql.executemany(stmt, stmtArray)
1011 print "writeBatchDB: problem with %s" % stmt
1019 def processSql(sqlFile, keyFields={}):
1020 """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the
1021 name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column
1022 name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the
1023 same field name will be used. For example, one possible keyFields dict would be:
1024 keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'}
1027 infile = open(sqlFile)
1029 while "CREATE TABLE" not in line:
1030 line = infile.readline()
1033 tabFields = line.split()
1034 tabName = tabFields[2]
1043 lineFields = line.split()
1044 if lineFields[0] in keyFields.keys():
1045 if keyFields[lineFields[0]] == "":
1046 outfield = lineFields[0]
1048 outfield = keyFields[lineFields[0]]
1050 fields[outfield] = index
1056 return (tabName, fields)
1059 def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"):
1060 """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature
1061 type in sequence_features, the dataFields in the format returned by processSql(), and a version.
1062 If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records,
1063 rather than use addFeatureEntryBatch() to record the added features.
1065 Note that numberic values are overloaded into the name field using @ as a delimiter.
1068 if dataFields == {}:
1069 dataFields["chrom"] = 1
1070 dataFields["start"] = 2
1071 dataFields["stop"] = 3
1072 dataFields["name"] = 4
1074 infile = open(dataFile)
1076 fields = line.strip().split("\t")
1077 if "name" in dataFields:
1078 recName = fields[dataFields["name"]]
1082 if "value" in dataFields:
1083 recName += "@" + fields[dataFields["value"]]
1085 start = int(fields[dataFields["start"]])
1086 stop = int(fields[dataFields["stop"]])
1087 chrom = fields[dataFields["chrom"]][3:]
1088 chrom.replace("_random", "rand")
1090 if "orientation" in dataFields:
1091 if fields[dataFields["orientation"]] == "-":
1094 if "version" in dataFields:
1095 version = fields[dataFields["version"]]
1097 records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper()))
1099 if genomeObject != "":
1100 genomeObject.addFeatureEntryBatch(records)
1105 # Voodoo to get recursive imports working
1106 for genome in supportedGenomes:
1107 importString = "import %s" % genome
1109 geneDB[genome] = eval("%s.geneDB" % genome)