import pysam
import optparse
from cistematic.genomes import Genome
-from commoncode import isSpliceEntry, getFeaturesByChromDict
+from commoncode import isSpliceEntry
def main(argv=None):
samfile = pysam.Samfile(BAM, "rb" )
readMultiplicityDict = getReadMultiplicity(samfile)
- counts = getReadCounts(samfile, readMultiplicityDict)
+ counts, readLength = getReadCounts(samfile, readMultiplicityDict)
outheader = buildHeader(samfile.header, counts)
if options.markGID:
genome = Genome(options.genomeName, inRAM=True)
genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
print "getting gene features...."
- featuresByChromDict = getFeaturesByChromDict(genome)
outheader['CO'].append(options.genomeComment)
outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
if options.markGID:
chrom = samfile.getrname(alignedread.tid)
chromNum = chrom[3:]
- geneModelFlag = "NM"
- for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
- if start < alignedread.pos and stop > alignedread.pos:
- geneModelFlag = gid
- continue
+ #new direct query
+ geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
+ try:
+ (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
+ geneModelFlag = name
+ except IndexError:
+ geneModelFlag = "NM"
ID = getReadID(alignedread)
try:
"ReadLength\t%d" % readLength
]
- return counts
+ return counts, readLength
def buildHeader(templateheader, commentList):
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
results.append((name, version, chromosome, start, stop, orientation, atype))
# select all features on chromosome that have a "start" between start and stop
- stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+ stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
res = self.queryDB(stmt, fetchall=True)
for entry in res:
(chromosome, start, stop, orientation, name, version, atype) = entry
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
-
+
geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
chromSize = {"1": 30432563,
return (fType, gid, chrom, start, stop, sense, otherDict)
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- atGenome = Genome("athaliana", dbFile=db)
+def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+ genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
+ annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
+ goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+ print "Creating database %s" % db
+ createDBFile(db)
- seq = string.join(seqArray,"")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
- print "writing to file %s" % (chromOut)
- outFile = open(cisRoot + chromOut, "w")
- outFile.write(seq)
- outFile.close()
- seq = ""
+ print "Adding feature entries"
+ loadFeatureEntries(db, genePath)
- atGenome.addChromosomeEntry(chromID, chromOut, "file")
- # Add alternative chromID - should be A-O and 01-09
- atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, annotPath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath)
+
+ for chromID in ["1", "2", "3", "4", "5", "C", "M"]:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/chr%s.fas" % (downloadDir, chromID)
+ loadChromosome(db, chromID, chromPath, "/A_thaliana/chr%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ atGenome = Genome("athaliana", dbFile=db)
+ atGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
featureEntries = []
trackedGenes = []
atGenome = Genome("athaliana", dbFile=db)
- featureTranslation = {"CDS": "CDS",
+ featureTranslation = {"CDS": "CDS",
"three_prime_UTR": "3UTR",
"five_prime_UTR": "5UTR",
"miRNA": "5UTR",
"exon": "5UTR"
}
+
geneFile = open(gFile, "r")
for line in geneFile:
fields = line.split("\t")
def loadGeneAnnotations(db, annotPath):
geneAnnotations = []
- annotFile = open(annotPath, "r")
+ annotFile = open(annotPath, "r")
annotFile.readline()
lines = annotFile.readlines()
annotFile.close()
atGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- atGenome = Genome("athaliana", dbFile=db)
- atGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
atGenome = Genome("athaliana", dbFile=db)
- atGenome.createIndices()
-
-
-def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
- genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
- annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
- goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
- chromos = {"1": "%s/chr1.fas" % downloadDir,
- "2": "%s/chr2.fas" % downloadDir,
- "3": "%s/chr3.fas" % downloadDir,
- "4": "%s/chr4.fas" % downloadDir,
- "5": "%s/chr5.fas" % downloadDir,
- "C": "%s/chrC.fas" % downloadDir,
- "M": "%s/chrM.fas" % downloadDir
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
-
- print "Adding feature entries"
- loadFeatureEntries(db, genePath)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
- print "Adding gene annotations"
- loadGeneAnnotations(db, annotPath)
+ seq = string.join(seqArray,"")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
- print "Adding gene ontology"
- loadGeneOntology(db, goPath)
+ print "writing to file %s" % (chromOut)
+ outFile = open(cisRoot + chromOut, "w")
+ outFile.write(seq)
+ outFile.close()
+ seq = ""
- for chromID in chromos.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/A_thaliana/chr%s.bin" % chromID)
+ atGenome.addChromosomeEntry(chromID, chromOut, "file")
+ # Add alternative chromID - should be A-O and 01-09
+ atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ atGenome = Genome("athaliana", dbFile=db)
+ atGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
-geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
+geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
-def loadChromosome(db, chromPath, chromOutPath):
- seqArray = []
- seqLen = 0
- btGenome = Genome("btaurus", dbFile=db)
- inFile = open(chromPath, "r")
- header = inFile.readline()
- while header != "":
- seqArray = []
- seqLen = 0
- chromID = header.strip()[1:]
- currentLine = inFile.readline()
+def buildBtaurusDB(db=geneDB):
+ genePath = "%s/download/bt2/genscan.txt" % cisRoot
+ chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
+ chromoOutPath = "/B_taurus/"
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+ print "Creating database %s" % db
+ createDBFile(db)
- seq = string.join(seqArray, "")
- if seqLen < 500000:
- print "Added contig %s to database" % chromID
- btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
- btGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open("%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- btGenome.addChromosomeEntry(chromID, outFileName, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
- header = currentLine
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
- inFile.close()
+ print "Loading sequences"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ btGenome = Genome("btaurus", dbFile=db)
+ btGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
btGenome = Genome("btaurus", dbFile=db)
geneFile = open(gFile, "r")
btGenome.addGeneEntryBatch(geneEntries)
-def loadGeneAnnotations(db, annotPath):
- geneAnnotations = []
- annotFile = open(annotPath, "r")
- btGenome = Genome("btaurus", dbFile=db)
- for line in annotFile:
- try:
- cols = line.split("\t")
- locID = cols[0]
- geneDesc = cols[6]
- if len(locID) > 0:
- geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
- except:
- pass
-
- print "Adding %d annotations" % len(geneAnnotations)
- btGenome.addAnnotationBatch(geneAnnotations)
-
-
def loadGeneFeatures(db, gfile):
geneFile = open(gfile, "r")
senseArray = {"+": "F",
btGenome.addFeatureEntryBatch(insertArray)
+def loadGeneAnnotations(db, annotPath):
+ geneAnnotations = []
+ annotFile = open(annotPath, "r")
+ btGenome = Genome("btaurus", dbFile=db)
+ for line in annotFile:
+ try:
+ cols = line.split("\t")
+ locID = cols[0]
+ geneDesc = cols[6]
+ if len(locID) > 0:
+ geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
+ except:
+ pass
+
+ print "Adding %d annotations" % len(geneAnnotations)
+ btGenome.addAnnotationBatch(geneAnnotations)
+
+
def loadGeneOntology(db, goPath, goDefPath, annotPath):
btGenome = Genome("btaurus", dbFile=db)
goDefFile = open(goDefPath, "r")
goFile = open(goPath, "r")
- annotFile = open(annotPath, "r")
+ annotFile = open(annotPath, "r")
annotEntries = annotFile.readlines()
annotFile.close()
goDefEntries = goDefFile.readlines()
btGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- btGenome = Genome("btaurus", dbFile=db)
- btGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+ seqArray = []
+ seqLen = 0
btGenome = Genome("btaurus", dbFile=db)
- btGenome.createIndices()
-
+ inFile = open(chromPath, "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromID = header.strip()[1:]
+ currentLine = inFile.readline()
-def buildBtaurusDB(db=geneDB):
- genePath = "%s/download/bt2/genscan.txt" % cisRoot
- chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
- chromoOutPath = "/B_taurus/"
-
- print "Creating database %s" % db
- createDBFile(db)
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
+ seq = string.join(seqArray, "")
+ if seqLen < 500000:
+ print "Added contig %s to database" % chromID
+ btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
+ btGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open("%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ btGenome.addChromosomeEntry(chromID, outFileName, "file")
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
+ header = currentLine
- print "Loading sequences"
- loadChromosome(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ btGenome = Genome("btaurus", dbFile=db)
+ btGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot
-def loadChromosome(db, chromPath, chromOutPath):
- seqArray = []
- seqLen = 0
- cbGenome = Genome("cbrenneri", dbFile=db)
- inFile = open(chromPath, "r")
- header = inFile.readline()
- while header != "":
- seqArray = []
- seqLen = 0
- chromHeader = header.strip()[1:].split()
- chromID = chromHeader[0]
- currentLine = inFile.readline()
+def buildCbrenneriDB(db=geneDB):
+ gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
+ chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
+ chromoOutPath = "/C_brenneri/"
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+ print "Creating database %s" % db
+ createDBFile(db)
- seq = string.join(seqArray, "")
- if seqLen < 100000:
- print "Added contig %s to database" % chromID
- cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
- cbGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open( "%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- cbGenome.addChromosomeEntry(chromID, outFileName, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, gffPath)
- header = currentLine
+ print "Adding feature entries"
+ loadFeatureEntries(db, gffPath)
- inFile.close()
+ print "Loading genomic sequence"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
+ cbGenome.createGeneDB(db)
def loadGeneEntries(db, gffFile):
cbGenome.addFeatureEntryBatch(featureEntries)
-def createDBFile(db):
- cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
- cbGenome.createGeneDB(db)
-
-
-def createDBindices(db):
- cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
- cbGenome.createIndices()
-
-
-def buildCbrenneriDB(db=geneDB):
- gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
- chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
- chromoOutPath = "/C_brenneri/"
+def loadChromosome(db, chromPath, chromOutPath):
+ seqArray = []
+ seqLen = 0
+ cbGenome = Genome("cbrenneri", dbFile=db)
+ inFile = open(chromPath, "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromHeader = header.strip()[1:].split()
+ chromID = chromHeader[0]
+ currentLine = inFile.readline()
- print "Creating database %s" % db
- createDBFile(db)
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- print "Adding gene entries"
- loadGeneEntries(db, gffPath)
+ seq = string.join(seqArray, "")
+ if seqLen < 100000:
+ print "Added contig %s to database" % chromID
+ cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
+ cbGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open( "%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ cbGenome.addChromosomeEntry(chromID, outFileName, "file")
- print "Adding feature entries"
- loadFeatureEntries(db, gffPath)
+ header = currentLine
- print "Loading genomic sequence"
- loadChromosome(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
+ cbGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
-def loadChromosome(db, chromPath, chromOutPath):
- seqArray = []
- seqLen = 0
- cbGenome = Genome("cbriggsae", dbFile=db)
- inFile = open(chromPath, "r")
- header = inFile.readline()
- while header != "":
- seqArray = []
- seqLen = 0
- chromID = header.strip()[1:]
- currentLine = inFile.readline()
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+def buildCbriggsaeDB(db=geneDB):
+ gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
+ chromoPath = "%s/download/briggsae_25.fa" % cisRoot
+ chromoOutPath = "/C_briggsae/"
- seq = string.join(seqArray, "")
- if seqLen < 900000:
- print "Added contig %s to database" % chromID
- cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
- cbGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open("%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- cbGenome.addChromosomeEntry(chromID, outFileName, "file")
+ print "Creating database %s" % db
+ createDBFile(db)
- header = currentLine
+ print "Adding gene entries"
+ loadGeneEntries(db, gffPath)
- inFile.close()
+ print "Adding feature entries"
+ loadFeatureEntries(db, gffPath)
+
+ print "Loading genomic sequence"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
+ cbGenome.createGeneDB(db)
def loadGeneEntries(db, gffFile):
featureEntries = []
seenFeatures = {}
featureTranslation = {"coding_exon": "CDS",
- "three_prime_UTR": "3UTR",
+ "three_prime_UTR": "3UTR",
"five_prime_UTR": "5UTR"
}
cbGenome.addFeatureEntryBatch(featureEntries)
-def createDBFile(db):
- cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
- cbGenome.createGeneDB(db)
-
-
-def createDBindices(db):
- cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
- cbGenome.createIndices()
-
-
-def buildCbriggsaeDB(db=geneDB):
- gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
- chromoPath = "%s/download/briggsae_25.fa" % cisRoot
- chromoOutPath = "/C_briggsae/"
-
- print "Creating database %s" % db
- createDBFile(db)
+def loadChromosome(db, chromPath, chromOutPath):
+ seqArray = []
+ seqLen = 0
+ cbGenome = Genome("cbriggsae", dbFile=db)
+ inFile = open(chromPath, "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromID = header.strip()[1:]
+ currentLine = inFile.readline()
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- print "Adding gene entries"
- loadGeneEntries(db, gffPath)
+ seq = string.join(seqArray, "")
+ if seqLen < 900000:
+ print "Added contig %s to database" % chromID
+ cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
+ cbGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open("%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ cbGenome.addChromosomeEntry(chromID, outFileName, "file")
- print "Adding feature entries"
- loadFeatureEntries(db, gffPath)
+ header = currentLine
- print "Loading genomic sequence"
- loadChromosome(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
+ cbGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- ceGenome = Genome("celegans", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildCelegansDB(db=geneDB, downloadRoot=""):
+ if downloadRoot == "":
+ downloadRoot = "%s/download/" % cisRoot
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ geneIDPath = "%sgeneIDs.WS200" % downloadRoot
+ goDefPath = "%sGO.terms_and_ids" % downloadRoot
+ goPath = "%sgene_association.wb" % downloadRoot
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- ceGenome.addChromosomeEntry(chromID, chromOut, "file")
+ # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
+ gffPath = "%selegansWS200.gff" % downloadRoot
+
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Adding gene entries"
+ loadGeneEntries(db, gffPath)
+
+ print "Adding feature entries"
+ loadFeatureEntries(db, gffPath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, geneIDPath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath, geneIDPath)
+
+ # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
+ for chromID in ["I", "II", "III", "IV", "V", "X"]:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID)
+ loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ ceGenome = Genome("celegans", version="WS200", dbFile=db)
+ ceGenome.createGeneDB(db)
def loadGeneEntries(db, gffFile):
else:
gidGene = giddots[1]
gidLetter = "a"
+
gid = "%s.%s" % (giddots[0], gidGene)
geneID = ("celegans", gid)
gidVersion = 1
def loadGeneAnnotations(db, geneIDPath):
geneAnnotations = []
- geneIDFile = open(geneIDPath, "r")
+ geneIDFile = open(geneIDPath, "r")
lines = geneIDFile.readlines()
geneIDFile.close()
ceGenome = Genome("celegans", dbFile=db)
ceGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- ceGenome = Genome("celegans", version="WS200", dbFile=db)
- ceGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ ceGenome = Genome("celegans", dbFile=db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ ceGenome.addChromosomeEntry(chromID, chromOut, "file")
def createDBindices(db):
ceGenome = Genome("celegans", version="WS200", dbFile=db)
ceGenome.createIndices()
-
-
-def buildCelegansDB(db=geneDB, downloadRoot=""):
- if downloadRoot == "":
- downloadRoot = "%s/download/" % cisRoot
-
- geneIDPath = "%sgeneIDs.WS200" % downloadRoot
- goDefPath = "%sGO.terms_and_ids" % downloadRoot
- goPath = "%sgene_association.wb" % downloadRoot
-
- # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
- chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot,
- "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot,
- "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot,
- "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot,
- "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot,
- "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot
- }
-
- # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
- gffPath = "%selegansWS200.gff" % downloadRoot
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, gffPath)
-
- print "Adding feature entries"
- loadFeatureEntries(db, gffPath)
-
- print "Adding gene annotations"
- loadGeneAnnotations(db, geneIDPath)
-
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath, geneIDPath)
-
- for chromID in chromos:
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID)
-
- print "Creating Indices"
- createDBindices(db)
-
- print "Finished creating database %s" % db
\ No newline at end of file
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- cfGenome = Genome("cfamiliaris", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildDogDB(db=geneDB):
+ genePath = "%s/download/seq_gene.md" % cisRoot
+ print "Creating database %s" % db
+ createDBFile(db)
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- cfGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+ "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+ "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
+ "31", "32", "33", "34", "35", "36", "37", "38", "X", "Un"
+ ]
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+ loadChromosome(db, chromID, chromPath, "/C_familiaris/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ cfGenome = Genome("cfamiliaris", dbFile=db)
+ cfGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
alreadySeen = []
cfGenome = Genome("cfamiliaris", dbFile=db)
synonyms = idb.geneIDSynonyms(gID)
if len(synonyms) >0:
for entry in synonyms:
- gene_name += ","
+ gene_name += ","
gene_name += entry
else:
gene_name = " "
cfGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- cfGenome = Genome("cfamiliaris", dbFile=db)
- cfGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
cfGenome = Genome("cfamiliaris", dbFile=db)
- cfGenome.createIndices()
-
-
-def buildDogDB(db=geneDB):
- genePath = "%s/download/seq_gene.md" % cisRoot
- chromos = {"1": "%s/download/chr1.fa" % cisRoot,
- "2": "%s/download/chr2.fa" % cisRoot,
- "3": "%s/download/chr3.fa" % cisRoot,
- "4": "%s/download/chr4.fa" % cisRoot,
- "5": "%s/download/chr5.fa" % cisRoot,
- "6": "%s/download/chr6.fa" % cisRoot,
- "7": "%s/download/chr7.fa" % cisRoot,
- "8": "%s/download/chr8.fa" % cisRoot,
- "9": "%s/download/chr9.fa" % cisRoot,
- "10": "%s/download/chr10.fa" % cisRoot,
- "11": "%s/download/chr11.fa" % cisRoot,
- "12": "%s/download/chr12.fa" % cisRoot,
- "13": "%s/download/chr13.fa" % cisRoot,
- "14": "%s/download/chr14.fa" % cisRoot,
- "15": "%s/download/chr15.fa" % cisRoot,
- "16": "%s/download/chr16.fa" % cisRoot,
- "17": "%s/download/chr17.fa" % cisRoot,
- "18": "%s/download/chr18.fa" % cisRoot,
- "19": "%s/download/chr19.fa" % cisRoot,
- "20": "%s/download/chr20.fa" % cisRoot,
- "21": "%s/download/chr21.fa" % cisRoot,
- "22": "%s/download/chr22.fa" % cisRoot,
- '23': "%s/download/chr23.fa" % cisRoot,
- "24": "%s/download/chr24.fa" % cisRoot,
- "25": "%s/download/chr25.fa" % cisRoot,
- "26": "%s/download/chr26.fa" % cisRoot,
- "27": "%s/download/chr27.fa" % cisRoot,
- "28": "%s/download/chr28.fa" % cisRoot,
- "29": "%s/download/chr29.fa" % cisRoot,
- "30": "%s/download/chr30.fa" % cisRoot,
- "31": "%s/download/chr31.fa" % cisRoot,
- "32": "%s/download/chr32.fa" % cisRoot,
- "33": "%s/download/chr33.fa" % cisRoot,
- "34": "%s/download/chr34.fa" % cisRoot,
- "35": "%s/download/chr35.fa" % cisRoot,
- "36": "%s/download/chr36.fa" % cisRoot,
- "37": "%s/download/chr37.fa" % cisRoot,
- "38": "%s/download/chr38.fa" % cisRoot,
- "X": "%s/download/chrX.fa" % cisRoot,
- "Un": "%s/download/chrUn.fa" % cisRoot
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
- for chromID in chromos.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/C_familiaris/chromo%s.bin" % chromID)
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ cfGenome.addChromosomeEntry(chromID, chromOut, "file")
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ cfGenome = Genome("cfamiliaris", dbFile=db)
+ cfGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
-def loadChromosomes(db, inPath, chromOutPath):
- crGenome = Genome("cremanei", dbFile=db)
- scontigList = os.listdir(inPath)
- for scontig in scontigList:
- seq = ''
- seqArray = []
- seqLen = 0
- inFile = open("%s/%s" % (inPath, scontig), "r")
- index = 0
- header = inFile.readline()
- chromID = header.strip()[1:]
- while header != "":
- seqArray = []
- seqLen = 0
- currentLine = inFile.readline()
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+def buildCremaneiDB(db=geneDB):
+ gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
+ chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
+ chromoOutPath = "/C_remanei/"
- seq = string.join(seqArray, "")
- if seqLen < 100000:
- print "Added contig %s to database" % chromID
- crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
- crGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open("%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- crGenome.addChromosomeEntry(chromID, outFileName, "file")
+ print "Creating database %s" % db
+ createDBFile(db)
- index += 1
- header = currentLine
+ print "Adding gene entries"
+ loadGeneEntries(db, gffPath)
- inFile.close()
+ print "Adding feature entries"
+ loadFeatureEntries(db, gffPath)
+
+ print "Loading genomic sequence"
+ loadChromosomes(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
+ crGenome.createGeneDB(db)
def loadGeneEntries(db, gffFile):
crGenome.addFeatureEntryBatch(featureEntries)
-def createDBFile(db):
- crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
- crGenome.createGeneDB(db)
-
-
-def createDBindices(db):
- crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
- crGenome.createIndices()
-
-
-def buildCremaneiDB(db=geneDB):
- gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
- chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
- chromoOutPath = "/C_remanei/"
-
- print "Creating database %s" % db
- createDBFile(db)
+def loadChromosomes(db, inPath, chromOutPath):
+ crGenome = Genome("cremanei", dbFile=db)
+ scontigList = os.listdir(inPath)
+ for scontig in scontigList:
+ seq = ''
+ seqArray = []
+ seqLen = 0
+ inFile = open("%s/%s" % (inPath, scontig), "r")
+ index = 0
+ header = inFile.readline()
+ chromID = header.strip()[1:]
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ currentLine = inFile.readline()
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- print "Adding gene entries"
- loadGeneEntries(db, gffPath)
+ seq = string.join(seqArray, "")
+ if seqLen < 100000:
+ print "Added contig %s to database" % chromID
+ crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
+ crGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open("%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ crGenome.addChromosomeEntry(chromID, outFileName, "file")
- print "Adding feature entries"
- loadFeatureEntries(db, gffPath)
+ index += 1
+ header = currentLine
- print "Loading genomic sequence"
- loadChromosomes(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
+ crGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
"Z": "26"
}
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- dmGenome = Genome("dmelanogaster", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+def buildDmelanogasterDB(db=geneDB):
+ """ genes and annotations are from UCSC. GO association file is from geneontology.org.
+ """
+ genePath = "%s/download/flyBaseGene.txt" % cisRoot
+ annotPath = "%s/download/gene_info" % cisRoot
+ goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+ goPath = "%s/download/gene2go" % cisRoot
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- dmGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, annotPath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath, annotPath)
+
+ chromList = ["2L", "2R", "2LHet", "2RHet", "3L", "3R", "3LHet", "3RHet",
+ "4", "X", "XHet", "YHet", "U", "UExtra", "M"
+ ]
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+ loadChromosome(db, chromID, chromPath, "/D_melanogaster/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ dmGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
def loadGeneAnnotations(db, annotPath):
geneAnnotations = []
- annotFile = open(annotPath, "r")
+ annotFile = open(annotPath, "r")
dmGenome = Genome("dmelanogaster", dbFile=db)
for line in annotFile:
try:
dmGenome = Genome("dmelanogaster", dbFile=db)
goDefFile = open(goDefPath, "r")
goFile = open(goPath, "r")
- annotFile = open(annotPath, "r")
+ annotFile = open(annotPath, "r")
annotEntries = annotFile.readlines()
annotFile.close()
goDefEntries = goDefFile.readlines()
dmGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
dmGenome = Genome("dmelanogaster", dbFile=db)
- dmGenome.createGeneDB(db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ dmGenome.addChromosomeEntry(chromID, chromOut, "file")
def createDBindices(db):
dmGenome = Genome("dmelanogaster", dbFile=db)
dmGenome.createIndices()
-
-
-def buildDmelanogasterDB(db=geneDB):
- """ genes and annotations are from UCSC. GO association file is from geneontology.org.
- """
- genePath = "%s/download/flyBaseGene.txt" % cisRoot
- annotPath = "%s/download/gene_info" % cisRoot
- goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
- goPath = "%s/download/gene2go" % cisRoot
- chromos = {"2L": "%s/download/chr2L.fa" % cisRoot,
- "2R": "%s/download/chr2R.fa" % cisRoot,
- "2LHet": "%s/download/chr2LHet.fa" % cisRoot,
- "2RHet": "%s/download/chr2RHet.fa" % cisRoot,
- "3L": "%s/download/chr3L.fa" % cisRoot,
- "3LHet": "%s/download/chr3LHet.fa" % cisRoot,
- "3R": "%s/download/chr3R.fa" % cisRoot,
- "3RHet": "%s/download/chr3RHet.fa" % cisRoot,
- "4": "%s/download/chr4.fa" % cisRoot,
- "X": "%s/download/chrX.fa" % cisRoot,
- "XHet": "%s/download/chrXHet.fa" % cisRoot,
- "YHet": "%s/download/chrYHet.fa" % cisRoot,
- "U": "%s/download/chrU.fa" % cisRoot,
- "Uextra": "%s/download/chrUextra.fa" % cisRoot,
- "M": "%s/download/chrM.fa" % cisRoot
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
-
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
-
- print "Adding gene annotations"
- loadGeneAnnotations(db, annotPath)
-
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath, annotPath)
-
- for chromID in chromos.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID)
-
- print "Creating Indices"
- createDBindices(db)
-
- print "Finished creating database %s" % db
\ No newline at end of file
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/D_rerio/drerio.genedb" % cisRoot
-def loadChromosome(db, chromPath, chromOutPath):
- seqArray = []
- seqLen = 0
- drGenome = Genome("drerio", dbFile=db)
- files = os.listdir(chromPath)
- for filename in files:
- inFile = open("%s/%s" % (chromPath, filename), "r")
- header = inFile.readline()
- while header != "":
- seqArray = []
- seqLen = 0
- chromID = header.strip()[1:]
- currentLine = inFile.readline()
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+def buildDrerioDB(db=geneDB):
+ """ genes and annotations are from UCSC (dr3).
+ """
+ #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
+ chromoPath = "%s/download/dr3" % cisRoot
+ chromoOutPath = "/D_rerio/"
+ print "Creating database %s" % db
+ createDBFile(db)
- seq = string.join(seqArray, "")
- if seqLen < 250000:
- print "Added contig %s to database" % chromID
- drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
- drGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open("%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- drGenome.addChromosomeEntry(chromID, outFileName, "file")
+ #print "Adding gene entries"
+ #loadGeneEntries(db, genePath)
- header = currentLine
+ #print "Adding gene features"
+ #loadGeneFeatures(db, genePath)
- inFile.close()
+ print "Loading chromosomes"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ drGenome = Genome("drerio", dbFile=db)
+ drGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
drGenome.addFeatureEntryBatch(insertArray)
-def createDBFile(db):
- drGenome = Genome("drerio", dbFile=db)
- drGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+ seqArray = []
+ seqLen = 0
drGenome = Genome("drerio", dbFile=db)
- drGenome.createIndices()
-
-
-def buildDrerioDB(db=geneDB):
- """ genes and annotations are from UCSC (dr3).
- """
- #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
- chromoPath = "%s/download/dr3" % cisRoot
- chromoOutPath = "/D_rerio/"
- print "Creating database %s" % db
- createDBFile(db)
+ files = os.listdir(chromPath)
+ for filename in files:
+ inFile = open("%s/%s" % (chromPath, filename), "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromID = header.strip()[1:]
+ currentLine = inFile.readline()
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- #print "Adding gene entries"
- #loadGeneEntries(db, genePath)
+ seq = string.join(seqArray, "")
+ if seqLen < 250000:
+ print "Added contig %s to database" % chromID
+ drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
+ drGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open("%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ drGenome.addChromosomeEntry(chromID, outFileName, "file")
- #print "Adding gene features"
- #loadGeneFeatures(db, genePath)
+ header = currentLine
- print "Loading chromosomes"
- loadChromosome(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ drGenome = Genome("drerio", dbFile=db)
+ drGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- ecGenome = Genome("ecaballus", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildHorseDB(db=geneDB):
+ genePath = "%s/download/seq_gene.md" % cisRoot
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Creating database %s" % db
+ createDBFile(db)
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- ecGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+ "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+ "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
+ "31", "M", "X", "Un"
+ ]
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+ loadChromosome(db, chromID, chromPath, "/E_caballus/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ ecGenome = Genome("ecaballus", dbFile=db)
+ ecGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
alreadySeen = []
ecGenome = Genome("ecaballus", dbFile=db)
ecGenome.addFeatureEntryBatch(featureEntries)
-def createDBFile(db):
- ecGenome = Genome("ecaballus", dbFile=db)
- ecGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
ecGenome = Genome("ecaballus", dbFile=db)
- ecGenome.createIndices()
-
-
-def buildHorseDB(db=geneDB):
- genePath = "%s/download/seq_gene.md" % cisRoot
- chromos = {"1": "%s/download/chr1.fa" % cisRoot,
- "2": "%s/download/chr2.fa" % cisRoot,
- "3": "%s/download/chr3.fa" % cisRoot,
- "4": "%s/download/chr4.fa" % cisRoot,
- "5": "%s/download/chr5.fa" % cisRoot,
- "6": "%s/download/chr6.fa" % cisRoot,
- "7": "%s/download/chr7.fa" % cisRoot,
- "8": "%s/download/chr8.fa" % cisRoot,
- "9": "%s/download/chr9.fa" % cisRoot,
- "10": "%s/download/chr10.fa" % cisRoot,
- "11": "%s/download/chr11.fa" % cisRoot,
- "12": "%s/download/chr12.fa" % cisRoot,
- "13": "%s/download/chr13.fa" % cisRoot,
- "14": "%s/download/chr14.fa" % cisRoot,
- "15": "%s/download/chr15.fa" % cisRoot,
- "16": "%s/download/chr16.fa" % cisRoot,
- "17": "%s/download/chr17.fa" % cisRoot,
- "18": "%s/download/chr18.fa" % cisRoot,
- "19": "%s/download/chr19.fa" % cisRoot,
- "20": "%s/download/chr20.fa" % cisRoot,
- "21": "%s/download/chr21.fa" % cisRoot,
- "22": "%s/download/chr22.fa" % cisRoot,
- "23": "%s/download/chr23.fa" % cisRoot,
- "24": "%s/download/chr24.fa" % cisRoot,
- "25": "%s/download/chr25.fa" % cisRoot,
- "26": "%s/download/chr26.fa" % cisRoot,
- "27": "%s/download/chr27.fa" % cisRoot,
- "28": "%s/download/chr28.fa" % cisRoot,
- "29": "%s/download/chr29.fa" % cisRoot,
- "30": "%s/download/chr30.fa" % cisRoot,
- "31": "%s/download/chr31.fa" % cisRoot,
- "M": "%s/download/chrM.fa" % cisRoot,
- "X": "%s/download/chrX.fa" % cisRoot,
- "Un": "%s/download/chrUn.fa" % cisRoot
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
- for chromID in chromos.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/E_caballus/chromo%s.bin" % chromID)
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ ecGenome.addChromosomeEntry(chromID, chromOut, "file")
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ ecGenome = Genome("ecaballus", dbFile=db)
+ ecGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/G_gallus/ggallus.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- ggGenome = Genome("ggallus", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildChickenDB(db=geneDB):
+ genePath = "%s/download/seq_gene.md" % cisRoot
+ goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
+ goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Creating database %s" % db
+ createDBFile(db)
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- ggGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ #print "Adding gene annotations"
+ #loadGeneAnnotations(db, annotPath)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+ "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+ "21", "22", "23", "24", "25", "26", "27", "28",
+ "32", "W", "Z", "M", "E22C19W28_E50C23", "E64",
+ "1_random", "2_random", "4_random", "5_random", "6_random",
+ "7_random", "8_random", "10_random", "11_random", "12_random",
+ "13_random", "16_random", "17_random", "18_random", "20_random",
+ "22_random", "25_random", "28_random", "Un_random", "W_random",
+ "E64_random", "Z_random", "E22C19W28_E50C23_random"
+ ]
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+ loadChromosome(db, chromID, chromPath, "/G_gallus/chromo%s.bin" % chromID)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ ggGenome = Genome("ggallus", dbFile=db)
+ ggGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
alreadySeen = []
ggGenome = Genome("ggallus", dbFile=db)
ggGenome.addFeatureEntryBatch(featureEntries)
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ ggGenome = Genome("ggallus", dbFile=db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ ggGenome.addChromosomeEntry(chromID, chromOut, "file")
+
+
def loadGeneOntology(db, goPath, goDefPath):
ggGenome = Genome("ggallus", dbFile=db)
goDefFile = open(goDefPath, "r")
ggGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- ggGenome = Genome("ggallus", dbFile=db)
- ggGenome.createGeneDB(db)
-
-
def createDBindices(db):
ggGenome = Genome("ggallus", dbFile=db)
ggGenome.createIndices()
-
-
-def buildChickenDB(db=geneDB):
- genePath = "%s/download/seq_gene.md" % cisRoot
- goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
- goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
- chromos = {"1": "%s/download/chr1.fa" % cisRoot,
- "2": "%s/download/chr2.fa" % cisRoot,
- "3": "%s/download/chr3.fa" % cisRoot,
- "4": "%s/download/chr4.fa" % cisRoot,
- "5": "%s/download/chr5.fa" % cisRoot,
- "6": "%s/download/chr6.fa" % cisRoot,
- "7": "%s/download/chr7.fa" % cisRoot,
- "8": "%s/download/chr8.fa" % cisRoot,
- "9": "%s/download/chr9.fa" % cisRoot,
- "10": "%s/download/chr10.fa" % cisRoot,
- "11": "%s/download/chr11.fa" % cisRoot,
- "12": "%s/download/chr12.fa" % cisRoot,
- "13": "%s/download/chr13.fa" % cisRoot,
- "14": "%s/download/chr14.fa" % cisRoot,
- "15": "%s/download/chr15.fa" % cisRoot,
- "16": "%s/download/chr16.fa" % cisRoot,
- "17": "%s/download/chr17.fa" % cisRoot,
- "18": "%s/download/chr18.fa" % cisRoot,
- "19": "%s/download/chr19.fa" % cisRoot,
- "20": "%s/download/chr20.fa" % cisRoot,
- "21": "%s/download/chr21.fa" % cisRoot,
- "22": "%s/download/chr22.fa" % cisRoot,
- "23": "%s/download/chr23.fa" % cisRoot,
- "24": "%s/download/chr24.fa" % cisRoot,
- "25": "%s/download/chr25.fa" % cisRoot,
- "26": "%s/download/chr26.fa" % cisRoot,
- "27": "%s/download/chr27.fa" % cisRoot,
- "28": "%s/download/chr28.fa" % cisRoot,
- "32": "%s/download/chr32.fa" % cisRoot,
- "W": "%s/download/chrW.fa" % cisRoot,
- "Z": "%s/download/chrZ.fa" % cisRoot,
- "M": "%s/download/chrM.fa" % cisRoot,
- "E22C19W28_E50C23": "%s/download/chrE22C19W28_E50C23.fa" % cisRoot,
- "E64": "%s/download/chrE64.fa" % cisRoot,
- "1_random": "%s/download/chr1_random.fa" % cisRoot,
- "2_random": "%s/download/chr2_random.fa" % cisRoot,
- "4_random": "%s/download/chr4_random.fa" % cisRoot,
- "5_random": "%s/download/chr5_random.fa" % cisRoot,
- "6_random": "%s/download/chr6_random.fa" % cisRoot,
- "7_random": "%s/download/chr7_random.fa" % cisRoot,
- "8_random": "%s/download/chr8_random.fa" % cisRoot,
- "10_random": "%s/download/chr10_random.fa" % cisRoot,
- "11_random": "%s/download/chr11_random.fa" % cisRoot,
- "12_random": "%s/download/chr12_random.fa" % cisRoot,
- "13_random": "%s/download/chr13_random.fa" % cisRoot,
- "16_random": "%s/download/chr16_random.fa" % cisRoot,
- "17_random": "%s/download/chr17_random.fa" % cisRoot,
- "18_random": "%s/download/chr18_random.fa" % cisRoot,
- "20_random": "%s/download/chr20_random.fa" % cisRoot,
- "22_random": "%s/download/chr22_random.fa" % cisRoot,
- "25_random": "%s/download/chr25_random.fa" % cisRoot,
- "28_random": "%s/download/chr28_random.fa" % cisRoot,
- "Un_random": "%s/download/chrUn_random.fa" % cisRoot,
- "W_random": "%s/download/chrW_random.fa" % cisRoot,
- "E64_random": "%s/download/chrE64_random.fa" % cisRoot,
- "Z_random": "%s/download/chrZ_random.fa" % cisRoot,
- "E22C19W28_E50C23_random": "%s/download/chrE22C19W28_E50C23_random.fa" % cisRoot
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
-
- #print "Adding gene annotations"
- #loadGeneAnnotations(db, annotPath)
-
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
-
- for chromID in chromos.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/G_gallus/chromo%s.bin" % chromID)
-
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath)
-
- print "Creating Indices"
- createDBindices(db)
-
- print "Finished creating database %s" % db
\ No newline at end of file
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- hsGenome = Genome("hsapiens", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- hsGenome.addChromosomeEntry(chromID, chromOut, "file")
+def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+ genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
+ goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
+ goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
+ # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
+ chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+ "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+ "21", "22", "X", "Y"
+ ]
+
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath, chromList)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath, chromList)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath)
+
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+ loadChromosome(db, chromID, chromPath, "/H_sapiens/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ hsGenome = Genome("hsapiens", dbFile=db)
+ hsGenome.createGeneDB(db)
def loadGeneEntries(db, gFile, cDict):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
geneEntries = []
hsGenome = Genome("hsapiens", dbFile=db)
geneFile = open(gFile, "r")
chrom = cols[1].strip()
if chrom not in cDict:
continue
-
+
name = cols[10].split(":")
gid = name[1]
start = int(cols[2])
cols = line.split("\t")
if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
continue
+
if cols[12] == "Celera":
continue
+
chrom = cols[1].strip()
if chrom not in cDict:
continue
synonyms = idb.geneIDSynonyms(gID)
if len(synonyms) >0:
for entry in synonyms:
- gene_name += ","
+ gene_name += ","
gene_name += entry
else:
gene_name = " "
hsGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- hsGenome = Genome("hsapiens", dbFile=db)
- hsGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ hsGenome = Genome("hsapiens", dbFile=db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ hsGenome.addChromosomeEntry(chromID, chromOut, "file")
def createDBindices(db):
hsGenome = Genome("hsapiens", dbFile=db)
hsGenome.createIndices()
-
-
-def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
- genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
- goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
- goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
- # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
- chromDict = {"1": "%s/chr1.fa" % downloadDir,
- "2": "%s/chr2.fa" % downloadDir,
- "3": "%s/chr3.fa" % downloadDir,
- "4": "%s/chr4.fa" % downloadDir,
- "5": "%s/chr5.fa" % downloadDir,
- "6": "%s/chr6.fa" % downloadDir,
- "7": "%s/chr7.fa" % downloadDir,
- "8": "%s/chr8.fa" % downloadDir,
- "9": "%s/chr9.fa" % downloadDir,
- "10": "%s/chr10.fa" % downloadDir,
- "11": "%s/chr11.fa" % downloadDir,
- "12": "%s/chr12.fa" % downloadDir,
- "13": "%s/chr13.fa" % downloadDir,
- "14": "%s/chr14.fa" % downloadDir,
- "15": "%s/chr15.fa" % downloadDir,
- "16": "%s/chr16.fa" % downloadDir,
- "17": "%s/chr17.fa" % downloadDir,
- "18": "%s/chr18.fa" % downloadDir,
- "19": "%s/chr19.fa" % downloadDir,
- "20": "%s/chr20.fa" % downloadDir,
- "21": "%s/chr21.fa" % downloadDir,
- "22": "%s/chr22.fa" % downloadDir,
- "X": "%s/chrX.fa" % downloadDir,
- "Y": "%s/chrY.fa" % downloadDir
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath, chromDict)
-
- print "Adding gene features"
- loadGeneFeatures(db, genePath, chromDict)
-
- print "Adding gene annotations"
- loadGeneAnnotations(db)
-
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath)
-
- for chromID in chromDict.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID)
-
- print "Creating Indices"
- createDBindices(db)
-
- print "Finished creating database %s" % db
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
-def loadChromosome(db, chromPath, chromOutPath):
- seqArray = []
- seqLen = 0
- mdGenome = Genome("mdomestica", dbFile=db)
- inFile = open(chromPath, "r")
- header = inFile.readline()
- while header != "":
- seqArray = []
- seqLen = 0
- chromID = header.strip()[1:]
- currentLine = inFile.readline()
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+def buildMdomesticaDB(db=geneDB):
+ genePath = "%s/download/mondom/genscan.txt" % cisRoot
+ chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
+ chromoOutPath = "/M_domestica/"
- seq = string.join(seqArray, "")
- if seqLen < 500000:
- print "Added contig %s to database" % chromID
- mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
- mdGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open("%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- mdGenome.addChromosomeEntry(chromID, outFileName, "file")
+ print "Creating database %s" % db
+ createDBFile(db)
- header = currentLine
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
- inFile.close()
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ print "Loading chromosomes"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ mdGenome = Genome("mdomestica", dbFile=db)
+ mdGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
mdGenome = Genome("mdomestica", dbFile=db)
geneFile = open(gFile, "r")
def loadGeneAnnotations(db, annotPath):
geneAnnotations = []
- annotFile = open(annotPath, "r")
+ annotFile = open(annotPath, "r")
mdGenome = Genome("mdomestica", dbFile=db)
for line in annotFile:
try:
mdGenome.addFeatureEntryBatch(insertArray)
-def createDBFile(db):
- mdGenome = Genome("mdomestica", dbFile=db)
- mdGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+ seqArray = []
+ seqLen = 0
mdGenome = Genome("mdomestica", dbFile=db)
- mdGenome.createIndices()
-
-
-def buildMdomesticaDB(db=geneDB):
- genePath = "%s/download/mondom/genscan.txt" % cisRoot
- chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
- chromoOutPath = "/M_domestica/"
-
- print "Creating database %s" % db
- createDBFile(db)
+ inFile = open(chromPath, "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromID = header.strip()[1:]
+ currentLine = inFile.readline()
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
+ seq = string.join(seqArray, "")
+ if seqLen < 500000:
+ print "Added contig %s to database" % chromID
+ mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
+ mdGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open("%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ mdGenome.addChromosomeEntry(chromID, outFileName, "file")
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
+ header = currentLine
- print "Loading chromosomes"
- loadChromosome(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ mdGenome = Genome("mdomestica", dbFile=db)
+ mdGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/M_musculus/mmusculus.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- mmGenome = Genome("mmusculus", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+ genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
+ goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
+ goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
+ # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
+ # but ignoring all random chromosomes
+ chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+ "11", "12", "13", "14", "15", "16", "17", "18", "19",
+ "X", "Y", "M"
+ ]
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Creating database %s" % db
+ createDBFile(db)
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- mmGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath, chromList)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath, chromList)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath)
+
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+ loadChromosome(db, chromID, chromPath, "/M_musculus/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ mmGenome = Genome("mmusculus", dbFile=db)
+ mmGenome.createGeneDB(db)
def loadGeneEntries(db, gFile, cDict):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
alreadySeen = []
mmGenome = Genome("mmusculus", dbFile=db)
synonyms = idb.geneIDSynonyms(gID)
if len(synonyms) >0:
for entry in synonyms:
- gene_name += ","
+ gene_name += ","
gene_name += entry
else:
gene_name = " "
mmGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- mmGenome = Genome("mmusculus", dbFile=db)
- mmGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
mmGenome = Genome("mmusculus", dbFile=db)
- mmGenome.createIndices()
-
-
-def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
- genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
- goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
- goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
- # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
- # but ignoring all random chromosomes
- chromDict = {"1": "%s/chr1.fa" % downloadDir,
- "2": "%s/chr2.fa" % downloadDir,
- "3": "%s/chr3.fa" % downloadDir,
- "4": "%s/chr4.fa" % downloadDir,
- "5": "%s/chr5.fa" % downloadDir,
- "6": "%s/chr6.fa" % downloadDir,
- "7": "%s/chr7.fa" % downloadDir,
- "8": "%s/chr8.fa" % downloadDir,
- "9": "%s/chr9.fa" % downloadDir,
- "10": "%s/chr10.fa" % downloadDir,
- "11": "%s/chr11.fa" % downloadDir,
- "12": "%s/chr12.fa" % downloadDir,
- "13": "%s/chr13.fa" % downloadDir,
- "14": "%s/chr14.fa" % downloadDir,
- "15": "%s/chr15.fa" % downloadDir,
- "16": "%s/chr16.fa" % downloadDir,
- "17": "%s/chr17.fa" % downloadDir,
- "18": "%s/chr18.fa" % downloadDir,
- "19": "%s/chr19.fa" % downloadDir,
- "X": "%s/chrX.fa" % downloadDir,
- "Y": "%s/chrY.fa" % downloadDir,
- "M": "%s/chrM.fa" % downloadDir
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath, chromDict)
-
- print "Adding gene features"
- loadGeneFeatures(db, genePath, chromDict)
-
- print "Adding gene annotations"
- loadGeneAnnotations(db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath)
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
- for chromID in chromDict.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID)
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ mmGenome.addChromosomeEntry(chromID, chromOut, "file")
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ mmGenome = Genome("mmusculus", dbFile=db)
+ mmGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- rnGenome = Genome("rnorvegicus", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+ genePath = "%s/seq_gene.md" % downloadDir
+ goDefPath = "%s/GO.terms_and_ids" % downloadDir
+ goPath = "%s/gene2go" % downloadDir
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Creating database %s" % db
+ createDBFile(db)
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- rnGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath)
+
+ # ignoring all random chromosomes
+ chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+ "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+ "Un", "X", "M"
+ ]
+ for chromID in chromList:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+ loadChromosome(db, chromID, chromPath, "/R_norvegicus/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ rnGenome = Genome("rnorvegicus", dbFile=db)
+ rnGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
rnGenome = Genome("rnorvegicus", dbFile=db)
geneFile = open(gFile, "r")
def loadGeneAnnotations(db):
geneAnnotations = []
- idb = geneinfoDB()
+ idb = geneinfoDB()
rnGenome = Genome("rnorvegicus", dbFile=db)
gidList = rnGenome.allGIDs()
for locID in gidList:
synonyms = idb.geneIDSynonyms(gID)
if len(synonyms) >0:
for entry in synonyms:
- gene_name += ","
+ gene_name += ","
gene_name += entry
else:
gene_name = " "
rnGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- rnGenome = Genome("rnorvegicus", dbFile=db)
- rnGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
rnGenome = Genome("rnorvegicus", dbFile=db)
- rnGenome.createIndices()
-
-
-def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
- genePath = "%s/seq_gene.md" % downloadDir
- goDefPath = "%s/GO.terms_and_ids" % downloadDir
- goPath = "%s/gene2go" % downloadDir
- # ignoring all random chromosomes
- chromos = {"1": "%s/chr1.fa" % downloadDir,
- "2": "%s/chr2.fa" % downloadDir,
- "3": "%s/chr3.fa" % downloadDir,
- "4": "%s/chr4.fa" % downloadDir,
- "5": "%s/chr5.fa" % downloadDir,
- "6": "%s/chr6.fa" % downloadDir,
- "7": "%s/chr7.fa" % downloadDir,
- "8": "%s/chr8.fa" % downloadDir,
- "9": "%s/chr9.fa" % downloadDir,
- "10": "%s/chr10.fa" % downloadDir,
- "11": "%s/chr11.fa" % downloadDir,
- "12": "%s/chr12.fa" % downloadDir,
- "13": "%s/chr13.fa" % downloadDir,
- "14": "%s/chr14.fa" % downloadDir,
- "15": "%s/chr15.fa" % downloadDir,
- "16": "%s/chr16.fa" % downloadDir,
- "17": "%s/chr17.fa" % downloadDir,
- "18": "%s/chr18.fa" % downloadDir,
- "19": "%s/chr19.fa" % downloadDir,
- "Un": "%s/chrUn.fa" % downloadDir,
- "X": "%s/chrX.fa" % downloadDir,
- "20": "%s/chr20.fa" % downloadDir,
- "M": "%s/chrM.fa" % downloadDir
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
-
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
-
- print "Adding gene annotations"
- loadGeneAnnotations(db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath)
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
- for chromID in chromos.keys():
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID)
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ rnGenome.addChromosomeEntry(chromID, chromOut, "file")
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ rnGenome = Genome("rnorvegicus", dbFile=db)
+ rnGenome.createIndices()
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
-def loadChromosome(db, chromID, chromPath, chromOut):
- seqArray = []
- scGenome = Genome("scerevisiae", dbFile=db)
- inFile = open(chromPath, "r")
- line = inFile.readline()
- for line in inFile:
- seqArray.append(line.strip())
+def buildScerevisiaeDB(db=geneDB):
+ genePath = "%s/download/SGD_features.tab" % cisRoot
+ goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+ goPath = "%s/download/gene_association.sgd" % cisRoot
- seq = string.join(seqArray, "")
- seqLen = len(seq)
- if seqLen < 1:
- print "Problems reading sequence from file"
+ print "Creating database %s" % db
+ createDBFile(db)
- print "writing to file %s" % chromOut
- outFile = open("%s%s" % (cisRoot, chromOut), "w")
- outFile.write(seq)
- outFile.close()
- seq = ""
- print "calling scGenome()"
- scGenome.addChromosomeEntry(chromID, chromOut, "file")
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, genePath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath)
+
+ for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
+ print "Loading chromosome %s" % chromID
+ chromPath = "%s/download/chr%s.fsa" % (cisRoot, chromID)
+ loadChromosome(db, chromID, chromPath, "/S_cerevisiae/chr%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
+ scGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
def loadGeneAnnotations(db, annotPath):
geneAnnotations = []
- annotFile = open(annotPath, "r")
+ annotFile = open(annotPath, "r")
lines = annotFile.readlines()
annotFile.close()
scGenome = Genome("scerevisiae", dbFile=db)
scGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
- scGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ scGenome = Genome("scerevisiae", dbFile=db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ seq = ""
+ print "calling scGenome()"
+ scGenome.addChromosomeEntry(chromID, chromOut, "file")
def createDBindices(db):
scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
scGenome.createIndices()
-
-
-def buildScerevisiaeDB(db=geneDB):
- genePath = "%s/download/SGD_features.tab" % cisRoot
- goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
- goPath = "%s/download/gene_association.sgd" % cisRoot
- chromos = {"1": "%s/download/chr01.fsa" % cisRoot,
- "2": "%s/download/chr02.fsa" % cisRoot,
- "3": "%s/download/chr03.fsa" % cisRoot,
- "4": "%s/download/chr04.fsa" % cisRoot,
- "5": "%s/download/chr05.fsa" % cisRoot,
- "6": "%s/download/chr06.fsa" % cisRoot,
- "7": "%s/download/chr07.fsa" % cisRoot,
- "8": "%s/download/chr08.fsa" % cisRoot,
- "9": "%s/download/chr09.fsa" % cisRoot,
- "10": "%s/download/chr10.fsa" % cisRoot,
- "11": "%s/download/chr11.fsa" % cisRoot,
- "12": "%s/download/chr12.fsa" % cisRoot,
- "13": "%s/download/chr13.fsa" % cisRoot,
- "14": "%s/download/chr14.fsa" % cisRoot,
- "15": "%s/download/chr15.fsa" % cisRoot,
- "16": "%s/download/chr16.fsa" % cisRoot
- }
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
-
- print "Adding gene annotations"
- loadGeneAnnotations(db, genePath)
-
- print "Adding gene ontology"
- loadGeneOntology(db, goPath, goDefPath)
-
- for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
- print "Loading chromosome %s" % chromID
- loadChromosome(db, chromID, chromos[chromID], "/S_cerevisiae/chr%s.bin" % chromID)
-
- print "Creating Indices"
- createDBindices(db)
-
- print "Finished creating database %s" % db
\ No newline at end of file
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/S_purpuratus/spurpuratus.genedb" % cisRoot
+def buildSpurpuratusDB(db=geneDB):
+ chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot
+ chromoOutPath = "/S_purpuratus/"
+
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Loading genomic sequence"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+def createDBFile(db):
+ spGenome = Genome("spurpuratus", version="2.1", dbFile=db)
+ spGenome.createGeneDB(db)
+
+
def loadChromosome(db, chromPath, chromOutPath):
seqArray = []
seqLen = 0
inFile.close()
-def createDBFile(db):
- spGenome = Genome("spurpuratus", version="2.1", dbFile=db)
- spGenome.createGeneDB(db)
-
-
def createDBindices(db):
spGenome = Genome("spurpuratus", version="2.1", dbFile=db)
spGenome.createIndices()
-
-
-def buildSpurpuratusDB(db=geneDB):
- chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot
- chromoOutPath = "/S_purpuratus/"
-
- print "Creating database %s" % db
- createDBFile(db)
-
- print "Loading genomic sequence"
- loadChromosome(db, chromoPath, chromoOutPath)
-
- print "Creating Indices"
- createDBindices(db)
-
- print "Finished creating database %s" % db
\ No newline at end of file
###########################################################################
# #
# C O P Y R I G H T N O T I C E #
-# Copyright (c) 2003-10 by: #
+# Copyright (c) 2003-13 by: #
# * California Institute of Technology #
# #
# All Rights Reserved. #
from os import environ
if environ.get("CISTEMATIC_ROOT"):
- cisRoot = environ.get("CISTEMATIC_ROOT")
+ cisRoot = environ.get("CISTEMATIC_ROOT")
else:
cisRoot = "/proj/genome"
geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
-def loadChromosome(db, chromPath, chromOutPath):
- seqArray = []
- seqLen = 0
- xtGenome = Genome("xtropicalis", dbFile=db)
- inFile = open(chromPath, "r")
- header = inFile.readline()
- while header != "":
- seqArray = []
- seqLen = 0
- chromID = header.strip()[1:]
- currentLine = inFile.readline()
- while currentLine != "" and currentLine[0] != ">":
- lineSeq = currentLine.strip()
- seqLen += len(lineSeq)
- seqArray.append(lineSeq)
- currentLine = inFile.readline()
+def buildXtropicalisDB(db=geneDB):
+ genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
+ chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
+ chromoOutPath = "/X_tropicalis/"
- seq = string.join(seqArray, "")
- if seqLen < 500000:
- print "Added contig %s to database" % chromID
- xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
- xtGenome.addChromosomeEntry(chromID, chromID, "db")
- else:
- outFileName = "%s%s.bin" % (chromOutPath, chromID)
- outFile = open("%s%s" % (cisRoot, outFileName), "w")
- outFile.write(seq)
- outFile.close()
- print "Added contig file %s to database" % outFileName
- xtGenome.addChromosomeEntry(chromID, outFileName, "file")
+ print "Creating database %s" % db
+ createDBFile(db)
- header = currentLine
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
- inFile.close()
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ print "Loading sequences"
+ loadChromosome(db, chromoPath, chromoOutPath)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+ xtGenome = Genome("xtropicalis", dbFile=db)
+ xtGenome.createGeneDB(db)
def loadGeneEntries(db, gFile):
- """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
- """
+ #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
geneEntries = []
xtGenome = Genome("xtropicalis", dbFile=db)
geneFile = open(gFile, "r")
xtGenome.addGoInfoBatch(goArray)
-def createDBFile(db):
- xtGenome = Genome("xtropicalis", dbFile=db)
- xtGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+ seqArray = []
+ seqLen = 0
xtGenome = Genome("xtropicalis", dbFile=db)
- xtGenome.createIndices()
-
-
-def buildXtropicalisDB(db=geneDB):
- genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
- chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
- chromoOutPath = "/X_tropicalis/"
-
- print "Creating database %s" % db
- createDBFile(db)
+ inFile = open(chromPath, "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromID = header.strip()[1:]
+ currentLine = inFile.readline()
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
- print "Adding gene entries"
- loadGeneEntries(db, genePath)
+ seq = string.join(seqArray, "")
+ if seqLen < 500000:
+ print "Added contig %s to database" % chromID
+ xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
+ xtGenome.addChromosomeEntry(chromID, chromID, "db")
+ else:
+ outFileName = "%s%s.bin" % (chromOutPath, chromID)
+ outFile = open("%s%s" % (cisRoot, outFileName), "w")
+ outFile.write(seq)
+ outFile.close()
+ print "Added contig file %s to database" % outFileName
+ xtGenome.addChromosomeEntry(chromID, outFileName, "file")
- print "Adding gene features"
- loadGeneFeatures(db, genePath)
+ header = currentLine
- print "Loading sequences"
- loadChromosome(db, chromoPath, chromoOutPath)
+ inFile.close()
- print "Creating Indices"
- createDBindices(db)
- print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+ xtGenome = Genome("xtropicalis", dbFile=db)
+ xtGenome.createIndices()
def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
regionPasses = False
if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
+ #TODO: here is where the test dataset is failing
if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
regionPasses = True
if __name__ == "__main__":
- main(sys.argv)
+ main(sys.argv)
\ No newline at end of file