first pass cleanup of cistematic/genomes; change bamPreprocessing master
authorSean Upchurch <sau@caltech.edu>
Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
committerSean Upchurch <sau@caltech.edu>
Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
to query GID directly

21 files changed:
bamPreprocessing.py
cistematic/genomes/__init__.py
cistematic/genomes/athaliana.py
cistematic/genomes/btaurus.py
cistematic/genomes/cbrenneri.py
cistematic/genomes/cbriggsae.py
cistematic/genomes/celegans.py
cistematic/genomes/cfamiliaris.py
cistematic/genomes/cremanei.py
cistematic/genomes/dmelanogaster.py
cistematic/genomes/drerio.py
cistematic/genomes/ecaballus.py
cistematic/genomes/ggallus.py
cistematic/genomes/hsapiens.py
cistematic/genomes/mdomestica.py
cistematic/genomes/mmusculus.py
cistematic/genomes/rnorvegicus.py
cistematic/genomes/scerevisiae.py
cistematic/genomes/spurpuratus.py
cistematic/genomes/xtropicalis.py
findall.py

index dd5e8612001dd31f19d49470c956205c789f016c..9f8b6fa0e607114af40d95e73707922eabcd981b 100644 (file)
@@ -15,7 +15,7 @@ import sys
 import pysam
 import optparse
 from cistematic.genomes import Genome
-from commoncode import isSpliceEntry, getFeaturesByChromDict
+from commoncode import isSpliceEntry
 
 
 def main(argv=None):
@@ -35,7 +35,7 @@ def main(argv=None):
     samfile = pysam.Samfile(BAM, "rb" )
     readMultiplicityDict = getReadMultiplicity(samfile)
 
-    counts = getReadCounts(samfile, readMultiplicityDict)
+    counts, readLength = getReadCounts(samfile, readMultiplicityDict)
     outheader = buildHeader(samfile.header, counts)
     if options.markGID:
         genome = Genome(options.genomeName, inRAM=True)
@@ -43,7 +43,6 @@ def main(argv=None):
             genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
 
         print "getting gene features...."
-        featuresByChromDict = getFeaturesByChromDict(genome)    
         outheader['CO'].append(options.genomeComment)
 
     outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
@@ -52,11 +51,13 @@ def main(argv=None):
         if options.markGID:
             chrom = samfile.getrname(alignedread.tid)
             chromNum = chrom[3:]
-            geneModelFlag = "NM"
-            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
-                if start < alignedread.pos and stop > alignedread.pos:
-                    geneModelFlag = gid
-                    continue
+            #new direct query
+            geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
+            try:
+                (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
+                geneModelFlag = name
+            except IndexError:
+                geneModelFlag = "NM"
 
         ID = getReadID(alignedread)
         try:
@@ -156,7 +157,7 @@ def getReadCounts(samfile, readMultiplicityDict):
               "ReadLength\t%d" % readLength
     ]
 
-    return counts
+    return counts, readLength
 
 
 def buildHeader(templateheader, commentList):
index 4dee9248836da4133204428fd6a476ad759ce8a4..81ba4ec2665ca7ef56cdfa03ebba5a096ce3333a 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -562,7 +562,7 @@ class Genome:
                 results.append((name, version, chromosome, start, stop, orientation, atype))
 
         # select all features on chromosome that have a "start" between start and stop
-        stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+        stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
         res = self.queryDB(stmt, fetchall=True)
         for entry in res:
             (chromosome, start, stop, orientation, name, version, atype) = entry
index 628be4dc4789ec61f8775429fd1d261020209063..9d2424639de8466d65510ce26ea0bf9402c5021c 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,10 +33,10 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
-    
+
 geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
 
 chromSize = {"1": 30432563,
@@ -83,29 +83,40 @@ def decodeGFF3(cols):
     return (fType, gid, chrom, start, stop, sense, otherDict)
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    atGenome = Genome("athaliana", dbFile=db)
+def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
+    annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
+    goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
 
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    seq = string.join(seqArray,"")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-    print "writing to file %s" % (chromOut)
-    outFile = open(cisRoot + chromOut, "w")
-    outFile.write(seq)
-    outFile.close()
-    seq = ""
+    print "Adding feature entries"
+    loadFeatureEntries(db, genePath)
 
-    atGenome.addChromosomeEntry(chromID, chromOut, "file")
-    # Add alternative chromID - should be A-O and 01-09
-    atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, annotPath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath)
+
+    for chromID in ["1", "2", "3", "4", "5", "C", "M"]:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fas" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath,  "/A_thaliana/chr%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    atGenome = Genome("athaliana", dbFile=db)
+    atGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
@@ -133,12 +144,13 @@ def loadFeatureEntries(db, gFile):
     featureEntries = []
     trackedGenes = []
     atGenome = Genome("athaliana", dbFile=db)
-    featureTranslation = {"CDS": "CDS", 
+    featureTranslation = {"CDS": "CDS",
                           "three_prime_UTR": "3UTR",
                           "five_prime_UTR": "5UTR",
                           "miRNA": "5UTR",
                           "exon": "5UTR"
     }
+
     geneFile = open(gFile, "r")
     for line in geneFile:
         fields = line.split("\t")
@@ -178,7 +190,7 @@ def loadFeatureEntries(db, gFile):
 
 def loadGeneAnnotations(db, annotPath):
     geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     annotFile.readline()
     lines = annotFile.readlines()
     annotFile.close()
@@ -220,50 +232,31 @@ def loadGeneOntology(db, goPath):
     atGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    atGenome = Genome("athaliana", dbFile=db)
-    atGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
     atGenome = Genome("athaliana", dbFile=db)
-    atGenome.createIndices()
-
-
-def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
-    annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
-    goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
 
-    chromos = {"1": "%s/chr1.fas" % downloadDir,
-               "2": "%s/chr2.fas" % downloadDir,
-               "3": "%s/chr3.fas" % downloadDir,
-               "4": "%s/chr4.fas" % downloadDir,
-               "5": "%s/chr5.fas" % downloadDir,
-               "C": "%s/chrC.fas" % downloadDir,
-               "M": "%s/chrM.fas" % downloadDir
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding feature entries"
-    loadFeatureEntries(db, genePath)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
 
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, annotPath)
+    seq = string.join(seqArray,"")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
 
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath)
+    print "writing to file %s" % (chromOut)
+    outFile = open(cisRoot + chromOut, "w")
+    outFile.write(seq)
+    outFile.close()
+    seq = ""
 
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID],  "/A_thaliana/chr%s.bin" % chromID)
+    atGenome.addChromosomeEntry(chromID, chromOut, "file")
+    # Add alternative chromID - should be A-O and 01-09
+    atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    atGenome = Genome("athaliana", dbFile=db)
+    atGenome.createIndices()
index c8a01b60d3978d0ebe56272b7f4b87edb62f8a5f..d703bf5f28b13e2b6a80e0216039384a242f8a6c 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,52 +33,44 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
-geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot 
+geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    btGenome = Genome("btaurus", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
+def buildBtaurusDB(db=geneDB):
+    genePath = "%s/download/bt2/genscan.txt" % cisRoot
+    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
+    chromoOutPath = "/B_taurus/"
 
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
-            btGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            btGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-        header = currentLine
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
 
-    inFile.close()
+    print "Loading sequences"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    btGenome = Genome("btaurus",  dbFile=db)
+    btGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     btGenome = Genome("btaurus", dbFile=db)
     geneFile = open(gFile, "r")
@@ -103,24 +95,6 @@ def loadGeneEntries(db, gFile):
     btGenome.addGeneEntryBatch(geneEntries)
 
 
-def loadGeneAnnotations(db, annotPath):
-    geneAnnotations = []
-    annotFile = open(annotPath, "r")
-    btGenome = Genome("btaurus", dbFile=db)
-    for line in annotFile:
-        try:
-            cols = line.split("\t")
-            locID = cols[0]
-            geneDesc = cols[6]
-            if len(locID) > 0:
-                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
-        except:
-            pass
-
-    print "Adding %d annotations" % len(geneAnnotations)
-    btGenome.addAnnotationBatch(geneAnnotations)
-
-
 def loadGeneFeatures(db, gfile):
     geneFile = open(gfile, "r")
     senseArray = {"+": "F",
@@ -194,11 +168,29 @@ def loadGeneFeatures(db, gfile):
     btGenome.addFeatureEntryBatch(insertArray)
 
 
+def loadGeneAnnotations(db, annotPath):
+    geneAnnotations = []
+    annotFile = open(annotPath, "r")
+    btGenome = Genome("btaurus", dbFile=db)
+    for line in annotFile:
+        try:
+            cols = line.split("\t")
+            locID = cols[0]
+            geneDesc = cols[6]
+            if len(locID) > 0:
+                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
+        except:
+            pass
+
+    print "Adding %d annotations" % len(geneAnnotations)
+    btGenome.addAnnotationBatch(geneAnnotations)
+
+
 def loadGeneOntology(db, goPath, goDefPath, annotPath):
     btGenome = Genome("btaurus", dbFile=db)
     goDefFile = open(goDefPath, "r")
     goFile = open(goPath, "r")
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     annotEntries = annotFile.readlines()
     annotFile.close()
     goDefEntries = goDefFile.readlines()
@@ -238,34 +230,42 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
     btGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    btGenome = Genome("btaurus",  dbFile=db)
-    btGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     btGenome = Genome("btaurus", dbFile=db)
-    btGenome.createIndices()
-
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
 
-def buildBtaurusDB(db=geneDB):
-    genePath = "%s/download/bt2/genscan.txt" % cisRoot
-    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
-    chromoOutPath = "/B_taurus/"
-    
-    print "Creating database %s" % db
-    createDBFile(db)
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
+            btGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            btGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
 
-    print "Loading sequences" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    btGenome = Genome("btaurus", dbFile=db)
+    btGenome.createIndices()
index a227faacf6a5c73b6b48512c2e917875fb8d6d9c..7a338a9779a0f97983130f6a5846c03456bb3d34 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,48 +33,39 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    cbGenome = Genome("cbrenneri", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromHeader = header.strip()[1:].split()
-        chromID = chromHeader[0]
-        currentLine = inFile.readline()
+def buildCbrenneriDB(db=geneDB):
+    gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
+    chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
+    chromoOutPath = "/C_brenneri/"
 
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        seq = string.join(seqArray, "")
-        if seqLen < 100000:
-            print "Added contig %s to database" % chromID
-            cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
-            cbGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open( "%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
 
-        header = currentLine
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
 
-    inFile.close()
+    print "Loading genomic sequence"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    cbGenome = Genome("cbrenneri", version="PB2801_001",  dbFile=db)
+    cbGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gffFile):
@@ -164,34 +155,43 @@ def loadFeatureEntries(db, gffFile):
     cbGenome.addFeatureEntryBatch(featureEntries)
 
 
-def createDBFile(db):
-    cbGenome = Genome("cbrenneri", version="PB2801_001",  dbFile=db)
-    cbGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
-    cbGenome.createIndices()
-
-
-def buildCbrenneriDB(db=geneDB):
-    gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
-    chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
-    chromoOutPath = "/C_brenneri/"
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
+    cbGenome = Genome("cbrenneri", dbFile=db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromHeader = header.strip()[1:].split()
+        chromID = chromHeader[0]
+        currentLine = inFile.readline()
 
-    print "Creating database %s" % db
-    createDBFile(db)
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+        seq = string.join(seqArray, "")
+        if seqLen < 100000:
+            print "Added contig %s to database" % chromID
+            cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
+            cbGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open( "%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+        header = currentLine
 
-    print "Loading genomic sequence" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
+    cbGenome.createIndices()
index b913728a739bd5a43fb8c5a7447f272dfb65678c..00468edd08ada512ed641255bc130fb0f4875b8d 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,46 +33,39 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    cbGenome = Genome("cbriggsae", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildCbriggsaeDB(db=geneDB):
+    gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
+    chromoPath = "%s/download/briggsae_25.fa" % cisRoot
+    chromoOutPath = "/C_briggsae/"
 
-        seq = string.join(seqArray, "")
-        if seqLen < 900000:
-            print "Added contig %s to database" % chromID
-            cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
-            cbGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
 
-    inFile.close()
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Loading genomic sequence"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
+    cbGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gffFile):
@@ -125,7 +118,7 @@ def loadFeatureEntries(db, gffFile):
     featureEntries = []
     seenFeatures = {}
     featureTranslation = {"coding_exon": "CDS",
-                          "three_prime_UTR": "3UTR", 
+                          "three_prime_UTR": "3UTR",
                           "five_prime_UTR": "5UTR"
     }
 
@@ -161,34 +154,41 @@ def loadFeatureEntries(db, gffFile):
     cbGenome.addFeatureEntryBatch(featureEntries)
 
 
-def createDBFile(db):
-    cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
-    cbGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
-    cbGenome.createIndices()
-
-
-def buildCbriggsaeDB(db=geneDB):
-    gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
-    chromoPath = "%s/download/briggsae_25.fa" % cisRoot
-    chromoOutPath = "/C_briggsae/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
+    cbGenome = Genome("cbriggsae", dbFile=db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+        seq = string.join(seqArray, "")
+        if seqLen < 900000:
+            print "Added contig %s to database" % chromID
+            cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
+            cbGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+        header = currentLine
 
-    print "Loading genomic sequence" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
+    cbGenome.createIndices()
index e3df297269ddba790fc7380c6010cce2cf876f3b..e9287966e5379ea1fdb7eb45c315b6459b4d2825 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,31 +33,54 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    ceGenome = Genome("celegans", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildCelegansDB(db=geneDB, downloadRoot=""):
+    if downloadRoot == "":
+        downloadRoot = "%s/download/" % cisRoot
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    geneIDPath = "%sgeneIDs.WS200" % downloadRoot
+    goDefPath = "%sGO.terms_and_ids" % downloadRoot
+    goPath = "%sgene_association.wb" % downloadRoot
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    ceGenome.addChromosomeEntry(chromID, chromOut, "file")
+    # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
+    gffPath = "%selegansWS200.gff" % downloadRoot
+
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
+
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, geneIDPath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath, geneIDPath)
+
+    # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
+    for chromID in ["I", "II", "III", "IV", "V", "X"]:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    ceGenome = Genome("celegans", version="WS200",  dbFile=db)
+    ceGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gffFile):
@@ -84,6 +107,7 @@ def loadGeneEntries(db, gffFile):
         else:
             gidGene = giddots[1]
             gidLetter = "a"
+
         gid = "%s.%s" % (giddots[0], gidGene)
         geneID = ("celegans", gid)
         gidVersion = 1
@@ -177,7 +201,7 @@ def loadFeatureEntries(db, gffFile):
 
 def loadGeneAnnotations(db, geneIDPath):
     geneAnnotations = []
-    geneIDFile = open(geneIDPath, "r")  
+    geneIDFile = open(geneIDPath, "r")
     lines = geneIDFile.readlines()
     geneIDFile.close()
     ceGenome = Genome("celegans", dbFile=db)
@@ -258,56 +282,26 @@ def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
     ceGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    ceGenome = Genome("celegans", version="WS200",  dbFile=db)
-    ceGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    ceGenome = Genome("celegans", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    ceGenome.addChromosomeEntry(chromID, chromOut, "file")
 
 
 def createDBindices(db):
     ceGenome = Genome("celegans", version="WS200", dbFile=db)
     ceGenome.createIndices()
-
-
-def buildCelegansDB(db=geneDB, downloadRoot=""):
-    if downloadRoot == "":
-        downloadRoot = "%s/download/" % cisRoot
-
-    geneIDPath = "%sgeneIDs.WS200" % downloadRoot
-    goDefPath = "%sGO.terms_and_ids" % downloadRoot
-    goPath = "%sgene_association.wb" % downloadRoot
-
-    # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
-    chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot,
-               "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot,
-               "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot,
-               "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot,
-               "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot,
-               "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot
-    }
-
-    # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
-    gffPath = "%selegansWS200.gff" % downloadRoot
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries" 
-    loadGeneEntries(db, gffPath)
-
-    print "Adding feature entries" 
-    loadFeatureEntries(db, gffPath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, geneIDPath)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath, geneIDPath)
-
-    for chromID in chromos:
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
\ No newline at end of file
index 75c31d1c9fa21e84708e5f1c3891cb09d1f589ac..12eda3008a9b31d8b13f25d263b6812198065dc9 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,36 +33,48 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    cfGenome = Genome("cfamiliaris", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildDogDB(db=geneDB):
+    genePath = "%s/download/seq_gene.md" % cisRoot
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    cfGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
+                 "31", "32", "33", "34", "35", "36", "37", "38", "X", "Un"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/C_familiaris/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    cfGenome = Genome("cfamiliaris",  dbFile=db)
+    cfGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     alreadySeen = []
     cfGenome = Genome("cfamiliaris", dbFile=db)
@@ -177,7 +189,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                 synonyms = idb.geneIDSynonyms(gID)
                 if len(synonyms) >0:
                     for entry in synonyms:
-                        gene_name += ","   
+                        gene_name += ","
                         gene_name += entry
                 else:
                     gene_name = " "
@@ -191,74 +203,26 @@ def loadGeneOntology(db, goPath, goDefPath):
     cfGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    cfGenome = Genome("cfamiliaris",  dbFile=db)
-    cfGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
     cfGenome = Genome("cfamiliaris", dbFile=db)
-    cfGenome.createIndices()
-
-
-def buildDogDB(db=geneDB):
-    genePath = "%s/download/seq_gene.md" % cisRoot
-    chromos = {"1": "%s/download/chr1.fa" % cisRoot,
-               "2": "%s/download/chr2.fa" % cisRoot,
-               "3": "%s/download/chr3.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "5": "%s/download/chr5.fa" % cisRoot,
-               "6": "%s/download/chr6.fa" % cisRoot,
-               "7": "%s/download/chr7.fa" % cisRoot,
-               "8": "%s/download/chr8.fa" % cisRoot,
-               "9": "%s/download/chr9.fa" % cisRoot,
-               "10": "%s/download/chr10.fa" % cisRoot,
-               "11": "%s/download/chr11.fa" % cisRoot,
-               "12": "%s/download/chr12.fa" % cisRoot,
-               "13": "%s/download/chr13.fa" % cisRoot,
-               "14": "%s/download/chr14.fa" % cisRoot,
-               "15": "%s/download/chr15.fa" % cisRoot,
-               "16": "%s/download/chr16.fa" % cisRoot,
-               "17": "%s/download/chr17.fa" % cisRoot,
-               "18": "%s/download/chr18.fa" % cisRoot,
-               "19": "%s/download/chr19.fa" % cisRoot,
-               "20": "%s/download/chr20.fa" % cisRoot,
-               "21": "%s/download/chr21.fa" % cisRoot,
-               "22": "%s/download/chr22.fa" % cisRoot,
-               '23': "%s/download/chr23.fa" % cisRoot,
-               "24": "%s/download/chr24.fa" % cisRoot,
-               "25": "%s/download/chr25.fa" % cisRoot,
-               "26": "%s/download/chr26.fa" % cisRoot,
-               "27": "%s/download/chr27.fa" % cisRoot,
-               "28": "%s/download/chr28.fa" % cisRoot,
-               "29": "%s/download/chr29.fa" % cisRoot,
-               "30": "%s/download/chr30.fa" % cisRoot,
-               "31": "%s/download/chr31.fa" % cisRoot,
-               "32": "%s/download/chr32.fa" % cisRoot,
-               "33": "%s/download/chr33.fa" % cisRoot,
-               "34": "%s/download/chr34.fa" % cisRoot,
-               "35": "%s/download/chr35.fa" % cisRoot,
-               "36": "%s/download/chr36.fa" % cisRoot,
-               "37": "%s/download/chr37.fa" % cisRoot,
-               "38": "%s/download/chr38.fa" % cisRoot,
-               "X": "%s/download/chrX.fa" % cisRoot,
-               "Un": "%s/download/chrUn.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
 
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/C_familiaris/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    cfGenome.addChromosomeEntry(chromID, chromOut, "file")
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    cfGenome = Genome("cfamiliaris", dbFile=db)
+    cfGenome.createIndices()
index 86e5991b112b69a095760b5eebf056de0621c0a1..3bc6b8f6a0dd9c7ab6aa61a8ce3bf8e00b21fd5c 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,51 +33,39 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
 
 
-def loadChromosomes(db, inPath, chromOutPath):
-    crGenome = Genome("cremanei", dbFile=db)
-    scontigList = os.listdir(inPath)
-    for scontig in scontigList:
-        seq = ''
-        seqArray = []
-        seqLen = 0
-        inFile = open("%s/%s" % (inPath, scontig), "r")
-        index = 0
-        header = inFile.readline()
-        chromID = header.strip()[1:]
-        while header != "":
-            seqArray = []
-            seqLen = 0
-            currentLine = inFile.readline()
-            while currentLine != "" and currentLine[0] != ">":
-                lineSeq = currentLine.strip()
-                seqLen += len(lineSeq)
-                seqArray.append(lineSeq)
-                currentLine = inFile.readline()
+def buildCremaneiDB(db=geneDB):
+    gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
+    chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
+    chromoOutPath = "/C_remanei/"
 
-            seq = string.join(seqArray, "")
-            if seqLen < 100000:
-                print "Added contig %s to database" % chromID
-                crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
-                crGenome.addChromosomeEntry(chromID, chromID, "db")
-            else:
-                outFileName = "%s%s.bin" % (chromOutPath, chromID)
-                outFile = open("%s%s" % (cisRoot, outFileName), "w")
-                outFile.write(seq)
-                outFile.close()
-                print "Added contig file %s to database" % outFileName
-                crGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
 
-            index += 1
-            header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
 
-        inFile.close()
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Loading genomic sequence"
+    loadChromosomes(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
+    crGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gffFile):
@@ -155,34 +143,46 @@ def loadFeatureEntries(db, gffFile):
     crGenome.addFeatureEntryBatch(featureEntries)
 
 
-def createDBFile(db):
-    crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
-    crGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
-    crGenome.createIndices()
-
-
-def buildCremaneiDB(db=geneDB):
-    gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
-    chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
-    chromoOutPath = "/C_remanei/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+def loadChromosomes(db, inPath, chromOutPath):
+    crGenome = Genome("cremanei", dbFile=db)
+    scontigList = os.listdir(inPath)
+    for scontig in scontigList:
+        seq = ''
+        seqArray = []
+        seqLen = 0
+        inFile = open("%s/%s" % (inPath, scontig), "r")
+        index = 0
+        header = inFile.readline()
+        chromID = header.strip()[1:]
+        while header != "":
+            seqArray = []
+            seqLen = 0
+            currentLine = inFile.readline()
+            while currentLine != "" and currentLine[0] != ">":
+                lineSeq = currentLine.strip()
+                seqLen += len(lineSeq)
+                seqArray.append(lineSeq)
+                currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+            seq = string.join(seqArray, "")
+            if seqLen < 100000:
+                print "Added contig %s to database" % chromID
+                crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
+                crGenome.addChromosomeEntry(chromID, chromID, "db")
+            else:
+                outFileName = "%s%s.bin" % (chromOutPath, chromID)
+                outFile = open("%s%s" % (cisRoot, outFileName), "w")
+                outFile.write(seq)
+                outFile.close()
+                print "Added contig file %s to database" % outFileName
+                crGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+            index += 1
+            header = currentLine
 
-    print "Loading genomic sequence" 
-    loadChromosomes(db, chromoPath, chromoOutPath)
+        inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
+    crGenome.createIndices()
index 5c9f53a58bbc4466689f9ee2dc40378db3367c10..78ef7fb7594b11314ef1b7602c2a03fcd7cd521d 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,7 +33,7 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
@@ -67,24 +67,47 @@ version = {"A": "1",
            "Z": "26"
 }
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    dmGenome = Genome("dmelanogaster", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+def buildDmelanogasterDB(db=geneDB):
+    """ genes and annotations are from UCSC. GO association file is from geneontology.org.
+    """
+    genePath = "%s/download/flyBaseGene.txt" % cisRoot
+    annotPath = "%s/download/gene_info" % cisRoot
+    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+    goPath = "%s/download/gene2go" % cisRoot
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    dmGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, annotPath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath, annotPath)
+
+    chromList = ["2L", "2R", "2LHet", "2RHet", "3L", "3R", "3LHet", "3RHet",
+                 "4", "X", "XHet", "YHet", "U", "UExtra", "M"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/D_melanogaster/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    dmGenome = Genome("dmelanogaster", dbFile=db)
+    dmGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
@@ -196,7 +219,7 @@ def loadGeneFeatures(db, gfile):
 
 def loadGeneAnnotations(db, annotPath):
     geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     dmGenome = Genome("dmelanogaster", dbFile=db)
     for line in annotFile:
         try:
@@ -222,7 +245,7 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
     dmGenome = Genome("dmelanogaster", dbFile=db)
     goDefFile = open(goDefPath, "r")
     goFile = open(goPath, "r")
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     annotEntries = annotFile.readlines()
     annotFile.close()
     goDefEntries = goDefFile.readlines()
@@ -271,60 +294,26 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
     dmGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
     dmGenome = Genome("dmelanogaster", dbFile=db)
-    dmGenome.createGeneDB(db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    dmGenome.addChromosomeEntry(chromID, chromOut, "file")
 
 
 def createDBindices(db):
     dmGenome = Genome("dmelanogaster", dbFile=db)
     dmGenome.createIndices()
-
-
-def buildDmelanogasterDB(db=geneDB):
-    """ genes and annotations are from UCSC. GO association file is from geneontology.org. 
-    """
-    genePath = "%s/download/flyBaseGene.txt" % cisRoot
-    annotPath = "%s/download/gene_info" % cisRoot
-    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
-    goPath = "%s/download/gene2go" % cisRoot
-    chromos = {"2L": "%s/download/chr2L.fa" % cisRoot,
-               "2R": "%s/download/chr2R.fa" % cisRoot,
-               "2LHet": "%s/download/chr2LHet.fa" % cisRoot,
-               "2RHet": "%s/download/chr2RHet.fa" % cisRoot,
-               "3L": "%s/download/chr3L.fa" % cisRoot,
-               "3LHet": "%s/download/chr3LHet.fa" % cisRoot,
-               "3R": "%s/download/chr3R.fa" % cisRoot,
-               "3RHet": "%s/download/chr3RHet.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "X": "%s/download/chrX.fa" % cisRoot,
-               "XHet": "%s/download/chrXHet.fa" % cisRoot,
-               "YHet": "%s/download/chrYHet.fa" % cisRoot,
-               "U": "%s/download/chrU.fa" % cisRoot,
-               "Uextra": "%s/download/chrUextra.fa" % cisRoot,
-               "M": "%s/download/chrM.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, annotPath)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath, annotPath)
-
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
\ No newline at end of file
index 7541db8d9043e2f4a082c73f6f938eb4ff639d9c..f26884eea87d0d463037a0a91f0ae768d974edbd 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -34,48 +34,40 @@ from cistematic.core.geneinfo import geneinfoDB
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/D_rerio/drerio.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    drGenome = Genome("drerio", dbFile=db)
-    files = os.listdir(chromPath)
-    for filename in files:
-        inFile = open("%s/%s" % (chromPath, filename), "r")
-        header = inFile.readline()
-        while header != "":
-            seqArray = []
-            seqLen = 0
-            chromID = header.strip()[1:]
-            currentLine = inFile.readline()
-            while currentLine != "" and currentLine[0] != ">":
-                lineSeq = currentLine.strip()
-                seqLen += len(lineSeq)
-                seqArray.append(lineSeq)
-                currentLine = inFile.readline()
+def buildDrerioDB(db=geneDB):
+    """ genes and annotations are from UCSC (dr3).
+    """
+    #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
+    chromoPath = "%s/download/dr3" % cisRoot
+    chromoOutPath = "/D_rerio/"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-            seq = string.join(seqArray, "")
-            if seqLen < 250000:
-                print "Added contig %s to database" % chromID
-                drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
-                drGenome.addChromosomeEntry(chromID, chromID, "db")
-            else:
-                outFileName = "%s%s.bin" % (chromOutPath, chromID)
-                outFile = open("%s%s" % (cisRoot, outFileName), "w")
-                outFile.write(seq)
-                outFile.close()
-                print "Added contig file %s to database" % outFileName
-                drGenome.addChromosomeEntry(chromID, outFileName, "file")
+    #print "Adding gene entries"
+    #loadGeneEntries(db, genePath)
 
-            header = currentLine
+    #print "Adding gene features"
+    #loadGeneFeatures(db, genePath)
 
-        inFile.close()
+    print "Loading chromosomes"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    drGenome = Genome("drerio",  dbFile=db)
+    drGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
@@ -216,35 +208,43 @@ def loadGeneFeatures(db, gfile):
     drGenome.addFeatureEntryBatch(insertArray)
 
 
-def createDBFile(db):
-    drGenome = Genome("drerio",  dbFile=db)
-    drGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     drGenome = Genome("drerio", dbFile=db)
-    drGenome.createIndices()
-
-
-def buildDrerioDB(db=geneDB):
-    """ genes and annotations are from UCSC (dr3). 
-    """
-    #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
-    chromoPath = "%s/download/dr3" % cisRoot
-    chromoOutPath = "/D_rerio/"
-    print "Creating database %s" % db
-    createDBFile(db)
+    files = os.listdir(chromPath)
+    for filename in files:
+        inFile = open("%s/%s" % (chromPath, filename), "r")
+        header = inFile.readline()
+        while header != "":
+            seqArray = []
+            seqLen = 0
+            chromID = header.strip()[1:]
+            currentLine = inFile.readline()
+            while currentLine != "" and currentLine[0] != ">":
+                lineSeq = currentLine.strip()
+                seqLen += len(lineSeq)
+                seqArray.append(lineSeq)
+                currentLine = inFile.readline()
 
-    #print "Adding gene entries"
-    #loadGeneEntries(db, genePath)
+            seq = string.join(seqArray, "")
+            if seqLen < 250000:
+                print "Added contig %s to database" % chromID
+                drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
+                drGenome.addChromosomeEntry(chromID, chromID, "db")
+            else:
+                outFileName = "%s%s.bin" % (chromOutPath, chromID)
+                outFile = open("%s%s" % (cisRoot, outFileName), "w")
+                outFile.write(seq)
+                outFile.close()
+                print "Added contig file %s to database" % outFileName
+                drGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    #print "Adding gene features"
-    #loadGeneFeatures(db, genePath)
+            header = currentLine
 
-    print "Loading chromosomes"
-    loadChromosome(db, chromoPath, chromoOutPath)
+        inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    drGenome = Genome("drerio", dbFile=db)
+    drGenome.createIndices()
index e845c426500550ef1169df4a3d4bf33fc99690d8..f2ac6d0da6369476e726500af3de550f638f2f6f 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,36 +33,49 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    ecGenome = Genome("ecaballus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildHorseDB(db=geneDB):
+    genePath = "%s/download/seq_gene.md" % cisRoot
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    ecGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
+                 "31", "M", "X", "Un"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/E_caballus/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    ecGenome = Genome("ecaballus",  dbFile=db)
+    ecGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     alreadySeen = []
     ecGenome = Genome("ecaballus", dbFile=db)
@@ -131,68 +144,26 @@ def loadGeneFeatures(db, gFile):
     ecGenome.addFeatureEntryBatch(featureEntries)
 
 
-def createDBFile(db):
-    ecGenome = Genome("ecaballus",  dbFile=db)
-    ecGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
     ecGenome = Genome("ecaballus", dbFile=db)
-    ecGenome.createIndices()
-
-
-def buildHorseDB(db=geneDB):
-    genePath = "%s/download/seq_gene.md" % cisRoot
-    chromos = {"1": "%s/download/chr1.fa" % cisRoot,
-               "2": "%s/download/chr2.fa" % cisRoot,
-               "3": "%s/download/chr3.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "5": "%s/download/chr5.fa" % cisRoot,
-               "6": "%s/download/chr6.fa" % cisRoot,
-               "7": "%s/download/chr7.fa" % cisRoot,
-               "8": "%s/download/chr8.fa" % cisRoot,
-               "9": "%s/download/chr9.fa" % cisRoot,
-               "10": "%s/download/chr10.fa" % cisRoot,
-               "11": "%s/download/chr11.fa" % cisRoot,
-               "12": "%s/download/chr12.fa" % cisRoot,
-               "13": "%s/download/chr13.fa" % cisRoot,
-               "14": "%s/download/chr14.fa" % cisRoot,
-               "15": "%s/download/chr15.fa" % cisRoot,
-               "16": "%s/download/chr16.fa" % cisRoot,
-               "17": "%s/download/chr17.fa" % cisRoot,
-               "18": "%s/download/chr18.fa" % cisRoot,
-               "19": "%s/download/chr19.fa" % cisRoot,
-               "20": "%s/download/chr20.fa" % cisRoot,
-               "21": "%s/download/chr21.fa" % cisRoot,
-               "22": "%s/download/chr22.fa" % cisRoot,
-               "23": "%s/download/chr23.fa" % cisRoot,
-               "24": "%s/download/chr24.fa" % cisRoot,
-               "25": "%s/download/chr25.fa" % cisRoot,
-               "26": "%s/download/chr26.fa" % cisRoot,
-               "27": "%s/download/chr27.fa" % cisRoot,
-               "28": "%s/download/chr28.fa" % cisRoot,
-               "29": "%s/download/chr29.fa" % cisRoot,
-               "30": "%s/download/chr30.fa" % cisRoot,
-               "31": "%s/download/chr31.fa" % cisRoot,
-               "M": "%s/download/chrM.fa" % cisRoot,
-               "X": "%s/download/chrX.fa" % cisRoot,
-               "Un": "%s/download/chrUn.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
 
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/E_caballus/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    ecGenome.addChromosomeEntry(chromID, chromOut, "file")
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    ecGenome = Genome("ecaballus", dbFile=db)
+    ecGenome.createIndices()
index 551a116654fbf1cd5a72cf944ba253ca3f777eed..eb0f5f30eae656dbf5fcd936030fd1f04f0d7a72 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -34,36 +34,62 @@ from cistematic.core.geneinfo import geneinfoDB
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/G_gallus/ggallus.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    ggGenome = Genome("ggallus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildChickenDB(db=geneDB):
+    genePath = "%s/download/seq_gene.md" % cisRoot
+    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
+    goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    ggGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+    
+    #print "Adding gene annotations"
+    #loadGeneAnnotations(db, annotPath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "23", "24", "25", "26", "27", "28",
+                 "32", "W", "Z", "M", "E22C19W28_E50C23", "E64",
+                 "1_random", "2_random", "4_random", "5_random", "6_random",
+                 "7_random", "8_random", "10_random", "11_random", "12_random",
+                 "13_random", "16_random", "17_random", "18_random", "20_random",
+                 "22_random", "25_random", "28_random", "Un_random", "W_random",
+                 "E64_random", "Z_random", "E22C19W28_E50C23_random"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/G_gallus/chromo%s.bin" % chromID)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    ggGenome = Genome("ggallus",  dbFile=db)
+    ggGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     alreadySeen = []
     ggGenome = Genome("ggallus", dbFile=db)
@@ -138,6 +164,26 @@ def loadGeneFeatures(db, gFile):
     ggGenome.addFeatureEntryBatch(featureEntries)
 
 
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    ggGenome = Genome("ggallus", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    ggGenome.addChromosomeEntry(chromID, chromOut, "file")
+
+
 def loadGeneOntology(db, goPath, goDefPath):
     ggGenome = Genome("ggallus", dbFile=db)
     goDefFile = open(goDefPath, "r")
@@ -176,99 +222,6 @@ def loadGeneOntology(db, goPath, goDefPath):
     ggGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    ggGenome = Genome("ggallus",  dbFile=db)
-    ggGenome.createGeneDB(db)
-
-
 def createDBindices(db):
     ggGenome = Genome("ggallus", dbFile=db)
     ggGenome.createIndices()
-
-
-def buildChickenDB(db=geneDB):
-    genePath = "%s/download/seq_gene.md" % cisRoot
-    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
-    goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
-    chromos = {"1": "%s/download/chr1.fa" % cisRoot,
-               "2": "%s/download/chr2.fa" % cisRoot,
-               "3": "%s/download/chr3.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "5": "%s/download/chr5.fa" % cisRoot,
-               "6": "%s/download/chr6.fa" % cisRoot,
-               "7": "%s/download/chr7.fa" % cisRoot,
-               "8": "%s/download/chr8.fa" % cisRoot,
-               "9": "%s/download/chr9.fa" % cisRoot,
-               "10": "%s/download/chr10.fa" % cisRoot,
-               "11": "%s/download/chr11.fa" % cisRoot,
-               "12": "%s/download/chr12.fa" % cisRoot,
-               "13": "%s/download/chr13.fa" % cisRoot,
-               "14": "%s/download/chr14.fa" % cisRoot,
-               "15": "%s/download/chr15.fa" % cisRoot,
-               "16": "%s/download/chr16.fa" % cisRoot,
-               "17": "%s/download/chr17.fa" % cisRoot,
-               "18": "%s/download/chr18.fa" % cisRoot,
-               "19": "%s/download/chr19.fa" % cisRoot,
-               "20": "%s/download/chr20.fa" % cisRoot,
-               "21": "%s/download/chr21.fa" % cisRoot,
-               "22": "%s/download/chr22.fa" % cisRoot,
-               "23": "%s/download/chr23.fa" % cisRoot,
-               "24": "%s/download/chr24.fa" % cisRoot,
-               "25": "%s/download/chr25.fa" % cisRoot,
-               "26": "%s/download/chr26.fa" % cisRoot,
-               "27": "%s/download/chr27.fa" % cisRoot,
-               "28": "%s/download/chr28.fa" % cisRoot,
-               "32": "%s/download/chr32.fa" % cisRoot,
-               "W": "%s/download/chrW.fa" % cisRoot,
-               "Z": "%s/download/chrZ.fa" % cisRoot,
-               "M": "%s/download/chrM.fa" % cisRoot,
-               "E22C19W28_E50C23": "%s/download/chrE22C19W28_E50C23.fa" % cisRoot,
-               "E64": "%s/download/chrE64.fa" % cisRoot,
-               "1_random": "%s/download/chr1_random.fa" % cisRoot,
-               "2_random": "%s/download/chr2_random.fa" % cisRoot,
-               "4_random": "%s/download/chr4_random.fa" % cisRoot,
-               "5_random": "%s/download/chr5_random.fa" % cisRoot,
-               "6_random": "%s/download/chr6_random.fa" % cisRoot,
-               "7_random": "%s/download/chr7_random.fa" % cisRoot,
-               "8_random": "%s/download/chr8_random.fa" % cisRoot,
-               "10_random": "%s/download/chr10_random.fa" % cisRoot,
-               "11_random": "%s/download/chr11_random.fa" % cisRoot,
-               "12_random": "%s/download/chr12_random.fa" % cisRoot,
-               "13_random": "%s/download/chr13_random.fa" % cisRoot,
-               "16_random": "%s/download/chr16_random.fa" % cisRoot,
-               "17_random": "%s/download/chr17_random.fa" % cisRoot,
-               "18_random": "%s/download/chr18_random.fa" % cisRoot,
-               "20_random": "%s/download/chr20_random.fa" % cisRoot,
-               "22_random": "%s/download/chr22_random.fa" % cisRoot,
-               "25_random": "%s/download/chr25_random.fa" % cisRoot,
-               "28_random": "%s/download/chr28_random.fa" % cisRoot,
-               "Un_random": "%s/download/chrUn_random.fa" % cisRoot,
-               "W_random": "%s/download/chrW_random.fa" % cisRoot,
-               "E64_random": "%s/download/chrE64_random.fa" % cisRoot,
-               "Z_random": "%s/download/chrZ_random.fa" % cisRoot,
-               "E22C19W28_E50C23_random": "%s/download/chrE22C19W28_E50C23_random.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-    
-    #print "Adding gene annotations"
-    #loadGeneAnnotations(db, annotPath)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
-
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/G_gallus/chromo%s.bin" % chromID)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
\ No newline at end of file
index e9c677cace13c0e556d0c2234029d3ce42e8c98d..b0948f1327ba90b315a2b0bc245caeb68ed4c77d 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -34,34 +34,56 @@ from cistematic.core.geneinfo import geneinfoDB
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    hsGenome = Genome("hsapiens", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    hsGenome.addChromosomeEntry(chromID, chromOut, "file")
+def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
+    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
+    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
+    # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "X", "Y"
+    ]
+
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath, chromList)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath, chromList)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath, "/H_sapiens/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    hsGenome = Genome("hsapiens",  dbFile=db)
+    hsGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile, cDict):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
     geneEntries = []
     hsGenome = Genome("hsapiens", dbFile=db)
     geneFile = open(gFile, "r")
@@ -76,7 +98,7 @@ def loadGeneEntries(db, gFile, cDict):
         chrom = cols[1].strip()
         if chrom not in cDict:
             continue
-  
+
         name = cols[10].split(":")
         gid = name[1]
         start = int(cols[2])
@@ -105,8 +127,10 @@ def loadGeneFeatures(db, gFile, cDict):
         cols = line.split("\t")
         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
             continue
+
         if cols[12] == "Celera":
             continue
+
         chrom = cols[1].strip()
         if chrom not in cDict:
             continue
@@ -184,7 +208,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                 synonyms = idb.geneIDSynonyms(gID)
                 if len(synonyms) >0:
                     for entry in synonyms:
-                        gene_name += ","    
+                        gene_name += ","
                         gene_name += entry
                 else:
                     gene_name = " "
@@ -198,67 +222,25 @@ def loadGeneOntology(db, goPath, goDefPath):
     hsGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    hsGenome = Genome("hsapiens",  dbFile=db)
-    hsGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    hsGenome = Genome("hsapiens", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    hsGenome.addChromosomeEntry(chromID, chromOut, "file")
 
 
 def createDBindices(db):
     hsGenome = Genome("hsapiens", dbFile=db)
     hsGenome.createIndices()
-
-
-def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
-    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
-    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
-    # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
-    chromDict = {"1": "%s/chr1.fa" % downloadDir,
-                 "2": "%s/chr2.fa" % downloadDir,
-                 "3": "%s/chr3.fa" % downloadDir,
-                 "4": "%s/chr4.fa" % downloadDir,
-                 "5": "%s/chr5.fa" % downloadDir,
-                 "6": "%s/chr6.fa" % downloadDir,
-                 "7": "%s/chr7.fa" % downloadDir,
-                 "8": "%s/chr8.fa" % downloadDir,
-                 "9": "%s/chr9.fa" % downloadDir,
-                 "10": "%s/chr10.fa" % downloadDir,
-                 "11": "%s/chr11.fa" % downloadDir,
-                 "12": "%s/chr12.fa" % downloadDir,
-                 "13": "%s/chr13.fa" % downloadDir,
-                 "14": "%s/chr14.fa" % downloadDir,
-                 "15": "%s/chr15.fa" % downloadDir,
-                 "16": "%s/chr16.fa" % downloadDir,
-                 "17": "%s/chr17.fa" % downloadDir,
-                 "18": "%s/chr18.fa" % downloadDir,
-                 "19": "%s/chr19.fa" % downloadDir,
-                 "20": "%s/chr20.fa" % downloadDir,
-                 "21": "%s/chr21.fa" % downloadDir,
-                 "22": "%s/chr22.fa" % downloadDir,
-                 "X": "%s/chrX.fa" % downloadDir,
-                 "Y": "%s/chrY.fa" % downloadDir
-    }
-    
-    print "Creating database %s" % db
-    createDBFile(db)
-    
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath, chromDict)
-    
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath, chromDict)
-    
-    print "Adding gene annotations"
-    loadGeneAnnotations(db)
-    
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
-    
-    for chromID in chromDict.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID)
-    
-    print "Creating Indices"
-    createDBindices(db)
-    
-    print "Finished creating database %s" % db
index 516e784bb7eaedd005a4776029731dd91aae576b..e1fc731dd8e39f88bcbc7d2500ad84cfc7b04fdc 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,51 +33,44 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    mdGenome = Genome("mdomestica", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildMdomesticaDB(db=geneDB):
+    genePath = "%s/download/mondom/genscan.txt" % cisRoot
+    chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
+    chromoOutPath = "/M_domestica/"
 
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
-            mdGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            mdGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-    inFile.close()
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Loading chromosomes"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    mdGenome = Genome("mdomestica",  dbFile=db)
+    mdGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     mdGenome = Genome("mdomestica", dbFile=db)
     geneFile = open(gFile, "r")
@@ -103,7 +96,7 @@ def loadGeneEntries(db, gFile):
 
 def loadGeneAnnotations(db, annotPath):
     geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     mdGenome = Genome("mdomestica", dbFile=db)
     for line in annotFile:
         try:
@@ -191,34 +184,41 @@ def loadGeneFeatures(db, gfile):
     mdGenome.addFeatureEntryBatch(insertArray)
 
 
-def createDBFile(db):
-    mdGenome = Genome("mdomestica",  dbFile=db)
-    mdGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     mdGenome = Genome("mdomestica", dbFile=db)
-    mdGenome.createIndices()
-
-
-def buildMdomesticaDB(db=geneDB):
-    genePath = "%s/download/mondom/genscan.txt" % cisRoot
-    chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
-    chromoOutPath = "/M_domestica/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
+            mdGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            mdGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
 
-    print "Loading chromosomes"
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    mdGenome = Genome("mdomestica", dbFile=db)
+    mdGenome.createIndices()
index 4e90cfe7e271f5b704d6cb152bb83da2b0264ad9..72f1350070ff7027118e5a9c569e1da23d5e8e41 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -34,36 +34,58 @@ from cistematic.core.geneinfo import geneinfoDB
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB  = "%s/M_musculus/mmusculus.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    mmGenome = Genome("mmusculus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
+    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
+    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
+    # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
+    # but ignoring all random chromosomes
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19",
+                 "X", "Y", "M"
+    ]
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    mmGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath, chromList)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath, chromList)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath, "/M_musculus/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    mmGenome = Genome("mmusculus",  dbFile=db)
+    mmGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile, cDict):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     alreadySeen = []
     mmGenome = Genome("mmusculus", dbFile=db)
@@ -184,7 +206,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                 synonyms = idb.geneIDSynonyms(gID)
                 if len(synonyms) >0:
                     for entry in synonyms:
-                        gene_name += ","    
+                        gene_name += ","
                         gene_name += entry
                 else:
                     gene_name = " "
@@ -198,66 +220,26 @@ def loadGeneOntology(db, goPath, goDefPath):
     mmGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    mmGenome = Genome("mmusculus",  dbFile=db)
-    mmGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
     mmGenome = Genome("mmusculus", dbFile=db)
-    mmGenome.createIndices()
-
-
-def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
-    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
-    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
-    # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
-    # but ignoring all random chromosomes
-    chromDict = {"1": "%s/chr1.fa" % downloadDir,
-                 "2": "%s/chr2.fa" % downloadDir,
-                 "3": "%s/chr3.fa" % downloadDir,
-                 "4": "%s/chr4.fa" % downloadDir,
-                 "5": "%s/chr5.fa" % downloadDir,
-                 "6": "%s/chr6.fa" % downloadDir,
-                 "7": "%s/chr7.fa" % downloadDir,
-                 "8": "%s/chr8.fa" % downloadDir,
-                 "9": "%s/chr9.fa" % downloadDir,
-                 "10": "%s/chr10.fa" % downloadDir,
-                 "11": "%s/chr11.fa" % downloadDir,
-                 "12": "%s/chr12.fa" % downloadDir,
-                 "13": "%s/chr13.fa" % downloadDir,
-                 "14": "%s/chr14.fa" % downloadDir,
-                 "15": "%s/chr15.fa" % downloadDir,
-                 "16": "%s/chr16.fa" % downloadDir,
-                 "17": "%s/chr17.fa" % downloadDir,
-                 "18": "%s/chr18.fa" % downloadDir,
-                 "19": "%s/chr19.fa" % downloadDir,
-                 "X": "%s/chrX.fa" % downloadDir,
-                 "Y": "%s/chrY.fa" % downloadDir,
-                 "M": "%s/chrM.fa" % downloadDir
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath, chromDict)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath, chromDict)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
 
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
 
-    for chromID in chromDict.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    mmGenome.addChromosomeEntry(chromID, chromOut, "file")
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    mmGenome = Genome("mmusculus", dbFile=db)
+    mmGenome.createIndices()
index b0fae70e721bc11e5046d9a15b4e34120144c16e..6f4c7bac981945074543d916bf5b6cf4d307fd39 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -34,36 +34,57 @@ from cistematic.core.geneinfo import geneinfoDB
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB  = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    rnGenome = Genome("rnorvegicus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/seq_gene.md" % downloadDir
+    goDefPath = "%s/GO.terms_and_ids" % downloadDir
+    goPath = "%s/gene2go" % downloadDir
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    rnGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    # ignoring all random chromosomes
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "Un", "X", "M"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath, "/R_norvegicus/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    rnGenome = Genome("rnorvegicus",  dbFile=db)
+    rnGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     rnGenome = Genome("rnorvegicus", dbFile=db)
     geneFile = open(gFile, "r")
@@ -130,7 +151,7 @@ def loadGeneFeatures(db, gFile):
 
 def loadGeneAnnotations(db):
     geneAnnotations = []
-    idb = geneinfoDB()  
+    idb = geneinfoDB()
     rnGenome = Genome("rnorvegicus", dbFile=db)
     gidList = rnGenome.allGIDs()
     for locID in gidList:
@@ -176,7 +197,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                 synonyms = idb.geneIDSynonyms(gID)
                 if len(synonyms) >0:
                     for entry in synonyms:
-                        gene_name += ","    
+                        gene_name += ","
                         gene_name += entry
                 else:
                     gene_name = " "
@@ -190,66 +211,26 @@ def loadGeneOntology(db, goPath, goDefPath):
     rnGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    rnGenome = Genome("rnorvegicus",  dbFile=db)
-    rnGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
     rnGenome = Genome("rnorvegicus", dbFile=db)
-    rnGenome.createIndices()
-
-
-def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/seq_gene.md" % downloadDir
-    goDefPath = "%s/GO.terms_and_ids" % downloadDir
-    goPath = "%s/gene2go" % downloadDir
-    # ignoring all random chromosomes
-    chromos = {"1": "%s/chr1.fa" % downloadDir,
-               "2": "%s/chr2.fa" % downloadDir,
-               "3": "%s/chr3.fa" % downloadDir,
-               "4": "%s/chr4.fa" % downloadDir,
-               "5": "%s/chr5.fa" % downloadDir,
-               "6": "%s/chr6.fa" % downloadDir,
-               "7": "%s/chr7.fa" % downloadDir,
-               "8": "%s/chr8.fa" % downloadDir,
-               "9": "%s/chr9.fa" % downloadDir,
-               "10": "%s/chr10.fa" % downloadDir,
-               "11": "%s/chr11.fa" % downloadDir,
-               "12": "%s/chr12.fa" % downloadDir,
-               "13": "%s/chr13.fa" % downloadDir,
-               "14": "%s/chr14.fa" % downloadDir,
-               "15": "%s/chr15.fa" % downloadDir,
-               "16": "%s/chr16.fa" % downloadDir,
-               "17": "%s/chr17.fa" % downloadDir,
-               "18": "%s/chr18.fa" % downloadDir,
-               "19": "%s/chr19.fa" % downloadDir,
-               "Un": "%s/chrUn.fa" % downloadDir,
-               "X": "%s/chrX.fa" % downloadDir,
-               "20": "%s/chr20.fa" % downloadDir,
-               "M": "%s/chrM.fa" % downloadDir
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
 
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
 
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    rnGenome.addChromosomeEntry(chromID, chromOut, "file")
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    rnGenome = Genome("rnorvegicus", dbFile=db)
+    rnGenome.createIndices()
index 5866f80963fac6914c81048fb1460548a4ef8a3b..d670c148773fc86b95acbe6d42bf5ae47a1a375f 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,33 +33,44 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
 
 
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    scGenome = Genome("scerevisiae", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildScerevisiaeDB(db=geneDB):
+    genePath = "%s/download/SGD_features.tab" % cisRoot
+    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+    goPath = "%s/download/gene_association.sgd" % cisRoot
 
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    seq = ""
-    print "calling scGenome()"
-    scGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, genePath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fsa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/S_cerevisiae/chr%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
+    scGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
@@ -105,7 +116,7 @@ def loadGeneEntries(db, gFile):
 
 def loadGeneAnnotations(db, annotPath):
     geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     lines = annotFile.readlines()
     annotFile.close()
     scGenome = Genome("scerevisiae", dbFile=db)
@@ -162,55 +173,28 @@ def loadGeneOntology(db, goPath, goDefPath):
     scGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
-    scGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    scGenome = Genome("scerevisiae", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    seq = ""
+    print "calling scGenome()"
+    scGenome.addChromosomeEntry(chromID, chromOut, "file")
 
 
 def createDBindices(db):
     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
     scGenome.createIndices()
-
-
-def buildScerevisiaeDB(db=geneDB):
-    genePath = "%s/download/SGD_features.tab" % cisRoot
-    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
-    goPath = "%s/download/gene_association.sgd" % cisRoot
-    chromos = {"1": "%s/download/chr01.fsa" % cisRoot,
-               "2": "%s/download/chr02.fsa" % cisRoot,
-               "3": "%s/download/chr03.fsa" % cisRoot,
-               "4": "%s/download/chr04.fsa" % cisRoot,
-               "5": "%s/download/chr05.fsa" % cisRoot,
-               "6": "%s/download/chr06.fsa" % cisRoot,
-               "7": "%s/download/chr07.fsa" % cisRoot,
-               "8": "%s/download/chr08.fsa" % cisRoot,
-               "9": "%s/download/chr09.fsa" % cisRoot,
-               "10": "%s/download/chr10.fsa" % cisRoot,
-               "11": "%s/download/chr11.fsa" % cisRoot,
-               "12": "%s/download/chr12.fsa" % cisRoot,
-               "13": "%s/download/chr13.fsa" % cisRoot,
-               "14": "%s/download/chr14.fsa" % cisRoot,
-               "15": "%s/download/chr15.fsa" % cisRoot,
-               "16": "%s/download/chr16.fsa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, genePath)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
-
-    for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/S_cerevisiae/chr%s.bin" % chromID)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
\ No newline at end of file
index 6098562b45b54ea2916987552587e95ab50ab8df..e9d12b77a02f880d55082afe10e31132e6e187fd 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,13 +33,33 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/S_purpuratus/spurpuratus.genedb" % cisRoot
 
 
+def buildSpurpuratusDB(db=geneDB):
+    chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot
+    chromoOutPath = "/S_purpuratus/"
+
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Loading genomic sequence"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+def createDBFile(db):
+    spGenome = Genome("spurpuratus", version="2.1",  dbFile=db)
+    spGenome.createGeneDB(db)
+
+
 def loadChromosome(db, chromPath, chromOutPath):
     seqArray = []
     seqLen = 0
@@ -82,27 +102,6 @@ def loadChromosome(db, chromPath, chromOutPath):
     inFile.close()
 
 
-def createDBFile(db):
-    spGenome = Genome("spurpuratus", version="2.1",  dbFile=db)
-    spGenome.createGeneDB(db)
-
-
 def createDBindices(db):
     spGenome = Genome("spurpuratus", version="2.1", dbFile=db)
     spGenome.createIndices()
-
-
-def buildSpurpuratusDB(db=geneDB):
-    chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot
-    chromoOutPath = "/S_purpuratus/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Loading genomic sequence" 
-    loadChromosome(db, chromoPath, chromoOutPath)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
\ No newline at end of file
index eb37500e68a5cfb6f54cf8d054ad47397d2c312b..bda0dbef0cd472f9573a271124afac7bddad0023 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,51 +33,44 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    xtGenome = Genome("xtropicalis", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildXtropicalisDB(db=geneDB):
+    genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
+    chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
+    chromoOutPath = "/X_tropicalis/"
 
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
-            xtGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            xtGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-    inFile.close()
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Loading sequences"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    xtGenome = Genome("xtropicalis",  dbFile=db)
+    xtGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     xtGenome = Genome("xtropicalis", dbFile=db)
     geneFile = open(gFile, "r")
@@ -233,34 +226,41 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
     xtGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    xtGenome = Genome("xtropicalis",  dbFile=db)
-    xtGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     xtGenome = Genome("xtropicalis", dbFile=db)
-    xtGenome.createIndices()
-
-
-def buildXtropicalisDB(db=geneDB):
-    genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
-    chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
-    chromoOutPath = "/X_tropicalis/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
+            xtGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            xtGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
 
-    print "Loading sequences" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    xtGenome = Genome("xtropicalis", dbFile=db)
+    xtGenome.createIndices()
index f8907a66c2db6d3b36a4ac9a13872554c1f1d9ca..827ee503ba7956b37143481beabae171a35f7619 100755 (executable)
@@ -927,6 +927,7 @@ def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, cu
 def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
     regionPasses = False
     if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
+        #TODO: here is where the test dataset is failing
         if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
             regionPasses = True
 
@@ -1008,4 +1009,4 @@ def getBestShiftInDict(shiftDict):
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file