first pass cleanup of cistematic/genomes; change bamPreprocessing

author Sean Upchurch <sau@caltech.edu>

Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)

committer Sean Upchurch <sau@caltech.edu>

Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
author Sean Upchurch <sau@caltech.edu>
Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
committer Sean Upchurch <sau@caltech.edu>
Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
diff --git a/bamPreprocessing.py b/bamPreprocessing.py

index dd5e8612001dd31f19d49470c956205c789f016c..9f8b6fa0e607114af40d95e73707922eabcd981b 100644 (file)
--- a/bamPreprocessing.py
+++ b/bamPreprocessing.py
@@ -15,7 +15,7 @@ import sys
  import pysam
  import optparse
  from cistematic.genomes import Genome
-from commoncode import isSpliceEntry, getFeaturesByChromDict
+from commoncode import isSpliceEntry
  
  
  def main(argv=None):
@@ -35,7 +35,7 @@ def main(argv=None):
      samfile = pysam.Samfile(BAM, "rb" )
      readMultiplicityDict = getReadMultiplicity(samfile)
  
-    counts = getReadCounts(samfile, readMultiplicityDict)
+    counts, readLength = getReadCounts(samfile, readMultiplicityDict)
      outheader = buildHeader(samfile.header, counts)
      if options.markGID:
          genome = Genome(options.genomeName, inRAM=True)
@@ -43,7 +43,6 @@ def main(argv=None):
              genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
  
          print "getting gene features...."
-        featuresByChromDict = getFeaturesByChromDict(genome)    
          outheader['CO'].append(options.genomeComment)
  
      outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
@@ -52,11 +51,13 @@ def main(argv=None):
          if options.markGID:
              chrom = samfile.getrname(alignedread.tid)
              chromNum = chrom[3:]
-            geneModelFlag = "NM"
-            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
-                if start < alignedread.pos and stop > alignedread.pos:
-                    geneModelFlag = gid
-                    continue
+            #new direct query
+            geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
+            try:
+                (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
+                geneModelFlag = name
+            except IndexError:
+                geneModelFlag = "NM"
  
          ID = getReadID(alignedread)
          try:
@@ -156,7 +157,7 @@ def getReadCounts(samfile, readMultiplicityDict):
                "ReadLength\t%d" % readLength
      ]
  
-    return counts
+    return counts, readLength
  
  
  def buildHeader(templateheader, commentList):
diff --git a/cistematic/genomes/__init__.py b/cistematic/genomes/__init__.py

index 4dee9248836da4133204428fd6a476ad759ce8a4..81ba4ec2665ca7ef56cdfa03ebba5a096ce3333a 100644 (file)
--- a/cistematic/genomes/__init__.py
+++ b/cistematic/genomes/__init__.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -562,7 +562,7 @@ class Genome:
                  results.append((name, version, chromosome, start, stop, orientation, atype))
  
          # select all features on chromosome that have a "start" between start and stop
-        stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+        stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
          res = self.queryDB(stmt, fetchall=True)
          for entry in res:
              (chromosome, start, stop, orientation, name, version, atype) = entry
diff --git a/cistematic/genomes/athaliana.py b/cistematic/genomes/athaliana.py

index 628be4dc4789ec61f8775429fd1d261020209063..9d2424639de8466d65510ce26ea0bf9402c5021c 100644 (file)
--- a/cistematic/genomes/athaliana.py
+++ b/cistematic/genomes/athaliana.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,10 +33,10 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
-    
+
  geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
  
  chromSize = {"1": 30432563,
@@ -83,29 +83,40 @@ def decodeGFF3(cols):
      return (fType, gid, chrom, start, stop, sense, otherDict)
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    atGenome = Genome("athaliana", dbFile=db)
+def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
+    annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
+    goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
  
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    seq = string.join(seqArray,"")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
  
-    print "writing to file %s" % (chromOut)
-    outFile = open(cisRoot + chromOut, "w")
-    outFile.write(seq)
-    outFile.close()
-    seq = ""
+    print "Adding feature entries"
+    loadFeatureEntries(db, genePath)
  
-    atGenome.addChromosomeEntry(chromID, chromOut, "file")
-    # Add alternative chromID - should be A-O and 01-09
-    atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, annotPath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath)
+
+    for chromID in ["1", "2", "3", "4", "5", "C", "M"]:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fas" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath,  "/A_thaliana/chr%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    atGenome = Genome("athaliana", dbFile=db)
+    atGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
@@ -133,12 +144,13 @@ def loadFeatureEntries(db, gFile):
      featureEntries = []
      trackedGenes = []
      atGenome = Genome("athaliana", dbFile=db)
-    featureTranslation = {"CDS": "CDS", 
+    featureTranslation = {"CDS": "CDS",
                            "three_prime_UTR": "3UTR",
                            "five_prime_UTR": "5UTR",
                            "miRNA": "5UTR",
                            "exon": "5UTR"
      }
+
      geneFile = open(gFile, "r")
      for line in geneFile:
          fields = line.split("\t")
@@ -178,7 +190,7 @@ def loadFeatureEntries(db, gFile):
  
  def loadGeneAnnotations(db, annotPath):
      geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      annotFile.readline()
      lines = annotFile.readlines()
      annotFile.close()
@@ -220,50 +232,31 @@ def loadGeneOntology(db, goPath):
      atGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    atGenome = Genome("athaliana", dbFile=db)
-    atGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
      atGenome = Genome("athaliana", dbFile=db)
-    atGenome.createIndices()
-
-
-def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
-    annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
-    goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
  
-    chromos = {"1": "%s/chr1.fas" % downloadDir,
-               "2": "%s/chr2.fas" % downloadDir,
-               "3": "%s/chr3.fas" % downloadDir,
-               "4": "%s/chr4.fas" % downloadDir,
-               "5": "%s/chr5.fas" % downloadDir,
-               "C": "%s/chrC.fas" % downloadDir,
-               "M": "%s/chrM.fas" % downloadDir
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding feature entries"
-    loadFeatureEntries(db, genePath)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
  
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, annotPath)
+    seq = string.join(seqArray,"")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
  
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath)
+    print "writing to file %s" % (chromOut)
+    outFile = open(cisRoot + chromOut, "w")
+    outFile.write(seq)
+    outFile.close()
+    seq = ""
  
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID],  "/A_thaliana/chr%s.bin" % chromID)
+    atGenome.addChromosomeEntry(chromID, chromOut, "file")
+    # Add alternative chromID - should be A-O and 01-09
+    atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    atGenome = Genome("athaliana", dbFile=db)
+    atGenome.createIndices()
diff --git a/cistematic/genomes/btaurus.py b/cistematic/genomes/btaurus.py

index c8a01b60d3978d0ebe56272b7f4b87edb62f8a5f..d703bf5f28b13e2b6a80e0216039384a242f8a6c 100644 (file)
--- a/cistematic/genomes/btaurus.py
+++ b/cistematic/genomes/btaurus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,52 +33,44 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
-geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot 
+geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    btGenome = Genome("btaurus", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
+def buildBtaurusDB(db=geneDB):
+    genePath = "%s/download/bt2/genscan.txt" % cisRoot
+    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
+    chromoOutPath = "/B_taurus/"
  
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+    print "Creating database %s" % db
+    createDBFile(db)
  
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
-            btGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            btGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
  
-        header = currentLine
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
  
-    inFile.close()
+    print "Loading sequences"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    btGenome = Genome("btaurus",  dbFile=db)
+    btGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      btGenome = Genome("btaurus", dbFile=db)
      geneFile = open(gFile, "r")
@@ -103,24 +95,6 @@ def loadGeneEntries(db, gFile):
      btGenome.addGeneEntryBatch(geneEntries)
  
  
-def loadGeneAnnotations(db, annotPath):
-    geneAnnotations = []
-    annotFile = open(annotPath, "r")
-    btGenome = Genome("btaurus", dbFile=db)
-    for line in annotFile:
-        try:
-            cols = line.split("\t")
-            locID = cols[0]
-            geneDesc = cols[6]
-            if len(locID) > 0:
-                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
-        except:
-            pass
-
-    print "Adding %d annotations" % len(geneAnnotations)
-    btGenome.addAnnotationBatch(geneAnnotations)
-
-
  def loadGeneFeatures(db, gfile):
      geneFile = open(gfile, "r")
      senseArray = {"+": "F",
@@ -194,11 +168,29 @@ def loadGeneFeatures(db, gfile):
      btGenome.addFeatureEntryBatch(insertArray)
  
  
+def loadGeneAnnotations(db, annotPath):
+    geneAnnotations = []
+    annotFile = open(annotPath, "r")
+    btGenome = Genome("btaurus", dbFile=db)
+    for line in annotFile:
+        try:
+            cols = line.split("\t")
+            locID = cols[0]
+            geneDesc = cols[6]
+            if len(locID) > 0:
+                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
+        except:
+            pass
+
+    print "Adding %d annotations" % len(geneAnnotations)
+    btGenome.addAnnotationBatch(geneAnnotations)
+
+
  def loadGeneOntology(db, goPath, goDefPath, annotPath):
      btGenome = Genome("btaurus", dbFile=db)
      goDefFile = open(goDefPath, "r")
      goFile = open(goPath, "r")
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      annotEntries = annotFile.readlines()
      annotFile.close()
      goDefEntries = goDefFile.readlines()
@@ -238,34 +230,42 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
      btGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    btGenome = Genome("btaurus",  dbFile=db)
-    btGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
      btGenome = Genome("btaurus", dbFile=db)
-    btGenome.createIndices()
-
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
  
-def buildBtaurusDB(db=geneDB):
-    genePath = "%s/download/bt2/genscan.txt" % cisRoot
-    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
-    chromoOutPath = "/B_taurus/"
-    
-    print "Creating database %s" % db
-    createDBFile(db)
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
+            btGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            btGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
  
-    print "Loading sequences" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    btGenome = Genome("btaurus", dbFile=db)
+    btGenome.createIndices()
diff --git a/cistematic/genomes/cbrenneri.py b/cistematic/genomes/cbrenneri.py

index a227faacf6a5c73b6b48512c2e917875fb8d6d9c..7a338a9779a0f97983130f6a5846c03456bb3d34 100644 (file)
--- a/cistematic/genomes/cbrenneri.py
+++ b/cistematic/genomes/cbrenneri.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,48 +33,39 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    cbGenome = Genome("cbrenneri", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromHeader = header.strip()[1:].split()
-        chromID = chromHeader[0]
-        currentLine = inFile.readline()
+def buildCbrenneriDB(db=geneDB):
+    gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
+    chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
+    chromoOutPath = "/C_brenneri/"
  
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+    print "Creating database %s" % db
+    createDBFile(db)
  
-        seq = string.join(seqArray, "")
-        if seqLen < 100000:
-            print "Added contig %s to database" % chromID
-            cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
-            cbGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open( "%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
  
-        header = currentLine
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
  
-    inFile.close()
+    print "Loading genomic sequence"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    cbGenome = Genome("cbrenneri", version="PB2801_001",  dbFile=db)
+    cbGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gffFile):
@@ -164,34 +155,43 @@ def loadFeatureEntries(db, gffFile):
      cbGenome.addFeatureEntryBatch(featureEntries)
  
  
-def createDBFile(db):
-    cbGenome = Genome("cbrenneri", version="PB2801_001",  dbFile=db)
-    cbGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
-    cbGenome.createIndices()
-
-
-def buildCbrenneriDB(db=geneDB):
-    gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
-    chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
-    chromoOutPath = "/C_brenneri/"
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
+    cbGenome = Genome("cbrenneri", dbFile=db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromHeader = header.strip()[1:].split()
+        chromID = chromHeader[0]
+        currentLine = inFile.readline()
  
-    print "Creating database %s" % db
-    createDBFile(db)
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+        seq = string.join(seqArray, "")
+        if seqLen < 100000:
+            print "Added contig %s to database" % chromID
+            cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
+            cbGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open( "%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+        header = currentLine
  
-    print "Loading genomic sequence" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
+    cbGenome.createIndices()
diff --git a/cistematic/genomes/cbriggsae.py b/cistematic/genomes/cbriggsae.py

index b913728a739bd5a43fb8c5a7447f272dfb65678c..00468edd08ada512ed641255bc130fb0f4875b8d 100644 (file)
--- a/cistematic/genomes/cbriggsae.py
+++ b/cistematic/genomes/cbriggsae.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,46 +33,39 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    cbGenome = Genome("cbriggsae", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildCbriggsaeDB(db=geneDB):
+    gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
+    chromoPath = "%s/download/briggsae_25.fa" % cisRoot
+    chromoOutPath = "/C_briggsae/"
  
-        seq = string.join(seqArray, "")
-        if seqLen < 900000:
-            print "Added contig %s to database" % chromID
-            cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
-            cbGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
  
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
  
-    inFile.close()
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Loading genomic sequence"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
+    cbGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gffFile):
@@ -125,7 +118,7 @@ def loadFeatureEntries(db, gffFile):
      featureEntries = []
      seenFeatures = {}
      featureTranslation = {"coding_exon": "CDS",
-                          "three_prime_UTR": "3UTR", 
+                          "three_prime_UTR": "3UTR",
                            "five_prime_UTR": "5UTR"
      }
  
@@ -161,34 +154,41 @@ def loadFeatureEntries(db, gffFile):
      cbGenome.addFeatureEntryBatch(featureEntries)
  
  
-def createDBFile(db):
-    cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
-    cbGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
-    cbGenome.createIndices()
-
-
-def buildCbriggsaeDB(db=geneDB):
-    gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
-    chromoPath = "%s/download/briggsae_25.fa" % cisRoot
-    chromoOutPath = "/C_briggsae/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
+    cbGenome = Genome("cbriggsae", dbFile=db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+        seq = string.join(seqArray, "")
+        if seqLen < 900000:
+            print "Added contig %s to database" % chromID
+            cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
+            cbGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            cbGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+        header = currentLine
  
-    print "Loading genomic sequence" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
+    cbGenome.createIndices()
diff --git a/cistematic/genomes/celegans.py b/cistematic/genomes/celegans.py

index e3df297269ddba790fc7380c6010cce2cf876f3b..e9287966e5379ea1fdb7eb45c315b6459b4d2825 100644 (file)
--- a/cistematic/genomes/celegans.py
+++ b/cistematic/genomes/celegans.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,31 +33,54 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    ceGenome = Genome("celegans", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildCelegansDB(db=geneDB, downloadRoot=""):
+    if downloadRoot == "":
+        downloadRoot = "%s/download/" % cisRoot
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    geneIDPath = "%sgeneIDs.WS200" % downloadRoot
+    goDefPath = "%sGO.terms_and_ids" % downloadRoot
+    goPath = "%sgene_association.wb" % downloadRoot
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    ceGenome.addChromosomeEntry(chromID, chromOut, "file")
+    # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
+    gffPath = "%selegansWS200.gff" % downloadRoot
+
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
+
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, geneIDPath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath, geneIDPath)
+
+    # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
+    for chromID in ["I", "II", "III", "IV", "V", "X"]:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    ceGenome = Genome("celegans", version="WS200",  dbFile=db)
+    ceGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gffFile):
@@ -84,6 +107,7 @@ def loadGeneEntries(db, gffFile):
          else:
              gidGene = giddots[1]
              gidLetter = "a"
+
          gid = "%s.%s" % (giddots[0], gidGene)
          geneID = ("celegans", gid)
          gidVersion = 1
@@ -177,7 +201,7 @@ def loadFeatureEntries(db, gffFile):
  
  def loadGeneAnnotations(db, geneIDPath):
      geneAnnotations = []
-    geneIDFile = open(geneIDPath, "r")  
+    geneIDFile = open(geneIDPath, "r")
      lines = geneIDFile.readlines()
      geneIDFile.close()
      ceGenome = Genome("celegans", dbFile=db)
@@ -258,56 +282,26 @@ def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
      ceGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    ceGenome = Genome("celegans", version="WS200",  dbFile=db)
-    ceGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    ceGenome = Genome("celegans", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    ceGenome.addChromosomeEntry(chromID, chromOut, "file")
  
  
  def createDBindices(db):
      ceGenome = Genome("celegans", version="WS200", dbFile=db)
      ceGenome.createIndices()
-
-
-def buildCelegansDB(db=geneDB, downloadRoot=""):
-    if downloadRoot == "":
-        downloadRoot = "%s/download/" % cisRoot
-
-    geneIDPath = "%sgeneIDs.WS200" % downloadRoot
-    goDefPath = "%sGO.terms_and_ids" % downloadRoot
-    goPath = "%sgene_association.wb" % downloadRoot
-
-    # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
-    chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot,
-               "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot,
-               "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot,
-               "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot,
-               "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot,
-               "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot
-    }
-
-    # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
-    gffPath = "%selegansWS200.gff" % downloadRoot
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries" 
-    loadGeneEntries(db, gffPath)
-
-    print "Adding feature entries" 
-    loadFeatureEntries(db, gffPath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, geneIDPath)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath, geneIDPath)
-
-    for chromID in chromos:
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
-\ No newline at end of file
diff --git a/cistematic/genomes/cfamiliaris.py b/cistematic/genomes/cfamiliaris.py

index 75c31d1c9fa21e84708e5f1c3891cb09d1f589ac..12eda3008a9b31d8b13f25d263b6812198065dc9 100644 (file)
--- a/cistematic/genomes/cfamiliaris.py
+++ b/cistematic/genomes/cfamiliaris.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,36 +33,48 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    cfGenome = Genome("cfamiliaris", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildDogDB(db=geneDB):
+    genePath = "%s/download/seq_gene.md" % cisRoot
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    cfGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
+                 "31", "32", "33", "34", "35", "36", "37", "38", "X", "Un"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/C_familiaris/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    cfGenome = Genome("cfamiliaris",  dbFile=db)
+    cfGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      alreadySeen = []
      cfGenome = Genome("cfamiliaris", dbFile=db)
@@ -177,7 +189,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                  synonyms = idb.geneIDSynonyms(gID)
                  if len(synonyms) >0:
                      for entry in synonyms:
-                        gene_name += ","   
+                        gene_name += ","
                          gene_name += entry
                  else:
                      gene_name = " "
@@ -191,74 +203,26 @@ def loadGeneOntology(db, goPath, goDefPath):
      cfGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    cfGenome = Genome("cfamiliaris",  dbFile=db)
-    cfGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
      cfGenome = Genome("cfamiliaris", dbFile=db)
-    cfGenome.createIndices()
-
-
-def buildDogDB(db=geneDB):
-    genePath = "%s/download/seq_gene.md" % cisRoot
-    chromos = {"1": "%s/download/chr1.fa" % cisRoot,
-               "2": "%s/download/chr2.fa" % cisRoot,
-               "3": "%s/download/chr3.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "5": "%s/download/chr5.fa" % cisRoot,
-               "6": "%s/download/chr6.fa" % cisRoot,
-               "7": "%s/download/chr7.fa" % cisRoot,
-               "8": "%s/download/chr8.fa" % cisRoot,
-               "9": "%s/download/chr9.fa" % cisRoot,
-               "10": "%s/download/chr10.fa" % cisRoot,
-               "11": "%s/download/chr11.fa" % cisRoot,
-               "12": "%s/download/chr12.fa" % cisRoot,
-               "13": "%s/download/chr13.fa" % cisRoot,
-               "14": "%s/download/chr14.fa" % cisRoot,
-               "15": "%s/download/chr15.fa" % cisRoot,
-               "16": "%s/download/chr16.fa" % cisRoot,
-               "17": "%s/download/chr17.fa" % cisRoot,
-               "18": "%s/download/chr18.fa" % cisRoot,
-               "19": "%s/download/chr19.fa" % cisRoot,
-               "20": "%s/download/chr20.fa" % cisRoot,
-               "21": "%s/download/chr21.fa" % cisRoot,
-               "22": "%s/download/chr22.fa" % cisRoot,
-               '23': "%s/download/chr23.fa" % cisRoot,
-               "24": "%s/download/chr24.fa" % cisRoot,
-               "25": "%s/download/chr25.fa" % cisRoot,
-               "26": "%s/download/chr26.fa" % cisRoot,
-               "27": "%s/download/chr27.fa" % cisRoot,
-               "28": "%s/download/chr28.fa" % cisRoot,
-               "29": "%s/download/chr29.fa" % cisRoot,
-               "30": "%s/download/chr30.fa" % cisRoot,
-               "31": "%s/download/chr31.fa" % cisRoot,
-               "32": "%s/download/chr32.fa" % cisRoot,
-               "33": "%s/download/chr33.fa" % cisRoot,
-               "34": "%s/download/chr34.fa" % cisRoot,
-               "35": "%s/download/chr35.fa" % cisRoot,
-               "36": "%s/download/chr36.fa" % cisRoot,
-               "37": "%s/download/chr37.fa" % cisRoot,
-               "38": "%s/download/chr38.fa" % cisRoot,
-               "X": "%s/download/chrX.fa" % cisRoot,
-               "Un": "%s/download/chrUn.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
  
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
  
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/C_familiaris/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    cfGenome.addChromosomeEntry(chromID, chromOut, "file")
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    cfGenome = Genome("cfamiliaris", dbFile=db)
+    cfGenome.createIndices()
diff --git a/cistematic/genomes/cremanei.py b/cistematic/genomes/cremanei.py

index 86e5991b112b69a095760b5eebf056de0621c0a1..3bc6b8f6a0dd9c7ab6aa61a8ce3bf8e00b21fd5c 100644 (file)
--- a/cistematic/genomes/cremanei.py
+++ b/cistematic/genomes/cremanei.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,51 +33,39 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
  
  
-def loadChromosomes(db, inPath, chromOutPath):
-    crGenome = Genome("cremanei", dbFile=db)
-    scontigList = os.listdir(inPath)
-    for scontig in scontigList:
-        seq = ''
-        seqArray = []
-        seqLen = 0
-        inFile = open("%s/%s" % (inPath, scontig), "r")
-        index = 0
-        header = inFile.readline()
-        chromID = header.strip()[1:]
-        while header != "":
-            seqArray = []
-            seqLen = 0
-            currentLine = inFile.readline()
-            while currentLine != "" and currentLine[0] != ">":
-                lineSeq = currentLine.strip()
-                seqLen += len(lineSeq)
-                seqArray.append(lineSeq)
-                currentLine = inFile.readline()
+def buildCremaneiDB(db=geneDB):
+    gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
+    chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
+    chromoOutPath = "/C_remanei/"
  
-            seq = string.join(seqArray, "")
-            if seqLen < 100000:
-                print "Added contig %s to database" % chromID
-                crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
-                crGenome.addChromosomeEntry(chromID, chromID, "db")
-            else:
-                outFileName = "%s%s.bin" % (chromOutPath, chromID)
-                outFile = open("%s%s" % (cisRoot, outFileName), "w")
-                outFile.write(seq)
-                outFile.close()
-                print "Added contig file %s to database" % outFileName
-                crGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
  
-            index += 1
-            header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
  
-        inFile.close()
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Loading genomic sequence"
+    loadChromosomes(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
+    crGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gffFile):
@@ -155,34 +143,46 @@ def loadFeatureEntries(db, gffFile):
      crGenome.addFeatureEntryBatch(featureEntries)
  
  
-def createDBFile(db):
-    crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
-    crGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
-    crGenome.createIndices()
-
-
-def buildCremaneiDB(db=geneDB):
-    gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
-    chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
-    chromoOutPath = "/C_remanei/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+def loadChromosomes(db, inPath, chromOutPath):
+    crGenome = Genome("cremanei", dbFile=db)
+    scontigList = os.listdir(inPath)
+    for scontig in scontigList:
+        seq = ''
+        seqArray = []
+        seqLen = 0
+        inFile = open("%s/%s" % (inPath, scontig), "r")
+        index = 0
+        header = inFile.readline()
+        chromID = header.strip()[1:]
+        while header != "":
+            seqArray = []
+            seqLen = 0
+            currentLine = inFile.readline()
+            while currentLine != "" and currentLine[0] != ">":
+                lineSeq = currentLine.strip()
+                seqLen += len(lineSeq)
+                seqArray.append(lineSeq)
+                currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+            seq = string.join(seqArray, "")
+            if seqLen < 100000:
+                print "Added contig %s to database" % chromID
+                crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
+                crGenome.addChromosomeEntry(chromID, chromID, "db")
+            else:
+                outFileName = "%s%s.bin" % (chromOutPath, chromID)
+                outFile = open("%s%s" % (cisRoot, outFileName), "w")
+                outFile.write(seq)
+                outFile.close()
+                print "Added contig file %s to database" % outFileName
+                crGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+            index += 1
+            header = currentLine
  
-    print "Loading genomic sequence" 
-    loadChromosomes(db, chromoPath, chromoOutPath)
+        inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
+    crGenome.createIndices()
diff --git a/cistematic/genomes/dmelanogaster.py b/cistematic/genomes/dmelanogaster.py

index 5c9f53a58bbc4466689f9ee2dc40378db3367c10..78ef7fb7594b11314ef1b7602c2a03fcd7cd521d 100644 (file)
--- a/cistematic/genomes/dmelanogaster.py
+++ b/cistematic/genomes/dmelanogaster.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,7 +33,7 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
@@ -67,24 +67,47 @@ version = {"A": "1",
             "Z": "26"
  }
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    dmGenome = Genome("dmelanogaster", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+def buildDmelanogasterDB(db=geneDB):
+    """ genes and annotations are from UCSC. GO association file is from geneontology.org.
+    """
+    genePath = "%s/download/flyBaseGene.txt" % cisRoot
+    annotPath = "%s/download/gene_info" % cisRoot
+    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+    goPath = "%s/download/gene2go" % cisRoot
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    dmGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, annotPath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath, annotPath)
+
+    chromList = ["2L", "2R", "2LHet", "2RHet", "3L", "3R", "3LHet", "3RHet",
+                 "4", "X", "XHet", "YHet", "U", "UExtra", "M"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/D_melanogaster/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    dmGenome = Genome("dmelanogaster", dbFile=db)
+    dmGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
@@ -196,7 +219,7 @@ def loadGeneFeatures(db, gfile):
  
  def loadGeneAnnotations(db, annotPath):
      geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      dmGenome = Genome("dmelanogaster", dbFile=db)
      for line in annotFile:
          try:
@@ -222,7 +245,7 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
      dmGenome = Genome("dmelanogaster", dbFile=db)
      goDefFile = open(goDefPath, "r")
      goFile = open(goPath, "r")
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      annotEntries = annotFile.readlines()
      annotFile.close()
      goDefEntries = goDefFile.readlines()
@@ -271,60 +294,26 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
      dmGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
      dmGenome = Genome("dmelanogaster", dbFile=db)
-    dmGenome.createGeneDB(db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    dmGenome.addChromosomeEntry(chromID, chromOut, "file")
  
  
  def createDBindices(db):
      dmGenome = Genome("dmelanogaster", dbFile=db)
      dmGenome.createIndices()
-
-
-def buildDmelanogasterDB(db=geneDB):
-    """ genes and annotations are from UCSC. GO association file is from geneontology.org. 
-    """
-    genePath = "%s/download/flyBaseGene.txt" % cisRoot
-    annotPath = "%s/download/gene_info" % cisRoot
-    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
-    goPath = "%s/download/gene2go" % cisRoot
-    chromos = {"2L": "%s/download/chr2L.fa" % cisRoot,
-               "2R": "%s/download/chr2R.fa" % cisRoot,
-               "2LHet": "%s/download/chr2LHet.fa" % cisRoot,
-               "2RHet": "%s/download/chr2RHet.fa" % cisRoot,
-               "3L": "%s/download/chr3L.fa" % cisRoot,
-               "3LHet": "%s/download/chr3LHet.fa" % cisRoot,
-               "3R": "%s/download/chr3R.fa" % cisRoot,
-               "3RHet": "%s/download/chr3RHet.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "X": "%s/download/chrX.fa" % cisRoot,
-               "XHet": "%s/download/chrXHet.fa" % cisRoot,
-               "YHet": "%s/download/chrYHet.fa" % cisRoot,
-               "U": "%s/download/chrU.fa" % cisRoot,
-               "Uextra": "%s/download/chrUextra.fa" % cisRoot,
-               "M": "%s/download/chrM.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, annotPath)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath, annotPath)
-
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
-\ No newline at end of file
diff --git a/cistematic/genomes/drerio.py b/cistematic/genomes/drerio.py

index 7541db8d9043e2f4a082c73f6f938eb4ff639d9c..f26884eea87d0d463037a0a91f0ae768d974edbd 100644 (file)
--- a/cistematic/genomes/drerio.py
+++ b/cistematic/genomes/drerio.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -34,48 +34,40 @@ from cistematic.core.geneinfo import geneinfoDB
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/D_rerio/drerio.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    drGenome = Genome("drerio", dbFile=db)
-    files = os.listdir(chromPath)
-    for filename in files:
-        inFile = open("%s/%s" % (chromPath, filename), "r")
-        header = inFile.readline()
-        while header != "":
-            seqArray = []
-            seqLen = 0
-            chromID = header.strip()[1:]
-            currentLine = inFile.readline()
-            while currentLine != "" and currentLine[0] != ">":
-                lineSeq = currentLine.strip()
-                seqLen += len(lineSeq)
-                seqArray.append(lineSeq)
-                currentLine = inFile.readline()
+def buildDrerioDB(db=geneDB):
+    """ genes and annotations are from UCSC (dr3).
+    """
+    #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
+    chromoPath = "%s/download/dr3" % cisRoot
+    chromoOutPath = "/D_rerio/"
+    print "Creating database %s" % db
+    createDBFile(db)
  
-            seq = string.join(seqArray, "")
-            if seqLen < 250000:
-                print "Added contig %s to database" % chromID
-                drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
-                drGenome.addChromosomeEntry(chromID, chromID, "db")
-            else:
-                outFileName = "%s%s.bin" % (chromOutPath, chromID)
-                outFile = open("%s%s" % (cisRoot, outFileName), "w")
-                outFile.write(seq)
-                outFile.close()
-                print "Added contig file %s to database" % outFileName
-                drGenome.addChromosomeEntry(chromID, outFileName, "file")
+    #print "Adding gene entries"
+    #loadGeneEntries(db, genePath)
  
-            header = currentLine
+    #print "Adding gene features"
+    #loadGeneFeatures(db, genePath)
  
-        inFile.close()
+    print "Loading chromosomes"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    drGenome = Genome("drerio",  dbFile=db)
+    drGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
@@ -216,35 +208,43 @@ def loadGeneFeatures(db, gfile):
      drGenome.addFeatureEntryBatch(insertArray)
  
  
-def createDBFile(db):
-    drGenome = Genome("drerio",  dbFile=db)
-    drGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
      drGenome = Genome("drerio", dbFile=db)
-    drGenome.createIndices()
-
-
-def buildDrerioDB(db=geneDB):
-    """ genes and annotations are from UCSC (dr3). 
-    """
-    #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
-    chromoPath = "%s/download/dr3" % cisRoot
-    chromoOutPath = "/D_rerio/"
-    print "Creating database %s" % db
-    createDBFile(db)
+    files = os.listdir(chromPath)
+    for filename in files:
+        inFile = open("%s/%s" % (chromPath, filename), "r")
+        header = inFile.readline()
+        while header != "":
+            seqArray = []
+            seqLen = 0
+            chromID = header.strip()[1:]
+            currentLine = inFile.readline()
+            while currentLine != "" and currentLine[0] != ">":
+                lineSeq = currentLine.strip()
+                seqLen += len(lineSeq)
+                seqArray.append(lineSeq)
+                currentLine = inFile.readline()
  
-    #print "Adding gene entries"
-    #loadGeneEntries(db, genePath)
+            seq = string.join(seqArray, "")
+            if seqLen < 250000:
+                print "Added contig %s to database" % chromID
+                drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
+                drGenome.addChromosomeEntry(chromID, chromID, "db")
+            else:
+                outFileName = "%s%s.bin" % (chromOutPath, chromID)
+                outFile = open("%s%s" % (cisRoot, outFileName), "w")
+                outFile.write(seq)
+                outFile.close()
+                print "Added contig file %s to database" % outFileName
+                drGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    #print "Adding gene features"
-    #loadGeneFeatures(db, genePath)
+            header = currentLine
  
-    print "Loading chromosomes"
-    loadChromosome(db, chromoPath, chromoOutPath)
+        inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    drGenome = Genome("drerio", dbFile=db)
+    drGenome.createIndices()
diff --git a/cistematic/genomes/ecaballus.py b/cistematic/genomes/ecaballus.py

index e845c426500550ef1169df4a3d4bf33fc99690d8..f2ac6d0da6369476e726500af3de550f638f2f6f 100644 (file)
--- a/cistematic/genomes/ecaballus.py
+++ b/cistematic/genomes/ecaballus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,36 +33,49 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    ecGenome = Genome("ecaballus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildHorseDB(db=geneDB):
+    genePath = "%s/download/seq_gene.md" % cisRoot
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    ecGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
+                 "31", "M", "X", "Un"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/E_caballus/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    ecGenome = Genome("ecaballus",  dbFile=db)
+    ecGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      alreadySeen = []
      ecGenome = Genome("ecaballus", dbFile=db)
@@ -131,68 +144,26 @@ def loadGeneFeatures(db, gFile):
      ecGenome.addFeatureEntryBatch(featureEntries)
  
  
-def createDBFile(db):
-    ecGenome = Genome("ecaballus",  dbFile=db)
-    ecGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
      ecGenome = Genome("ecaballus", dbFile=db)
-    ecGenome.createIndices()
-
-
-def buildHorseDB(db=geneDB):
-    genePath = "%s/download/seq_gene.md" % cisRoot
-    chromos = {"1": "%s/download/chr1.fa" % cisRoot,
-               "2": "%s/download/chr2.fa" % cisRoot,
-               "3": "%s/download/chr3.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "5": "%s/download/chr5.fa" % cisRoot,
-               "6": "%s/download/chr6.fa" % cisRoot,
-               "7": "%s/download/chr7.fa" % cisRoot,
-               "8": "%s/download/chr8.fa" % cisRoot,
-               "9": "%s/download/chr9.fa" % cisRoot,
-               "10": "%s/download/chr10.fa" % cisRoot,
-               "11": "%s/download/chr11.fa" % cisRoot,
-               "12": "%s/download/chr12.fa" % cisRoot,
-               "13": "%s/download/chr13.fa" % cisRoot,
-               "14": "%s/download/chr14.fa" % cisRoot,
-               "15": "%s/download/chr15.fa" % cisRoot,
-               "16": "%s/download/chr16.fa" % cisRoot,
-               "17": "%s/download/chr17.fa" % cisRoot,
-               "18": "%s/download/chr18.fa" % cisRoot,
-               "19": "%s/download/chr19.fa" % cisRoot,
-               "20": "%s/download/chr20.fa" % cisRoot,
-               "21": "%s/download/chr21.fa" % cisRoot,
-               "22": "%s/download/chr22.fa" % cisRoot,
-               "23": "%s/download/chr23.fa" % cisRoot,
-               "24": "%s/download/chr24.fa" % cisRoot,
-               "25": "%s/download/chr25.fa" % cisRoot,
-               "26": "%s/download/chr26.fa" % cisRoot,
-               "27": "%s/download/chr27.fa" % cisRoot,
-               "28": "%s/download/chr28.fa" % cisRoot,
-               "29": "%s/download/chr29.fa" % cisRoot,
-               "30": "%s/download/chr30.fa" % cisRoot,
-               "31": "%s/download/chr31.fa" % cisRoot,
-               "M": "%s/download/chrM.fa" % cisRoot,
-               "X": "%s/download/chrX.fa" % cisRoot,
-               "Un": "%s/download/chrUn.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
  
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
  
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/E_caballus/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    ecGenome.addChromosomeEntry(chromID, chromOut, "file")
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    ecGenome = Genome("ecaballus", dbFile=db)
+    ecGenome.createIndices()
diff --git a/cistematic/genomes/ggallus.py b/cistematic/genomes/ggallus.py

index 551a116654fbf1cd5a72cf944ba253ca3f777eed..eb0f5f30eae656dbf5fcd936030fd1f04f0d7a72 100644 (file)
--- a/cistematic/genomes/ggallus.py
+++ b/cistematic/genomes/ggallus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -34,36 +34,62 @@ from cistematic.core.geneinfo import geneinfoDB
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/G_gallus/ggallus.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    ggGenome = Genome("ggallus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildChickenDB(db=geneDB):
+    genePath = "%s/download/seq_gene.md" % cisRoot
+    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
+    goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    ggGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+    
+    #print "Adding gene annotations"
+    #loadGeneAnnotations(db, annotPath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "23", "24", "25", "26", "27", "28",
+                 "32", "W", "Z", "M", "E22C19W28_E50C23", "E64",
+                 "1_random", "2_random", "4_random", "5_random", "6_random",
+                 "7_random", "8_random", "10_random", "11_random", "12_random",
+                 "13_random", "16_random", "17_random", "18_random", "20_random",
+                 "22_random", "25_random", "28_random", "Un_random", "W_random",
+                 "E64_random", "Z_random", "E22C19W28_E50C23_random"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/G_gallus/chromo%s.bin" % chromID)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    ggGenome = Genome("ggallus",  dbFile=db)
+    ggGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      alreadySeen = []
      ggGenome = Genome("ggallus", dbFile=db)
@@ -138,6 +164,26 @@ def loadGeneFeatures(db, gFile):
      ggGenome.addFeatureEntryBatch(featureEntries)
  
  
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    ggGenome = Genome("ggallus", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    ggGenome.addChromosomeEntry(chromID, chromOut, "file")
+
+
  def loadGeneOntology(db, goPath, goDefPath):
      ggGenome = Genome("ggallus", dbFile=db)
      goDefFile = open(goDefPath, "r")
@@ -176,99 +222,6 @@ def loadGeneOntology(db, goPath, goDefPath):
      ggGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    ggGenome = Genome("ggallus",  dbFile=db)
-    ggGenome.createGeneDB(db)
-
-
  def createDBindices(db):
      ggGenome = Genome("ggallus", dbFile=db)
      ggGenome.createIndices()
-
-
-def buildChickenDB(db=geneDB):
-    genePath = "%s/download/seq_gene.md" % cisRoot
-    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
-    goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
-    chromos = {"1": "%s/download/chr1.fa" % cisRoot,
-               "2": "%s/download/chr2.fa" % cisRoot,
-               "3": "%s/download/chr3.fa" % cisRoot,
-               "4": "%s/download/chr4.fa" % cisRoot,
-               "5": "%s/download/chr5.fa" % cisRoot,
-               "6": "%s/download/chr6.fa" % cisRoot,
-               "7": "%s/download/chr7.fa" % cisRoot,
-               "8": "%s/download/chr8.fa" % cisRoot,
-               "9": "%s/download/chr9.fa" % cisRoot,
-               "10": "%s/download/chr10.fa" % cisRoot,
-               "11": "%s/download/chr11.fa" % cisRoot,
-               "12": "%s/download/chr12.fa" % cisRoot,
-               "13": "%s/download/chr13.fa" % cisRoot,
-               "14": "%s/download/chr14.fa" % cisRoot,
-               "15": "%s/download/chr15.fa" % cisRoot,
-               "16": "%s/download/chr16.fa" % cisRoot,
-               "17": "%s/download/chr17.fa" % cisRoot,
-               "18": "%s/download/chr18.fa" % cisRoot,
-               "19": "%s/download/chr19.fa" % cisRoot,
-               "20": "%s/download/chr20.fa" % cisRoot,
-               "21": "%s/download/chr21.fa" % cisRoot,
-               "22": "%s/download/chr22.fa" % cisRoot,
-               "23": "%s/download/chr23.fa" % cisRoot,
-               "24": "%s/download/chr24.fa" % cisRoot,
-               "25": "%s/download/chr25.fa" % cisRoot,
-               "26": "%s/download/chr26.fa" % cisRoot,
-               "27": "%s/download/chr27.fa" % cisRoot,
-               "28": "%s/download/chr28.fa" % cisRoot,
-               "32": "%s/download/chr32.fa" % cisRoot,
-               "W": "%s/download/chrW.fa" % cisRoot,
-               "Z": "%s/download/chrZ.fa" % cisRoot,
-               "M": "%s/download/chrM.fa" % cisRoot,
-               "E22C19W28_E50C23": "%s/download/chrE22C19W28_E50C23.fa" % cisRoot,
-               "E64": "%s/download/chrE64.fa" % cisRoot,
-               "1_random": "%s/download/chr1_random.fa" % cisRoot,
-               "2_random": "%s/download/chr2_random.fa" % cisRoot,
-               "4_random": "%s/download/chr4_random.fa" % cisRoot,
-               "5_random": "%s/download/chr5_random.fa" % cisRoot,
-               "6_random": "%s/download/chr6_random.fa" % cisRoot,
-               "7_random": "%s/download/chr7_random.fa" % cisRoot,
-               "8_random": "%s/download/chr8_random.fa" % cisRoot,
-               "10_random": "%s/download/chr10_random.fa" % cisRoot,
-               "11_random": "%s/download/chr11_random.fa" % cisRoot,
-               "12_random": "%s/download/chr12_random.fa" % cisRoot,
-               "13_random": "%s/download/chr13_random.fa" % cisRoot,
-               "16_random": "%s/download/chr16_random.fa" % cisRoot,
-               "17_random": "%s/download/chr17_random.fa" % cisRoot,
-               "18_random": "%s/download/chr18_random.fa" % cisRoot,
-               "20_random": "%s/download/chr20_random.fa" % cisRoot,
-               "22_random": "%s/download/chr22_random.fa" % cisRoot,
-               "25_random": "%s/download/chr25_random.fa" % cisRoot,
-               "28_random": "%s/download/chr28_random.fa" % cisRoot,
-               "Un_random": "%s/download/chrUn_random.fa" % cisRoot,
-               "W_random": "%s/download/chrW_random.fa" % cisRoot,
-               "E64_random": "%s/download/chrE64_random.fa" % cisRoot,
-               "Z_random": "%s/download/chrZ_random.fa" % cisRoot,
-               "E22C19W28_E50C23_random": "%s/download/chrE22C19W28_E50C23_random.fa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-    
-    #print "Adding gene annotations"
-    #loadGeneAnnotations(db, annotPath)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
-
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/G_gallus/chromo%s.bin" % chromID)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
-\ No newline at end of file
diff --git a/cistematic/genomes/hsapiens.py b/cistematic/genomes/hsapiens.py

index e9c677cace13c0e556d0c2234029d3ce42e8c98d..b0948f1327ba90b315a2b0bc245caeb68ed4c77d 100644 (file)
--- a/cistematic/genomes/hsapiens.py
+++ b/cistematic/genomes/hsapiens.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -34,34 +34,56 @@ from cistematic.core.geneinfo import geneinfoDB
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    hsGenome = Genome("hsapiens", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    hsGenome.addChromosomeEntry(chromID, chromOut, "file")
+def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
+    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
+    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
+    # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "21", "22", "X", "Y"
+    ]
+
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath, chromList)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath, chromList)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath, "/H_sapiens/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    hsGenome = Genome("hsapiens",  dbFile=db)
+    hsGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile, cDict):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
      geneEntries = []
      hsGenome = Genome("hsapiens", dbFile=db)
      geneFile = open(gFile, "r")
@@ -76,7 +98,7 @@ def loadGeneEntries(db, gFile, cDict):
          chrom = cols[1].strip()
          if chrom not in cDict:
              continue
-  
+
          name = cols[10].split(":")
          gid = name[1]
          start = int(cols[2])
@@ -105,8 +127,10 @@ def loadGeneFeatures(db, gFile, cDict):
          cols = line.split("\t")
          if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
              continue
+
          if cols[12] == "Celera":
              continue
+
          chrom = cols[1].strip()
          if chrom not in cDict:
              continue
@@ -184,7 +208,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                  synonyms = idb.geneIDSynonyms(gID)
                  if len(synonyms) >0:
                      for entry in synonyms:
-                        gene_name += ","    
+                        gene_name += ","
                          gene_name += entry
                  else:
                      gene_name = " "
@@ -198,67 +222,25 @@ def loadGeneOntology(db, goPath, goDefPath):
      hsGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    hsGenome = Genome("hsapiens",  dbFile=db)
-    hsGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    hsGenome = Genome("hsapiens", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    hsGenome.addChromosomeEntry(chromID, chromOut, "file")
  
  
  def createDBindices(db):
      hsGenome = Genome("hsapiens", dbFile=db)
      hsGenome.createIndices()
-
-
-def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
-    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
-    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
-    # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
-    chromDict = {"1": "%s/chr1.fa" % downloadDir,
-                 "2": "%s/chr2.fa" % downloadDir,
-                 "3": "%s/chr3.fa" % downloadDir,
-                 "4": "%s/chr4.fa" % downloadDir,
-                 "5": "%s/chr5.fa" % downloadDir,
-                 "6": "%s/chr6.fa" % downloadDir,
-                 "7": "%s/chr7.fa" % downloadDir,
-                 "8": "%s/chr8.fa" % downloadDir,
-                 "9": "%s/chr9.fa" % downloadDir,
-                 "10": "%s/chr10.fa" % downloadDir,
-                 "11": "%s/chr11.fa" % downloadDir,
-                 "12": "%s/chr12.fa" % downloadDir,
-                 "13": "%s/chr13.fa" % downloadDir,
-                 "14": "%s/chr14.fa" % downloadDir,
-                 "15": "%s/chr15.fa" % downloadDir,
-                 "16": "%s/chr16.fa" % downloadDir,
-                 "17": "%s/chr17.fa" % downloadDir,
-                 "18": "%s/chr18.fa" % downloadDir,
-                 "19": "%s/chr19.fa" % downloadDir,
-                 "20": "%s/chr20.fa" % downloadDir,
-                 "21": "%s/chr21.fa" % downloadDir,
-                 "22": "%s/chr22.fa" % downloadDir,
-                 "X": "%s/chrX.fa" % downloadDir,
-                 "Y": "%s/chrY.fa" % downloadDir
-    }
-    
-    print "Creating database %s" % db
-    createDBFile(db)
-    
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath, chromDict)
-    
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath, chromDict)
-    
-    print "Adding gene annotations"
-    loadGeneAnnotations(db)
-    
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
-    
-    for chromID in chromDict.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID)
-    
-    print "Creating Indices"
-    createDBindices(db)
-    
-    print "Finished creating database %s" % db
diff --git a/cistematic/genomes/mdomestica.py b/cistematic/genomes/mdomestica.py

index 516e784bb7eaedd005a4776029731dd91aae576b..e1fc731dd8e39f88bcbc7d2500ad84cfc7b04fdc 100644 (file)
--- a/cistematic/genomes/mdomestica.py
+++ b/cistematic/genomes/mdomestica.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,51 +33,44 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    mdGenome = Genome("mdomestica", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildMdomesticaDB(db=geneDB):
+    genePath = "%s/download/mondom/genscan.txt" % cisRoot
+    chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
+    chromoOutPath = "/M_domestica/"
  
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
-            mdGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            mdGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
  
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
  
-    inFile.close()
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Loading chromosomes"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    mdGenome = Genome("mdomestica",  dbFile=db)
+    mdGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      mdGenome = Genome("mdomestica", dbFile=db)
      geneFile = open(gFile, "r")
@@ -103,7 +96,7 @@ def loadGeneEntries(db, gFile):
  
  def loadGeneAnnotations(db, annotPath):
      geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      mdGenome = Genome("mdomestica", dbFile=db)
      for line in annotFile:
          try:
@@ -191,34 +184,41 @@ def loadGeneFeatures(db, gfile):
      mdGenome.addFeatureEntryBatch(insertArray)
  
  
-def createDBFile(db):
-    mdGenome = Genome("mdomestica",  dbFile=db)
-    mdGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
      mdGenome = Genome("mdomestica", dbFile=db)
-    mdGenome.createIndices()
-
-
-def buildMdomesticaDB(db=geneDB):
-    genePath = "%s/download/mondom/genscan.txt" % cisRoot
-    chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
-    chromoOutPath = "/M_domestica/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
+            mdGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            mdGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
  
-    print "Loading chromosomes"
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    mdGenome = Genome("mdomestica", dbFile=db)
+    mdGenome.createIndices()
diff --git a/cistematic/genomes/mmusculus.py b/cistematic/genomes/mmusculus.py

index 4e90cfe7e271f5b704d6cb152bb83da2b0264ad9..72f1350070ff7027118e5a9c569e1da23d5e8e41 100644 (file)
--- a/cistematic/genomes/mmusculus.py
+++ b/cistematic/genomes/mmusculus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -34,36 +34,58 @@ from cistematic.core.geneinfo import geneinfoDB
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB  = "%s/M_musculus/mmusculus.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    mmGenome = Genome("mmusculus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
+    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
+    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
+    # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
+    # but ignoring all random chromosomes
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19",
+                 "X", "Y", "M"
+    ]
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    mmGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath, chromList)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath, chromList)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath, "/M_musculus/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    mmGenome = Genome("mmusculus",  dbFile=db)
+    mmGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile, cDict):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      alreadySeen = []
      mmGenome = Genome("mmusculus", dbFile=db)
@@ -184,7 +206,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                  synonyms = idb.geneIDSynonyms(gID)
                  if len(synonyms) >0:
                      for entry in synonyms:
-                        gene_name += ","    
+                        gene_name += ","
                          gene_name += entry
                  else:
                      gene_name = " "
@@ -198,66 +220,26 @@ def loadGeneOntology(db, goPath, goDefPath):
      mmGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    mmGenome = Genome("mmusculus",  dbFile=db)
-    mmGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
      mmGenome = Genome("mmusculus", dbFile=db)
-    mmGenome.createIndices()
-
-
-def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
-    goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
-    goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
-    # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
-    # but ignoring all random chromosomes
-    chromDict = {"1": "%s/chr1.fa" % downloadDir,
-                 "2": "%s/chr2.fa" % downloadDir,
-                 "3": "%s/chr3.fa" % downloadDir,
-                 "4": "%s/chr4.fa" % downloadDir,
-                 "5": "%s/chr5.fa" % downloadDir,
-                 "6": "%s/chr6.fa" % downloadDir,
-                 "7": "%s/chr7.fa" % downloadDir,
-                 "8": "%s/chr8.fa" % downloadDir,
-                 "9": "%s/chr9.fa" % downloadDir,
-                 "10": "%s/chr10.fa" % downloadDir,
-                 "11": "%s/chr11.fa" % downloadDir,
-                 "12": "%s/chr12.fa" % downloadDir,
-                 "13": "%s/chr13.fa" % downloadDir,
-                 "14": "%s/chr14.fa" % downloadDir,
-                 "15": "%s/chr15.fa" % downloadDir,
-                 "16": "%s/chr16.fa" % downloadDir,
-                 "17": "%s/chr17.fa" % downloadDir,
-                 "18": "%s/chr18.fa" % downloadDir,
-                 "19": "%s/chr19.fa" % downloadDir,
-                 "X": "%s/chrX.fa" % downloadDir,
-                 "Y": "%s/chrY.fa" % downloadDir,
-                 "M": "%s/chrM.fa" % downloadDir
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath, chromDict)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath, chromDict)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
  
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
  
-    for chromID in chromDict.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    mmGenome.addChromosomeEntry(chromID, chromOut, "file")
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    mmGenome = Genome("mmusculus", dbFile=db)
+    mmGenome.createIndices()
diff --git a/cistematic/genomes/rnorvegicus.py b/cistematic/genomes/rnorvegicus.py

index b0fae70e721bc11e5046d9a15b4e34120144c16e..6f4c7bac981945074543d916bf5b6cf4d307fd39 100644 (file)
--- a/cistematic/genomes/rnorvegicus.py
+++ b/cistematic/genomes/rnorvegicus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -34,36 +34,57 @@ from cistematic.core.geneinfo import geneinfoDB
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB  = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    rnGenome = Genome("rnorvegicus", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+    genePath = "%s/seq_gene.md" % downloadDir
+    goDefPath = "%s/GO.terms_and_ids" % downloadDir
+    goPath = "%s/gene2go" % downloadDir
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    rnGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    # ignoring all random chromosomes
+    chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
+                 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
+                 "Un", "X", "M"
+    ]
+    for chromID in chromList:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
+        loadChromosome(db, chromID, chromPath, "/R_norvegicus/chromo%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    rnGenome = Genome("rnorvegicus",  dbFile=db)
+    rnGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      rnGenome = Genome("rnorvegicus", dbFile=db)
      geneFile = open(gFile, "r")
@@ -130,7 +151,7 @@ def loadGeneFeatures(db, gFile):
  
  def loadGeneAnnotations(db):
      geneAnnotations = []
-    idb = geneinfoDB()  
+    idb = geneinfoDB()
      rnGenome = Genome("rnorvegicus", dbFile=db)
      gidList = rnGenome.allGIDs()
      for locID in gidList:
@@ -176,7 +197,7 @@ def loadGeneOntology(db, goPath, goDefPath):
                  synonyms = idb.geneIDSynonyms(gID)
                  if len(synonyms) >0:
                      for entry in synonyms:
-                        gene_name += ","    
+                        gene_name += ","
                          gene_name += entry
                  else:
                      gene_name = " "
@@ -190,66 +211,26 @@ def loadGeneOntology(db, goPath, goDefPath):
      rnGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    rnGenome = Genome("rnorvegicus",  dbFile=db)
-    rnGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
      rnGenome = Genome("rnorvegicus", dbFile=db)
-    rnGenome.createIndices()
-
-
-def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
-    genePath = "%s/seq_gene.md" % downloadDir
-    goDefPath = "%s/GO.terms_and_ids" % downloadDir
-    goPath = "%s/gene2go" % downloadDir
-    # ignoring all random chromosomes
-    chromos = {"1": "%s/chr1.fa" % downloadDir,
-               "2": "%s/chr2.fa" % downloadDir,
-               "3": "%s/chr3.fa" % downloadDir,
-               "4": "%s/chr4.fa" % downloadDir,
-               "5": "%s/chr5.fa" % downloadDir,
-               "6": "%s/chr6.fa" % downloadDir,
-               "7": "%s/chr7.fa" % downloadDir,
-               "8": "%s/chr8.fa" % downloadDir,
-               "9": "%s/chr9.fa" % downloadDir,
-               "10": "%s/chr10.fa" % downloadDir,
-               "11": "%s/chr11.fa" % downloadDir,
-               "12": "%s/chr12.fa" % downloadDir,
-               "13": "%s/chr13.fa" % downloadDir,
-               "14": "%s/chr14.fa" % downloadDir,
-               "15": "%s/chr15.fa" % downloadDir,
-               "16": "%s/chr16.fa" % downloadDir,
-               "17": "%s/chr17.fa" % downloadDir,
-               "18": "%s/chr18.fa" % downloadDir,
-               "19": "%s/chr19.fa" % downloadDir,
-               "Un": "%s/chrUn.fa" % downloadDir,
-               "X": "%s/chrX.fa" % downloadDir,
-               "20": "%s/chr20.fa" % downloadDir,
-               "M": "%s/chrM.fa" % downloadDir
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
  
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
  
-    for chromID in chromos.keys():
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID)
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    rnGenome.addChromosomeEntry(chromID, chromOut, "file")
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    rnGenome = Genome("rnorvegicus", dbFile=db)
+    rnGenome.createIndices()
diff --git a/cistematic/genomes/scerevisiae.py b/cistematic/genomes/scerevisiae.py

index 5866f80963fac6914c81048fb1460548a4ef8a3b..d670c148773fc86b95acbe6d42bf5ae47a1a375f 100644 (file)
--- a/cistematic/genomes/scerevisiae.py
+++ b/cistematic/genomes/scerevisiae.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,33 +33,44 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
  
  
-def loadChromosome(db, chromID, chromPath, chromOut):
-    seqArray = []
-    scGenome = Genome("scerevisiae", dbFile=db)
-    inFile = open(chromPath, "r")
-    line = inFile.readline()
-    for line in inFile:
-        seqArray.append(line.strip())
+def buildScerevisiaeDB(db=geneDB):
+    genePath = "%s/download/SGD_features.tab" % cisRoot
+    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+    goPath = "%s/download/gene_association.sgd" % cisRoot
  
-    seq = string.join(seqArray, "")
-    seqLen = len(seq)
-    if seqLen < 1:
-        print "Problems reading sequence from file"
+    print "Creating database %s" % db
+    createDBFile(db)
  
-    print "writing to file %s" % chromOut
-    outFile = open("%s%s" % (cisRoot, chromOut), "w")
-    outFile.write(seq)
-    outFile.close()
-    seq = ""
-    print "calling scGenome()"
-    scGenome.addChromosomeEntry(chromID, chromOut, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
+
+    print "Adding gene annotations"
+    loadGeneAnnotations(db, genePath)
+
+    print "Adding gene ontology"
+    loadGeneOntology(db, goPath, goDefPath)
+
+    for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
+        print "Loading chromosome %s" % chromID
+        chromPath = "%s/download/chr%s.fsa" % (cisRoot, chromID)
+        loadChromosome(db, chromID, chromPath, "/S_cerevisiae/chr%s.bin" % chromID)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
+    scGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
@@ -105,7 +116,7 @@ def loadGeneEntries(db, gFile):
  
  def loadGeneAnnotations(db, annotPath):
      geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      lines = annotFile.readlines()
      annotFile.close()
      scGenome = Genome("scerevisiae", dbFile=db)
@@ -162,55 +173,28 @@ def loadGeneOntology(db, goPath, goDefPath):
      scGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
-    scGenome.createGeneDB(db)
+def loadChromosome(db, chromID, chromPath, chromOut):
+    seqArray = []
+    scGenome = Genome("scerevisiae", dbFile=db)
+    inFile = open(chromPath, "r")
+    line = inFile.readline()
+    for line in inFile:
+        seqArray.append(line.strip())
+
+    seq = string.join(seqArray, "")
+    seqLen = len(seq)
+    if seqLen < 1:
+        print "Problems reading sequence from file"
+
+    print "writing to file %s" % chromOut
+    outFile = open("%s%s" % (cisRoot, chromOut), "w")
+    outFile.write(seq)
+    outFile.close()
+    seq = ""
+    print "calling scGenome()"
+    scGenome.addChromosomeEntry(chromID, chromOut, "file")
  
  
  def createDBindices(db):
      scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
      scGenome.createIndices()
-
-
-def buildScerevisiaeDB(db=geneDB):
-    genePath = "%s/download/SGD_features.tab" % cisRoot
-    goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
-    goPath = "%s/download/gene_association.sgd" % cisRoot
-    chromos = {"1": "%s/download/chr01.fsa" % cisRoot,
-               "2": "%s/download/chr02.fsa" % cisRoot,
-               "3": "%s/download/chr03.fsa" % cisRoot,
-               "4": "%s/download/chr04.fsa" % cisRoot,
-               "5": "%s/download/chr05.fsa" % cisRoot,
-               "6": "%s/download/chr06.fsa" % cisRoot,
-               "7": "%s/download/chr07.fsa" % cisRoot,
-               "8": "%s/download/chr08.fsa" % cisRoot,
-               "9": "%s/download/chr09.fsa" % cisRoot,
-               "10": "%s/download/chr10.fsa" % cisRoot,
-               "11": "%s/download/chr11.fsa" % cisRoot,
-               "12": "%s/download/chr12.fsa" % cisRoot,
-               "13": "%s/download/chr13.fsa" % cisRoot,
-               "14": "%s/download/chr14.fsa" % cisRoot,
-               "15": "%s/download/chr15.fsa" % cisRoot,
-               "16": "%s/download/chr16.fsa" % cisRoot
-    }
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
-
-    print "Adding gene annotations"
-    loadGeneAnnotations(db, genePath)
-
-    print "Adding gene ontology"
-    loadGeneOntology(db, goPath, goDefPath)
-
-    for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
-        print "Loading chromosome %s" % chromID
-        loadChromosome(db, chromID, chromos[chromID], "/S_cerevisiae/chr%s.bin" % chromID)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
-\ No newline at end of file
diff --git a/cistematic/genomes/spurpuratus.py b/cistematic/genomes/spurpuratus.py

index 6098562b45b54ea2916987552587e95ab50ab8df..e9d12b77a02f880d55082afe10e31132e6e187fd 100644 (file)
--- a/cistematic/genomes/spurpuratus.py
+++ b/cistematic/genomes/spurpuratus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,13 +33,33 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/S_purpuratus/spurpuratus.genedb" % cisRoot
  
  
+def buildSpurpuratusDB(db=geneDB):
+    chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot
+    chromoOutPath = "/S_purpuratus/"
+
+    print "Creating database %s" % db
+    createDBFile(db)
+
+    print "Loading genomic sequence"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+def createDBFile(db):
+    spGenome = Genome("spurpuratus", version="2.1",  dbFile=db)
+    spGenome.createGeneDB(db)
+
+
  def loadChromosome(db, chromPath, chromOutPath):
      seqArray = []
      seqLen = 0
@@ -82,27 +102,6 @@ def loadChromosome(db, chromPath, chromOutPath):
      inFile.close()
  
  
-def createDBFile(db):
-    spGenome = Genome("spurpuratus", version="2.1",  dbFile=db)
-    spGenome.createGeneDB(db)
-
-
  def createDBindices(db):
      spGenome = Genome("spurpuratus", version="2.1", dbFile=db)
      spGenome.createIndices()
-
-
-def buildSpurpuratusDB(db=geneDB):
-    chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot
-    chromoOutPath = "/S_purpuratus/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
-
-    print "Loading genomic sequence" 
-    loadChromosome(db, chromoPath, chromoOutPath)
-
-    print "Creating Indices"
-    createDBindices(db)
-
-    print "Finished creating database %s" % db
-\ No newline at end of file
diff --git a/cistematic/genomes/xtropicalis.py b/cistematic/genomes/xtropicalis.py

index eb37500e68a5cfb6f54cf8d054ad47397d2c312b..bda0dbef0cd472f9573a271124afac7bddad0023 100644 (file)
--- a/cistematic/genomes/xtropicalis.py
+++ b/cistematic/genomes/xtropicalis.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,51 +33,44 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
  geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    xtGenome = Genome("xtropicalis", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildXtropicalisDB(db=geneDB):
+    genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
+    chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
+    chromoOutPath = "/X_tropicalis/"
  
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
-            xtGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            xtGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
  
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
  
-    inFile.close()
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Loading sequences"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    xtGenome = Genome("xtropicalis",  dbFile=db)
+    xtGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      xtGenome = Genome("xtropicalis", dbFile=db)
      geneFile = open(gFile, "r")
@@ -233,34 +226,41 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
      xtGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    xtGenome = Genome("xtropicalis",  dbFile=db)
-    xtGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
      xtGenome = Genome("xtropicalis", dbFile=db)
-    xtGenome.createIndices()
-
-
-def buildXtropicalisDB(db=geneDB):
-    genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
-    chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
-    chromoOutPath = "/X_tropicalis/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
+            xtGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            xtGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
  
-    print "Loading sequences" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    xtGenome = Genome("xtropicalis", dbFile=db)
+    xtGenome.createIndices()
diff --git a/findall.py b/findall.py

index f8907a66c2db6d3b36a4ac9a13872554c1f1d9ca..827ee503ba7956b37143481beabae171a35f7619 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -927,6 +927,7 @@ def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, cu
  def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
      regionPasses = False
      if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
+        #TODO: here is where the test dataset is failing
          if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
              regionPasses = True
  
@@ -1008,4 +1009,4 @@ def getBestShiftInDict(shiftDict):
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
author	Sean Upchurch <sau@caltech.edu>
	Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
committer	Sean Upchurch <sau@caltech.edu>
	Mon, 15 Jul 2013 19:19:18 +0000 (12:19 -0700)
bamPreprocessing.py		patch \| blob \| history
cistematic/genomes/__init__.py		patch \| blob \| history
cistematic/genomes/athaliana.py		patch \| blob \| history
cistematic/genomes/btaurus.py		patch \| blob \| history
cistematic/genomes/cbrenneri.py		patch \| blob \| history
cistematic/genomes/cbriggsae.py		patch \| blob \| history
cistematic/genomes/celegans.py		patch \| blob \| history
cistematic/genomes/cfamiliaris.py		patch \| blob \| history
cistematic/genomes/cremanei.py		patch \| blob \| history
cistematic/genomes/dmelanogaster.py		patch \| blob \| history
cistematic/genomes/drerio.py		patch \| blob \| history
cistematic/genomes/ecaballus.py		patch \| blob \| history
cistematic/genomes/ggallus.py		patch \| blob \| history
cistematic/genomes/hsapiens.py		patch \| blob \| history
cistematic/genomes/mdomestica.py		patch \| blob \| history
cistematic/genomes/mmusculus.py		patch \| blob \| history
cistematic/genomes/rnorvegicus.py		patch \| blob \| history
cistematic/genomes/scerevisiae.py		patch \| blob \| history
cistematic/genomes/spurpuratus.py		patch \| blob \| history
cistematic/genomes/xtropicalis.py		patch \| blob \| history
findall.py		patch \| blob \| history