first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / mdomestica.py
index 516e784bb7eaedd005a4776029731dd91aae576b..e1fc731dd8e39f88bcbc7d2500ad84cfc7b04fdc 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,51 +33,44 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    mdGenome = Genome("mdomestica", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+def buildMdomesticaDB(db=geneDB):
+    genePath = "%s/download/mondom/genscan.txt" % cisRoot
+    chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
+    chromoOutPath = "/M_domestica/"
 
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
-            mdGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            mdGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-    inFile.close()
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
+
+    print "Loading chromosomes"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    mdGenome = Genome("mdomestica",  dbFile=db)
+    mdGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     mdGenome = Genome("mdomestica", dbFile=db)
     geneFile = open(gFile, "r")
@@ -103,7 +96,7 @@ def loadGeneEntries(db, gFile):
 
 def loadGeneAnnotations(db, annotPath):
     geneAnnotations = []
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     mdGenome = Genome("mdomestica", dbFile=db)
     for line in annotFile:
         try:
@@ -191,34 +184,41 @@ def loadGeneFeatures(db, gfile):
     mdGenome.addFeatureEntryBatch(insertArray)
 
 
-def createDBFile(db):
-    mdGenome = Genome("mdomestica",  dbFile=db)
-    mdGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     mdGenome = Genome("mdomestica", dbFile=db)
-    mdGenome.createIndices()
-
-
-def buildMdomesticaDB(db=geneDB):
-    genePath = "%s/download/mondom/genscan.txt" % cisRoot
-    chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
-    chromoOutPath = "/M_domestica/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
+            mdGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            mdGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
 
-    print "Loading chromosomes"
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    mdGenome = Genome("mdomestica", dbFile=db)
+    mdGenome.createIndices()