first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / drerio.py
index 7541db8d9043e2f4a082c73f6f938eb4ff639d9c..f26884eea87d0d463037a0a91f0ae768d974edbd 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -34,48 +34,40 @@ from cistematic.core.geneinfo import geneinfoDB
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/D_rerio/drerio.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    drGenome = Genome("drerio", dbFile=db)
-    files = os.listdir(chromPath)
-    for filename in files:
-        inFile = open("%s/%s" % (chromPath, filename), "r")
-        header = inFile.readline()
-        while header != "":
-            seqArray = []
-            seqLen = 0
-            chromID = header.strip()[1:]
-            currentLine = inFile.readline()
-            while currentLine != "" and currentLine[0] != ">":
-                lineSeq = currentLine.strip()
-                seqLen += len(lineSeq)
-                seqArray.append(lineSeq)
-                currentLine = inFile.readline()
+def buildDrerioDB(db=geneDB):
+    """ genes and annotations are from UCSC (dr3).
+    """
+    #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
+    chromoPath = "%s/download/dr3" % cisRoot
+    chromoOutPath = "/D_rerio/"
+    print "Creating database %s" % db
+    createDBFile(db)
 
-            seq = string.join(seqArray, "")
-            if seqLen < 250000:
-                print "Added contig %s to database" % chromID
-                drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
-                drGenome.addChromosomeEntry(chromID, chromID, "db")
-            else:
-                outFileName = "%s%s.bin" % (chromOutPath, chromID)
-                outFile = open("%s%s" % (cisRoot, outFileName), "w")
-                outFile.write(seq)
-                outFile.close()
-                print "Added contig file %s to database" % outFileName
-                drGenome.addChromosomeEntry(chromID, outFileName, "file")
+    #print "Adding gene entries"
+    #loadGeneEntries(db, genePath)
 
-            header = currentLine
+    #print "Adding gene features"
+    #loadGeneFeatures(db, genePath)
 
-        inFile.close()
+    print "Loading chromosomes"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    drGenome = Genome("drerio",  dbFile=db)
+    drGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
@@ -216,35 +208,43 @@ def loadGeneFeatures(db, gfile):
     drGenome.addFeatureEntryBatch(insertArray)
 
 
-def createDBFile(db):
-    drGenome = Genome("drerio",  dbFile=db)
-    drGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     drGenome = Genome("drerio", dbFile=db)
-    drGenome.createIndices()
-
-
-def buildDrerioDB(db=geneDB):
-    """ genes and annotations are from UCSC (dr3). 
-    """
-    #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
-    chromoPath = "%s/download/dr3" % cisRoot
-    chromoOutPath = "/D_rerio/"
-    print "Creating database %s" % db
-    createDBFile(db)
+    files = os.listdir(chromPath)
+    for filename in files:
+        inFile = open("%s/%s" % (chromPath, filename), "r")
+        header = inFile.readline()
+        while header != "":
+            seqArray = []
+            seqLen = 0
+            chromID = header.strip()[1:]
+            currentLine = inFile.readline()
+            while currentLine != "" and currentLine[0] != ">":
+                lineSeq = currentLine.strip()
+                seqLen += len(lineSeq)
+                seqArray.append(lineSeq)
+                currentLine = inFile.readline()
 
-    #print "Adding gene entries"
-    #loadGeneEntries(db, genePath)
+            seq = string.join(seqArray, "")
+            if seqLen < 250000:
+                print "Added contig %s to database" % chromID
+                drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
+                drGenome.addChromosomeEntry(chromID, chromID, "db")
+            else:
+                outFileName = "%s%s.bin" % (chromOutPath, chromID)
+                outFile = open("%s%s" % (cisRoot, outFileName), "w")
+                outFile.write(seq)
+                outFile.close()
+                print "Added contig file %s to database" % outFileName
+                drGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    #print "Adding gene features"
-    #loadGeneFeatures(db, genePath)
+            header = currentLine
 
-    print "Loading chromosomes"
-    loadChromosome(db, chromoPath, chromoOutPath)
+        inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    drGenome = Genome("drerio", dbFile=db)
+    drGenome.createIndices()