first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / cremanei.py
index 86e5991b112b69a095760b5eebf056de0621c0a1..3bc6b8f6a0dd9c7ab6aa61a8ce3bf8e00b21fd5c 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,51 +33,39 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
 geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
 
 
-def loadChromosomes(db, inPath, chromOutPath):
-    crGenome = Genome("cremanei", dbFile=db)
-    scontigList = os.listdir(inPath)
-    for scontig in scontigList:
-        seq = ''
-        seqArray = []
-        seqLen = 0
-        inFile = open("%s/%s" % (inPath, scontig), "r")
-        index = 0
-        header = inFile.readline()
-        chromID = header.strip()[1:]
-        while header != "":
-            seqArray = []
-            seqLen = 0
-            currentLine = inFile.readline()
-            while currentLine != "" and currentLine[0] != ">":
-                lineSeq = currentLine.strip()
-                seqLen += len(lineSeq)
-                seqArray.append(lineSeq)
-                currentLine = inFile.readline()
+def buildCremaneiDB(db=geneDB):
+    gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
+    chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
+    chromoOutPath = "/C_remanei/"
 
-            seq = string.join(seqArray, "")
-            if seqLen < 100000:
-                print "Added contig %s to database" % chromID
-                crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
-                crGenome.addChromosomeEntry(chromID, chromID, "db")
-            else:
-                outFileName = "%s%s.bin" % (chromOutPath, chromID)
-                outFile = open("%s%s" % (cisRoot, outFileName), "w")
-                outFile.write(seq)
-                outFile.close()
-                print "Added contig file %s to database" % outFileName
-                crGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Creating database %s" % db
+    createDBFile(db)
 
-            index += 1
-            header = currentLine
+    print "Adding gene entries"
+    loadGeneEntries(db, gffPath)
 
-        inFile.close()
+    print "Adding feature entries"
+    loadFeatureEntries(db, gffPath)
+
+    print "Loading genomic sequence"
+    loadChromosomes(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
+    crGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gffFile):
@@ -155,34 +143,46 @@ def loadFeatureEntries(db, gffFile):
     crGenome.addFeatureEntryBatch(featureEntries)
 
 
-def createDBFile(db):
-    crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
-    crGenome.createGeneDB(db)
-
-
-def createDBindices(db):
-    crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
-    crGenome.createIndices()
-
-
-def buildCremaneiDB(db=geneDB):
-    gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
-    chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
-    chromoOutPath = "/C_remanei/"
-
-    print "Creating database %s" % db
-    createDBFile(db)
+def loadChromosomes(db, inPath, chromOutPath):
+    crGenome = Genome("cremanei", dbFile=db)
+    scontigList = os.listdir(inPath)
+    for scontig in scontigList:
+        seq = ''
+        seqArray = []
+        seqLen = 0
+        inFile = open("%s/%s" % (inPath, scontig), "r")
+        index = 0
+        header = inFile.readline()
+        chromID = header.strip()[1:]
+        while header != "":
+            seqArray = []
+            seqLen = 0
+            currentLine = inFile.readline()
+            while currentLine != "" and currentLine[0] != ">":
+                lineSeq = currentLine.strip()
+                seqLen += len(lineSeq)
+                seqArray.append(lineSeq)
+                currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, gffPath)
+            seq = string.join(seqArray, "")
+            if seqLen < 100000:
+                print "Added contig %s to database" % chromID
+                crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
+                crGenome.addChromosomeEntry(chromID, chromID, "db")
+            else:
+                outFileName = "%s%s.bin" % (chromOutPath, chromID)
+                outFile = open("%s%s" % (cisRoot, outFileName), "w")
+                outFile.write(seq)
+                outFile.close()
+                print "Added contig file %s to database" % outFileName
+                crGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding feature entries"
-    loadFeatureEntries(db, gffPath)
+            index += 1
+            header = currentLine
 
-    print "Loading genomic sequence" 
-    loadChromosomes(db, chromoPath, chromoOutPath)
+        inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
+    crGenome.createIndices()