first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / btaurus.py
index c8a01b60d3978d0ebe56272b7f4b87edb62f8a5f..d703bf5f28b13e2b6a80e0216039384a242f8a6c 100644 (file)
@@ -1,7 +1,7 @@
 ###########################################################################
 #                                                                         #
 # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
 #    * California Institute of Technology                                 #
 #                                                                         #
 #    All Rights Reserved.                                                 #
@@ -33,52 +33,44 @@ from cistematic.genomes import Genome
 from os import environ
 
 if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
 else:
     cisRoot = "/proj/genome"
 
-geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot 
+geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
 
 
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    btGenome = Genome("btaurus", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
+def buildBtaurusDB(db=geneDB):
+    genePath = "%s/download/bt2/genscan.txt" % cisRoot
+    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
+    chromoOutPath = "/B_taurus/"
 
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+    print "Creating database %s" % db
+    createDBFile(db)
 
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
-            btGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            btGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
 
-        header = currentLine
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
 
-    inFile.close()
+    print "Loading sequences"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    btGenome = Genome("btaurus",  dbFile=db)
+    btGenome.createGeneDB(db)
 
 
 def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
     geneEntries = []
     btGenome = Genome("btaurus", dbFile=db)
     geneFile = open(gFile, "r")
@@ -103,24 +95,6 @@ def loadGeneEntries(db, gFile):
     btGenome.addGeneEntryBatch(geneEntries)
 
 
-def loadGeneAnnotations(db, annotPath):
-    geneAnnotations = []
-    annotFile = open(annotPath, "r")
-    btGenome = Genome("btaurus", dbFile=db)
-    for line in annotFile:
-        try:
-            cols = line.split("\t")
-            locID = cols[0]
-            geneDesc = cols[6]
-            if len(locID) > 0:
-                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
-        except:
-            pass
-
-    print "Adding %d annotations" % len(geneAnnotations)
-    btGenome.addAnnotationBatch(geneAnnotations)
-
-
 def loadGeneFeatures(db, gfile):
     geneFile = open(gfile, "r")
     senseArray = {"+": "F",
@@ -194,11 +168,29 @@ def loadGeneFeatures(db, gfile):
     btGenome.addFeatureEntryBatch(insertArray)
 
 
+def loadGeneAnnotations(db, annotPath):
+    geneAnnotations = []
+    annotFile = open(annotPath, "r")
+    btGenome = Genome("btaurus", dbFile=db)
+    for line in annotFile:
+        try:
+            cols = line.split("\t")
+            locID = cols[0]
+            geneDesc = cols[6]
+            if len(locID) > 0:
+                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
+        except:
+            pass
+
+    print "Adding %d annotations" % len(geneAnnotations)
+    btGenome.addAnnotationBatch(geneAnnotations)
+
+
 def loadGeneOntology(db, goPath, goDefPath, annotPath):
     btGenome = Genome("btaurus", dbFile=db)
     goDefFile = open(goDefPath, "r")
     goFile = open(goPath, "r")
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
     annotEntries = annotFile.readlines()
     annotFile.close()
     goDefEntries = goDefFile.readlines()
@@ -238,34 +230,42 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
     btGenome.addGoInfoBatch(goArray)
 
 
-def createDBFile(db):
-    btGenome = Genome("btaurus",  dbFile=db)
-    btGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
     btGenome = Genome("btaurus", dbFile=db)
-    btGenome.createIndices()
-
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
 
-def buildBtaurusDB(db=geneDB):
-    genePath = "%s/download/bt2/genscan.txt" % cisRoot
-    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
-    chromoOutPath = "/B_taurus/"
-    
-    print "Creating database %s" % db
-    createDBFile(db)
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
 
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
+            btGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            btGenome.addChromosomeEntry(chromID, outFileName, "file")
 
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
 
-    print "Loading sequences" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
 
-    print "Creating Indices"
-    createDBindices(db)
 
-    print "Finished creating database %s" % db
\ No newline at end of file
+def createDBindices(db):
+    btGenome = Genome("btaurus", dbFile=db)
+    btGenome.createIndices()