first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / cistematic / genomes / btaurus.py
diff --git a/cistematic/genomes/btaurus.py b/cistematic/genomes/btaurus.py

index c8a01b60d3978d0ebe56272b7f4b87edb62f8a5f..d703bf5f28b13e2b6a80e0216039384a242f8a6c 100644 (file)
--- a/cistematic/genomes/btaurus.py
+++ b/cistematic/genomes/btaurus.py
@@ -1,7 +1,7 @@
  ###########################################################################
  #                                                                         #
  # C O P Y R I G H T   N O T I C E                                         #
-#  Copyright (c) 2003-10 by:                                              #
+#  Copyright (c) 2003-13 by:                                              #
  #    * California Institute of Technology                                 #
  #                                                                         #
  #    All Rights Reserved.                                                 #
@@ -33,52 +33,44 @@ from cistematic.genomes import Genome
  from os import environ
  
  if environ.get("CISTEMATIC_ROOT"):
-    cisRoot = environ.get("CISTEMATIC_ROOT") 
+    cisRoot = environ.get("CISTEMATIC_ROOT")
  else:
      cisRoot = "/proj/genome"
  
-geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot 
+geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
  
  
-def loadChromosome(db, chromPath, chromOutPath):
-    seqArray = []
-    seqLen = 0
-    btGenome = Genome("btaurus", dbFile=db)
-    inFile = open(chromPath, "r")
-    header = inFile.readline()
-    while header != "":
-        seqArray = []
-        seqLen = 0
-        chromID = header.strip()[1:]
-        currentLine = inFile.readline()
+def buildBtaurusDB(db=geneDB):
+    genePath = "%s/download/bt2/genscan.txt" % cisRoot
+    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
+    chromoOutPath = "/B_taurus/"
  
-        while currentLine != "" and currentLine[0] != ">":
-            lineSeq = currentLine.strip()
-            seqLen += len(lineSeq)
-            seqArray.append(lineSeq)
-            currentLine = inFile.readline()
+    print "Creating database %s" % db
+    createDBFile(db)
  
-        seq = string.join(seqArray, "")
-        if seqLen < 500000:
-            print "Added contig %s to database" % chromID
-            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
-            btGenome.addChromosomeEntry(chromID, chromID, "db")
-        else:
-            outFileName = "%s%s.bin" % (chromOutPath, chromID)
-            outFile = open("%s%s" % (cisRoot, outFileName), "w")
-            outFile.write(seq)
-            outFile.close()
-            print "Added contig file %s to database" % outFileName
-            btGenome.addChromosomeEntry(chromID, outFileName, "file")
+    print "Adding gene entries"
+    loadGeneEntries(db, genePath)
  
-        header = currentLine
+    print "Adding gene features"
+    loadGeneFeatures(db, genePath)
  
-    inFile.close()
+    print "Loading sequences"
+    loadChromosome(db, chromoPath, chromoOutPath)
+
+    print "Creating Indices"
+    createDBindices(db)
+
+    print "Finished creating database %s" % db
+
+
+def createDBFile(db):
+    btGenome = Genome("btaurus",  dbFile=db)
+    btGenome.createGeneDB(db)
  
  
  def loadGeneEntries(db, gFile):
-    """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
-    """
+    #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
+
      geneEntries = []
      btGenome = Genome("btaurus", dbFile=db)
      geneFile = open(gFile, "r")
@@ -103,24 +95,6 @@ def loadGeneEntries(db, gFile):
      btGenome.addGeneEntryBatch(geneEntries)
  
  
-def loadGeneAnnotations(db, annotPath):
-    geneAnnotations = []
-    annotFile = open(annotPath, "r")
-    btGenome = Genome("btaurus", dbFile=db)
-    for line in annotFile:
-        try:
-            cols = line.split("\t")
-            locID = cols[0]
-            geneDesc = cols[6]
-            if len(locID) > 0:
-                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
-        except:
-            pass
-
-    print "Adding %d annotations" % len(geneAnnotations)
-    btGenome.addAnnotationBatch(geneAnnotations)
-
-
  def loadGeneFeatures(db, gfile):
      geneFile = open(gfile, "r")
      senseArray = {"+": "F",
@@ -194,11 +168,29 @@ def loadGeneFeatures(db, gfile):
      btGenome.addFeatureEntryBatch(insertArray)
  
  
+def loadGeneAnnotations(db, annotPath):
+    geneAnnotations = []
+    annotFile = open(annotPath, "r")
+    btGenome = Genome("btaurus", dbFile=db)
+    for line in annotFile:
+        try:
+            cols = line.split("\t")
+            locID = cols[0]
+            geneDesc = cols[6]
+            if len(locID) > 0:
+                geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
+        except:
+            pass
+
+    print "Adding %d annotations" % len(geneAnnotations)
+    btGenome.addAnnotationBatch(geneAnnotations)
+
+
  def loadGeneOntology(db, goPath, goDefPath, annotPath):
      btGenome = Genome("btaurus", dbFile=db)
      goDefFile = open(goDefPath, "r")
      goFile = open(goPath, "r")
-    annotFile = open(annotPath, "r")  
+    annotFile = open(annotPath, "r")
      annotEntries = annotFile.readlines()
      annotFile.close()
      goDefEntries = goDefFile.readlines()
@@ -238,34 +230,42 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath):
      btGenome.addGoInfoBatch(goArray)
  
  
-def createDBFile(db):
-    btGenome = Genome("btaurus",  dbFile=db)
-    btGenome.createGeneDB(db)
-
-
-def createDBindices(db):
+def loadChromosome(db, chromPath, chromOutPath):
+    seqArray = []
+    seqLen = 0
      btGenome = Genome("btaurus", dbFile=db)
-    btGenome.createIndices()
-
+    inFile = open(chromPath, "r")
+    header = inFile.readline()
+    while header != "":
+        seqArray = []
+        seqLen = 0
+        chromID = header.strip()[1:]
+        currentLine = inFile.readline()
  
-def buildBtaurusDB(db=geneDB):
-    genePath = "%s/download/bt2/genscan.txt" % cisRoot
-    chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
-    chromoOutPath = "/B_taurus/"
-    
-    print "Creating database %s" % db
-    createDBFile(db)
+        while currentLine != "" and currentLine[0] != ">":
+            lineSeq = currentLine.strip()
+            seqLen += len(lineSeq)
+            seqArray.append(lineSeq)
+            currentLine = inFile.readline()
  
-    print "Adding gene entries"
-    loadGeneEntries(db, genePath)
+        seq = string.join(seqArray, "")
+        if seqLen < 500000:
+            print "Added contig %s to database" % chromID
+            btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
+            btGenome.addChromosomeEntry(chromID, chromID, "db")
+        else:
+            outFileName = "%s%s.bin" % (chromOutPath, chromID)
+            outFile = open("%s%s" % (cisRoot, outFileName), "w")
+            outFile.write(seq)
+            outFile.close()
+            print "Added contig file %s to database" % outFileName
+            btGenome.addChromosomeEntry(chromID, outFileName, "file")
  
-    print "Adding gene features"
-    loadGeneFeatures(db, genePath)
+        header = currentLine
  
-    print "Loading sequences" 
-    loadChromosome(db, chromoPath, chromoOutPath)
+    inFile.close()
  
-    print "Creating Indices"
-    createDBindices(db)
  
-    print "Finished creating database %s" % db
-\ No newline at end of file
+def createDBindices(db):
+    btGenome = Genome("btaurus", dbFile=db)
+    btGenome.createIndices()