1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Equus Caballus
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot
43 def buildHorseDB(db=geneDB):
44 genePath = "%s/download/seq_gene.md" % cisRoot
46 print "Creating database %s" % db
49 print "Adding gene entries"
50 loadGeneEntries(db, genePath)
52 print "Adding gene features"
53 loadGeneFeatures(db, genePath)
55 chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
56 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
57 "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
60 for chromID in chromList:
61 print "Loading chromosome %s" % chromID
62 chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
63 loadChromosome(db, chromID, chromPath, "/E_caballus/chromo%s.bin" % chromID)
65 print "Creating Indices"
68 print "Finished creating database %s" % db
72 ecGenome = Genome("ecaballus", dbFile=db)
73 ecGenome.createGeneDB(db)
76 def loadGeneEntries(db, gFile):
77 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
81 ecGenome = Genome("ecaballus", dbFile=db)
82 geneFile = open(gFile, "r")
85 cols = line.split("\t")
86 if cols[11].strip() != "GENE":
89 name = cols[10].split(":")
91 if gid == "" or gid in alreadySeen:
94 alreadySeen.append(gid)
95 start = int(cols[2]) - 1
96 stop = int(cols[3]) - 1
98 chrom = cols[1].strip()
104 geneID = ("ecaballus", gid)
106 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
108 print "Adding %d gene entries" % len(geneEntries)
109 ecGenome.addGeneEntryBatch(geneEntries)
112 def loadGeneFeatures(db, gFile):
113 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
116 ecGenome = Genome("ecaballus", dbFile=db)
117 featureFile = open(gFile, "r")
118 featureFile.readline()
119 for line in featureFile:
120 cols = line.split("\t")
121 if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
125 name = cols[10].split(":")
130 start = int(cols[2]) - 1
131 stop = int(cols[3]) - 1
133 chrom = cols[1].strip()
139 geneID = ("ecaballus", gid)
141 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
143 print "Adding %d feature entries" % len(featureEntries)
144 ecGenome.addFeatureEntryBatch(featureEntries)
147 def loadChromosome(db, chromID, chromPath, chromOut):
149 ecGenome = Genome("ecaballus", dbFile=db)
150 inFile = open(chromPath, "r")
151 line = inFile.readline()
153 seqArray.append(line.strip())
155 seq = string.join(seqArray, "")
158 print "Problems reading sequence from file"
160 print "writing to file %s" % chromOut
161 outFile = open("%s%s" % (cisRoot, chromOut), "w")
164 ecGenome.addChromosomeEntry(chromID, chromOut, "file")
167 def createDBindices(db):
168 ecGenome = Genome("ecaballus", dbFile=db)
169 ecGenome.createIndices()