first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / ecaballus.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Equus Caballus
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot
41
42
43 def buildHorseDB(db=geneDB):
44     genePath = "%s/download/seq_gene.md" % cisRoot
45
46     print "Creating database %s" % db
47     createDBFile(db)
48
49     print "Adding gene entries"
50     loadGeneEntries(db, genePath)
51
52     print "Adding gene features"
53     loadGeneFeatures(db, genePath)
54
55     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
56                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
57                  "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
58                  "31", "M", "X", "Un"
59     ]
60     for chromID in chromList:
61         print "Loading chromosome %s" % chromID
62         chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
63         loadChromosome(db, chromID, chromPath, "/E_caballus/chromo%s.bin" % chromID)
64
65     print "Creating Indices"
66     createDBindices(db)
67
68     print "Finished creating database %s" % db
69
70
71 def createDBFile(db):
72     ecGenome = Genome("ecaballus",  dbFile=db)
73     ecGenome.createGeneDB(db)
74
75
76 def loadGeneEntries(db, gFile):
77     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
78
79     geneEntries = []
80     alreadySeen = []
81     ecGenome = Genome("ecaballus", dbFile=db)
82     geneFile = open(gFile, "r")
83     geneFile.readline()
84     for line in geneFile:
85         cols = line.split("\t")
86         if cols[11].strip() != "GENE":
87             continue
88
89         name = cols[10].split(":")
90         gid = name[1]
91         if gid == "" or gid in alreadySeen:
92             continue
93
94         alreadySeen.append(gid)
95         start = int(cols[2]) - 1
96         stop = int(cols[3]) - 1
97         sense = cols[4]
98         chrom = cols[1].strip()
99         if sense == "+":
100             sense = "F"
101         else:
102             sense = "R"
103
104         geneID = ("ecaballus", gid)
105         gidVersion = 1
106         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
107
108     print "Adding %d gene entries" % len(geneEntries)
109     ecGenome.addGeneEntryBatch(geneEntries)
110
111
112 def loadGeneFeatures(db, gFile):
113     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
114     """
115     featureEntries = []
116     ecGenome = Genome("ecaballus", dbFile=db)
117     featureFile = open(gFile, "r")
118     featureFile.readline()
119     for line in featureFile:
120         cols = line.split("\t")
121         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
122             continue
123
124         fType = cols[11]
125         name = cols[10].split(":")
126         gid = name[1]
127         if gid == "":
128             continue
129
130         start = int(cols[2]) - 1
131         stop = int(cols[3]) - 1
132         sense = cols[4]
133         chrom = cols[1].strip()
134         if sense == "+":
135             sense = "F"
136         else:
137             sense = "R"
138
139         geneID = ("ecaballus", gid)
140         gidVersion = 1
141         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
142
143     print "Adding %d feature entries" % len(featureEntries)
144     ecGenome.addFeatureEntryBatch(featureEntries)
145
146
147 def loadChromosome(db, chromID, chromPath, chromOut):
148     seqArray = []
149     ecGenome = Genome("ecaballus", dbFile=db)
150     inFile = open(chromPath, "r")
151     line = inFile.readline()
152     for line in inFile:
153         seqArray.append(line.strip())
154
155     seq = string.join(seqArray, "")
156     seqLen = len(seq)
157     if seqLen < 1:
158         print "Problems reading sequence from file"
159
160     print "writing to file %s" % chromOut
161     outFile = open("%s%s" % (cisRoot, chromOut), "w")
162     outFile.write(seq)
163     outFile.close()
164     ecGenome.addChromosomeEntry(chromID, chromOut, "file")
165
166
167 def createDBindices(db):
168     ecGenome = Genome("ecaballus", dbFile=db)
169     ecGenome.createIndices()