first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / cfamiliaris.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Canis familiaris
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
41
42
43 def buildDogDB(db=geneDB):
44     genePath = "%s/download/seq_gene.md" % cisRoot
45     print "Creating database %s" % db
46     createDBFile(db)
47
48     print "Adding gene entries"
49     loadGeneEntries(db, genePath)
50
51     print "Adding gene features"
52     loadGeneFeatures(db, genePath)
53
54     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
55                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
56                  "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
57                  "31", "32", "33", "34", "35", "36", "37", "38", "X", "Un"
58     ]
59     for chromID in chromList:
60         print "Loading chromosome %s" % chromID
61         chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
62         loadChromosome(db, chromID, chromPath, "/C_familiaris/chromo%s.bin" % chromID)
63
64     print "Creating Indices"
65     createDBindices(db)
66
67     print "Finished creating database %s" % db
68
69
70 def createDBFile(db):
71     cfGenome = Genome("cfamiliaris",  dbFile=db)
72     cfGenome.createGeneDB(db)
73
74
75 def loadGeneEntries(db, gFile):
76     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
77
78     geneEntries = []
79     alreadySeen = []
80     cfGenome = Genome("cfamiliaris", dbFile=db)
81     geneFile = open(gFile, "r")
82     geneFile.readline()
83     for line in geneFile:
84         cols = line.split("\t")
85         if cols[11].strip() != "GENE":
86             continue
87
88         name = cols[10].split(":")
89         gid = name[1]
90         if gid == "" or gid in alreadySeen:
91             continue
92
93         alreadySeen.append(gid)
94         start = int(cols[2]) - 1
95         stop = int(cols[3]) - 1
96         sense = cols[4]
97         chrom = cols[1].strip()
98         if sense == "+":
99             sense = "F"
100         else:
101             sense = "R"
102
103         geneID = ("cfamiliaris", gid)
104         gidVersion = 1
105         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
106
107     print "Adding %d gene entries" % len(geneEntries)
108     cfGenome.addGeneEntryBatch(geneEntries)
109
110
111 def loadGeneFeatures(db, gFile):
112     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
113     """
114     featureEntries = []
115     cfGenome = Genome("cfamiliaris", dbFile=db)
116     featureFile = open(gFile, "r")
117     featureFile.readline()
118     for line in featureFile:
119         cols = line.split("\t")
120         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
121             continue
122
123         fType = cols[11]
124         name = cols[10].split(":")
125         gid = name[1]
126         if gid == "":
127             continue
128
129         start = int(cols[2]) - 1
130         stop = int(cols[3]) - 1
131         sense = cols[4]
132         chrom = cols[1].strip()
133         if sense == "+":
134             sense = "F"
135         else:
136             sense = "R"
137
138         geneID = ("cfamiliaris", gid)
139         gidVersion = 1
140         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
141
142     print "Adding %d feature entries" % len(featureEntries)
143     cfGenome.addFeatureEntryBatch(featureEntries)
144
145
146 def loadGeneAnnotations(db, annotPath):
147     geneAnnotations = []
148     annotFile = open(annotPath, "r")
149     cfGenome = Genome("cfamiliaris", dbFile=db)
150     for line in annotFile:
151         try:
152             cols = line.split("\t")
153             locID = cols[0]
154             geneDesc = cols[6]
155             if len(locID) > 0:
156                 geneAnnotations.append((("cfamiliaris", locID), string.replace(geneDesc.strip(), "'", "p")))
157         except:
158             pass
159
160     print "Adding %d annotations" % len(geneAnnotations)
161     cfGenome.addAnnotationBatch(geneAnnotations)
162
163
164 def loadGeneOntology(db, goPath, goDefPath):
165     cfGenome = Genome("cfamiliaris", dbFile=db)
166     goDefFile = open(goDefPath, "r")
167     goFile = open(goPath, "r")
168     idb = geneinfoDB()
169     goDefs = {}
170     goArray = []
171     for goDefEntry in goDefFile:
172         if goDefEntry[0] != "!":
173             cols = goDefEntry.split("\t")
174             goDefs[cols[0]] = (cols[1], cols[2].strip())
175
176     goEntries = goFile.readlines()
177     prevGID = ''
178     for entry in goEntries:
179         try:
180             fields = entry.split("\t")
181             if fields[0] != "9615":
182                 continue
183
184             locID = fields[1].strip()
185             gID = ("cfamiliaris", locID)
186             if prevGID != gID:
187                 prevGID = gID
188                 gene_name = ""
189                 synonyms = idb.geneIDSynonyms(gID)
190                 if len(synonyms) >0:
191                     for entry in synonyms:
192                         gene_name += ","
193                         gene_name += entry
194                 else:
195                     gene_name = " "
196
197             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
198         except:
199             print "locus ID %s could not be added" % locID
200             pass
201
202     print "adding %d go entries" % len(goArray)
203     cfGenome.addGoInfoBatch(goArray)
204
205
206 def loadChromosome(db, chromID, chromPath, chromOut):
207     seqArray = []
208     cfGenome = Genome("cfamiliaris", dbFile=db)
209     inFile = open(chromPath, "r")
210     line = inFile.readline()
211     for line in inFile:
212         seqArray.append(line.strip())
213
214     seq = string.join(seqArray, "")
215     seqLen = len(seq)
216     if seqLen < 1:
217         print "Problems reading sequence from file"
218
219     print "writing to file %s" % chromOut
220     outFile = open("%s%s" % (cisRoot, chromOut), "w")
221     outFile.write(seq)
222     outFile.close()
223     cfGenome.addChromosomeEntry(chromID, chromOut, "file")
224
225
226 def createDBindices(db):
227     cfGenome = Genome("cfamiliaris", dbFile=db)
228     cfGenome.createIndices()