erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / programs / fastcomp.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # fastcomp.py
31 from cistematic.programs import Program
32 import os, time, tempfile
33 from cistematic.core.motif import Motif
34 from cistematic.core import complement, sanitize
35 from os import environ
36
37 if environ.get("CISTEMATIC_TEMP"):
38     cisTemp = environ.get("CISTEMATIC_TEMP")
39 else:
40     cisTemp = "/tmp"
41
42 tempfile.tempdir = cisTemp
43
44
45 class Fastcomp(Program):
46     fastcompPath = Program.programRoot + "/fastcomp/"
47     motifs = []
48     argDict = {}
49     seq1name = ""
50     seq2name = ""
51     pairdir = ""
52     argLetter = {"windowsize": "-w", "threshold": "-t"}
53
54
55     def getSettings(self):
56         return self.argDict
57
58
59     def setSettings(self, settings):
60         self.motifs = []
61         self.argDict = settings
62
63
64     def buildInputFiles(self):
65         """ given a Fasta file (via the constructor or the inputFile() method), grab the 
66             first two sequences and save them as two temporary sequences.
67         """
68         line = " "
69         seq1 = ""
70         seq2 = ""
71         inFile = open(self.inputFilePath, "r")
72         try:
73             while line[0] != ">":
74                 line = inFile.readline()
75
76             self.seq1name = line[:-1]
77             line = inFile.readline()
78             while line[0] != ">":
79                 seq1 += line.strip()
80                 line = inFile.readline()
81
82             self.seq2name = line[:-1]
83             line = inFile.readline()
84             while line[0] != ">":
85                 seq2 += line.strip()
86                 line = inFile.readline()
87         except:
88             if len(seq2) > 0:
89                 pass
90             else:
91                 print "Error processing input file"
92                 return ("", "")
93
94         inFile.close()
95         self.fastdir = tempfile.mktemp()
96         try:
97             os.mkdir(self.fastdir)
98         except:
99             self.fastdir = cisTemp
100
101         self.inputFile1 = self.fastdir + "/seq1.fsa"
102         self.inputFile2 = self.fastdir + "/seq2.fsa"
103         seq1 = sanitize(seq1)
104         seq2 = sanitize(seq2)
105         seq1File = open(self.inputFile1, "w")
106         seq1File.write(seq1 + "\n")
107         seq1File.close()
108         seq2File = open(self.inputFile2, "w")
109         seq2File.write(seq2 + "\n")
110         seq2File.close()
111
112         return (seq1, seq2)
113
114
115     def buildCommand(self):
116         self.buildInputFiles()
117         cmd = self.fastcompPath + "fastcomp -1 " + self.inputFile1 + " -2 " + self.inputFile2
118         for arg in ["windowsize", "threshold"]:
119             cmd = cmd + " " + self.argLetter[arg] + "  " + str(self.argDict[arg])
120
121         cmd += " -o %s/fastcomp.out" % self.fastdir
122         print "cmd is %s" % (cmd)
123
124         return cmd
125
126
127     def setWindowSize(self, wsize):
128         self.argDict["windowsize"] = str(wsize)
129
130
131     def setThreshold(self, thresh):
132         self.argDict["threshold"] = thresh
133
134
135     def checkDefaults(self):
136         if "threshold" not in self.argDict:
137             self.argDict["threshold"] = 0.9
138
139         if "windowsize" not in self.argDict:
140             self.argDict["windowsize"] = 10
141
142
143     # run the program - preferrably after some of the other options have been set
144     def run(self):
145         startTime = time.time()
146         self.checkDefaults()
147         self.contents = os.popen(Fastcomp.buildCommand(self)).readlines()
148         outFile = open( self.fastdir + "/fastcomp.out")
149         self.contents = outFile.readlines()
150         outFile.close()
151         try:
152             filenames = os.listdir(self.fastdir)
153             for entry in filenames:
154                 os.remove(self.fastdir + "/" + entry)
155             os.rmdir(self.fastdir)
156         except:
157             print "error cleaning up directory %s" % self.fastdir
158
159         stopTime = time.time()
160
161         print "\nThis run took %.3f - %.3f = %.3f seconds" % (startTime, stopTime, stopTime - startTime)
162
163
164     def getWindows(self, seqNum="1"):
165         results = []
166         for line in self.contents:
167             (seq1pos, seq2pos, matches, sense) = line[:-1].split("\t")
168             if seqNum == "1":
169                 results.append((seq1pos, seq2pos, matches, sense))
170             else:
171                 results.append((seq2pos, seq1pos, matches, sense))
172
173         return results
174
175
176     def getMotifs(self):
177         self.motifs = []
178         thePWM = []
179         motSize = self.argDict["windowsize"]
180         matrixRow = {"A": 0, "C": 1, "G": 2, "T": 3}
181         (seq1, seq2)  = self.buildInputFiles()
182         currentPos = -1
183         for line in self.contents:
184             (seq1pos, seq2pos, matches, sense) = line[:-1].split("\t")
185             if int(seq1pos) > currentPos:
186                 currentPos = int(seq1pos)
187                 if len(thePWM) > 0:
188                     for pos in range(len(thePWM)):
189                         thePWM[pos][matrixRow["A"]] /= seqNum
190                         thePWM[pos][matrixRow["C"]] /= seqNum
191                         thePWM[pos][matrixRow["G"]] /= seqNum
192                         thePWM[pos][matrixRow["T"]] /= seqNum
193
194                     self.motifs.append(Motif(self.tagID + "-FASTCOMP-" + str(len(self.motifs) + 1), "", thePWM, [], 1.0, str(self.argDict["threshold"])))
195
196                 thePWM = []
197                 motseq1 = seq1[currentPos:currentPos+motSize]
198                 seqNum = 1.0
199                 for pos in range(motSize):
200                     thePWM.append([0.0, 0.0, 0.0, 0.0])
201                     NT = motseq1[pos]
202                     thePWM[pos][matrixRow[NT]] += 1
203
204             if sense == "1":
205                 motseq2 = seq2[int(seq2pos):int(seq2pos) + motSize]
206             else:
207                 motseq2 = complement(seq2[int(seq2pos) + 1 - motSize:int(seq2pos) + 1], motSize)
208
209             for pos in range(motSize):
210                 NT = motseq2[pos]
211                 thePWM[pos][matrixRow[NT]] += 1
212
213             seqNum += 1.0
214
215         if len(thePWM) > 0:
216             for pos in range(len(thePWM)):
217                 thePWM[pos][matrixRow["A"]] /= seqNum
218                 thePWM[pos][matrixRow["C"]] /= seqNum
219                 thePWM[pos][matrixRow["G"]] /= seqNum
220                 thePWM[pos][matrixRow["T"]] /= seqNum
221
222             self.motifs.append(Motif(self.tagID + "-FASTCOMP-" + str(len(self.motifs) + 1), "", thePWM, [], 1.0, str(self.argDict["threshold"])))
223
224         return self.motifs