erange version 4.0a dev release
[erange.git] / stallCategory.py
1 #
2 #  stallCategory.py
3 #  ENRAGE
4 #
5
6 try:
7         import psyco
8         psyco.full()
9 except:
10         pass
11
12 import sys
13 import optparse
14 from commoncode import getConfigParser, getConfigOption, getConfigFloatOption
15
16 print "stallCategory: version 1.2"
17
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]"
24
25     parser = makeParser(usage)
26     (options, args) = parser.parse_args(argv[1:])
27
28     if len(args) < 3:
29         print usage
30         sys.exit(1)
31
32     infile1 = args[1]
33     infile2 = args[2]
34     transcriptFile = args[2]
35
36     stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel)
37
38
39 def makeParser(usage=""):
40     parser = optparse.OptionParser(usage=usage)
41     parser.add_option("--out", dest="outFileName")
42     parser.add_option("--statout", dest="statOutFileName")
43     parser.add_option("--expression", type="float", dest="expressionLevel")
44
45     configParser = getConfigParser()
46     section = "stallCategory"
47     outFileName = getConfigOption(configParser, section, "outFileName", None)
48     statOutFileName = getConfigOption(configParser, section, "statOutFileName", None)
49     expressionLevel = getConfigFloatOption(configParser, section, "expressionLevel", 0.9)
50
51     parser.set_defaults(outFileName=outFileName, statOutFileName=statOutFileName, expressionLevel=expressionLevel)
52
53     return parser
54
55
56 def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9):
57
58     infile1 = open(inFile1Name)
59     infile2 = open(inFile2Name)
60     transcriptFile = open(transcriptFileName)
61
62     writeOut = False
63     if outFileName is not None:
64         outfile = open(outFileName, "w")
65         outfile.write("gene\texpression\tratio1\tpromAmount1\ttotal1\trestRPKM1\tratio2\tpromAmount2\ttotal2\trestRPKM2\n")
66         writeOut = True
67
68     statWriteOut = False
69     if statOutFileName is not None:
70         statoutfile = open(statOutFileName, "w")
71         statoutfile.write("ExpressionR1R2Stalled1Stalled2\tCount\n")
72         statWriteOut = True
73
74     dictOne = {}
75     dictTwo = {}
76     expressionDict = {}
77
78     for line in infile1:
79         if "short" in line:
80             continue
81
82         fields = line.strip().split()
83         promAmount = float(fields[4]) + float(fields[5])
84         genelen = float(fields[3])/100
85         total = float(fields[2])
86         if total < 0.1:
87             total = 0.1
88
89         restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
90         ratio = float(fields[-1])
91         dictOne[fields[1]] = (ratio, promAmount, total, restRPKM)
92
93     for line in infile2:
94         if "short" in line:
95             continue
96
97         fields = line.strip().split()
98         promAmount = float(fields[4]) + float(fields[5])
99         genelen = float(fields[3])/100
100         if promAmount == 0.:
101             promAmount = 0.1
102
103         total = float(fields[2])
104         if total < 0.1:
105             total = 0.1
106
107         restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
108         ratio = float(fields[-1])
109         dictTwo[fields[1]] = (ratio, promAmount, total, restRPKM)
110
111     for line in transcriptFile:
112         (gene, transc, transcpercell) = line.strip().split()
113         expressionDict[gene] = float(transcpercell)
114
115     categoryList = []
116     categoryDict = {}
117     for atype in ["HH", "HL", "LH", "LL"]:
118         for expression in ["E", "N"]:
119             for cat1 in ["Y", "N"]:
120                 for cat2 in ["Y", "N"]:
121                     category = expression + cat1 + cat2 + atype
122                     categoryList.append(category)
123                     categoryDict[category] = []
124
125     for gene in dictOne:
126         if gene not in expressionDict:
127             if writeOut:
128                 print "%s is not in expressionDict - skipping" % gene
129
130             continue
131
132         expression = expressionDict[gene]
133         (ratio1, promAmount1, total1, restRPKM1) = dictOne[gene]
134         (ratio2, promAmount2, total2, restRPKM2) = dictTwo[gene]
135
136         if expression > expressionLevel:
137             category = "E"
138         else:
139             category = "N"
140
141         if total1 > 5.0:
142             category += "Y"
143         else:
144             category += "N"
145
146         if total2 > 5.0:
147             category += "Y"
148         else:
149             category += "N"
150
151         if ratio1 > 15:
152             category += "H"
153         else:
154             category += "L"
155
156         if ratio2 > 15:
157             category += "H"
158         else:
159             category += "L"
160
161         categoryDict[category].append(gene)
162         if writeOut:
163             outfile.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
164 )
165         else:
166             print "%s %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %s" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
167
168     if writeOut:
169         outfile.close()
170
171     for category in categoryList:
172         if statWriteOut:
173             statoutfile.write("%s\t%d\n" % (category, len(categoryDict[category])))
174         else:
175             print "%s %d" % (category, len(categoryDict[category]))
176
177
178 if __name__ == "__main__":
179     main(sys.argv)