snapshot of 4.0a development. initial git repo commit
[erange.git] / stallCategory.py
1 #
2 #  stallCategory.py
3 #  ENRAGE
4 #
5
6 try:
7         import psyco
8         psyco.full()
9 except:
10         pass
11
12 import sys
13 import optparse
14
15 print "%prog: version 1.1"
16
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]"
23
24     parser = optparse.OptionParser(usage=usage)
25     parser.add_option("--out", dest="outFileName")
26     parser.add_option("--statout", dest="statOutFileName")
27     parser.add_option("--expression", type="float", dest="expressionLevel")
28     parser.set_defaults(outFileName=None, statOutFileName=None, expressionLevel=0.9)
29     (options, args) = parser.parse_args(argv[1:])
30
31     if len(args) < 3:
32         print usage
33         sys.exit(1)
34
35     infile1 = args[1]
36     infile2 = args[2]
37     transcriptFile = args[2]
38
39     stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel)
40
41
42 def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9):
43
44     infile1 = open(inFile1Name)
45     infile2 = open(inFile2Name)
46     transcriptFile = open(transcriptFileName)
47
48     writeOut = False
49     if outFileName is not None:
50         outfile = open(outFileName, "w")
51         outfile.write("gene\texpression\tratio1\tpromAmount1\ttotal1\trestRPKM1\tratio2\tpromAmount2\ttotal2\trestRPKM2\n")
52         writeOut = True
53
54     statWriteOut = False
55     if statOutFileName is not None:
56         statoutfile = open(statOutFileName, "w")
57         statoutfile.write("ExpressionR1R2Stalled1Stalled2\tCount\n")
58         statWriteOut = True
59
60     dictOne = {}
61     dictTwo = {}
62     expressionDict = {}
63
64     for line in infile1:
65         if "short" in line:
66             continue
67
68         fields = line.strip().split()
69         promAmount = float(fields[4]) + float(fields[5])
70         genelen = float(fields[3])/100
71         total = float(fields[2])
72         if total < 0.1:
73             total = 0.1
74
75         restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
76         ratio = float(fields[-1])
77         dictOne[fields[1]] = (ratio, promAmount, total, restRPKM)
78
79     for line in infile2:
80         if "short" in line:
81             continue
82
83         fields = line.strip().split()
84         promAmount = float(fields[4]) + float(fields[5])
85         genelen = float(fields[3])/100
86         if promAmount == 0.:
87             promAmount = 0.1
88
89         total = float(fields[2])
90         if total < 0.1:
91             total = 0.1
92
93         restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
94         ratio = float(fields[-1])
95         dictTwo[fields[1]] = (ratio, promAmount, total, restRPKM)
96
97     for line in transcriptFile:
98         (gene, transc, transcpercell) = line.strip().split()
99         expressionDict[gene] = float(transcpercell)
100
101     categoryList = []
102     categoryDict = {}
103     for atype in ["HH", "HL", "LH", "LL"]:
104         for expression in ["E", "N"]:
105             for cat1 in ["Y", "N"]:
106                 for cat2 in ["Y", "N"]:
107                     category = expression + cat1 + cat2 + atype
108                     categoryList.append(category)
109                     categoryDict[category] = []
110
111     for gene in dictOne:
112         if gene not in expressionDict:
113             if writeOut:
114                 print "%s is not in expressionDict - skipping" % gene
115
116             continue
117
118         expression = expressionDict[gene]
119         (ratio1, promAmount1, total1, restRPKM1) = dictOne[gene]
120         (ratio2, promAmount2, total2, restRPKM2) = dictTwo[gene]
121
122         if expression > expressionLevel:
123             category = "E"
124         else:
125             category = "N"
126
127         if total1 > 5.0:
128             category += "Y"
129         else:
130             category += "N"
131
132         if total2 > 5.0:
133             category += "Y"
134         else:
135             category += "N"
136
137         if ratio1 > 15:
138             category += "H"
139         else:
140             category += "L"
141
142         if ratio2 > 15:
143             category += "H"
144         else:
145             category += "L"
146
147         categoryDict[category].append(gene)
148         if writeOut:
149             outfile.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
150 )
151         else:
152             print "%s %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %s" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
153
154     if writeOut:
155         outfile.close()
156
157     for category in categoryList:
158         if statWriteOut:
159             statoutfile.write("%s\t%d\n" % (category, len(categoryDict[category])))
160         else:
161             print "%s %d" % (category, len(categoryDict[category]))
162
163
164 if __name__ == "__main__":
165     main(sys.argv)