first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneStartBins.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
9 import pysam
10 from cistematic.genomes import Genome
11
12 print "geneStartBins: version 2.1"
13 if len(sys.argv) < 4:
14     print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
15     print '\n\twhere regionSize is the optional maximum region in bp\n'
16     sys.exit(1)
17
18 genome = sys.argv[1]
19 hitfile =  sys.argv[2]
20 outfilename = sys.argv[3]
21 standardMinDist = 3000
22 if '-max' in sys.argv:
23     standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
24
25 if '-raw' in sys.argv:
26     normalize = False
27     normalizeBins = False
28 else:
29     normalize = True
30     normalizeBins = True    
31
32 doCache = False
33 if '-cache' in sys.argv:
34     doCache = True
35
36 bins = 10
37 standardMinThresh = standardMinDist / bins
38 bamfile = pysam.Samfile(hitfile, "rb")
39 readlen = getHeaderComment(bamfile.header, "ReadLength")
40 normalizationFactor = 1.0
41 if normalize:
42     totalCount = getHeaderComment(bamfile.header, "Total")
43     normalizationFactor = totalCount / 1000000.
44
45 hg = Genome(genome)
46 gidDict = {}
47 geneinfoDict = getGeneInfoDict(genome, cache=True)
48 featuresDict = hg.getallGeneFeatures()
49 outfile = open(outfilename,'w')
50 gidList = hg.allGIDs()
51 gidList.sort()
52 chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
53 for gid in gidList:
54     symbol = 'LOC' + gid
55     geneinfo = ''
56     featureList = []
57     try:
58         geneinfo = geneinfoDict[gid]
59         featureList = featuresDict[gid]
60         symbol = geneinfo[0][0]
61     except:
62         print geneinfo
63
64     newfeatureList = []
65     if len(featureList) == 0:
66         continue
67
68     for (ftype, chrom, start, stop, fsense) in featureList:
69         if (start, stop) not in newfeatureList:
70             newfeatureList.append((start, stop))
71
72     if chrom not in chromosomeList:
73         continue
74
75     newfeatureList.sort()
76     if len(newfeatureList) < 1:
77         continue
78
79     glen = standardMinDist / 2
80     if fsense == 'F':
81         nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
82         if nextGene < glen * 2:
83             glen = nextGene / 2
84
85         if glen < 1:
86             glen = 1
87
88         gstart = newfeatureList[0][0] - glen
89         if gstart < 0:
90             gstart = 0
91
92         gstop = newfeatureList[0][0]  + glen
93     else:
94         nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
95         if nextGene < glen * 2:
96             glen = nextGene / 2
97
98         if glen < 1:
99             glen = 1
100
101         gstart = newfeatureList[-1][1] - glen
102         gstop = newfeatureList[-1][1] + glen
103
104     tagCount = 0
105     if glen < standardMinDist / 2:
106         continue
107
108     binList = [0] * bins
109     for alignedread in bamfile.fetch(chrom):
110         tagStart = alignedread.pos - gstart
111         sense = getReadSense(alignedread)
112         weight = 1.0/alignedread.opt('NH')
113         if tagStart >= 2 * glen:
114             break
115
116         if tagStart > 0:
117             tagCount += weight
118             if fsense == 'R':
119                 # we are relying on python's integer division quirk
120                 binID = tagStart / standardMinThresh 
121                 binList[binID] += weight
122             else:
123                 rdist = 2 * glen - tagStart
124                 binID = rdist / standardMinThresh 
125                 binList[binID] += weight
126
127     if tagCount < 2:
128         continue
129
130     print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
131     outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
132     for binAmount in binList:
133         outfile.write('\t%d' % binAmount)
134
135     outfile.write('\n')
136
137 outfile.close()
138