first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / partition.py
1 #
2 #  partition.py
3 #  ENRAGE
4 #
5 """ usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]
6            where the regionfiles must be comma-separated with no white space
7            -minFeature controls the size of the smallest partition
8 """
9
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys
17 import string
18 import optparse
19 from commoncode import getMergedRegions, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
20
21 versionString = "partition: version 2.1"
22 print versionString
23
24
25 def main(argv=None):
26     if not argv:
27         argv = sys.argv
28
29     usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]"
30
31     parser = getParser(usage)
32     (options, args) = parser.parse_args(argv[1:])
33
34     if len(args) < 3:
35         print usage
36         sys.exit(1)
37
38     mergeID = args[0]
39     regionfiles = args[1]
40     outfilename = args[2]
41
42     if options.padregion:
43         print "padding %d bp on each side of a region" % options.padregion
44
45     if options.mergeregion:
46         print "merging regions closer than %d bp" % options.mergeregion
47
48     if options.locID:
49         print "using locations as region ID"
50
51     if options.ignoreRandom:
52         print "ignoring 'random' chromosomes"
53
54     partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField,
55               options.padregion, options.locID, options.ignoreRandom, options.mergeregion,
56               options.merging, options.logfilename)
57
58
59 def getParser(usage):
60     parser = optparse.OptionParser(usage=usage)
61     parser.add_option("--minFeature", type="int", dest="minFeature",
62                       help="size of smallest partition")
63     parser.add_option("--chromField", type="int", dest="cField",
64                       help="num chromosome fields")
65     parser.add_option("--padregion", type="int", dest="padregion",
66                       help="padding on each side of region")
67     parser.add_option("--mergeregion", type="int", dest="mergeregion",
68                       help="bp threshold to merge regions")
69     parser.add_option("--nomerge", action="store_false", dest="merging",
70                       help="do not merge regions")
71     parser.add_option("--log", dest="logfilename",
72                       help="log file")
73     parser.add_option("--locID", action="store_true", dest="locID",
74                       help="use location as region ID")
75     parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
76                       help="ignore 'random' chromosomes")
77
78     configParser = getConfigParser()
79     section = "partition"
80     minFeature = getConfigIntOption(configParser, section, "minFeature", 25)
81     cField = getConfigIntOption(configParser, section, "cField", 1)
82     padregion = getConfigIntOption(configParser, section, "padregion", 1)
83     locID = getConfigBoolOption(configParser, section, "locID", False)
84     ignoreRandom = getConfigBoolOption(configParser, section, "ignoreRandom", False)
85     mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
86     merging = getConfigBoolOption(configParser, section, "merging", True)
87     logfilename = getConfigOption(configParser, section, "logfilename", "partition.log")
88
89     parser.set_defaults(minFeature=minFeature, cField=cField, padregion=padregion, locID=locID,
90                         ignoreRandom=ignoreRandom, mergeregion=mergeregion, merging=merging,
91                         logfilename=logfilename)
92
93     return parser
94
95
96 def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0,
97               locID=False, ignoreRandom=False, mergeregion=0, merging=True,
98               logfilename="partition.log"):
99
100     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
101
102     allregionsDict = {}
103     regionFileList = regionfiles.split(',')
104     numRegions = len(regionFileList)
105     chromList = []
106     for regionID in range(numRegions):
107         allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion,
108                                                     minHits=-1, fullChrom=True, verbose=True, chromField=cField,
109                                                     doMerge=merging, pad=padregion)
110
111         for achrom in allregionsDict[regionID]:
112             if achrom not in chromList:
113                 chromList.append(achrom)
114             
115     outregionDict = {}
116
117     chromList = sorted(chromList)
118
119     for chrom in chromList:
120         if ignoreRandom and "random" in chrom:
121             continue
122
123         outregionDict[chrom] = []
124         pointList = []
125         for regionID in range(numRegions):
126             if chrom in allregionsDict[regionID]:
127                 for region in allregionsDict[regionID][chrom]:
128                     pointList.append(region.start)
129                     pointList.append(region.stop)
130
131         pointList.sort()
132         start = 0
133         for point in pointList:
134             if (point - start) > minFeature:
135                 outregionDict[chrom].append((start, point - 1, point - 1 - start))
136                 start = point
137
138     outfile = open(outfilename, "w")
139     if locID:
140         outfile.write("#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n")
141     else:
142         outfile.write("#labelID\tchrom\tstart\tstop\tlength_kb\n")
143
144     index = 0
145     for chrom in outregionDict:
146         for (start, stop, length) in outregionDict[chrom]:
147             index += 1
148             if locID:
149                 label = "%s:%d-%d" % (chrom, start, stop)
150             else:
151                 label = "%s%d" % (mergeID, index)
152
153             outfile.write("%s\t%s\t%d\t%d\t%.3f\n" % (label, chrom, start, stop, length/1000.))
154
155     message = "%s was partitioned into %d regions" % (mergeID, index)
156     print message
157     writeLog(logfilename, versionString, message)
158
159 if __name__ == "__main__":
160     main(sys.argv)