snapshot of 4.0a development. initial git repo commit
[erange.git] / partition.py
1 #
2 #  partition.py
3 #  ENRAGE
4 #
5 """ usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [--minFeature bp] [--padregion bp] [--mergeregion bp] [--nomerge] [--log altlogfile] [--locid] [--ignorerandom] [--chromField fieldNum]
6            where the regionfiles must be comma-separated with no white space
7            -minFeature controls the size of the smallest partition
8 """
9
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys, string, optparse
17 from commoncode import getMergedRegions, writeLog
18
19 versionString = '%s: version 2.0' % sys.argv[0]
20 print versionString
21
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]"
28
29     parser = optparse.OptionParser(usage=usage)
30     parser.add_option("--minFeature", type="int", dest="minFeature",
31                       help="size of smallest partition")
32     parser.add_option("--chromField", type="int", dest="cField",
33                       help="num chromosome fields")
34     parser.add_option("--padregion", type="int", dest="padregion",
35                       help="padding on each side of region")
36     parser.add_option("--mergeregion", type="int", dest="mergeregion",
37                       help="bp threshold to merge regions")
38     parser.add_option("--nomerge", action="store_false", dest="merging",
39                       help="do not merge regions")
40     parser.add_option("--log", dest="logfilename",
41                       help="log file")
42     parser.add_option("--locID", action="store_true", dest="locID",
43                       help="use location as region ID")
44     parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
45                       help="ignore 'random' chromosomes")
46     parser.set_defaults(minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log")
47     (options, args) = parser.parse_args(argv[1:])
48
49     if len(args) < 3:
50         print usage
51         sys.exit(1)
52
53     mergeID = args[0]
54     regionfiles = args[1]
55     outfilename = args[2]
56
57     if options.padregion:
58         print "padding %d bp on each side of a region" % options.padregion
59
60     if options.mergeregion:
61         print "merging regions closer than %d bp" % options.mergeregion
62
63     if options.locID:
64         print "using locations as region ID"
65
66     if options.ignoreRandom:
67         print "ignoring 'random' chromosomes"
68
69     partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField, options.padregion, options.locID, options.ignoreRandom, options.mergeregion, options.merging, options.logfilename)
70
71
72 def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log"):
73
74     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
75
76     allregionsDict = {}
77     regionFileList = regionfiles.split(',')
78     numRegions = len(regionFileList)
79     chromList = []
80     for regionID in range(numRegions):
81         allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion,  minHits=-1, fullChrom = True, verbose = True, chromField = cField, doMerge=merging, pad=padregion)
82         for achrom in allregionsDict[regionID]:
83             if achrom not in chromList:
84                 chromList.append(achrom)
85             
86     outregionDict = {}
87
88     chromList = sorted(chromList)
89
90     for chrom in chromList:
91         if ignoreRandom and 'random' in chrom:
92             continue
93
94         outregionDict[chrom] = []
95         pointList = []
96         for regionID in range(numRegions):
97             if chrom in allregionsDict[regionID]:
98                 for (rstart, rstop, rlength) in allregionsDict[regionID][chrom]:
99                     pointList.append(rstart)
100                     pointList.append(rstop)
101
102         pointList.sort()
103         start = 0
104         for point in pointList:
105             if (point - start) > minFeature:
106                 outregionDict[chrom].append((start, point - 1, point - 1 - start))
107                 start = point
108
109     outfile = open(outfilename, 'w')
110     if locID:
111         outfile.write('#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n')
112     else:
113         outfile.write('#labelID\tchrom\tstart\tstop\tlength_kb\n')
114
115     index = 0
116     for chrom in outregionDict:
117         for (start, stop, length) in outregionDict[chrom]:
118             index += 1
119             if locID:
120                 outfile.write("%s:%d-%d\t%s\t%d\t%d\t%.3f\n" % (chrom, start, stop, chrom, start, stop, length/1000.))
121             else:
122                 outfile.write("%s%d\t%s\t%d\t%d\t%.3f\n" % (mergeID, index, chrom, start, stop, length/1000.))
123
124     message = "%s was partitioned into %d regions" % (mergeID, index)
125     print message
126     writeLog(logfilename, versionString, message)
127
128 if __name__ == "__main__":
129     main(sys.argv)