makerdsfrombed.py

   1 #
   2 #  makerdsfrombed.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 6/21/08.
   6 #
   7 try:
   8     import psyco
   9     psyco.full()
  10 except:
  11     pass
  12
  13 import sys, string, optparse
  14 from commoncode import readDataset, writeLog
  15
  16 verstring = "%prog: version 2.1" % sys.argv[0]
  17 print verstring
  18
  19
  20 def main(argv=None):
  21     if not argv:
  22         argv = sys.argv
  23
  24     usage = "usage: python %prog label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]"
  25
  26     parser = optparse.OptionParser(usage=usage)
  27     parser.add_option("--append", action="store_false", dest="init")
  28     parser.add_option("--index", action="store_true", dest="doIndex")
  29     parser.add_option("--cache", type="int", dest="cachePages")
  30     parser.add_option("--RNA", action="store_true", dest="rnaDataType")
  31     parser.set_defaults(init=True, rnaDataType=False, doIndex=False, cachePages=100000)
  32     (options, args) = parser.parse_args(argv[1:])
  33
  34     if len(args) < 3:
  35         print usage
  36         print "\ntreats all imported reads as uniquely mapped\n"
  37         sys.exit(1)
  38
  39     label = args[0]
  40     filename = args[1]
  41     outdbname = args[2]
  42
  43     if options.rnaDataType:
  44         dataType = "RNA"
  45     else:
  46         dataType = "DNA"
  47
  48     propertyList = []
  49     for arg in args:
  50         if "::" in arg:
  51             (pname, pvalue) = arg.strip().split("::")
  52             propertyList.append((pname, pvalue))
  53
  54     makerdsfrombed(label, filename, outdbname, options.init, dataType, options.doIndex, options.cachePages, propertyList)
  55
  56
  57 def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doIndex=False, cachePages=100000, propertyList=[]):
  58     readsize = 0
  59     padsize = 0
  60     index = 0
  61     insertSize = 100000
  62
  63     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
  64
  65     infile = open(filename,"r")
  66
  67     rds = readDataset(outdbname, init, dataType, verbose=True)
  68     if not init:
  69         rds.dropIndex()
  70
  71     #check that our cacheSize is better than the dataset's default cache size
  72     defaultCacheSize = rds.getDefaultCacheSize()
  73     if cachePages > defaultCacheSize:
  74         if init:
  75             rds.setDBcache(cachePages, default=True)
  76         else:
  77             rds.setDBcache(cachePages)
  78
  79     if len(propertyList) > 0:
  80         rds.insertMetadata(propertyList)
  81
  82     insertList = []
  83     for line in infile:
  84         if "track" in line:
  85             continue
  86
  87         fields = line.split()
  88         if readsize == 0:
  89             readsize = abs(int(fields[1]) - int(fields[2]))
  90             if init:
  91                 rds.insertMetadata([("readsize", readsize+1)])
  92                 rds.insertMetadata([("imported_from_bed", "True")])
  93
  94         chrom = fields[0]
  95         start = int(fields[1])
  96         stop = int(fields[2])
  97         sense = fields[5]
  98         readID = "%s-%s" % (label, str(index))
  99         insertList.append((readID, chrom, start, stop, sense, 1.0, "", ""))
 100         if index % insertSize == 0:
 101             rds.insertUniqs(insertList)
 102             insertList = []
 103             print ".",
 104             sys.stdout.flush()
 105
 106         index += 1
 107
 108     if len(insertList) > 0:
 109         rds.insertUniqs(insertList)
 110
 111     countString = "%d unique reads" % index
 112     print countString
 113
 114     writeLog(outdbname + ".log", verstring, countString)
 115
 116     if doIndex:
 117         print "building index...."
 118         if cachePages > defaultCacheSize:
 119             rds.setDBcache(cachePages)
 120             rds.buildIndex(cachePages)
 121         else:
 122             rds.buildIndex(defaultCacheSize)
 123
 124
 125 if __name__ == "__main__":
 126     main(sys.argv)