makerdsfrombowtie.py

   1 #
   2 #  makerdsfrombowtie.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 10/20/08.
   6 #
   7
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys
  15 import string
  16 import optparse
  17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  18 import ReadDataset
  19
  20 verstring = "makerdsfrombowtie: version 4.2"
  21 print verstring
  22
  23 def main(argv=None):
  24     if not argv:
  25         argv = sys.argv
  26
  27     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  28
  29     parser = getParser(usage)
  30     (options, args) = parser.parse_args(argv[1:])
  31
  32     if len(args) < 3:
  33         print usage
  34         sys.exit(1)
  35
  36     label = args[0]
  37     filename = args[1]
  38     outdbname = args[2]
  39
  40     propertyList = []
  41     for arg in args:
  42         if "::" in arg:
  43             (pname, pvalue) = arg.strip().split("::")
  44             propertyList.append((pname, pvalue))
  45
  46     makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
  47                       options.doIndex, options.spacer, options.trimReadID, options.forceID,
  48                       options.flip, options.verbose, options.stripSpace, options.cachePages,
  49                       propertyList)
  50
  51
  52 def getParser(usage):
  53     parser = optparse.OptionParser(usage=usage)
  54     parser.add_option("--RNA", dest="genedatafilename")
  55     parser.add_option("--append", action="store_false", dest="init")
  56     parser.add_option("--index", action="store_true", dest="doIndex")
  57     parser.add_option("--spacer", type="int", dest="spacer")
  58     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
  59     parser.add_option("--forcepair", type="int", dest="forceID")
  60     parser.add_option("--flip", action="store_true", dest="flip")
  61     parser.add_option("--verbose", action="store_true", dest="verbose")
  62     parser.add_option("--strip", action="store_true", dest="stripSpace")
  63     parser.add_option("--cache", type="int", dest="cachePages")
  64
  65     configParser = getConfigParser()
  66     section = "makerdsfrom bowtie"
  67     genedatafilename = getConfigOption(configParser, section, "genedatafilename", None)
  68     init = getConfigBoolOption(configParser, section, "init", True)
  69     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  70     spacer = getConfigIntOption(configParser, section, "spacer", 2)
  71     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
  72     forceID = getConfigOption(configParser, section, "forceID", None)
  73     flip = getConfigBoolOption(configParser, section, "flip", False)
  74     verbose = getConfigBoolOption(configParser, section, "verbose", False)
  75     stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False)
  76     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  77
  78     parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer,
  79                         trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose,
  80                         stripSpace=stripSpace, cachePages=cachePages)
  81
  82     return parser
  83
  84
  85 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
  86                       doIndex=False, spacer=2, trimReadID=True, forceID=None,
  87                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
  88                       propertyList=[]):
  89
  90     delimiter = "|"
  91
  92     dataType = "DNA"
  93     if genedatafilename is not None:
  94         dataType = "RNA"
  95         genedatafile = open(genedatafilename)
  96
  97
  98     forcePair = False
  99     if forceID is not None:
 100         forcePair = True
 101     else:
 102         forceID = 0
 103
 104     maxBorder = 0
 105     index = 0
 106     insertSize = 100000
 107
 108     writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
 109
 110     geneDict = {}
 111     mapDict = {}
 112     if dataType == "RNA":
 113         for line in genedatafile:
 114             fields = line.strip().split("\t")
 115             blockCount = int(fields[7])
 116             if blockCount < 2:
 117                 continue
 118
 119             uname = fields[0]
 120             chrom = fields[1]
 121             sense = fields[2]
 122             chromstarts = fields[8][:-1].split(",")
 123             chromstops = fields[9][:-1].split(",")
 124             exonLengths = []
 125             totalLength = 0
 126             for index in range(blockCount):
 127                 chromstarts[index] = int(chromstarts[index])
 128                 chromstops[index] = int(chromstops[index])
 129                 exonLengths.append(chromstops[index] - chromstarts[index])
 130                 totalLength += exonLengths[index]
 131
 132             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 133             mapDict[uname] = []
 134
 135         genedatafile.close()
 136
 137     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
 138
 139     #check that our cacheSize is better than the dataset's default cache size
 140     defaultCacheSize = rds.getDefaultCacheSize()
 141     if cachePages > defaultCacheSize:
 142         if init:
 143             rds.setDBcache(cachePages, default=True)
 144         else:
 145             rds.setDBcache(cachePages)
 146
 147     if not init and doIndex:
 148         try:
 149             if rds.hasIndex():
 150                 rds.dropIndex()
 151         except:
 152             if verbose:
 153                 print "couldn't drop Index"
 154
 155     if len(propertyList) > 0:
 156         rds.insertMetadata(propertyList)
 157
 158     # make some assumptions based on first read
 159     infile = open(filename, "r")
 160     line = infile.readline()
 161     if stripSpace:
 162         line = line.replace(" ","")
 163
 164     fields = line.split()
 165     readsize = len(fields[5])
 166     pairedTest = fields[0][-2:]
 167     paired = False
 168     if pairedTest in ["/1", "/2"] or forcePair:
 169         print "assuming reads are paired"
 170         paired = True
 171
 172
 173     print "read size: %d bp" % readsize
 174     if init:
 175         rds.insertMetadata([("readsize", readsize)])
 176         if paired:
 177             rds.insertMetadata([("paired", "True")])
 178
 179     if "bowtie_mapped" not in rds.getMetadata():
 180         rds.insertMetadata([("bowtie_mapped", "True")])
 181
 182     if dataType == "RNA" and "spacer" not in rds.getMetadata():
 183         rds.insertMetadata([("spacer", spacer)])
 184
 185     infile.close()
 186
 187     trim = -4
 188     if dataType == "RNA":
 189         maxBorder = readsize + trim
 190
 191     infile = open(filename, "r")
 192     prevID = ""
 193     readList = []
 194     uInsertList = []
 195     mInsertList = []
 196     sInsertList = []
 197     index = uIndex = mIndex = sIndex = lIndex = 0
 198     for line in infile:
 199         lIndex += 1
 200         if stripSpace:
 201             line = line.replace(" ","")
 202
 203         fields = line.strip().split()
 204         readID = fields[0]
 205         if trimReadID:
 206             readID = string.join(readID.split(":")[1:], ":")
 207
 208         if readID != prevID:
 209             listlen = len(readList)
 210             if trimReadID:
 211                 prevID = "%s-%s" % (label, prevID)
 212
 213             if forcePair:
 214                 prevID += "/%d" % forceID
 215
 216             if listlen == 1:
 217                 (sense, chrom, start, mismatches) = readList[0]
 218                 if flip:
 219                     if sense == "+":
 220                         sense = "-"
 221                     else:
 222                         sense = "+"
 223
 224                 if "|" not in chrom:
 225                     stop = start + readsize
 226                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
 227                     uIndex += 1
 228                 elif dataType == "RNA":
 229                     currentSplice = chrom
 230                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
 231                     if model not in geneDict:
 232                         prevID = readID
 233                     else:
 234                         (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 235                         spliceID = int(spliceID)
 236                         rstart = int(start) - spacer
 237                         lefthalf = maxBorder - rstart
 238                         if lefthalf < 1 or lefthalf > maxBorder:
 239                             prevID = readID
 240                         else:
 241                             righthalf = readsize - lefthalf
 242                             startL = int(regionStart)  + rstart
 243                             stopL = startL + lefthalf
 244                             startR = chromstarts[spliceID + 1]
 245                             stopR = chromstarts[spliceID + 1] + righthalf
 246                             sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
 247                             sIndex += 1
 248             elif listlen > 1:
 249                 prevID = "%s::%s" % (prevID, str(listlen))
 250                 mIndex += 1
 251                 # ignore multireads that can also map across splices
 252                 skip = False
 253                 for (sense, chrom, start, mismatches) in readList:
 254                     if "|" in chrom:
 255                         skip = True
 256
 257                 if not skip:
 258                     for (sense, chrom, start, mismatches) in readList:
 259                         stop = start + readsize
 260                         if flip:
 261                             if sense == "+":
 262                                 sense = "-"
 263                             else:
 264                                 sense = "+"
 265
 266                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
 267             else:
 268                 prevID = readID
 269
 270             if index % insertSize == 0:
 271                 rds.insertUniqs(uInsertList)
 272                 rds.insertMulti(mInsertList)
 273                 uInsertList = []
 274                 mInsertList = []
 275                 if dataType == "RNA":
 276                     rds.insertSplices(sInsertList)
 277                     sInsertList = []
 278
 279                 print ".",
 280                 sys.stdout.flush()
 281
 282             # start processing new read
 283             readList = []
 284             prevID = readID
 285             index += 1
 286
 287         # add the new read
 288         sense = fields[1]
 289         chrom = fields[2]
 290         # for eland compat, we are 1-based
 291         start = int(fields[3]) + 1
 292         mismatches = ""
 293         if ":" in fields[-1]:
 294             mismatches = decodeMismatches(fields[-1], sense)
 295
 296         readList.append((sense, chrom, start, mismatches))
 297         if lIndex % 1000000 == 0:
 298             print "processed %d lines" % lIndex
 299
 300     print "%d lines processed" % lIndex
 301
 302     if len(uInsertList) > 0:
 303         rds.insertUniqs(uInsertList)
 304
 305     if len(mInsertList) > 0:
 306         rds.insertMulti(mInsertList)
 307
 308     if len(sInsertList) > 0:
 309         rds.insertSplices(sInsertList)
 310
 311     combString = "%d unique reads" % uIndex
 312     combString += "\t%d multi reads" % mIndex
 313     if dataType == "RNA":
 314         combString += "\t%d spliced reads" % sIndex
 315
 316     print
 317     print combString.replace("\t", "\n")
 318
 319     writeLog("%s.log" % outdbname, verstring, combString)
 320
 321     if doIndex:
 322         print "building index...."
 323         if cachePages > defaultCacheSize:
 324             rds.setDBcache(cachePages)
 325             rds.buildIndex(cachePages)
 326         else:
 327             rds.buildIndex(defaultCacheSize)
 328
 329
 330 def decodeMismatches(mString, rsense):
 331     complement = {"A": "T",
 332                   "T": "A",
 333                   "C": "G",
 334                   "G": "C",
 335                   "N": "N"
 336     }
 337
 338     output = []
 339     mismatches = mString.split(",")
 340     for mismatch in mismatches:
 341         (pos,change) = mismatch.split(":")
 342         (genNT, readNT) = change.split(">")
 343         if rsense == "-":
 344             readNT = complement[readNT]
 345             genNT  = complement[genNT]
 346
 347         elandCompatiblePos = int(pos) + 1
 348         output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
 349
 350     return string.join(output, ",")
 351
 352
 353 if __name__ == "__main__":
 354     main(sys.argv)