makerdsfrombowtie.py

   1 #
   2 #  makerdsfrombowtie.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 10/20/08.
   6 #
   7
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys, string, optparse
  15 from commoncode import readDataset, writeLog
  16
  17 verstring = "%prog: version 4.1"
  18 print verstring
  19
  20 def main(argv=None):
  21     if not argv:
  22         argv = sys.argv
  23
  24     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  25
  26     parser = optparse.OptionParser(usage=usage)
  27     parser.add_option("--RNA", dest="genedatafilename")
  28     parser.add_option("--append", action="store_false", dest="init")
  29     parser.add_option("--index", action="store_true", dest="doIndex")
  30     parser.add_option("--spacer", type="int", dest="spacer")
  31     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
  32     parser.add_option("--forcepair", type="int", dest="forceID")
  33     parser.add_option("--flip", action="store_true", dest="flip")
  34     parser.add_option("--verbose", action="store_true", dest="verbose")
  35     parser.add_option("--strip", action="store_true", dest="stripSpace")
  36     parser.add_option("--cache", type="int", dest="cachePages")
  37     parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
  38                         trimReadID=True, forceID=None, flip=False, verbose=False,
  39                         stripSpace=False, cachePages=100000)
  40
  41     (options, args) = parser.parse_args(argv[1:])
  42
  43     if len(args) < 3:
  44         print usage
  45         sys.exit(1)
  46
  47     label = args[0]
  48     filename = args[1]
  49     outdbname = args[2]
  50
  51     propertyList = []
  52     for arg in args:
  53         if "::" in arg:
  54             (pname, pvalue) = arg.strip().split("::")
  55             propertyList.append((pname, pvalue))
  56
  57     makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
  58                       options.doIndex, options.spacer, options.trimReadID, options.forceID,
  59                       options.flip, options.verbose, options.stripSpace, options.cachePages,
  60                       propertyList)
  61
  62
  63 def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
  64                       doIndex=False, spacer=2, trimReadID=True, forceID=None,
  65                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
  66                       propertyList=[]):
  67
  68     delimiter = "|"
  69
  70     dataType = "DNA"
  71     if genedatafilename is not None:
  72         dataType = "RNA"
  73         genedatafile = open(genedatafilename)
  74
  75
  76     forcePair = False
  77     if forceID is not None:
  78         forcePair = True
  79     else:
  80         forceID = 0
  81
  82     maxBorder = 0
  83     index = 0
  84     insertSize = 100000
  85
  86     writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
  87
  88     geneDict = {}
  89     mapDict = {}
  90     if dataType == "RNA":
  91         for line in genedatafile:
  92             fields = line.strip().split("\t")
  93             blockCount = int(fields[7])
  94             if blockCount < 2:
  95                 continue
  96
  97             uname = fields[0]
  98             chrom = fields[1]
  99             sense = fields[2]
 100             chromstarts = fields[8][:-1].split(",")
 101             chromstops = fields[9][:-1].split(",")
 102             exonLengths = []
 103             totalLength = 0
 104             for index in range(blockCount):
 105                 chromstarts[index] = int(chromstarts[index])
 106                 chromstops[index] = int(chromstops[index])
 107                 exonLengths.append(chromstops[index] - chromstarts[index])
 108                 totalLength += exonLengths[index]
 109
 110             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 111             mapDict[uname] = []
 112
 113         genedatafile.close()
 114
 115     rds = readDataset(outdbname, init, dataType, verbose=True)
 116
 117     #check that our cacheSize is better than the dataset's default cache size
 118     defaultCacheSize = rds.getDefaultCacheSize()
 119     if cachePages > defaultCacheSize:
 120         if init:
 121             rds.setDBcache(cachePages, default=True)
 122         else:
 123             rds.setDBcache(cachePages)
 124
 125     if not init and doIndex:
 126         try:
 127             if rds.hasIndex():
 128                 rds.dropIndex()
 129         except:
 130             if verbose:
 131                 print "couldn't drop Index"
 132
 133     if len(propertyList) > 0:
 134         rds.insertMetadata(propertyList)
 135
 136     # make some assumptions based on first read
 137     infile = open(filename, "r")
 138     line = infile.readline()
 139     if stripSpace:
 140         line = line.replace(" ","")
 141
 142     fields = line.split()
 143     readsize = len(fields[5])
 144     pairedTest = fields[0][-2:]
 145     paired = False
 146     if pairedTest in ["/1", "/2"] or forcePair:
 147         print "assuming reads are paired"
 148         paired = True
 149
 150
 151     print "read size: %d bp" % readsize
 152     if init:
 153         rds.insertMetadata([("readsize", readsize)])
 154         if paired:
 155             rds.insertMetadata([("paired", "True")])
 156
 157     if "bowtie_mapped" not in rds.getMetadata():
 158         rds.insertMetadata([("bowtie_mapped", "True")])
 159
 160     if dataType == "RNA" and "spacer" not in rds.getMetadata():
 161         rds.insertMetadata([("spacer", spacer)])
 162
 163     infile.close()
 164
 165     trim = -4
 166     if dataType == "RNA":
 167         maxBorder = readsize + trim
 168
 169     infile = open(filename, "r")
 170     prevID = ""
 171     readList = []
 172     uInsertList = []
 173     mInsertList = []
 174     sInsertList = []
 175     index = uIndex = mIndex = sIndex = lIndex = 0
 176     for line in infile:
 177         lIndex += 1
 178         if stripSpace:
 179             line = line.replace(" ","")
 180
 181         fields = line.strip().split()
 182         readID = fields[0]
 183         if trimReadID:
 184             readID = string.join(readID.split(":")[1:], ":")
 185
 186         if readID != prevID:
 187             listlen = len(readList)
 188             if trimReadID:
 189                 prevID = "%s-%s" % (label, prevID)
 190
 191             if forcePair:
 192                 prevID += "/%d" % forceID
 193
 194             if listlen == 1:
 195                 (sense, chrom, start, mismatches) = readList[0]
 196                 if flip:
 197                     if sense == "+":
 198                         sense = "-"
 199                     else:
 200                         sense = "+"
 201
 202                 if "|" not in chrom:
 203                     stop = start + readsize
 204                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
 205                     uIndex += 1
 206                 elif dataType == "RNA":
 207                     currentSplice = chrom
 208                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
 209                     if model not in geneDict:
 210                         prevID = readID
 211                     else:
 212                         (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 213                         spliceID = int(spliceID)
 214                         rstart = int(start) - spacer
 215                         lefthalf = maxBorder - rstart
 216                         if lefthalf < 1 or lefthalf > maxBorder:
 217                             prevID = readID
 218                         else:
 219                             righthalf = readsize - lefthalf
 220                             startL = int(regionStart)  + rstart
 221                             stopL = startL + lefthalf
 222                             startR = chromstarts[spliceID + 1]
 223                             stopR = chromstarts[spliceID + 1] + righthalf
 224                             sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
 225                             sIndex += 1
 226             elif listlen > 1:
 227                 prevID = "%s::%s" % (prevID, str(listlen))
 228                 mIndex += 1
 229                 # ignore multireads that can also map across splices
 230                 skip = False
 231                 for (sense, chrom, start, mismatches) in readList:
 232                     if "|" in chrom:
 233                         skip = True
 234
 235                 if not skip:
 236                     for (sense, chrom, start, mismatches) in readList:
 237                         stop = start + readsize
 238                         if flip:
 239                             if sense == "+":
 240                                 sense = "-"
 241                             else:
 242                                 sense = "+"
 243
 244                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
 245             else:
 246                 prevID = readID
 247
 248             if index % insertSize == 0:
 249                 rds.insertUniqs(uInsertList)
 250                 rds.insertMulti(mInsertList)
 251                 uInsertList = []
 252                 mInsertList = []
 253                 if dataType == "RNA":
 254                     rds.insertSplices(sInsertList)
 255                     sInsertList = []
 256
 257                 print ".",
 258                 sys.stdout.flush()
 259
 260             # start processing new read
 261             readList = []
 262             prevID = readID
 263             index += 1
 264
 265         # add the new read
 266         sense = fields[1]
 267         chrom = fields[2]
 268         # for eland compat, we are 1-based
 269         start = int(fields[3]) + 1
 270         mismatches = ""
 271         if ":" in fields[-1]:
 272             mismatches = decodeMismatches(fields[-1], sense)
 273
 274         readList.append((sense, chrom, start, mismatches))
 275         if lIndex % 1000000 == 0:
 276             print "processed %d lines" % lIndex
 277
 278     print "%d lines processed" % lIndex
 279
 280     if len(uInsertList) > 0:
 281         rds.insertUniqs(uInsertList)
 282
 283     if len(mInsertList) > 0:
 284         rds.insertMulti(mInsertList)
 285
 286     if len(sInsertList) > 0:
 287         rds.insertSplices(sInsertList)
 288
 289     combString = "%d unique reads" % uIndex
 290     combString += "\t%d multi reads" % mIndex
 291     if dataType == "RNA":
 292         combString += "\t%d spliced reads" % sIndex
 293
 294     print
 295     print combString.replace("\t", "\n")
 296
 297     writeLog("%s.log" % outdbname, verstring, combString)
 298
 299     if doIndex:
 300         print "building index...."
 301         if cachePages > defaultCacheSize:
 302             rds.setDBcache(cachePages)
 303             rds.buildIndex(cachePages)
 304         else:
 305             rds.buildIndex(defaultCacheSize)
 306
 307
 308 def decodeMismatches(mString, rsense):
 309     complement = {"A": "T",
 310                   "T": "A",
 311                   "C": "G",
 312                   "G": "C",
 313                   "N": "N"
 314     }
 315
 316     output = []
 317     mismatches = mString.split(",")
 318     for mismatch in mismatches:
 319         (pos,change) = mismatch.split(":")
 320         (genNT, readNT) = change.split(">")
 321         if rsense == "-":
 322             readNT = complement[readNT]
 323             genNT  = complement[genNT]
 324
 325         elandCompatiblePos = int(pos) + 1
 326         output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
 327
 328     return string.join(output, ",")
 329
 330
 331 if __name__ == "__main__":
 332     main(sys.argv)