X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=makerdsfrombowtie.py;h=eadb66b83d59e140b0402a9ea76dd351219abad0;hp=3534a8827c9538fdc05ba193cd792ef7bf5aab72;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py index 3534a88..eadb66b 100755 --- a/makerdsfrombowtie.py +++ b/makerdsfrombowtie.py @@ -11,10 +11,13 @@ try: except: pass -import sys, string, optparse -from commoncode import readDataset, writeLog +import sys +import string +import optparse +from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption +import ReadDataset -verstring = "%prog: version 4.1" +verstring = "makerdsfrombowtie: version 4.2" print verstring def main(argv=None): @@ -23,21 +26,7 @@ def main(argv=None): usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--RNA", dest="genedatafilename") - parser.add_option("--append", action="store_false", dest="init") - parser.add_option("--index", action="store_true", dest="doIndex") - parser.add_option("--spacer", type="int", dest="spacer") - parser.add_option("--rawreadID", action="store_false", dest="trimReadID") - parser.add_option("--forcepair", type="int", dest="forceID") - parser.add_option("--flip", action="store_true", dest="flip") - parser.add_option("--verbose", action="store_true", dest="verbose") - parser.add_option("--strip", action="store_true", dest="stripSpace") - parser.add_option("--cache", type="int", dest="cachePages") - parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2, - trimReadID=True, forceID=None, flip=False, verbose=False, - stripSpace=False, cachePages=100000) - + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -60,34 +49,51 @@ def main(argv=None): propertyList) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--RNA", dest="genedatafilename") + parser.add_option("--append", action="store_false", dest="init") + parser.add_option("--index", action="store_true", dest="doIndex") + parser.add_option("--spacer", type="int", dest="spacer") + parser.add_option("--rawreadID", action="store_false", dest="trimReadID") + parser.add_option("--forcepair", type="int", dest="forceID") + parser.add_option("--flip", action="store_true", dest="flip") + parser.add_option("--verbose", action="store_true", dest="verbose") + parser.add_option("--strip", action="store_true", dest="stripSpace") + parser.add_option("--cache", type="int", dest="cachePages") + + configParser = getConfigParser() + section = "makerdsfrom bowtie" + genedatafilename = getConfigOption(configParser, section, "genedatafilename", None) + init = getConfigBoolOption(configParser, section, "init", True) + doIndex = getConfigBoolOption(configParser, section, "doIndex", False) + spacer = getConfigIntOption(configParser, section, "spacer", 2) + trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True) + forceID = getConfigOption(configParser, section, "forceID", None) + flip = getConfigBoolOption(configParser, section, "flip", False) + verbose = getConfigBoolOption(configParser, section, "verbose", False) + stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False) + cachePages = getConfigIntOption(configParser, section, "cachePages", 100000) + + parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer, + trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose, + stripSpace=stripSpace, cachePages=cachePages) + + return parser + + def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True, doIndex=False, spacer=2, trimReadID=True, forceID=None, flip=False, verbose=False, stripSpace=False, cachePages=100000, propertyList=[]): - delimiter = "|" + writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:])) + geneDict = {} dataType = "DNA" if genedatafilename is not None: dataType = "RNA" genedatafile = open(genedatafilename) - - - forcePair = False - if forceID is not None: - forcePair = True - else: - forceID = 0 - - maxBorder = 0 - index = 0 - insertSize = 100000 - - writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:])) - - geneDict = {} - mapDict = {} - if dataType == "RNA": for line in genedatafile: fields = line.strip().split("\t") blockCount = int(fields[7]) @@ -108,11 +114,10 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr totalLength += exonLengths[index] geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) - mapDict[uname] = [] genedatafile.close() - rds = readDataset(outdbname, init, dataType, verbose=True) + rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True) #check that our cacheSize is better than the dataset's default cache size defaultCacheSize = rds.getDefaultCacheSize() @@ -142,12 +147,17 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr fields = line.split() readsize = len(fields[5]) pairedTest = fields[0][-2:] + forcePair = False + if forceID is not None: + forcePair = True + else: + forceID = 0 + paired = False if pairedTest in ["/1", "/2"] or forcePair: print "assuming reads are paired" paired = True - print "read size: %d bp" % readsize if init: rds.insertMetadata([("readsize", readsize)]) @@ -162,8 +172,9 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr infile.close() - trim = -4 + maxBorder = 0 if dataType == "RNA": + trim = -4 maxBorder = readsize + trim infile = open(filename, "r") @@ -173,6 +184,8 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr mInsertList = [] sInsertList = [] index = uIndex = mIndex = sIndex = lIndex = 0 + delimiter = "|" + insertSize = 100000 for line in infile: lIndex += 1 if stripSpace: