prev | Draft Version 560 (Thu Dec 1 09:18:36 2005) | next |
"*"
in the shell's *.txt
re
module, then use re.search(pattern, text)
pattern
is a regular expression that describes what you're looking fortext
is the string you're searching inpattern
?
Pattern | Matches | Doesn't Match | Explanation |
---|---|---|---|
⌈a*⌋ | "" , "a" , "aa" , … | "A" , "b" | ⌈*⌋ means “zero or more” matching is case sensitive |
⌈b+⌋ | "b" , "bb" , … | "" | ⌈+⌋ means “one or more” |
⌈ab?c⌋ | "ac" , "abc" | "a" , "abbc" | ⌈?⌋ means “optional” (zero or one) |
⌈[abc]⌋ | "a" , "b" , or "c" | "ab" , "d" | ⌈[…]⌋ means “one character from a set” |
⌈[a-c]⌋ | "a" , "b" , or "c" | Character ranges can be abbreviated | |
⌈[abc]*⌋ | "" , "ac" , "baabcab" , … | Operators can be combined: zero or more choices from "a" , "b" , or "c" |
re.search
looks for a match anywhere in the textimport re pattern = 'a[bc]*' for text in ['b', 'ab', 'accb', 'mad']: if re.search(pattern, text): print '"%s" matches "%s"' % (pattern, text) else: print '"%s" does not match "%s"' % (pattern, text)
"a[bc]*" does not match "b" "a[bc]*" matches "ab" "a[bc]*" matches "accb" "a[bc]*" matches "mad"
⌈a[bc]*⌋
matches an "a"
, followed by zero or more of either "b"
or "c"
"b"
because there's no leading "a"
"ab"
and "accb"
"mad"
?
re.search
looks for a match anywhere in text
"m"
, then ⌈a⌋
matches "a"
, and ⌈[bc]*⌋
matches the empty stringre.search
looks anywhere in the line, how to find blank lines?
"x \n"
or " x\n"
blank⌈^⌋
matches the beginning of the string⌈$⌋
matches the endimport sys, re # Nothing but space, tab, carriage return, newline from start to end pattern = '^[ \t\r\n]*$' # Count matches in one file/stream. def count(filename, instream): count = 0 for line in instream: if re.search(pattern, line): count += 1 print '%s %d' % (filename, count) # Only standard instream? if len(sys.argv) == 1: count('<stdin>', sys.stdin) else: for filename in sys.argv[1:]: instream = open(filename, 'r') count(filename, instream) instream.close()
"^"
or "*"
?
"\"
in front of it⌈\$⌋
matches a literal "$"
, and ⌈\\⌋
matches a literal "\"
"\\$"
and "\\\\"
"\t"
is a tab character, which matches a tab character"\\t"
is the two-character sequence ⌈\t⌋
, which also matches a tab character"\"
is also used in shorthand notation for common character sets⌈[^abc]⌋
means “anything except the characters in this set”⌈.⌋
means “any character except the end of line”
⌈[^\n]⌋
⌈\b⌋
anchors the match to a break between word and non-word characters⌈^⌋
and ⌈$⌋
, doesn't consume any actual charactersload D 1 10 : sub A B jlt A 20
import sys, re # start of line, optional spaces, digits, more optional spaces, colon numbered = '^\\s*\\d+\\s*:' seen = {} for line in sys.stdin: if re.search(numbered, line): num = line.split()[0] if num in seen: print num else: seen[num] = True
'2 :'.split()
gives ['2', ':']
, but '2:'.split()'
gives ['2:']
re.search
is actually a match object that records what what matched, and wheremo.group()
returns the whole string that matched the REmo.start()
and mo.end()
are the indices of the match's locationimport re text = 'abbcb' for pattern in ['b+', 'bc*', 'b+c+']: mo = re.search(pattern, text) print '%s / %s => "%s" (%d, %d)' % \ (pattern, text, mo.group(), mo.start(), mo.end())
b+ / abbcb => "bb" (1, 3) bc* / abbcb => "b" (1, 2) b+c+ / abbcb => "bbc" (1, 4)
mo.group(3)
is the text that matched the third subexpression, m.start(3)
is where it startedimport sys, re # start of line, optional spaces, digits, more optional spaces, colon numbered = '^\\s*(\\d+)\\s*:' seen = {} for line in sys.stdin: mo = re.search(numbered, line) if mo: num = mo.group(1) if num in seen: print num else: seen[num] = True
# optional spaces, number, required spaces, number, optional spaces def reverse(instream, outstream): cols = '^\\s*(\\d+)\\s+(\\d+)\s*$' for line in instream: mo = re.match(cols, line) # If match, reverse numbers if mo: a, b = mo.group(1), mo.group(2) print >> outstream, '%s\t%s' % (b, a) # If no match, echo line (without adding extra newline at end) else: print >> outstream, line,
if __name__ == '__main__': fixture = '''\ # Leading comment followed by blank line 10 20 30\t40\t 50 60 70 80 \t90 100 ''' expected = '''\ # Leading comment followed by blank line 20\t10 40\t30 50 60 70 80 100\t90 ''' from cStringIO import StringIO instream = StringIO(fixture) outstream = StringIO() reverse(instream, outstream) assert outstream.getvalue() == expected
re.compile(pattern)
to get the compiled REre
modulematcher.search(text)
searches text
for matches to the RE that was compiled to create matcher
def findAll(instream, outstream): matcher = re.compile('\\b([A-Z][a-z]*)\\b(.*)') for line in instream: mo = matcher.search(line) while mo: print >> outstream, mo.group(1) mo = matcher.search(mo.group(2))
if __name__ == '__main__': fixture = '''\ This has several "Title Case" words on Each Line (Some in parentheses). ''' expected = '''\ This Title Case Each Line Some ''' from cStringIO import StringIO instream = StringIO(fixture) outstream = StringIO() findAll(instream, outstream) assert outstream.getvalue() == expected print 'INPUT' print fixture print 'OUTPUT' print expected
import re #- start:findAll def findAll(instream, outstream): matcher = re.compile('\\b([A-Z][a-z]*)\\b(.*)') for line in instream: mo = matcher.search(line) while mo: print >> outstream, mo.group(1) mo = matcher.search(mo.group(2)) #- end:findAll # start:test if __name__ == '__main__': fixture = '''\ This has several "Title Case" words on Each Line (Some in parentheses). ''' expected = '''\ This Title Case Each Line Some ''' from cStringIO import StringIO instream = StringIO(fixture) outstream = StringIO() findAll(instream, outstream) assert outstream.getvalue() == expected print 'INPUT' print fixture print 'OUTPUT' print expected # end:test
re
moduleMethod | Purpose | Example | Result |
---|---|---|---|
split | Split a string on a pattern. | re.split('\\s*,\\s*', 'a, b ,c , d') | ['a', 'b', 'c', 'd'] |
findall | Find all matches for a pattern. | re.findall('\\b[A-Z][a-z]*', 'Some words in Title Case.') | ['Some', 'Title', 'Case'] |
sub | Replace matches with new text. | re.sub('\\d+', 'NUM', 'If 123 is 456') | "If NUM is NUM" |
java.util.regex
package contains two classes:
Pattern
: a compiled regular expressionMatcher
: the result of a matchpublic static String matchMiddle(String data) { String result = null; Pattern p = Pattern.compile("a(b|c)d"); Matcher m = p.matcher(data); if (m.matches()) { result = m.group(1); } return result; }
open MAIL, 'mail.txt' while (<MAIL>) { if (($name, $value) = /^([^:]+): ?(.+)$/) { print "Message header $name is $value\n"; } }
⌈|⌋
for either/or⌈ab|cd⌋
matches either "ab"
or "cd"
⌈a(b|c)d⌋
matches either "abd"
or "acd"
⌈pat{N}⌋
to match exactly N occurrences of a pattern⌈pat{M,N}⌋
matches between M and N occurrences⌈\d{2,3}⌋
matches "19"
or "207"
, but not "3"
or "4567"
"456"
⌈^\d{2,3}⌋
won'tAndrew Kuchling's Python Regular Expression HOWTO
Exercise 17.1:
By default, regular expression matches are
greedy: the first term in the RE
matches as much as it can, then the second part, and so on. As a
result, if you apply the RE ⌈X(.*)X(.*)⌋
to the string
"XaX and XbX"
, the first group will contain "aX and Xb"
,
and the second group will be empty.
It's also possible to make REs match
reluctantly, i.e., to have the
parts match as little as possible, rather than as much. Find out
how to do this, and then modify the RE in the previous paragraph
so that the first group winds up containing "a"
, and the
second group " and XbX"
.
prev | Copyright © 2005, Python Software Foundation. See License for details. | next |