diff -r ddc95a9bc2e0 Doc/library/re.rst
--- a/Doc/library/re.rst Wed Sep 07 13:18:40 2016 +0200
+++ b/Doc/library/re.rst Wed Sep 07 17:52:28 2016 +0300
@@ -237,6 +237,16 @@ The special characters are:
*cannot* be retrieved after performing a match or referenced later in the
pattern.
+``(?imsx-imsx:...)``
+ (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
+ optionally followed by ``'-'`` followed by one or more letters from the
+ same set.) The letters set or removes the corresponding flags:
+ :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
+ (dot matches all), and :const:`re.X` (verbose), for the part of the
+ expression. (The flags are described in :ref:`contents-of-module-re`.)
+
+ .. versionadded: 3.7
+
``(?P...)``
Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid
diff -r ddc95a9bc2e0 Doc/whatsnew/3.6.rst
--- a/Doc/whatsnew/3.6.rst Wed Sep 07 13:18:40 2016 +0200
+++ b/Doc/whatsnew/3.6.rst Wed Sep 07 17:52:28 2016 +0300
@@ -520,6 +520,15 @@ Protocol version 4 already supports this
Storchaka in :issue:`24164`.)
+re
+--
+
+Added support of modifier spans in regular expressions. Examples:
+``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``;
+``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``.
+(Contributed by Serhiy Storchaka in :issue:`433028`.)
+
+
readline
--------
diff -r ddc95a9bc2e0 Lib/re.py
--- a/Lib/re.py Wed Sep 07 13:18:40 2016 +0200
+++ b/Lib/re.py Wed Sep 07 17:52:28 2016 +0300
@@ -352,7 +352,7 @@ class Scanner:
for phrase, action in lexicon:
gid = s.opengroup()
p.append(sre_parse.SubPattern(s, [
- (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))),
+ (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
]))
s.closegroup(gid, p[-1])
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
diff -r ddc95a9bc2e0 Lib/sre_compile.py
--- a/Lib/sre_compile.py Wed Sep 07 13:18:40 2016 +0200
+++ b/Lib/sre_compile.py Wed Sep 07 17:52:28 2016 +0300
@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
ASSERT_CODES = _ASSERT_CODES
if (flags & SRE_FLAG_IGNORECASE and
not (flags & SRE_FLAG_LOCALE) and
- flags & SRE_FLAG_UNICODE):
+ flags & SRE_FLAG_UNICODE and
+ not (flags & SRE_FLAG_ASCII)):
fixes = _ignorecase_fixes
else:
fixes = None
@@ -137,14 +138,15 @@ def _compile(code, pattern, flags):
else:
emit(MIN_UNTIL)
elif op is SUBPATTERN:
- if av[0]:
+ group, add_flags, del_flags, p = av
+ if group:
emit(MARK)
- emit((av[0]-1)*2)
- # _compile_info(code, av[1], flags)
- _compile(code, av[1], flags)
- if av[0]:
+ emit((group-1)*2)
+ # _compile_info(code, p, (flags | add_flags) & ~del_flags)
+ _compile(code, p, (flags | add_flags) & ~del_flags)
+ if group:
emit(MARK)
- emit((av[0]-1)*2+1)
+ emit((group-1)*2+1)
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
@@ -172,7 +174,7 @@ def _compile(code, pattern, flags):
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
- elif flags & SRE_FLAG_UNICODE:
+ elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
av = AT_UNICODE.get(av, av)
emit(av)
elif op is BRANCH:
@@ -193,7 +195,7 @@ def _compile(code, pattern, flags):
emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
- elif flags & SRE_FLAG_UNICODE:
+ elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
av = CH_UNICODE[av]
emit(av)
elif op is GROUPREF:
@@ -237,7 +239,7 @@ def _compile_charset(charset, flags, cod
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av])
- elif flags & SRE_FLAG_UNICODE:
+ elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
emit(CH_UNICODE[av])
else:
emit(av)
@@ -414,14 +416,16 @@ def _get_literal_prefix(pattern):
prefix = []
prefixappend = prefix.append
prefix_skip = None
- got_all = True
for op, av in pattern.data:
if op is LITERAL:
prefixappend(av)
elif op is SUBPATTERN:
- prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
+ group, add_flags, del_flags, p = av
+ if add_flags & SRE_FLAG_IGNORECASE:
+ break
+ prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
if prefix_skip is None:
- if av[0] is not None:
+ if group is not None:
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
@@ -429,32 +433,35 @@ def _get_literal_prefix(pattern):
if not got_all:
break
else:
- got_all = False
break
- return prefix, prefix_skip, got_all
+ else:
+ return prefix, prefix_skip, True
+ return prefix, prefix_skip, False
def _get_charset_prefix(pattern):
charset = [] # not used
charsetappend = charset.append
if pattern.data:
op, av = pattern.data[0]
- if op is SUBPATTERN and av[1]:
- op, av = av[1][0]
- if op is LITERAL:
- charsetappend((op, av))
- elif op is BRANCH:
- c = []
- cappend = c.append
- for p in av[1]:
- if not p:
- break
- op, av = p[0]
- if op is LITERAL:
- cappend((op, av))
+ if op is SUBPATTERN:
+ group, add_flags, del_flags, p = av
+ if p and not (add_flags & SRE_FLAG_IGNORECASE):
+ op, av = p[0]
+ if op is LITERAL:
+ charsetappend((op, av))
+ elif op is BRANCH:
+ c = []
+ cappend = c.append
+ for p in av[1]:
+ if not p:
+ break
+ op, av = p[0]
+ if op is LITERAL:
+ cappend((op, av))
+ else:
+ break
else:
- break
- else:
- charset = c
+ charset = c
elif op is BRANCH:
c = []
cappend = c.append
diff -r ddc95a9bc2e0 Lib/sre_parse.py
--- a/Lib/sre_parse.py Wed Sep 07 13:18:40 2016 +0200
+++ b/Lib/sre_parse.py Wed Sep 07 17:52:28 2016 +0300
@@ -65,6 +65,12 @@ FLAGS = {
"u": SRE_FLAG_UNICODE,
}
+GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
+ SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
+
+class Verbose(Exception):
+ pass
+
class Pattern:
# master pattern object. keeps track of global attributes
def __init__(self):
@@ -184,7 +190,7 @@ class SubPattern:
lo = lo + i
hi = hi + j
elif op is SUBPATTERN:
- i, j = av[1].getwidth()
+ i, j = av[-1].getwidth()
lo = lo + i
hi = hi + j
elif op in _REPEATCODES:
@@ -395,7 +401,7 @@ def _escape(source, escape, state):
pass
raise source.error("bad escape %s" % escape, len(escape))
-def _parse_sub(source, state, nested=True):
+def _parse_sub(source, state, verbose, nested=True):
# parse an alternation: a|b|c
items = []
@@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=Tru
sourcematch = source.match
start = source.tell()
while True:
- itemsappend(_parse(source, state))
+ itemsappend(_parse(source, state, verbose))
if not sourcematch("|"):
break
@@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=Tru
subpattern.append((BRANCH, (None, items)))
return subpattern
-def _parse_sub_cond(source, state, condgroup):
- item_yes = _parse(source, state)
+def _parse_sub_cond(source, state, condgroup, verbose):
+ item_yes = _parse(source, state, verbose)
if source.match("|"):
- item_no = _parse(source, state)
+ item_no = _parse(source, state, verbose)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
@@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condg
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
-def _parse(source, state):
+def _parse(source, state, verbose):
# parse a simple pattern
subpattern = SubPattern(state)
@@ -467,7 +473,6 @@ def _parse(source, state):
sourcematch = source.match
_len = len
_ord = ord
- verbose = state.flags & SRE_FLAG_VERBOSE
while True:
@@ -621,6 +626,8 @@ def _parse(source, state):
group = True
name = None
condgroup = None
+ add_flags = 0
+ del_flags = 0
if sourcematch("?"):
# options
char = sourceget()
@@ -682,7 +689,7 @@ def _parse(source, state):
lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None:
state.lookbehindgroups = state.groups
- p = _parse_sub(source, state)
+ p = _parse_sub(source, state, verbose)
if dir )', 'unknown extension ?', 1)
diff -r ddc95a9bc2e0 Misc/NEWS
--- a/Misc/NEWS Wed Sep 07 13:18:40 2016 +0200
+++ b/Misc/NEWS Wed Sep 07 17:52:28 2016 +0300
@@ -91,6 +91,8 @@ Core and Builtins
Library
-------
+- Issue #433028: Added support of modifier spans in regular expressions.
+
- Issue #16113: Add SHA-3 and SHAKE support to hashlib module.
- Issue #27776: The :func:`os.urandom` function does now block on Linux 3.17