import regex_helper from django
This commit is contained in:
parent
8ea92461dc
commit
dd3b8277d0
1 changed files with 366 additions and 0 deletions
366
weboob/tools/regex_helper.py
Normal file
366
weboob/tools/regex_helper.py
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
# Copyright (c) Django Software Foundation and individual contributors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of Django nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Functions for reversing a regular expression (used in reverse URL resolving).
|
||||
Used internally by Django and not intended for external use.
|
||||
|
||||
This is not, and is not intended to be, a complete reg-exp decompiler. It
|
||||
should be good enough for a large class of URLS, however.
|
||||
"""
|
||||
|
||||
# Mapping of an escape character to a representative of that class. So, e.g.,
|
||||
# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore
|
||||
# this sequence. Any missing key is mapped to itself.
|
||||
ESCAPE_MAPPINGS = {
|
||||
"A": None,
|
||||
"b": None,
|
||||
"B": None,
|
||||
"d": u"0",
|
||||
"D": u"x",
|
||||
"s": u" ",
|
||||
"S": u"x",
|
||||
"w": u"x",
|
||||
"W": u"!",
|
||||
"Z": None,
|
||||
}
|
||||
|
||||
class Choice(list):
|
||||
"""
|
||||
Used to represent multiple possibilities at this point in a pattern string.
|
||||
We use a distinguished type, rather than a list, so that the usage in the
|
||||
code is clear.
|
||||
"""
|
||||
|
||||
class Group(list):
|
||||
"""
|
||||
Used to represent a capturing group in the pattern string.
|
||||
"""
|
||||
|
||||
class NonCapture(list):
|
||||
"""
|
||||
Used to represent a non-capturing group in the pattern string.
|
||||
"""
|
||||
|
||||
def normalize(pattern):
|
||||
"""
|
||||
Given a reg-exp pattern, normalizes it to a list of forms that suffice for
|
||||
reverse matching. This does the following:
|
||||
|
||||
(1) For any repeating sections, keeps the minimum number of occurrences
|
||||
permitted (this means zero for optional groups).
|
||||
(2) If an optional group includes parameters, include one occurrence of
|
||||
that group (along with the zero occurrence case from step (1)).
|
||||
(3) Select the first (essentially an arbitrary) element from any character
|
||||
class. Select an arbitrary character for any unordered class (e.g. '.'
|
||||
or '\w') in the pattern.
|
||||
(5) Ignore comments and any of the reg-exp flags that won't change
|
||||
what we construct ("iLmsu"). "(?x)" is an error, however.
|
||||
(6) Raise an error on all other non-capturing (?...) forms (e.g.
|
||||
look-ahead and look-behind matches) and any disjunctive ('|')
|
||||
constructs.
|
||||
|
||||
Django's URLs for forward resolving are either all positional arguments or
|
||||
all keyword arguments. That is assumed here, as well. Although reverse
|
||||
resolving can be done using positional args when keyword args are
|
||||
specified, the two cannot be mixed in the same reverse() call.
|
||||
"""
|
||||
# Do a linear scan to work out the special features of this pattern. The
|
||||
# idea is that we scan once here and collect all the information we need to
|
||||
# make future decisions.
|
||||
result = []
|
||||
non_capturing_groups = []
|
||||
consume_next = True
|
||||
pattern_iter = next_char(iter(pattern))
|
||||
num_args = 0
|
||||
|
||||
# A "while" loop is used here because later on we need to be able to peek
|
||||
# at the next character and possibly go around without consuming another
|
||||
# one at the top of the loop.
|
||||
try:
|
||||
ch, escaped = pattern_iter.next()
|
||||
except StopIteration:
|
||||
return zip([u''], [[]])
|
||||
|
||||
try:
|
||||
while True:
|
||||
if escaped:
|
||||
result.append(ch)
|
||||
elif ch == '.':
|
||||
# Replace "any character" with an arbitrary representative.
|
||||
result.append(u".")
|
||||
elif ch == '|':
|
||||
# FIXME: One day we'll should do this, but not in 1.0.
|
||||
raise NotImplementedError
|
||||
elif ch == "^":
|
||||
pass
|
||||
elif ch == '$':
|
||||
break
|
||||
elif ch == ')':
|
||||
# This can only be the end of a non-capturing group, since all
|
||||
# other unescaped parentheses are handled by the grouping
|
||||
# section later (and the full group is handled there).
|
||||
#
|
||||
# We regroup everything inside the capturing group so that it
|
||||
# can be quantified, if necessary.
|
||||
start = non_capturing_groups.pop()
|
||||
inner = NonCapture(result[start:])
|
||||
result = result[:start] + [inner]
|
||||
elif ch == '[':
|
||||
# Replace ranges with the first character in the range.
|
||||
ch, escaped = pattern_iter.next()
|
||||
result.append(ch)
|
||||
ch, escaped = pattern_iter.next()
|
||||
while escaped or ch != ']':
|
||||
ch, escaped = pattern_iter.next()
|
||||
elif ch == '(':
|
||||
# Some kind of group.
|
||||
ch, escaped = pattern_iter.next()
|
||||
if ch != '?' or escaped:
|
||||
# A positional group
|
||||
name = "_%d" % num_args
|
||||
num_args += 1
|
||||
result.append(Group(((u"%%(%s)s" % name), name)))
|
||||
walk_to_end(ch, pattern_iter)
|
||||
else:
|
||||
ch, escaped = pattern_iter.next()
|
||||
if ch in "iLmsu#":
|
||||
# All of these are ignorable. Walk to the end of the
|
||||
# group.
|
||||
walk_to_end(ch, pattern_iter)
|
||||
elif ch == ':':
|
||||
# Non-capturing group
|
||||
non_capturing_groups.append(len(result))
|
||||
elif ch != 'P':
|
||||
# Anything else, other than a named group, is something
|
||||
# we cannot reverse.
|
||||
raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch)
|
||||
else:
|
||||
ch, escaped = pattern_iter.next()
|
||||
if ch not in ('<', '='):
|
||||
raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch)
|
||||
# We are in a named capturing group. Extra the name and
|
||||
# then skip to the end.
|
||||
if ch == '<':
|
||||
terminal_char = '>'
|
||||
# We are in a named backreference.
|
||||
else:
|
||||
terminal_char = ')'
|
||||
name = []
|
||||
ch, escaped = pattern_iter.next()
|
||||
while ch != terminal_char:
|
||||
name.append(ch)
|
||||
ch, escaped = pattern_iter.next()
|
||||
param = ''.join(name)
|
||||
# Named backreferences have already consumed the
|
||||
# parenthesis.
|
||||
if terminal_char != ')':
|
||||
result.append(Group(((u"%%(%s)s" % param), param)))
|
||||
walk_to_end(ch, pattern_iter)
|
||||
else:
|
||||
result.append(Group(((u"%%(%s)s" % param), None)))
|
||||
elif ch in "*?+{":
|
||||
# Quanitifers affect the previous item in the result list.
|
||||
count, ch = get_quantifier(ch, pattern_iter)
|
||||
if ch:
|
||||
# We had to look ahead, but it wasn't need to compute the
|
||||
# quanitifer, so use this character next time around the
|
||||
# main loop.
|
||||
consume_next = False
|
||||
|
||||
if count == 0:
|
||||
if contains(result[-1], Group):
|
||||
# If we are quantifying a capturing group (or
|
||||
# something containing such a group) and the minimum is
|
||||
# zero, we must also handle the case of one occurrence
|
||||
# being present. All the quantifiers (except {0,0},
|
||||
# which we conveniently ignore) that have a 0 minimum
|
||||
# also allow a single occurrence.
|
||||
result[-1] = Choice([None, result[-1]])
|
||||
else:
|
||||
result.pop()
|
||||
elif count > 1:
|
||||
result.extend([result[-1]] * (count - 1))
|
||||
else:
|
||||
# Anything else is a literal.
|
||||
result.append(ch)
|
||||
|
||||
if consume_next:
|
||||
ch, escaped = pattern_iter.next()
|
||||
else:
|
||||
consume_next = True
|
||||
except StopIteration:
|
||||
pass
|
||||
except NotImplementedError:
|
||||
# A case of using the disjunctive form. No results for you!
|
||||
return zip([u''], [[]])
|
||||
|
||||
return zip(*flatten_result(result))
|
||||
|
||||
def next_char(input_iter):
|
||||
"""
|
||||
An iterator that yields the next character from "pattern_iter", respecting
|
||||
escape sequences. An escaped character is replaced by a representative of
|
||||
its class (e.g. \w -> "x"). If the escaped character is one that is
|
||||
skipped, it is not returned (the next character is returned instead).
|
||||
|
||||
Yields the next character, along with a boolean indicating whether it is a
|
||||
raw (unescaped) character or not.
|
||||
"""
|
||||
for ch in input_iter:
|
||||
if ch != '\\':
|
||||
yield ch, False
|
||||
continue
|
||||
ch = input_iter.next()
|
||||
representative = ESCAPE_MAPPINGS.get(ch, ch)
|
||||
if representative is None:
|
||||
continue
|
||||
yield representative, True
|
||||
|
||||
def walk_to_end(ch, input_iter):
|
||||
"""
|
||||
The iterator is currently inside a capturing group. We want to walk to the
|
||||
close of this group, skipping over any nested groups and handling escaped
|
||||
parentheses correctly.
|
||||
"""
|
||||
if ch == '(':
|
||||
nesting = 1
|
||||
else:
|
||||
nesting = 0
|
||||
for ch, escaped in input_iter:
|
||||
if escaped:
|
||||
continue
|
||||
elif ch == '(':
|
||||
nesting += 1
|
||||
elif ch == ')':
|
||||
if not nesting:
|
||||
return
|
||||
nesting -= 1
|
||||
|
||||
def get_quantifier(ch, input_iter):
|
||||
"""
|
||||
Parse a quantifier from the input, where "ch" is the first character in the
|
||||
quantifier.
|
||||
|
||||
Returns the minimum number of occurences permitted by the quantifier and
|
||||
either None or the next character from the input_iter if the next character
|
||||
is not part of the quantifier.
|
||||
"""
|
||||
if ch in '*?+':
|
||||
try:
|
||||
ch2, escaped = input_iter.next()
|
||||
except StopIteration:
|
||||
ch2 = None
|
||||
if ch2 == '?':
|
||||
ch2 = None
|
||||
if ch == '+':
|
||||
return 1, ch2
|
||||
return 0, ch2
|
||||
|
||||
quant = []
|
||||
while ch != '}':
|
||||
ch, escaped = input_iter.next()
|
||||
quant.append(ch)
|
||||
quant = quant[:-1]
|
||||
values = ''.join(quant).split(',')
|
||||
|
||||
# Consume the trailing '?', if necessary.
|
||||
try:
|
||||
ch, escaped = input_iter.next()
|
||||
except StopIteration:
|
||||
ch = None
|
||||
if ch == '?':
|
||||
ch = None
|
||||
return int(values[0]), ch
|
||||
|
||||
def contains(source, inst):
|
||||
"""
|
||||
Returns True if the "source" contains an instance of "inst". False,
|
||||
otherwise.
|
||||
"""
|
||||
if isinstance(source, inst):
|
||||
return True
|
||||
if isinstance(source, NonCapture):
|
||||
for elt in source:
|
||||
if contains(elt, inst):
|
||||
return True
|
||||
return False
|
||||
|
||||
def flatten_result(source):
|
||||
"""
|
||||
Turns the given source sequence into a list of reg-exp possibilities and
|
||||
their arguments. Returns a list of strings and a list of argument lists.
|
||||
Each of the two lists will be of the same length.
|
||||
"""
|
||||
if source is None:
|
||||
return [u''], [[]]
|
||||
if isinstance(source, Group):
|
||||
if source[1] is None:
|
||||
params = []
|
||||
else:
|
||||
params = [source[1]]
|
||||
return [source[0]], [params]
|
||||
result = [u'']
|
||||
result_args = [[]]
|
||||
pos = last = 0
|
||||
for pos, elt in enumerate(source):
|
||||
if isinstance(elt, basestring):
|
||||
continue
|
||||
piece = u''.join(source[last:pos])
|
||||
if isinstance(elt, Group):
|
||||
piece += elt[0]
|
||||
param = elt[1]
|
||||
else:
|
||||
param = None
|
||||
last = pos + 1
|
||||
for i in range(len(result)):
|
||||
result[i] += piece
|
||||
if param:
|
||||
result_args[i].append(param)
|
||||
if isinstance(elt, (Choice, NonCapture)):
|
||||
if isinstance(elt, NonCapture):
|
||||
elt = [elt]
|
||||
inner_result, inner_args = [], []
|
||||
for item in elt:
|
||||
res, args = flatten_result(item)
|
||||
inner_result.extend(res)
|
||||
inner_args.extend(args)
|
||||
new_result = []
|
||||
new_args = []
|
||||
for item, args in zip(result, result_args):
|
||||
for i_item, i_args in zip(inner_result, inner_args):
|
||||
new_result.append(item + i_item)
|
||||
new_args.append(args[:] + i_args)
|
||||
result = new_result
|
||||
result_args = new_args
|
||||
if pos >= last:
|
||||
piece = u''.join(source[last:])
|
||||
for i in range(len(result)):
|
||||
result[i] += piece
|
||||
return result, result_args
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue