mirror of
https://gitee.com/openharmony/third_party_jinja2
synced 2025-02-17 05:58:10 +00:00
speed up urlize matching
This commit is contained in:
parent
eeca0fecc3
commit
ef658dc3b6
10
CHANGES.rst
10
CHANGES.rst
@ -1,5 +1,15 @@
|
||||
.. currentmodule:: jinja2
|
||||
|
||||
Version 2.11.3
|
||||
--------------
|
||||
|
||||
Unreleased
|
||||
|
||||
- Improve the speed of the ``urlize`` filter by reducing regex
|
||||
backtracking. Email matching requires a word character at the start
|
||||
of the domain part, and only word characters in the TLD. :pr:`1343`
|
||||
|
||||
|
||||
Version 2.11.2
|
||||
--------------
|
||||
|
||||
|
@ -6,6 +6,8 @@ import warnings
|
||||
from collections import deque
|
||||
from random import choice
|
||||
from random import randrange
|
||||
from string import ascii_letters as _letters
|
||||
from string import digits as _digits
|
||||
from threading import Lock
|
||||
|
||||
from markupsafe import escape
|
||||
@ -16,20 +18,6 @@ from ._compat import string_types
|
||||
from ._compat import text_type
|
||||
from ._compat import url_quote
|
||||
|
||||
_word_split_re = re.compile(r"(\s+)")
|
||||
_punctuation_re = re.compile(
|
||||
"^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$"
|
||||
% (
|
||||
"|".join(map(re.escape, ("(", "<", "<"))),
|
||||
"|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))),
|
||||
)
|
||||
)
|
||||
_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
|
||||
_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
|
||||
_entity_re = re.compile(r"&([^;]+);")
|
||||
_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
_digits = "0123456789"
|
||||
|
||||
# special singleton representing missing values for the runtime
|
||||
missing = type("MissingType", (), {"__repr__": lambda x: "missing"})()
|
||||
|
||||
@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
|
||||
and (x[:limit] + (len(x) >= limit and "..." or ""))
|
||||
or x
|
||||
)
|
||||
words = _word_split_re.split(text_type(escape(text)))
|
||||
words = re.split(r"(\s+)", text_type(escape(text)))
|
||||
rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or ""
|
||||
target_attr = target and ' target="%s"' % escape(target) or ""
|
||||
|
||||
for i, word in enumerate(words):
|
||||
match = _punctuation_re.match(word)
|
||||
head, middle, tail = "", word, ""
|
||||
match = re.match(r"^([(<]|<)+", middle)
|
||||
|
||||
if match:
|
||||
lead, middle, trail = match.groups()
|
||||
if middle.startswith("www.") or (
|
||||
"@" not in middle
|
||||
and not middle.startswith("http://")
|
||||
and not middle.startswith("https://")
|
||||
and len(middle) > 0
|
||||
and middle[0] in _letters + _digits
|
||||
and (
|
||||
middle.endswith(".org")
|
||||
or middle.endswith(".net")
|
||||
or middle.endswith(".com")
|
||||
)
|
||||
):
|
||||
middle = '<a href="http://%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
if middle.startswith("http://") or middle.startswith("https://"):
|
||||
middle = '<a href="%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
if (
|
||||
"@" in middle
|
||||
and not middle.startswith("www.")
|
||||
and ":" not in middle
|
||||
and _simple_email_re.match(middle)
|
||||
):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
if lead + middle + trail != word:
|
||||
words[i] = lead + middle + trail
|
||||
head = match.group()
|
||||
middle = middle[match.end() :]
|
||||
|
||||
# Unlike lead, which is anchored to the start of the string,
|
||||
# need to check that the string ends with any of the characters
|
||||
# before trying to match all of them, to avoid backtracking.
|
||||
if middle.endswith((")", ">", ".", ",", "\n", ">")):
|
||||
match = re.search(r"([)>.,\n]|>)+$", middle)
|
||||
|
||||
if match:
|
||||
tail = match.group()
|
||||
middle = middle[: match.start()]
|
||||
|
||||
if middle.startswith("www.") or (
|
||||
"@" not in middle
|
||||
and not middle.startswith("http://")
|
||||
and not middle.startswith("https://")
|
||||
and len(middle) > 0
|
||||
and middle[0] in _letters + _digits
|
||||
and (
|
||||
middle.endswith(".org")
|
||||
or middle.endswith(".net")
|
||||
or middle.endswith(".com")
|
||||
)
|
||||
):
|
||||
middle = '<a href="http://%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
|
||||
if middle.startswith("http://") or middle.startswith("https://"):
|
||||
middle = '<a href="%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
|
||||
if (
|
||||
"@" in middle
|
||||
and not middle.startswith("www.")
|
||||
and ":" not in middle
|
||||
and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle)
|
||||
):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
|
||||
words[i] = head + middle + tail
|
||||
|
||||
return u"".join(words)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user