speed up urlize matching

This commit is contained in:
David Lord 2021-01-31 07:54:40 -08:00
parent eeca0fecc3
commit ef658dc3b6
No known key found for this signature in database
GPG Key ID: 7A1C87E3F5BC42A8
2 changed files with 66 additions and 51 deletions

View File

@ -1,5 +1,15 @@
.. currentmodule:: jinja2
Version 2.11.3
--------------
Unreleased
- Improve the speed of the ``urlize`` filter by reducing regex
backtracking. Email matching requires a word character at the start
of the domain part, and only word characters in the TLD. :pr:`1343`
Version 2.11.2
--------------

View File

@ -6,6 +6,8 @@ import warnings
from collections import deque
from random import choice
from random import randrange
from string import ascii_letters as _letters
from string import digits as _digits
from threading import Lock
from markupsafe import escape
@ -16,20 +18,6 @@ from ._compat import string_types
from ._compat import text_type
from ._compat import url_quote
_word_split_re = re.compile(r"(\s+)")
_punctuation_re = re.compile(
"^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$"
% (
"|".join(map(re.escape, ("(", "<", "&lt;"))),
"|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;"))),
)
)
_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
_entity_re = re.compile(r"&([^;]+);")
_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
_digits = "0123456789"
# special singleton representing missing values for the runtime
missing = type("MissingType", (), {"__repr__": lambda x: "missing"})()
@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
and (x[:limit] + (len(x) >= limit and "..." or ""))
or x
)
words = _word_split_re.split(text_type(escape(text)))
words = re.split(r"(\s+)", text_type(escape(text)))
rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or ""
target_attr = target and ' target="%s"' % escape(target) or ""
for i, word in enumerate(words):
match = _punctuation_re.match(word)
head, middle, tail = "", word, ""
match = re.match(r"^([(<]|&lt;)+", middle)
if match:
lead, middle, trail = match.groups()
if middle.startswith("www.") or (
"@" not in middle
and not middle.startswith("http://")
and not middle.startswith("https://")
and len(middle) > 0
and middle[0] in _letters + _digits
and (
middle.endswith(".org")
or middle.endswith(".net")
or middle.endswith(".com")
)
):
middle = '<a href="http://%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)
if middle.startswith("http://") or middle.startswith("https://"):
middle = '<a href="%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)
if (
"@" in middle
and not middle.startswith("www.")
and ":" not in middle
and _simple_email_re.match(middle)
):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
head = match.group()
middle = middle[match.end() :]
# Unlike lead, which is anchored to the start of the string,
# need to check that the string ends with any of the characters
# before trying to match all of them, to avoid backtracking.
if middle.endswith((")", ">", ".", ",", "\n", "&gt;")):
match = re.search(r"([)>.,\n]|&gt;)+$", middle)
if match:
tail = match.group()
middle = middle[: match.start()]
if middle.startswith("www.") or (
"@" not in middle
and not middle.startswith("http://")
and not middle.startswith("https://")
and len(middle) > 0
and middle[0] in _letters + _digits
and (
middle.endswith(".org")
or middle.endswith(".net")
or middle.endswith(".com")
)
):
middle = '<a href="http://%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)
if middle.startswith("http://") or middle.startswith("https://"):
middle = '<a href="%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)
if (
"@" in middle
and not middle.startswith("www.")
and ":" not in middle
and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle)
):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
words[i] = head + middle + tail
return u"".join(words)