You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1583 lines
57 KiB

"""CSS matcher."""
from __future__ import annotations
from datetime import datetime
from . import util
import re
from . import css_types as ct
import unicodedata
import bs4 # type: ignore[import]
from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
# Relationships
REL_PARENT = ' '
REL_CLOSE_PARENT = '>'
REL_SIBLING = '~'
REL_CLOSE_SIBLING = '+'
# Relationships for :has() (forward looking)
REL_HAS_PARENT = ': '
REL_HAS_CLOSE_PARENT = ':>'
REL_HAS_SIBLING = ':~'
REL_HAS_CLOSE_SIBLING = ':+'
NS_XHTML = 'http://www.w3.org/1999/xhtml'
NS_XML = 'http://www.w3.org/XML/1998/namespace'
DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
DIR_MAP = {
'ltr': ct.SEL_DIR_LTR,
'rtl': ct.SEL_DIR_RTL,
'auto': 0
}
RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
RE_DATETIME = re.compile(
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
)
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
FEB = 2
SHORT_MONTH = 30
LONG_MONTH = 31
FEB_MONTH = 28
FEB_LEAP_MONTH = 29
DAYS_IN_WEEK = 7
class _FakeParent:
"""
Fake parent class.
When we have a fragment with no `BeautifulSoup` document object,
we can't evaluate `nth` selectors properly. Create a temporary
fake parent so we can traverse the root element as a child.
"""
def __init__(self, element: bs4.Tag) -> None:
"""Initialize."""
self.contents = [element]
def __len__(self) -> bs4.PageElement:
"""Length."""
return len(self.contents)
class _DocumentNav:
"""Navigate a Beautiful Soup document."""
@classmethod
def assert_valid_input(cls, tag: Any) -> None:
"""Check if valid input tag or document."""
# Fail on unexpected types.
if not cls.is_tag(tag):
raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")
@staticmethod
def is_doc(obj: bs4.Tag) -> bool:
"""Is `BeautifulSoup` object."""
return isinstance(obj, bs4.BeautifulSoup)
@staticmethod
def is_tag(obj: bs4.PageElement) -> bool:
"""Is tag."""
return isinstance(obj, bs4.Tag)
@staticmethod
def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is declaration."""
return isinstance(obj, bs4.Declaration)
@staticmethod
def is_cdata(obj: bs4.PageElement) -> bool:
"""Is CDATA."""
return isinstance(obj, bs4.CData)
@staticmethod
def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is processing instruction."""
return isinstance(obj, bs4.ProcessingInstruction)
@staticmethod
def is_navigable_string(obj: bs4.PageElement) -> bool:
"""Is navigable string."""
return isinstance(obj, bs4.NavigableString)
@staticmethod
def is_special_string(obj: bs4.PageElement) -> bool:
"""Is special string."""
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
@classmethod
def is_content_string(cls, obj: bs4.PageElement) -> bool:
"""Check if node is content string."""
return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
@staticmethod
def create_fake_parent(el: bs4.Tag) -> _FakeParent:
"""Create fake parent for a given element."""
return _FakeParent(el)
@staticmethod
def is_xml_tree(el: bs4.Tag) -> bool:
"""Check if element (or document) is from a XML tree."""
return bool(el._is_xml)
def is_iframe(self, el: bs4.Tag) -> bool:
"""Check if element is an `iframe`."""
return bool(
((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
self.is_html_tag(el) # type: ignore[attr-defined]
)
def is_root(self, el: bs4.Tag) -> bool:
"""
Return whether element is a root element.
We check that the element is the root of the tree (which we have already pre-calculated),
and we check if it is the root element under an `iframe`.
"""
root = self.root and self.root is el # type: ignore[attr-defined]
if not root:
parent = self.get_parent(el)
root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
return root
def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
"""Get contents or contents in reverse."""
if not no_iframe or not self.is_iframe(el):
yield from el.contents
def get_children(
self,
el: bs4.Tag,
start: int | None = None,
reverse: bool = False,
tags: bool = True,
no_iframe: bool = False
) -> Iterator[bs4.PageElement]:
"""Get children."""
if not no_iframe or not self.is_iframe(el):
last = len(el.contents) - 1
if start is None:
index = last if reverse else 0
else:
index = start
end = -1 if reverse else last + 1
incr = -1 if reverse else 1
if 0 <= index <= last:
while index != end:
node = el.contents[index]
index += incr
if not tags or self.is_tag(node):
yield node
def get_descendants(
self,
el: bs4.Tag,
tags: bool = True,
no_iframe: bool = False
) -> Iterator[bs4.PageElement]:
"""Get descendants."""
if not no_iframe or not self.is_iframe(el):
next_good = None
for child in el.descendants:
if next_good is not None:
if child is not next_good:
continue
next_good = None
is_tag = self.is_tag(child)
if no_iframe and is_tag and self.is_iframe(child):
if child.next_sibling is not None:
next_good = child.next_sibling
else:
last_child = child
while self.is_tag(last_child) and last_child.contents:
last_child = last_child.contents[-1]
next_good = last_child.next_element
yield child
if next_good is None:
break
# Coverage isn't seeing this even though it's executed
continue # pragma: no cover
if not tags or is_tag:
yield child
def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
"""Get parent."""
parent = el.parent
if no_iframe and parent is not None and self.is_iframe(parent):
parent = None
return parent
@staticmethod
def get_tag_name(el: bs4.Tag) -> str | None:
"""Get tag."""
return cast('str | None', el.name)
@staticmethod
def get_prefix_name(el: bs4.Tag) -> str | None:
"""Get prefix."""
return cast('str | None', el.prefix)
@staticmethod
def get_uri(el: bs4.Tag) -> str | None:
"""Get namespace `URI`."""
return cast('str | None', el.namespace)
@classmethod
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get next sibling tag."""
sibling = el.next_sibling
while tags and not cls.is_tag(sibling) and sibling is not None:
sibling = sibling.next_sibling
return sibling
@classmethod
def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get previous sibling tag."""
sibling = el.previous_sibling
while tags and not cls.is_tag(sibling) and sibling is not None:
sibling = sibling.previous_sibling
return sibling
@staticmethod
def has_html_ns(el: bs4.Tag) -> bool:
"""
Check if element has an HTML namespace.
This is a bit different than whether a element is treated as having an HTML namespace,
like we do in the case of `is_html_tag`.
"""
ns = getattr(el, 'namespace') if el else None # noqa: B009
return bool(ns and ns == NS_XHTML)
@staticmethod
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
"""Return namespace and attribute name without the prefix."""
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
@classmethod
def normalize_value(cls, value: Any) -> str | Sequence[str]:
"""Normalize the value to be a string or list of strings."""
# Treat `None` as empty string.
if value is None:
return ''
# Pass through strings
if (isinstance(value, str)):
return value
# If it's a byte string, convert it to Unicode, treating it as UTF-8.
if isinstance(value, bytes):
return value.decode("utf8")
# BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
if isinstance(value, Sequence):
new_value = []
for v in value:
if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
# This is most certainly a user error and will crash and burn later.
# To keep things working, we'll do what we do with all objects,
# And convert them to strings.
new_value.append(str(v))
else:
# Convert the child to a string
new_value.append(cast(str, cls.normalize_value(v)))
return new_value
# Try and make anything else a string
return str(value)
@classmethod
def get_attribute_by_name(
cls,
el: bs4.Tag,
name: str,
default: str | Sequence[str] | None = None
) -> str | Sequence[str] | None:
"""Get attribute by name."""
value = default
if el._is_xml:
try:
value = cls.normalize_value(el.attrs[name])
except KeyError:
pass
else:
for k, v in el.attrs.items():
if util.lower(k) == name:
value = cls.normalize_value(v)
break
return value
@classmethod
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
"""Iterate attributes."""
for k, v in el.attrs.items():
yield k, cls.normalize_value(v)
@classmethod
def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
"""Get classes."""
classes = cls.get_attribute_by_name(el, 'class', [])
if isinstance(classes, str):
classes = RE_NOT_WS.findall(classes)
return cast(Sequence[str], classes)
def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
"""Get text."""
return ''.join(
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
)
def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
"""Get Own Text."""
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
class Inputs:
"""Class for parsing and validating input items."""
@staticmethod
def validate_day(year: int, month: int, day: int) -> bool:
"""Validate day."""
max_days = LONG_MONTH
if month == FEB:
max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
elif month in MONTHS_30:
max_days = SHORT_MONTH
return 1 <= day <= max_days
@staticmethod
def validate_week(year: int, week: int) -> bool:
"""Validate week."""
max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1]
if max_week == 1:
max_week = 53
return 1 <= week <= max_week
@staticmethod
def validate_month(month: int) -> bool:
"""Validate month."""
return 1 <= month <= 12
@staticmethod
def validate_year(year: int) -> bool:
"""Validate year."""
return 1 <= year
@staticmethod
def validate_hour(hour: int) -> bool:
"""Validate hour."""
return 0 <= hour <= 23
@staticmethod
def validate_minutes(minutes: int) -> bool:
"""Validate minutes."""
return 0 <= minutes <= 59
@classmethod
def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
"""Parse the input value."""
parsed = None # type: tuple[float, ...] | None
if value is None:
return value
if itype == "date":
m = RE_DATE.match(value)
if m:
year = int(m.group('year'), 10)
month = int(m.group('month'), 10)
day = int(m.group('day'), 10)
if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
parsed = (year, month, day)
elif itype == "month":
m = RE_MONTH.match(value)
if m:
year = int(m.group('year'), 10)
month = int(m.group('month'), 10)
if cls.validate_year(year) and cls.validate_month(month):
parsed = (year, month)
elif itype == "week":
m = RE_WEEK.match(value)
if m:
year = int(m.group('year'), 10)
week = int(m.group('week'), 10)
if cls.validate_year(year) and cls.validate_week(year, week):
parsed = (year, week)
elif itype == "time":
m = RE_TIME.match(value)
if m:
hour = int(m.group('hour'), 10)
minutes = int(m.group('minutes'), 10)
if cls.validate_hour(hour) and cls.validate_minutes(minutes):
parsed = (hour, minutes)
elif itype == "datetime-local":
m = RE_DATETIME.match(value)
if m:
year = int(m.group('year'), 10)
month = int(m.group('month'), 10)
day = int(m.group('day'), 10)
hour = int(m.group('hour'), 10)
minutes = int(m.group('minutes'), 10)
if (
cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
cls.validate_hour(hour) and cls.validate_minutes(minutes)
):
parsed = (year, month, day, hour, minutes)
elif itype in ("number", "range"):
m = RE_NUM.match(value)
if m:
parsed = (float(m.group('value')),)
return parsed
class CSSMatch(_DocumentNav):
"""Perform CSS matching."""
def __init__(
self,
selectors: ct.SelectorList,
scope: bs4.Tag,
namespaces: ct.Namespaces | None,
flags: int
) -> None:
"""Initialize."""
self.assert_valid_input(scope)
self.tag = scope
self.cached_meta_lang = [] # type: list[tuple[str, str]]
self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
self.selectors = selectors
self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
self.flags = flags
self.iframe_restrict = False
# Find the root element for the whole tree
doc = scope
parent = self.get_parent(doc)
while parent:
doc = parent
parent = self.get_parent(doc)
root = None
if not self.is_doc(doc):
root = doc
else:
for child in self.get_children(doc):
root = child
break
self.root = root
self.scope = scope if scope is not doc else root
self.has_html_namespace = self.has_html_ns(root)
# A document can be both XML and HTML (XHTML)
self.is_xml = self.is_xml_tree(doc)
self.is_html = not self.is_xml or self.has_html_namespace
def supports_namespaces(self) -> bool:
"""Check if namespaces are supported in the HTML type."""
return self.is_xml or self.has_html_namespace
def get_tag_ns(self, el: bs4.Tag) -> str:
"""Get tag namespace."""
if self.supports_namespaces():
namespace = ''
ns = self.get_uri(el)
if ns:
namespace = ns
else:
namespace = NS_XHTML
return namespace
def is_html_tag(self, el: bs4.Tag) -> bool:
"""Check if tag is in HTML namespace."""
return self.get_tag_ns(el) == NS_XHTML
def get_tag(self, el: bs4.Tag) -> str | None:
"""Get tag."""
name = self.get_tag_name(el)
return util.lower(name) if name is not None and not self.is_xml else name
def get_prefix(self, el: bs4.Tag) -> str | None:
"""Get prefix."""
prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
def find_bidi(self, el: bs4.Tag) -> int | None:
"""Get directionality from element text."""
for node in self.get_children(el, tags=False):
# Analyze child text nodes
if self.is_tag(node):
# Avoid analyzing certain elements specified in the specification.
direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
if (
self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
not self.is_html_tag(node) or
direction is not None
):
continue # pragma: no cover
# Check directionality of this node's text
value = self.find_bidi(node)
if value is not None:
return value
# Direction could not be determined
continue # pragma: no cover
# Skip `doctype` comments, etc.
if self.is_special_string(node):
continue
# Analyze text nodes for directionality.
for c in node:
bidi = unicodedata.bidirectional(c)
if bidi in ('AL', 'R', 'L'):
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return None
def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
"""Filter the language tags."""
match = True
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-')
length = len(ranges)
slength = len(subtags)
rindex = 0
sindex = 0
r = ranges[rindex]
s = subtags[sindex]
# Empty specified language should match unspecified language attributes
if length == 1 and slength == 1 and not r and r == s:
return True
# Primary tag needs to match
if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
match = False
rindex += 1
sindex += 1
# Match until we run out of ranges
while match and rindex < length:
r = ranges[rindex]
try:
s = subtags[sindex]
except IndexError:
# Ran out of subtags,
# but we still have ranges
match = False
continue
# Empty range
if not r:
match = False
continue
# Matched range
elif s == r:
rindex += 1
# Implicit wildcard cannot match
# singletons
elif len(s) == 1:
match = False
continue
# Implicitly matched, so grab next subtag
sindex += 1
return match
def match_attribute_name(
self,
el: bs4.Tag,
attr: str,
prefix: str | None
) -> str | Sequence[str] | None:
"""Match attribute name and return value if it exists."""
value = None
if self.supports_namespaces():
value = None
# If we have not defined namespaces, we can't very well find them, so don't bother trying.
if prefix:
ns = self.namespaces.get(prefix)
if ns is None and prefix != '*':
return None
else:
ns = None
for k, v in self.iter_attributes(el):
# Get attribute parts
namespace, name = self.split_namespace(el, k)
# Can't match a prefix attribute as we haven't specified one to match
# Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
if ns is None:
if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
value = v
break
# Coverage is not finding this even though it is executed.
# Adding a print statement before this (and erasing coverage) causes coverage to find the line.
# Ignore the false positive message.
continue # pragma: no cover
# We can't match our desired prefix attribute as the attribute doesn't have a prefix
if namespace is None or ns != namespace and prefix != '*':
continue
# The attribute doesn't match.
if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
continue
value = v
break
else:
for k, v in self.iter_attributes(el):
if util.lower(attr) != util.lower(k):
continue
value = v
break
return value
def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match the namespace of the element."""
match = True
namespace = self.get_tag_ns(el)
default_namespace = self.namespaces.get('')
tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
# We must match the default namespace if one is not provided
if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
match = False
# If we specified `|tag`, we must not have a namespace.
elif (tag.prefix is not None and tag.prefix == '' and namespace):
match = False
# Verify prefix matches
elif (
tag.prefix and
tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
):
match = False
return match
def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
"""Match attributes."""
match = True
if attributes:
for a in attributes:
temp = self.match_attribute_name(el, a.attribute, a.prefix)
pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
if temp is None:
match = False
break
value = temp if isinstance(temp, str) else ' '.join(temp)
if pattern is None:
continue
elif pattern.match(value) is None:
match = False
break
return match
def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match tag name."""
name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
return not (
name is not None and
name not in (self.get_tag(el), '*')
)
def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
"""Match the tag."""
match = True
if tag is not None:
# Verify namespace
if not self.match_namespace(el, tag):
match = False
if not self.match_tagname(el, tag):
match = False
return match
def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match past relationship."""
found = False
# I don't think this can ever happen, but it makes `mypy` happy
if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
return found
if relation[0].rel_type == REL_PARENT:
parent = self.get_parent(el, no_iframe=self.iframe_restrict)
while not found and parent:
found = self.match_selectors(parent, relation)
parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
elif relation[0].rel_type == REL_CLOSE_PARENT:
parent = self.get_parent(el, no_iframe=self.iframe_restrict)
if parent:
found = self.match_selectors(parent, relation)
elif relation[0].rel_type == REL_SIBLING:
sibling = self.get_previous(el)
while not found and sibling:
found = self.match_selectors(sibling, relation)
sibling = self.get_previous(sibling)
elif relation[0].rel_type == REL_CLOSE_SIBLING:
sibling = self.get_previous(el)
if sibling and self.is_tag(sibling):
found = self.match_selectors(sibling, relation)
return found
def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
"""Match future child."""
match = False
if recursive:
children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
else:
children = self.get_children
for child in children(parent, no_iframe=self.iframe_restrict):
match = self.match_selectors(child, relation)
if match:
break
return match
def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match future relationship."""
found = False
# I don't think this can ever happen, but it makes `mypy` happy
if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
return found
if relation[0].rel_type == REL_HAS_PARENT:
found = self.match_future_child(el, relation, True)
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
found = self.match_future_child(el, relation)
elif relation[0].rel_type == REL_HAS_SIBLING:
sibling = self.get_next(el)
while not found and sibling:
found = self.match_selectors(sibling, relation)
sibling = self.get_next(sibling)
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
sibling = self.get_next(el)
if sibling and self.is_tag(sibling):
found = self.match_selectors(sibling, relation)
return found
def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match relationship to other elements."""
found = False
if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
return found
if relation[0].rel_type.startswith(':'):
found = self.match_future_relations(el, relation)
else:
found = self.match_past_relations(el, relation)
return found
def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
"""Match element's ID."""
found = True
for i in ids:
if i != self.get_attribute_by_name(el, 'id', ''):
found = False
break
return found
def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
"""Match element's classes."""
current_classes = self.get_classes(el)
found = True
for c in classes:
if c not in current_classes:
found = False
break
return found
def match_root(self, el: bs4.Tag) -> bool:
"""Match element as root."""
is_root = self.is_root(el)
if is_root:
sibling = self.get_previous(el, tags=False)
while is_root and sibling is not None:
if (
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
self.is_cdata(sibling)
):
is_root = False
else:
sibling = self.get_previous(sibling, tags=False)
if is_root:
sibling = self.get_next(el, tags=False)
while is_root and sibling is not None:
if (
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
self.is_cdata(sibling)
):
is_root = False
else:
sibling = self.get_next(sibling, tags=False)
return is_root
def match_scope(self, el: bs4.Tag) -> bool:
"""Match element as scope."""
return self.scope is el
def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
"""Match tag type for `nth` matches."""
return (
(self.get_tag(child) == self.get_tag(el)) and
(self.get_tag_ns(child) == self.get_tag_ns(el))
)
def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
"""Match `nth` elements."""
matched = True
for n in nth:
matched = False
if n.selectors and not self.match_selectors(el, n.selectors):
break
parent = self.get_parent(el)
if parent is None:
parent = self.create_fake_parent(el)
last = n.last
last_index = len(parent) - 1
index = last_index if last else 0
relative_index = 0
a = n.a
b = n.b
var = n.n
count = 0
count_incr = 1
factor = -1 if last else 1
idx = last_idx = a * count + b if var else a
# We can only adjust bounds within a variable index
if var:
# Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
# Otherwise, increment to try to get in bounds.
adjust = None
while idx < 1 or idx > last_index:
if idx < 0:
diff_low = 0 - idx
if adjust is not None and adjust == 1:
break
adjust = -1
count += count_incr
idx = last_idx = a * count + b if var else a
diff = 0 - idx
if diff >= diff_low:
break
else:
diff_high = idx - last_index
if adjust is not None and adjust == -1:
break
adjust = 1
count += count_incr
idx = last_idx = a * count + b if var else a
diff = idx - last_index
if diff >= diff_high:
break
diff_high = diff
# If a < 0, our count is working backwards, so floor the index by increasing the count.
# Find the count that yields the lowest, in bound value and use that.
# Lastly reverse count increment so that we'll increase our index.
lowest = count
if a < 0:
while idx >= 1:
lowest = count
count += count_incr
idx = last_idx = a * count + b if var else a
count_incr = -1
count = lowest
idx = last_idx = a * count + b if var else a
# Evaluate elements while our calculated nth index is still in range
while 1 <= idx <= last_index + 1:
child = None
# Evaluate while our child index is still in range.
for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
index += factor
if not self.is_tag(child):
continue
# Handle `of S` in `nth-child`
if n.selectors and not self.match_selectors(child, n.selectors):
continue
# Handle `of-type`
if n.of_type and not self.match_nth_tag_type(el, child):
continue
relative_index += 1
if relative_index == idx:
if child is el:
matched = True
else:
break
if child is el:
break
if child is el:
break
last_idx = idx
count += count_incr
if count < 0:
# Count is counting down and has now ventured into invalid territory.
break
idx = a * count + b if var else a
if last_idx == idx:
break
if not matched:
break
return matched
def match_empty(self, el: bs4.Tag) -> bool:
"""Check if element is empty (if requested)."""
is_empty = True
for child in self.get_children(el, tags=False):
if self.is_tag(child):
is_empty = False
break
elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
is_empty = False
break
return is_empty
def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
"""Match selectors."""
match = True
for sel in selectors:
if not self.match_selectors(el, sel):
match = False
return match
def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
"""Match element if it contains text."""
match = True
content = None # type: str | Sequence[str] | None
for contain_list in contains:
if content is None:
if contain_list.own:
content = self.get_own_text(el, no_iframe=self.is_html)
else:
content = self.get_text(el, no_iframe=self.is_html)
found = False
for text in contain_list.text:
if contain_list.own:
for c in content:
if text in c:
found = True
break
if found:
break
else:
if text in content:
found = True
break
if not found:
match = False
return match
def match_default(self, el: bs4.Tag) -> bool:
"""Match default."""
match = False
# Find this input's form
form = None
parent = self.get_parent(el, no_iframe=True)
while parent and form is None:
if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
form = parent
else:
parent = self.get_parent(parent, no_iframe=True)
# Look in form cache to see if we've already located its default button
found_form = False
for f, t in self.cached_default_forms:
if f is form:
found_form = True
if t is el:
match = True
break
# We didn't have the form cached, so look for its default button
if not found_form:
for child in self.get_descendants(form, no_iframe=True):
name = self.get_tag(child)
# Can't do nested forms (haven't figured out why we never hit this)
if name == 'form': # pragma: no cover
break
if name in ('input', 'button'):
v = self.get_attribute_by_name(child, 'type', '')
if v and util.lower(v) == 'submit':
self.cached_default_forms.append((form, child))
if el is child:
match = True
break
return match
def match_indeterminate(self, el: bs4.Tag) -> bool:
"""Match default."""
match = False
name = cast(str, self.get_attribute_by_name(el, 'name'))
def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
"""Find this input's form."""
form = None
parent = self.get_parent(el, no_iframe=True)
while form is None:
if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
form = parent
break
last_parent = parent
parent = self.get_parent(parent, no_iframe=True)
if parent is None:
form = last_parent
break
return form
form = get_parent_form(el)
# Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
found_form = False
for f, n, i in self.cached_indeterminate_forms:
if f is form and n == name:
found_form = True
if i is True:
match = True
break
# We didn't have the form cached, so validate that the radio button is indeterminate
if not found_form:
checked = False
for child in self.get_descendants(form, no_iframe=True):
if child is el:
continue
tag_name = self.get_tag(child)
if tag_name == 'input':
is_radio = False
check = False
has_name = False
for k, v in self.iter_attributes(child):
if util.lower(k) == 'type' and util.lower(v) == 'radio':
is_radio = True
elif util.lower(k) == 'name' and v == name:
has_name = True
elif util.lower(k) == 'checked':
check = True
if is_radio and check and has_name and get_parent_form(child) is form:
checked = True
break
if checked:
break
if not checked:
match = True
self.cached_indeterminate_forms.append((form, name, match))
return match
def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
"""Match languages."""
match = False
has_ns = self.supports_namespaces()
root = self.root
has_html_namespace = self.has_html_namespace
# Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
parent = el
found_lang = None
last = None
while not found_lang:
has_html_ns = self.has_html_ns(parent)
for k, v in self.iter_attributes(parent):
attr_ns, attr = self.split_namespace(parent, k)
if (
((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
(
has_ns and not has_html_ns and attr_ns == NS_XML and
(util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
)
):
found_lang = v
break
last = parent
parent = self.get_parent(parent, no_iframe=self.is_html)
if parent is None:
root = last
has_html_namespace = self.has_html_ns(root)
parent = last
break
# Use cached meta language.
if found_lang is None and self.cached_meta_lang:
for cache in self.cached_meta_lang:
if root is cache[0]:
found_lang = cache[1]
# If we couldn't find a language, and the document is HTML, look to meta to determine language.
if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
# Find head
found = False
for tag in ('html', 'head'):
found = False
for child in self.get_children(parent, no_iframe=self.is_html):
if self.get_tag(child) == tag and self.is_html_tag(child):
found = True
parent = child
break
if not found: # pragma: no cover
break
# Search meta tags
if found:
for child in parent:
if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
c_lang = False
content = None
for k, v in self.iter_attributes(child):
if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
c_lang = True
if util.lower(k) == 'content':
content = v
if c_lang and content:
found_lang = content
self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
break
if found_lang is not None:
break
if found_lang is None:
self.cached_meta_lang.append((cast(str, root), ''))
# If we determined a language, compare.
if found_lang is not None:
for patterns in langs:
match = False
for pattern in patterns:
if self.extended_language_filter(pattern, cast(str, found_lang)):
match = True
if not match:
break
return match
def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
"""Check directionality."""
# If we have to match both left and right, we can't match either.
if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
return False
if el is None or not self.is_html_tag(el):
return False
# Element has defined direction of left to right or right to left
direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
if direction not in (None, 0):
return direction == directionality
# Element is the document element (the root) and no direction assigned, assume left to right.
is_root = self.is_root(el)
if is_root and direction is None:
return ct.SEL_DIR_LTR == directionality
# If `input[type=telephone]` and no direction is assigned, assume left to right.
name = self.get_tag(el)
is_input = name == 'input'
is_textarea = name == 'textarea'
is_bdi = name == 'bdi'
itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
if is_input and itype == 'tel' and direction is None:
return ct.SEL_DIR_LTR == directionality
# Auto handling for text inputs
if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
if is_textarea:
value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node))
else:
value = cast(str, self.get_attribute_by_name(el, 'value', ''))
if value:
for c in value:
bidi = unicodedata.bidirectional(c)
if bidi in ('AL', 'R', 'L'):
direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return direction == directionality
# Assume left to right
return ct.SEL_DIR_LTR == directionality
elif is_root:
return ct.SEL_DIR_LTR == directionality
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
# Auto handling for `bdi` and other non text inputs.
if (is_bdi and direction is None) or direction == 0:
direction = self.find_bidi(el)
if direction is not None:
return direction == directionality
elif is_root:
return ct.SEL_DIR_LTR == directionality
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
# Match parents direction
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
def match_range(self, el: bs4.Tag, condition: int) -> bool:
"""
Match range.
Behavior is modeled after what we see in browsers. Browsers seem to evaluate
if the value is out of range, and if not, it is in range. So a missing value
will not evaluate out of range; therefore, value is in range. Personally, I
feel like this should evaluate as neither in or out of range.
"""
out_of_range = False
itype = util.lower(self.get_attribute_by_name(el, 'type'))
mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
# There is no valid min or max, so we cannot evaluate a range
if mn is None and mx is None:
return False
value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
if value is not None:
if itype in ("date", "datetime-local", "month", "week", "number", "range"):
if mn is not None and value < mn:
out_of_range = True
if not out_of_range and mx is not None and value > mx:
out_of_range = True
elif itype == "time":
if mn is not None and mx is not None and mn > mx:
# Time is periodic, so this is a reversed/discontinuous range
if value < mn and value > mx:
out_of_range = True
else:
if mn is not None and value < mn:
out_of_range = True
if not out_of_range and mx is not None and value > mx:
out_of_range = True
return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
def match_defined(self, el: bs4.Tag) -> bool:
"""
Match defined.
`:defined` is related to custom elements in a browser.
- If the document is XML (not XHTML), all tags will match.
- Tags that are not custom (don't have a hyphen) are marked defined.
- If the tag has a prefix (without or without a namespace), it will not match.
This is of course requires the parser to provide us with the proper prefix and namespace info,
if it doesn't, there is nothing we can do.
"""
name = self.get_tag(el)
return (
name is not None and (
name.find('-') == -1 or
name.find(':') != -1 or
self.get_prefix(el) is not None
)
)
def match_placeholder_shown(self, el: bs4.Tag) -> bool:
"""
Match placeholder shown according to HTML spec.
- text area should be checked if they have content. A single newline does not count as content.
"""
match = False
content = self.get_text(el)
if content in ('', '\n'):
match = True
return match
def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
"""Check if element matches one of the selectors."""
match = False
is_not = selectors.is_not
is_html = selectors.is_html
# Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
if is_html:
namespaces = self.namespaces
iframe_restrict = self.iframe_restrict
self.namespaces = {'html': NS_XHTML}
self.iframe_restrict = True
if not is_html or self.is_html:
for selector in selectors:
match = is_not
# We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
if isinstance(selector, ct.SelectorNull):
continue
# Verify tag matches
if not self.match_tag(el, selector.tag):
continue
# Verify tag is defined
if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
continue
# Verify element is root
if selector.flags & ct.SEL_ROOT and not self.match_root(el):
continue
# Verify element is scope
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
continue
# Verify element has placeholder shown
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
continue
# Verify `nth` matches
if not self.match_nth(el, selector.nth):
continue
if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
continue
# Verify id matches
if selector.ids and not self.match_id(el, selector.ids):
continue
# Verify classes match
if selector.classes and not self.match_classes(el, selector.classes):
continue
# Verify attribute(s) match
if not self.match_attributes(el, selector.attributes):
continue
# Verify ranges
if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
continue
# Verify language patterns
if selector.lang and not self.match_lang(el, selector.lang):
continue
# Verify pseudo selector patterns
if selector.selectors and not self.match_subselectors(el, selector.selectors):
continue
# Verify relationship selectors
if selector.relation and not self.match_relations(el, selector.relation):
continue
# Validate that the current default selector match corresponds to the first submit button in the form
if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
continue
# Validate that the unset radio button is among radio buttons with the same name in a form that are
# also not set.
if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
continue
# Validate element directionality
if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
continue
# Validate that the tag contains the specified text.
if selector.contains and not self.match_contains(el, selector.contains):
continue
match = not is_not
break
# Restore actual namespaces being used for external selector lists
if is_html:
self.namespaces = namespaces
self.iframe_restrict = iframe_restrict
return match
def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
"""Match all tags under the targeted tag."""
lim = None if limit < 1 else limit
for child in self.get_descendants(self.tag):
if self.match(child):
yield child
if lim is not None:
lim -= 1
if lim < 1:
break
def closest(self) -> bs4.Tag | None:
"""Match closest ancestor."""
current = self.tag
closest = None
while closest is None and current is not None:
if self.match(current):
closest = current
else:
current = self.get_parent(current)
return closest
def filter(self) -> list[bs4.Tag]: # noqa A001
"""Filter tag's children."""
return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
def match(self, el: bs4.Tag) -> bool:
"""Match."""
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
class SoupSieve(ct.Immutable):
"""Compiled Soup Sieve selector matching object."""
pattern: str
selectors: ct.SelectorList
namespaces: ct.Namespaces | None
custom: dict[str, str]
flags: int
__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
def __init__(
self,
pattern: str,
selectors: ct.SelectorList,
namespaces: ct.Namespaces | None,
custom: ct.CustomSelectors | None,
flags: int
):
"""Initialize."""
super().__init__(
pattern=pattern,
selectors=selectors,
namespaces=namespaces,
custom=custom,
flags=flags
)
def match(self, tag: bs4.Tag) -> bool:
"""Match."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
def closest(self, tag: bs4.Tag) -> bs4.Tag:
"""Match closest ancestor."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
"""
Filter.
`CSSMatch` can cache certain searches for tags of the same document,
so if we are given a tag, all tags are from the same document,
and we can take advantage of the optimization.
Any other kind of iterable could have tags from different documents or detached tags,
so for those, we use a new `CSSMatch` for each item in the iterable.
"""
if CSSMatch.is_tag(iterable):
return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
else:
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
def select_one(self, tag: bs4.Tag) -> bs4.Tag:
"""Select a single tag."""
tags = self.select(tag, limit=1)
return tags[0] if tags else None
def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
"""Select the specified tags."""
return list(self.iselect(tag, limit))
def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
"""Iterate the specified tags."""
yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)
def __repr__(self) -> str: # pragma: no cover
"""Representation."""
return (
f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "
f"custom={self.custom!r}, flags={self.flags!r})"
)
__str__ = __repr__
ct.pickle_register(SoupSieve)