"""
HTML markup utilities for ``collective.html2blocks``.
This module provides functions for parsing, normalizing, and extracting
information from HTML markup, including grouping inline elements, filtering,
normalizing, and extracting table and style information.
Example:
.. code-block:: python
from collective.html2blocks.utils import markup
soup = markup.parse_source('<p>Hello <b>world</b></p>')
children = markup.all_children(soup)
"""
from .inline import ALLOW_EMPTY_ELEMENTS
from .inline import INLINE_ELEMENTS
from bs4 import BeautifulSoup
from bs4.element import Comment
from bs4.element import NavigableString
from bs4.element import PageElement
from collections.abc import Iterable
from collective.html2blocks._types import Tag
from urllib import parse
def _group_inline_elements(soup: BeautifulSoup) -> BeautifulSoup:
"""
Group inline elements into paragraphs in the soup.
Args:
soup (BeautifulSoup): The soup to process.
Returns:
BeautifulSoup: The modified soup with inline elements grouped.
"""
wrapper = None
children = list(soup.children)
for element in children:
if inline_element := is_inline(element, True):
if not wrapper:
wrapper = soup.new_tag("p")
element.insert_before(wrapper)
wrapper.append(element.extract())
elif not inline_element and wrapper:
if wrapper.text == "\n":
wrapper.extract()
wrapper = None
return soup
def _filter_children(soup: BeautifulSoup) -> BeautifulSoup:
"""
Filter out comments and empty elements from the soup.
Args:
soup (BeautifulSoup): The soup to process.
Returns:
BeautifulSoup: The filtered soup.
"""
children = list(soup.children)
for child in children:
if isinstance(child, Comment) or (
isinstance(child, NavigableString) and child.text == "\n"
):
child.extract()
children = list(soup.children)
if (
len(children) == 1
and isinstance(children[0], Tag)
and children[0].name == "div"
):
# If there is only a wraping div, return its children
new_soup = BeautifulSoup("", features="html.parser")
internal_ = list(children[0].children)
for child in internal_:
child = child.extract()
new_soup.append(child)
soup = _filter_children(new_soup)
return soup
def _normalize_html(soup: BeautifulSoup, block_level_tags: Iterable[str] = ()):
"""
Normalize HTML by simplifying, removing empty tags, and wrapping paragraphs.
Args:
soup (BeautifulSoup): The soup to normalize.
block_level_tags (Iterable[str], optional): Block-level tags to wrap.
Defaults to ``()``.
Returns:
BeautifulSoup: The normalized soup.
"""
_recursively_simplify(soup)
_remove_empty_tags(soup)
_wrap_all_paragraphs(soup, block_level_tags)
return soup
def _recursively_simplify(tag: Tag):
"""
Recursively simplify nested tags with identical names and attributes.
Args:
tag (Tag): The tag to simplify.
"""
for child in list(tag.children):
if isinstance(child, Tag):
_recursively_simplify(child)
if len(tag.contents) == 1 and isinstance(tag.contents[0], Tag):
child = tag.contents[0]
if tag.name == child.name and tag.attrs == child.attrs:
tag.replace_with(child)
_recursively_simplify(child)
[docs]
def is_empty(tag: Tag | NavigableString) -> bool:
"""
Check if a tag or string is empty (not allowed or has no content).
Args:
tag (Tag | NavigableString): The tag or string to check.
Returns:
bool: ``True`` if empty, ``False`` otherwise.
"""
if isinstance(tag, NavigableString):
return tag.strip() == ""
return (
tag.name not in ALLOW_EMPTY_ELEMENTS
and not tag.contents
and not tag.string
and not tag.attrs
)
[docs]
def is_ignorable(el: PageElement) -> bool:
"""
Check if an element is ignorable (empty string or allowed empty tag).
Args:
el (PageElement): The element to check.
Returns:
bool: ``True`` if ignorable, ``False`` otherwise.
"""
return (isinstance(el, NavigableString) and not el.strip()) or (
isinstance(el, Tag) and el.name in ALLOW_EMPTY_ELEMENTS
)
def _remove_trailing_allowed_empty_recursive(tag: Tag):
"""
Remove trailing allowed empty elements recursively from a tag.
Args:
tag (Tag): The tag to process.
Returns:
list: The contents after removal.
"""
for child in tag.find_all(recursive=False):
if isinstance(child, Tag):
_remove_trailing_allowed_empty_recursive(child)
elif isinstance(child, NavigableString) and child.strip() == "":
child.extract()
contents = list(tag.contents)
while (
contents
and isinstance(contents[-1], Tag)
and contents[-1].name in ALLOW_EMPTY_ELEMENTS
):
contents[-1].decompose()
contents = list(tag.contents)
return contents
def _remove_empty_tags(soup: BeautifulSoup):
"""
Remove all empty tags from the soup, except allowed empty elements.
Args:
soup (BeautifulSoup): The soup to process.
"""
# Remove all empty tags (excluding allowed empty elements)
for element in list(soup.find_all()):
if isinstance(element, Tag | NavigableString) and is_empty(element):
element.decompose()
# Clean up paragraphs
for p in list(soup.find_all("p")):
if not isinstance(p, Tag):
continue
contents: list[PageElement] = list(p.contents) if isinstance(p, Tag) else []
# Remove ignorable leading content
while contents and is_ignorable(contents[0]):
contents[0].extract()
contents = list(p.contents) if isinstance(p, Tag) else []
contents = _remove_trailing_allowed_empty_recursive(p)
# Remove paragraph if now empty
if not any(c for c in contents if not is_ignorable(c)):
p.decompose()
def _wrap_all_paragraphs(soup: BeautifulSoup, block_level_tags: Iterable[str]):
"""
Wrap all paragraphs in the soup, splitting as needed by block-level tags.
Args:
soup (BeautifulSoup): The soup to process.
block_level_tags (Iterable[str]): Block-level tags to split by.
"""
for p_tag in list(soup.find_all("p")):
if not isinstance(p_tag, Tag):
continue
new_elements = _split_paragraph(p_tag, block_level_tags)
if new_elements:
p_tag.insert_after(*new_elements)
p_tag.decompose()
def _get_root_soup(tag: Tag) -> BeautifulSoup:
"""
Get the root BeautifulSoup object for a tag.
Args:
tag (Tag): The tag to find the root for.
Returns:
BeautifulSoup: The root soup object.
"""
parent = tag
while parent is not None and not isinstance(parent, BeautifulSoup):
parent = parent.parent
if parent is None:
raise ValueError("Could not find root BeautifulSoup object")
return parent
def _split_paragraph(p_tag: Tag, block_level_tags: Iterable[str]) -> list[Tag]:
"""
Split a paragraph tag into multiple paragraphs by block-level tags.
Args:
p_tag (Tag): The paragraph tag to split.
block_level_tags (Iterable[str]): Block-level tags to split by.
Returns:
list[Tag]: List of new paragraph tags.
"""
soup = _get_root_soup(p_tag)
new_elements: list[Tag] = []
buffer: list[Tag] = []
def flush_buffer():
if buffer:
p = soup.new_tag("p")
for item in buffer:
p.append(item)
new_elements.append(p)
buffer.clear()
for child in list(p_tag.contents):
if isinstance(child, Tag) and child.name in block_level_tags:
if child.name == "img" and not child.get("src"):
continue
flush_buffer()
p = soup.new_tag("p")
p.append(child)
new_elements.append(p)
else:
buffer.append(child)
flush_buffer()
return new_elements
[docs]
def parse_source(
source: str,
filter_: bool = True,
group: bool = True,
normalize: bool = True,
block_level_tags: Iterable[str] = (),
) -> Tag:
"""
Parse HTML source and return a normalized soup object.
Args:
source (str): The HTML source to parse.
filter_ (bool, optional): Whether to filter children. Defaults to ``True``.
group (bool, optional): Whether to group inline elements. Defaults to ``True``.
normalize (bool, optional): Whether to normalize HTML. Defaults to ``True``.
block_level_tags (Iterable[str], optional): Block-level tags. Defaults to ``()``.
Returns:
Tag: The parsed and normalized soup object.
Example:
.. code-block:: python
soup = parse_source("<p>Hello <b>world</b></p>")
""" # noqa: E501
# Remove linebreaks from the end of the source
source = source.strip()
soup = BeautifulSoup(source, features="html.parser")
if normalize:
soup = _normalize_html(soup, block_level_tags)
if filter_:
soup = _filter_children(soup)
if group:
soup = _group_inline_elements(soup)
return soup
[docs]
def all_children(
element: PageElement | Tag, allow_tags: list[str] | None = None
) -> list[PageElement]:
"""
Return a list of all children of an element, optionally filtered by tag names.
Args:
element (PageElement | Tag): The element to get children from.
allow_tags (list[str], optional): List of tag names to include.
Defaults to ``None``.
Returns:
list[PageElement]: List of child elements.
"""
raw_children: list[PageElement] = list(getattr(element, "children", []))
if allow_tags:
children = [
child
for child in raw_children
if getattr(child, "name", None) in allow_tags
]
else:
children = raw_children
return children
[docs]
def styles(element: Tag) -> dict:
"""
Parse style attributes from an element into a dictionary.
Args:
element (Tag): The element to parse styles from.
Returns:
dict: Dictionary of style properties.
"""
styles = {}
raw_styles = str(element.get("style", "")).split(";")
for raw_item in raw_styles:
item = [i.strip() for i in raw_item.split(":")]
if len(item) != 2:
# Malformed style info
continue
styles[item[0]] = item[1]
return styles
[docs]
def css_classes(element: Tag) -> list[str]:
"""
Return a list of CSS classes from an element.
Args:
element (Tag): The element to get classes from.
Returns:
list[str]: List of CSS class names.
"""
attr = element.get("class")
return attr if isinstance(attr, list) else [str(attr)]
[docs]
def is_inline(element: PageElement, include_span: bool = False) -> bool:
"""
Check if an element is considered inline.
Args:
element (PageElement): The element to check.
include_span (bool, optional): Whether to treat span as inline.
Defaults to ``False``.
Returns:
bool: ``True`` if inline, ``False`` otherwise.
"""
if isinstance(element, NavigableString):
return True
if not isinstance(element, Tag):
return False
elif include_span and element.name == "span":
return True
return element.name in INLINE_ELEMENTS
[docs]
def extract_rows_and_possible_blocks(
table_element: Tag, tags_to_extract: list[str]
) -> tuple[list[tuple[Tag, bool]], list[Tag]]:
"""
Extract rows and possible blocks from a table element.
Args:
table_element (Tag): The table element to process.
tags_to_extract (list[str]): List of tag names to extract.
Returns:
tuple[list[tuple[Tag, bool]], list[Tag]]: Rows and extracted blocks.
"""
unbound_elements = []
for tag_name in tags_to_extract:
for match in table_element.find_all(tag_name):
unbound_elements.append(match.extract())
rows = []
for el in table_element.find_all("tr"):
parent = el.parent
if isinstance(parent, Tag):
rows.append((el, parent.name == "thead"))
return rows, unbound_elements
[docs]
def table_cell_type(cell: Tag, is_header: bool = False) -> str:
"""
Get the type of a table cell (``header`` or ``data``).
Args:
cell (Tag): The table cell element.
is_header (bool, optional): Whether the cell is a header. Defaults to ``False``.
Returns:
str: ``header`` or ``data``.
"""
if is_header:
return "header"
return "data" if cell.name == "td" else "header"
[docs]
def url_from_iframe(element: Tag) -> str:
"""
Parse an iframe element and return its ``src`` URL.
Args:
element (Tag): The iframe element.
Returns:
str: The ``src`` URL of the iframe.
"""
src = ""
if element.name == "iframe":
src = element.get("src", "")
return str(src)
[docs]
def cleanse_url(url: str) -> str:
"""
Clean up a URL by decoding HTML entities and normalizing.
Args:
url (str): The URL to clean.
Returns:
str: The cleansed URL.
"""
raw_url = url.replace("&", "&")
parsed = parse.urlparse(raw_url)
return parsed.geturl()