Source code for collective.html2blocks.utils.markup

"""
HTML markup utilities for ``collective.html2blocks``.

This module provides functions for parsing, normalizing, and extracting
information from HTML markup, including grouping inline elements, filtering,
normalizing, and extracting table and style information.

Example:
    .. code-block:: python

        from collective.html2blocks.utils import markup
        soup = markup.parse_source('<p>Hello <b>world</b></p>')
        children = markup.all_children(soup)
"""

from .inline import ALLOW_EMPTY_ELEMENTS
from .inline import INLINE_ELEMENTS
from bs4 import BeautifulSoup
from bs4.element import Comment
from bs4.element import NavigableString
from bs4.element import PageElement
from collections.abc import Iterable
from collective.html2blocks._types import Tag
from urllib import parse


def _group_inline_elements(soup: BeautifulSoup) -> BeautifulSoup:
    """
    Group inline elements into paragraphs in the soup.

    Args:
        soup (BeautifulSoup): The soup to process.

    Returns:
        BeautifulSoup: The modified soup with inline elements grouped.
    """
    wrapper = None
    children = list(soup.children)
    for element in children:
        if inline_element := is_inline(element, True):
            if not wrapper:
                wrapper = soup.new_tag("p")
                element.insert_before(wrapper)
            wrapper.append(element.extract())
        elif not inline_element and wrapper:
            if wrapper.text == "\n":
                wrapper.extract()
            wrapper = None
    return soup


def _filter_children(soup: BeautifulSoup) -> BeautifulSoup:
    """
    Filter out comments and empty elements from the soup.

    Args:
        soup (BeautifulSoup): The soup to process.

    Returns:
        BeautifulSoup: The filtered soup.
    """
    children = list(soup.children)
    for child in children:
        if isinstance(child, Comment) or (
            isinstance(child, NavigableString) and child.text == "\n"
        ):
            child.extract()
    children = list(soup.children)
    if (
        len(children) == 1
        and isinstance(children[0], Tag)
        and children[0].name == "div"
    ):
        # If there is only a wraping div, return its children
        new_soup = BeautifulSoup("", features="html.parser")
        internal_ = list(children[0].children)
        for child in internal_:
            child = child.extract()
            new_soup.append(child)
        soup = _filter_children(new_soup)
    return soup


def _normalize_html(soup: BeautifulSoup, block_level_tags: Iterable[str] = ()):
    """
    Normalize HTML by simplifying, removing empty tags, and wrapping paragraphs.

    Args:
        soup (BeautifulSoup): The soup to normalize.
        block_level_tags (Iterable[str], optional): Block-level tags to wrap.
                                                    Defaults to ``()``.

    Returns:
        BeautifulSoup: The normalized soup.
    """
    _recursively_simplify(soup)
    _remove_empty_tags(soup)
    _wrap_all_paragraphs(soup, block_level_tags)
    return soup


def _recursively_simplify(tag: Tag):
    """
    Recursively simplify nested tags with identical names and attributes.

    Args:
        tag (Tag): The tag to simplify.
    """
    for child in list(tag.children):
        if isinstance(child, Tag):
            _recursively_simplify(child)

    if len(tag.contents) == 1 and isinstance(tag.contents[0], Tag):
        child = tag.contents[0]
        if tag.name == child.name and tag.attrs == child.attrs:
            tag.replace_with(child)
            _recursively_simplify(child)



[docs]
def is_empty(tag: Tag | NavigableString) -> bool:
    """
    Check if a tag or string is empty (not allowed or has no content).

    Args:
        tag (Tag | NavigableString): The tag or string to check.

    Returns:
        bool: ``True`` if empty, ``False`` otherwise.
    """
    if isinstance(tag, NavigableString):
        return tag.strip() == ""
    return (
        tag.name not in ALLOW_EMPTY_ELEMENTS
        and not tag.contents
        and not tag.string
        and not tag.attrs
    )




[docs]
def is_ignorable(el: PageElement) -> bool:
    """
    Check if an element is ignorable (empty string or allowed empty tag).

    Args:
        el (PageElement): The element to check.

    Returns:
        bool: ``True`` if ignorable, ``False`` otherwise.
    """
    return (isinstance(el, NavigableString) and not el.strip()) or (
        isinstance(el, Tag) and el.name in ALLOW_EMPTY_ELEMENTS
    )



def _remove_trailing_allowed_empty_recursive(tag: Tag):
    """
    Remove trailing allowed empty elements recursively from a tag.

    Args:
        tag (Tag): The tag to process.

    Returns:
        list: The contents after removal.
    """
    for child in tag.find_all(recursive=False):
        if isinstance(child, Tag):
            _remove_trailing_allowed_empty_recursive(child)
        elif isinstance(child, NavigableString) and child.strip() == "":
            child.extract()

    contents = list(tag.contents)
    while (
        contents
        and isinstance(contents[-1], Tag)
        and contents[-1].name in ALLOW_EMPTY_ELEMENTS
    ):
        contents[-1].decompose()
        contents = list(tag.contents)
    return contents


def _remove_empty_tags(soup: BeautifulSoup):
    """
    Remove all empty tags from the soup, except allowed empty elements.

    Args:
        soup (BeautifulSoup): The soup to process.
    """
    # Remove all empty tags (excluding allowed empty elements)
    for element in list(soup.find_all()):
        if isinstance(element, Tag | NavigableString) and is_empty(element):
            element.decompose()

    # Clean up paragraphs
    for p in list(soup.find_all("p")):
        if not isinstance(p, Tag):
            continue
        contents: list[PageElement] = list(p.contents) if isinstance(p, Tag) else []

        # Remove ignorable leading content
        while contents and is_ignorable(contents[0]):
            contents[0].extract()
            contents = list(p.contents) if isinstance(p, Tag) else []

        contents = _remove_trailing_allowed_empty_recursive(p)

        # Remove paragraph if now empty
        if not any(c for c in contents if not is_ignorable(c)):
            p.decompose()


def _wrap_all_paragraphs(soup: BeautifulSoup, block_level_tags: Iterable[str]):
    """
    Wrap all paragraphs in the soup, splitting as needed by block-level tags.

    Args:
        soup (BeautifulSoup): The soup to process.
        block_level_tags (Iterable[str]): Block-level tags to split by.
    """
    for p_tag in list(soup.find_all("p")):
        if not isinstance(p_tag, Tag):
            continue
        new_elements = _split_paragraph(p_tag, block_level_tags)
        if new_elements:
            p_tag.insert_after(*new_elements)
            p_tag.decompose()


def _get_root_soup(tag: Tag) -> BeautifulSoup:
    """
    Get the root BeautifulSoup object for a tag.

    Args:
        tag (Tag): The tag to find the root for.

    Returns:
        BeautifulSoup: The root soup object.
    """
    parent = tag
    while parent is not None and not isinstance(parent, BeautifulSoup):
        parent = parent.parent
    if parent is None:
        raise ValueError("Could not find root BeautifulSoup object")
    return parent


def _split_paragraph(p_tag: Tag, block_level_tags: Iterable[str]) -> list[Tag]:
    """
    Split a paragraph tag into multiple paragraphs by block-level tags.

    Args:
        p_tag (Tag): The paragraph tag to split.
        block_level_tags (Iterable[str]): Block-level tags to split by.

    Returns:
        list[Tag]: List of new paragraph tags.
    """
    soup = _get_root_soup(p_tag)
    new_elements: list[Tag] = []
    buffer: list[Tag] = []

    def flush_buffer():
        if buffer:
            p = soup.new_tag("p")
            for item in buffer:
                p.append(item)
            new_elements.append(p)
            buffer.clear()

    for child in list(p_tag.contents):
        if isinstance(child, Tag) and child.name in block_level_tags:
            if child.name == "img" and not child.get("src"):
                continue
            flush_buffer()
            p = soup.new_tag("p")
            p.append(child)
            new_elements.append(p)
        else:
            buffer.append(child)

    flush_buffer()
    return new_elements



[docs]
def parse_source(
    source: str,
    filter_: bool = True,
    group: bool = True,
    normalize: bool = True,
    block_level_tags: Iterable[str] = (),
) -> Tag:
    """
    Parse HTML source and return a normalized soup object.

    Args:
        source (str): The HTML source to parse.
        filter_ (bool, optional): Whether to filter children. Defaults to ``True``.
        group (bool, optional): Whether to group inline elements. Defaults to ``True``.
        normalize (bool, optional): Whether to normalize HTML. Defaults to ``True``.
        block_level_tags (Iterable[str], optional): Block-level tags. Defaults to ``()``.

    Returns:
        Tag: The parsed and normalized soup object.

    Example:
        .. code-block:: python

            soup = parse_source("<p>Hello <b>world</b></p>")
    """  # noqa: E501
    # Remove linebreaks from the end of the source
    source = source.strip()
    soup = BeautifulSoup(source, features="html.parser")
    if normalize:
        soup = _normalize_html(soup, block_level_tags)
    if filter_:
        soup = _filter_children(soup)
    if group:
        soup = _group_inline_elements(soup)
    return soup




[docs]
def all_children(
    element: PageElement | Tag, allow_tags: list[str] | None = None
) -> list[PageElement]:
    """
    Return a list of all children of an element, optionally filtered by tag names.

    Args:
        element (PageElement | Tag): The element to get children from.
        allow_tags (list[str], optional): List of tag names to include.
                                          Defaults to ``None``.

    Returns:
        list[PageElement]: List of child elements.
    """
    raw_children: list[PageElement] = list(getattr(element, "children", []))
    if allow_tags:
        children = [
            child
            for child in raw_children
            if getattr(child, "name", None) in allow_tags
        ]
    else:
        children = raw_children
    return children




[docs]
def styles(element: Tag) -> dict:
    """
    Parse style attributes from an element into a dictionary.

    Args:
        element (Tag): The element to parse styles from.

    Returns:
        dict: Dictionary of style properties.
    """
    styles = {}
    raw_styles = str(element.get("style", "")).split(";")
    for raw_item in raw_styles:
        item = [i.strip() for i in raw_item.split(":")]
        if len(item) != 2:
            # Malformed style info
            continue
        styles[item[0]] = item[1]
    return styles




[docs]
def css_classes(element: Tag) -> list[str]:
    """
    Return a list of CSS classes from an element.

    Args:
        element (Tag): The element to get classes from.

    Returns:
        list[str]: List of CSS class names.
    """
    attr = element.get("class")
    return attr if isinstance(attr, list) else [str(attr)]




[docs]
def is_inline(element: PageElement, include_span: bool = False) -> bool:
    """
    Check if an element is considered inline.

    Args:
        element (PageElement): The element to check.
        include_span (bool, optional): Whether to treat span as inline.
                                       Defaults to ``False``.

    Returns:
        bool: ``True`` if inline, ``False`` otherwise.
    """
    if isinstance(element, NavigableString):
        return True
    if not isinstance(element, Tag):
        return False
    elif include_span and element.name == "span":
        return True
    return element.name in INLINE_ELEMENTS




[docs]
def extract_rows_and_possible_blocks(
    table_element: Tag, tags_to_extract: list[str]
) -> tuple[list[tuple[Tag, bool]], list[Tag]]:
    """
    Extract rows and possible blocks from a table element.

    Args:
        table_element (Tag): The table element to process.
        tags_to_extract (list[str]): List of tag names to extract.

    Returns:
        tuple[list[tuple[Tag, bool]], list[Tag]]: Rows and extracted blocks.
    """
    unbound_elements = []

    for tag_name in tags_to_extract:
        for match in table_element.find_all(tag_name):
            unbound_elements.append(match.extract())

    rows = []
    for el in table_element.find_all("tr"):
        parent = el.parent
        if isinstance(parent, Tag):
            rows.append((el, parent.name == "thead"))
    return rows, unbound_elements




[docs]
def table_cell_type(cell: Tag, is_header: bool = False) -> str:
    """
    Get the type of a table cell (``header`` or ``data``).

    Args:
        cell (Tag): The table cell element.
        is_header (bool, optional): Whether the cell is a header. Defaults to ``False``.

    Returns:
        str: ``header`` or ``data``.
    """
    if is_header:
        return "header"
    return "data" if cell.name == "td" else "header"




[docs]
def extract_plaintext(element: Tag) -> str:
    """
    Extract plaintext from an element, handling lists specially.

    Args:
        element (Tag): The element to extract text from.

    Returns:
        str: The extracted plaintext.
    """
    plaintext = element.text.strip()
    tag_name = element.name
    if tag_name in ("ol", "ul"):
        plaintext = " ".join([c.text.strip() for c in element.children])
    return plaintext




[docs]
def url_from_iframe(element: Tag) -> str:
    """
    Parse an iframe element and return its ``src`` URL.

    Args:
        element (Tag): The iframe element.

    Returns:
        str: The ``src`` URL of the iframe.
    """
    src = ""
    if element.name == "iframe":
        src = element.get("src", "")
    return str(src)




[docs]
def cleanse_url(url: str) -> str:
    """
    Clean up a URL by decoding HTML entities and normalizing.

    Args:
        url (str): The URL to clean.

    Returns:
        str: The cleansed URL.
    """
    raw_url = url.replace("&amp;", "&")
    parsed = parse.urlparse(raw_url)
    return parsed.geturl()