Skip to content

chunklet-py Documentation

html_2_md

chunklet.document_chunker.converters.html_2_md

Functions:

html_to_md –

Convert HTML content to Markdown, remove hrefs from links, and truncate long URLs.

html_to_md

html_to_md(
    file_path: str | Path = None,
    raw_text: str | None = None,
    max_url_length: int = 150,
) -> str

Convert HTML content to Markdown, remove hrefs from links, and truncate long URLs.

Parameters:

file_path
(str | Path, default: None ) –

Path to the html file.
raw_text
(str, default: None ) –

Raw HTML text. If both file_path and raw_text is provided, then raw_text will be used instead.
max_url_length
(int, default: 150 ) –

The maximum length of a URL. Defaults to 150.

Returns:

str ( str ) –

The full text content in Markdown.

Source code in src/chunklet/document_chunker/converters/html_2_md.py

def html_to_md(
    file_path: str | Path = None, raw_text: str | None = None, max_url_length: int = 150
) -> str:
    """
    Convert HTML content to Markdown, remove hrefs from links, and truncate long URLs.

    Args:
        file_path (str | Path): Path to the html file.
        raw_text (str, optional): Raw HTML text. If both file_path and raw_text is provided,
            then raw_text will be used instead.
        max_url_length (int): The maximum length of a URL. Defaults to 150.

    Returns:
        str: The full text content in Markdown.
    """
    if md is None:
        raise ImportError(
            "The 'markdownify' library is not installed. "
            "Please install it with 'pip install markdownify' or install the document processing extras "
            "with 'pip install 'chunklet-py[document]''"
        )

    if raw_text:
        markdown_content = md(raw_text)
    elif file_path:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            markdown_content = md(f.read())
    else:
        raise ValueError("Either file_path or raw_text must be provided.")

    # Normalize consecutive newlines that are more than 2
    markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)

    # Truncate long URLs in Markdown links or images
    def truncate_url(match: re.Match) -> str:
        prefix, url = match.group(1), match.group(2)
        if len(url) > max_url_length:
            url = url[: max_url_length - 3] + "..."
        return f"{prefix}({url})"

    return re.sub(r"(!?\[[^\]]*\])\((.*?)\)", truncate_url, markdown_content)