Convert LaTeX code to Markdown-style plain text.
Parameters:
Returns:
-
str ( str
) –
The full text content in markdown
Source code in src/chunklet/document_chunker/converters/latex_2_md.py
| def latex_to_md(file_path: str | Path) -> str:
"""
Convert LaTeX code to Markdown-style plain text.
Args:
file_path (str | Path): Path to the latex file.
Returns:
str: The full text content in markdown
"""
if LatexNodes2Text is None:
raise ImportError(
"The 'pylatexenc' library is not installed. "
"Please install it with 'pip install 'pylatexenc>=2.10'' or install the document processing extras "
"with 'pip install 'chunklet-py[document]''"
)
with open(file_path, encoding="utf-8", errors="ignore") as f:
latex_code = f.read()
# Convert to text
latex_node = LatexNodes2Text()
text = latex_node.latex_to_text(latex_code)
# Replace § by #
markdown_content = re.sub(r"§\.?", "#", text)
# Normalize consecutive newlines more than two
return re.sub(r"\n{2,}", "\n\n", markdown_content.strip())
|