Skip to content

chunklet.document_chunker.registry

Classes:

CustomProcessorRegistry

Methods:

  • clear

    Clears all registered processors from the registry.

  • extract_data

    Processes a file using a processor registered for the given file extension.

  • is_registered

    Check if a document processor is registered for the given file extension.

  • register

    Register a document processor callback for one or more file extensions.

  • unregister

    Remove document processor(s) from the registry.

Attributes:

  • processors

    Returns a shallow copy of the dictionary of registered processors.

processors property

processors

Returns a shallow copy of the dictionary of registered processors.

This prevents external modification of the internal registry state.

clear

clear() -> None

Clears all registered processors from the registry.

Source code in src/chunklet/document_chunker/registry.py
def clear(self) -> None:
    """
    Clears all registered processors from the registry.
    """
    self._processors.clear()

extract_data

extract_data(
    file_path: str, ext: str
) -> tuple[ReturnType, str]

Processes a file using a processor registered for the given file extension.

Parameters:

  • file_path

    (str) –

    The path to the file.

  • ext

    (str) –

    The file extension.

Returns:

  • tuple[ReturnType, str]

    tuple[ReturnType, str]: A tuple containing the extracted data and the name of the processor used.

Raises:

Examples:

>>> from chunklet.document_chunker.registry import CustomProcessorRegistry
>>> registry = CustomProcessorRegistry()
>>> @registry.register(".txt", name="my_txt_processor")
... def process_txt(file_path: str) -> tuple[str, dict]:
...     with open(file_path, 'r') as f:
...         content = f.read()
...     return content, {"source": file_path}
>>> # Assuming 'sample.txt' exists with some content
>>> # result, processor_name = registry.extract_data("sample.txt", ".txt")
>>> # print(f"Extracted by {processor_name}: {result[0][:20]}...")
Source code in src/chunklet/document_chunker/registry.py
@validate_input
def extract_data(self, file_path: str, ext: str) -> tuple[ReturnType, str]:
    """
    Processes a file using a processor registered for the given file extension.

    Args:
        file_path (str): The path to the file.
        ext (str): The file extension.

    Returns:
        tuple[ReturnType, str]: A tuple containing the extracted data and the name of the processor used.

    Raises:
        CallbackError: If the processor callback fails or returns the wrong type.
        InvalidInputError: If no processor is registered for the extension.

    Examples:
        >>> from chunklet.document_chunker.registry import CustomProcessorRegistry
        >>> registry = CustomProcessorRegistry()
        >>> @registry.register(".txt", name="my_txt_processor")
        ... def process_txt(file_path: str) -> tuple[str, dict]:
        ...     with open(file_path, 'r') as f:
        ...         content = f.read()
        ...     return content, {"source": file_path}
        >>> # Assuming 'sample.txt' exists with some content
        >>> # result, processor_name = registry.extract_data("sample.txt", ".txt")
        >>> # print(f"Extracted by {processor_name}: {result[0][:20]}...")
    """
    processor_info = self._processors.get(ext)
    if not processor_info:
        raise InvalidInputError(
            f"No document processor registered for file extension '{ext}'.\n"
            f"💡Hint: Use `register('{ext}', callback=your_function)` first."
        )

    name, callback = processor_info

    try:
        # Validate the return type
        result = callback(file_path)
        validator = TypeAdapter(ReturnType)
        validator.validate_python(result)
    except ValidationError as e:
        e.subtitle = f"{name} result"
        e.hint = (
            "💡Hint: Make sure your processor returns a tuple of (text/texts, metadata_dict)."
            " An empty dict can be provided if there's no metadata."
        )

        raise CallbackError(f"{pretty_errors(e)}.\n") from None
    except Exception as e:
        raise CallbackError(
            f"Processor '{name}' for extension '{ext}' raised an exception.\nDetails: {e}"
        ) from None

    return result, name

is_registered

is_registered(ext: str) -> bool

Check if a document processor is registered for the given file extension.

Source code in src/chunklet/document_chunker/registry.py
@validate_input
def is_registered(self, ext: str) -> bool:
    """
    Check if a document processor is registered for the given file extension.
    """
    return ext in self._processors

register

register(*args: Any, name: str | None = None)

Register a document processor callback for one or more file extensions.

This method can be used in two ways: 1. As a decorator: @registry.register(".json", ".xml", name="my_processor") def my_processor(file_path): ...

  1. As a direct function call: registry.register(my_processor, ".json", ".xml", name="my_processor")

Parameters:

  • *args

    (Any, default: () ) –

    The arguments, which can be either (ext1, ext2, ...) for a decorator or (callback, ext1, ext2, ...) for a direct call.

  • name

    (str | None, default: None ) –

    The name of the processor. If None, attempts to use the callback's name.

Source code in src/chunklet/document_chunker/registry.py
def register(self, *args: Any, name: str | None = None):
    """
    Register a document processor callback for one or more file extensions.

    This method can be used in two ways:
    1. As a decorator:
        @registry.register(".json", ".xml", name="my_processor")
        def my_processor(file_path):
            ...

    2. As a direct function call:
        registry.register(my_processor, ".json", ".xml", name="my_processor")

    Args:
        *args: The arguments, which can be either (ext1, ext2, ...) for a decorator
               or (callback, ext1, ext2, ...) for a direct call.
        name (str | None): The name of the processor. If None, attempts to use the callback's name.
    """
    if not args:
        raise ValueError(
            "At least one file extension or a callback must be provided."
        )

    if callable(args[0]):
        # Direct call: register(callback, ext1, ext2, ...)
        callback = args[0]
        exts = args[1:]
        if not exts:
            raise ValueError(
                "At least one file extension must be provided for the callback."
            )
        self._register_logic(exts, callback, name)
        return callback
    else:
        # Decorator: @register(ext1, ext2, ...)
        exts = args

        def decorator(cb: Callable):
            self._register_logic(exts, cb, name)
            return cb

        return decorator

unregister

unregister(*exts: str) -> None

Remove document processor(s) from the registry.

Parameters:

  • *exts

    (str, default: () ) –

    File extensions to remove.

Source code in src/chunklet/document_chunker/registry.py
@validate_input
def unregister(self, *exts: str) -> None:
    """
    Remove document processor(s) from the registry.

    Args:
        *exts: File extensions to remove.
    """
    for ext in exts:
        self._processors.pop(ext, None)