Skip to content

chunklet.code_chunker._code_structure_extractor

Code Structure Extractor

Internal module for extracting code structures from source code. Split from CodeChunker for modularity.

Classes:

CodeStructureExtractor

CodeStructureExtractor(verbose: bool = False)

Internal class for extracting structural units from source code.

Methods:

Source code in src/chunklet/code_chunker/_code_structure_extractor.py
@validate_input
def __init__(self, verbose: bool = False):
    self.verbose = verbose

extract_code_structure

extract_code_structure(
    source: str | Path,
    include_comments: bool,
    docstring_mode: str,
) -> tuple[list[dict], tuple[int, ...]]

Preprocess and parse source into individual snippet boxes.

This function-first extraction identifies functions as primary units while implicitly handling other structures within the function context.

Parameters:

  • source

    (str | Path) –

    Raw code string or Path to source file.

  • include_comments

    (bool) –

    Whether to include comments in output.

  • docstring_mode

    (Literal[summary, all, excluded]) –

    How to handle docstrings.

Returns:

  • tuple[list[dict], tuple[int, ...]]

    tuple[list[dict], tuple[int, ...]]: A tuple containing the list of extracted code structure boxes and the line lengths.

Source code in src/chunklet/code_chunker/_code_structure_extractor.py
def extract_code_structure(
    self,
    source: str | Path,
    include_comments: bool,
    docstring_mode: str,
) -> tuple[list[dict], tuple[int, ...]]:
    """
    Preprocess and parse source into individual snippet boxes.

    This function-first extraction identifies functions as primary units
    while implicitly handling other structures within the function context.

    Args:
        source (str | Path): Raw code string or Path to source file.
        include_comments (bool): Whether to include comments in output.
        docstring_mode (Literal["summary", "all", "excluded"]): How to handle docstrings.

    Returns:
        tuple[list[dict], tuple[int, ...]]: A tuple containing the list of extracted code structure boxes and the line lengths.
    """
    source_code = self._read_source(source)
    if not source_code:
        return [], ()

    source_code, cumulative_lengths = self._preprocess(
        source_code, include_comments, docstring_mode
    )

    state = {
        "curr_struct": [],
        "last_indent": 0,
        "inside_func": False,
        "snippet_dicts": [],
    }
    buffer = defaultdict(list)

    for line_no, line in enumerate(source_code.splitlines(), start=1):
        indent_level = len(line) - len(line.lstrip())

        # Detect annotated lines
        matched = re.search(r"\(-- ([A-Z]+) -->\) ", line)
        if matched:
            self._handle_annotated_line(
                line=line,
                line_no=line_no,
                indent_level=indent_level,
                matched=matched,
                buffer=buffer,
                state=state,
            )
            continue

        # Manage block accumulation

        func_start = FUNCTION_DECLARATION.match(line)
        func_start = func_start.group(0) if func_start else None

        self._handle_block_start(
            line=line,
            indent_level=indent_level,
            buffer=buffer,
            state=state,
            source=source,
            func_start=func_start,
        )

        if not state["curr_struct"]:  # Fresh block
            state["curr_struct"] = [
                CodeLine(
                    line_no,
                    line,
                    indent_level,
                    func_start,
                )
            ]
            continue

        if (
            line.strip()
            and indent_level <= state["last_indent"]
            and not (OPENER.match(line) or CLOSURE.match(line))
        ):  # Block end
            self._flush_snippet(
                state["curr_struct"], state["snippet_dicts"], buffer
            )
            state["last_indent"] = 0
            state["inside_func"] = False

        state["curr_struct"].append(
            CodeLine(line_no, line, indent_level, func_start)
        )

    # Append last snippet
    if state["curr_struct"]:
        self._flush_snippet(state["curr_struct"], state["snippet_dicts"], buffer)

    snippet_dicts = self._post_processing(state["snippet_dicts"])
    if self.verbose:
        logger.info(
            "Extracted {} structural blocks from source", len(snippet_dicts)
        )

    return snippet_dicts, cumulative_lengths