Skip to content

Chunking API

LangChainChunker

LangChainChunker(
    method: Literal["recursive", "character", "token"] = "recursive",
    chunk_size: int = 2048,
    chunk_overlap: int = 256,
    **kwargs: Any
)

Bases: BaseChunker[Document]

Wrapper for LangChain TextSplitter.

Parameters:

  • method (Literal['recursive', 'character', 'token'], default: "recursive" ) –

    Describes the type of TextSplitter as the main instance performing the chunking.

  • chunk_size (int, default: 2048 ) –

    Maximum size of a single chunk that is returned.

  • chunk_overlap (int, default: 256 ) –

    Overlap in characters between chunks.

Other Parameters:

  • separators (list[str]) –

    Separators between chunks.

Source code in ai4rag/rag/chunking/langchain_chunker.py
def __init__(
    self,
    method: Literal["recursive", "character", "token"] = "recursive",
    chunk_size: int = 2048,
    chunk_overlap: int = 256,
    **kwargs: Any,
) -> None:
    self.method = method
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap
    self.separators = kwargs.pop("separators", ["\n\n", r"(?<=\. )", "\n", " ", ""])
    self._text_splitter = self._get_text_splitter()

Functions

to_dict

to_dict() -> dict[str, Any]

Return dictionary that can be used to recreate an instance of the LangChainChunker.

Source code in ai4rag/rag/chunking/langchain_chunker.py
def to_dict(self) -> dict[str, Any]:
    """
    Return dictionary that can be used to recreate an instance of the LangChainChunker.
    """
    params = (
        "method",
        "chunk_size",
        "chunk_overlap",
    )

    ret = {k: v for k, v in self.__dict__.items() if k in params}

    return ret

from_dict classmethod

from_dict(d: dict[str, Any]) -> LangChainChunker

Create an instance from the dictionary.

Source code in ai4rag/rag/chunking/langchain_chunker.py
@classmethod
def from_dict(cls, d: dict[str, Any]) -> "LangChainChunker":
    """Create an instance from the dictionary."""

    return cls(**d)

split_documents

split_documents(documents: Sequence[Document]) -> list[Document]

Split series of documents into smaller chunks based on the provided chunker settings. Each chunk has metadata that includes the document_id, sequence_number, and start_index.

Parameters:

  • documents (Sequence[Document]) –

    Sequence of elements that contain context in a text format.

Returns:

  • list[Document]

    List of documents split into smaller chunks.

Source code in ai4rag/rag/chunking/langchain_chunker.py
def split_documents(self, documents: Sequence[Document]) -> list[Document]:
    """
    Split series of documents into smaller chunks based on the provided
    chunker settings. Each chunk has metadata that includes the document_id,
    sequence_number, and start_index.

    Parameters
    ----------
    documents : Sequence[Document]
        Sequence of elements that contain context in a text format.

    Returns
    -------
    list[Document]
        List of documents split into smaller chunks.
    """
    self._set_document_id_in_metadata_if_missing(documents)
    chunks = self._text_splitter.split_documents(documents)
    sorted_chunks = self._set_sequence_number_in_metadata(chunks)
    return sorted_chunks