Module `zep_python.document`

Expand source code

from zep_python.document.collections import DocumentCollection
from zep_python.document.models import Document

__all__ = [
    "Document",
    "DocumentCollection",
]

Sub-modules

zep_python.document.client
zep_python.document.collections
zep_python.document.models

Classes

class Document (**data: Any)

Represents a document base.

Attributes

uuid : Optional[str]: The unique identifier of the document.
created_at : Optional[datetime]: The timestamp of when the document was created.
updated_at : Optional[datetime]: The timestamp of when the document was last updated.
document_id : Optional[str]: The unique identifier of the document (name or some id).
content : str: The content of the document.
metadata : Optional[Dict[str, Any]]: Any additional metadata associated with the document.
is_embedded : Optional[bool]: Whether the document has an embedding.
embedding : Optional[List[float]]: The embedding of the document.
score : Optional[float]: The normed score of the search result. Available only when the document is returned as part of a query result.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Expand source code

class Document(BaseModel):
    """
    Represents a document base.

    Attributes
    ----------
    uuid : Optional[str]
        The unique identifier of the document.
    created_at : Optional[datetime]
        The timestamp of when the document was created.
    updated_at : Optional[datetime]
        The timestamp of when the document was last updated.
    document_id : Optional[str]
        The unique identifier of the document (name or some id).
    content : str
        The content of the document.
    metadata : Optional[Dict[str, Any]]
        Any additional metadata associated with the document.
    is_embedded : Optional[bool]
        Whether the document has an embedding.
    embedding : Optional[List[float]]
        The embedding of the document.
    score : Optional[float]
        The normed score of the search result. Available only
        when the document is returned as part of a query result.
    """

    uuid: Optional[str] = None
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None
    document_id: Optional[str] = Field(default=None, max_length=100)
    content: str = Field(..., min_length=1)
    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
    is_embedded: Optional[bool] = None
    embedding: Optional[List[float]] = None
    score: Optional[float] = None

    def to_dict(self) -> Dict[str, Any]:
        """
        Returns a dictionary representation of the document.

        Returns
        -------
        Dict[str, Any]
            A dictionary containing the attributes of the document.
        """
        return self.model_dump()

Ancestors

pydantic.main.BaseModel

Class variables

var content : str
var created_at : Optional[datetime.datetime]
var document_id : Optional[str]
var embedding : Optional[List[float]]
var is_embedded : Optional[bool]
var metadata : Optional[Dict[str, Any]]
var model_computed_fields
var model_config
var model_fields
var score : Optional[float]
var updated_at : Optional[datetime.datetime]
var uuid : Optional[str]

Methods

def to_dict(self) ‑> Dict[str, Any]

Returns a dictionary representation of the document.

Returns

Dict[str, Any]: A dictionary containing the attributes of the document.

Expand source code

def to_dict(self) -> Dict[str, Any]:
    """
    Returns a dictionary representation of the document.

    Returns
    -------
    Dict[str, Any]
        A dictionary containing the attributes of the document.
    """
    return self.model_dump()

class DocumentCollection (aclient: Optional[httpx.AsyncClient] = None, client: Optional[httpx.Client] = None, **kwargs: Any)

Represents a collection of documents.

Attributes

uuid : str: The unique identifier of the collection.
created_at : Optional[datetime]: The timestamp of when the collection was created.
updated_at : Optional[datetime]: The timestamp of when the collection was last updated.
name : str: The unique name of the collection.
description : Optional[str]: The description of the collection.
metadata : Optional[Dict[str, Any]]: Any additional metadata associated with the collection.
embedding_dimensions : int: The dimensions of the embedding model.
is_auto_embedded : bool: Flag to indicate whether the documents in the collection should be automatically embedded by Zep. (Default: True)
is_indexed : bool: Flag indicating whether an index has been created for this collection.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Expand source code

class DocumentCollection(DocumentCollectionModel):
    __doc__ = DocumentCollectionModel.__doc__ or ""

    _client: Optional[httpx.Client] = PrivateAttr(default=None)
    _aclient: Optional[httpx.AsyncClient] = PrivateAttr(default=None)

    def __init__(
        self,
        aclient: Optional[httpx.AsyncClient] = None,
        client: Optional[httpx.Client] = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        self._aclient = aclient
        self._client = client

    @property
    def status(self) -> str:
        """
        Get the status of the collection.

        Returns
        -------
        str
            The status of the collection.

            `ready`: All documents have been embedded and the collection is ready for
            search.

            `pending`: The collection is still processing.
        """
        if self.document_count and (
            self.document_embedded_count == self.document_count
        ):
            return "ready"
        else:
            return "pending"

    async def aadd_documents(
        self,
        documents: List[Document],
        batch_size: int = DEFAULT_BATCH_SIZE,
    ) -> List[str]:
        """
        Asynchronously create documents.


        documents : List[Document]
            A list of Document objects representing the documents to create.
        batch_size : int, optional
            The number of documents to upload in each batch. Defaults to 500.

        Returns
        -------
        List[str]
            The UUIDs of the created documents.

        Raises
        ------
        APIError
            If the API response format is unexpected.
        """

        if not self._aclient:
            raise ValueError(
                "Can only add documents once a collection has been created"
            )

        if documents is None:
            raise ValueError("document list must be provided")

        uuids: List[str] = []
        for batch in generate_batches(documents, batch_size):
            response = await self._aclient.post(
                f"/collections/{urllib.parse.quote_plus(self.name)}/documents",
                json=batch,
            )

            handle_response(response)

            uuids.extend(response.json())

        return uuids

    def add_documents(
        self,
        documents: List[Document],
        batch_size: int = DEFAULT_BATCH_SIZE,
    ) -> List[str]:
        """
        Create documents.


        documents : List[Document]
            A list of Document objects representing the documents to create.

        Returns
        -------
        List[str]
            The UUIDs of the created documents.
        batch_size : int, optional
            The number of documents to upload in each batch. Defaults to 500.

        Raises
        ------
        APIError
            If the API response format is unexpected.
        """
        if not self._client:
            raise ValueError(
                "Can only add documents once a collection has been created"
            )

        if documents is None:
            raise ValueError("document list must be provided")

        uuids: List[str] = []
        for batch in generate_batches(documents, batch_size):
            response = self._client.post(
                f"/collections/{urllib.parse.quote_plus(self.name)}/documents",
                json=batch,
            )

            handle_response(response)

            uuids.extend(response.json())

        return uuids

    async def aupdate_document(
        self,
        uuid: str,
        document_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> None:
        """
        Asynchronously update document by UUID.

        Parameters
        ----------
        uuid : str
            The UUID of the document to update.
        document_id : Optional[str]
            The document_id of the document.
        metadata : Optional[Dict[str, Any]]
            The metadata of the document.

        Returns
        -------
        None

        Raises
        ------
        NotFoundError
            If the document is not found.

        APIError
            If the API response format is unexpected.
        """
        if not self._aclient:
            raise ValueError(
                "Can only update documents once a collection has been retrieved or"
                " created"
            )

        if uuid is None:
            raise ValueError("document uuid must be provided")

        if document_id is None and metadata is None:
            raise ValueError("document_id or metadata must be provided")

        payload = filter_dict({"document_id": document_id, "metadata": metadata})

        response = await self._aclient.patch(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
            json=payload,
        )

        handle_response(response)

    def update_document(
        self,
        uuid: str,
        document_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> None:
        """
        Update document by UUID.

        Parameters
        ----------
        uuid : str
            The UUID of the document to update.
        document_id : Optional[str]
            The document_id of the document.
        metadata : Optional[Dict[str, Any]]
            The metadata of the document.

        Returns
        -------
        None

        Raises
        ------
        NotFoundError
            If the document is not found.

        APIError
            If the API response format is unexpected.
        """
        if not self._client:
            raise ValueError(
                "Can only update documents once a collection has been retrieved or"
                " created"
            )

        if uuid is None:
            raise ValueError("document uuid must be provided")

        if document_id is None and metadata is None:
            raise ValueError("document_id or metadata must be provided")

        payload = filter_dict({"document_id": document_id, "metadata": metadata})

        response = self._client.patch(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
            json=payload,
        )

        handle_response(response)

    async def adelete_document(self, uuid: str) -> None:
        """
        Asynchronously delete document.

        Parameters
        ----------
        uuid: str
            The uuid of the document to be deleted.

        Returns
        -------
        None

        Raises
        ------
        NotFoundError
            If the document is not found.

        APIError
            If the API response format is unexpected.
        """
        if not self._aclient:
            raise ValueError(
                "Can only delete a document once a collection has been retrieved"
            )

        if uuid is None or uuid.strip() == "":
            raise ValueError("document uuid must be provided")

        response = await self._aclient.delete(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
        )

        handle_response(response)

    def delete_document(self, uuid: str) -> None:
        """
        Delete document.

        Parameters
        ----------
        uuid: str
            The uuid of the document to be deleted.

        Returns
        -------
        None

        Raises
        ------
        NotFoundError
            If the document is not found.

        APIError
            If the API response format is unexpected.
        """
        if not self._client:
            raise ValueError(
                "Can only delete a document once a collection has been retrieved"
            )

        if uuid is None or uuid.strip() == "":
            raise ValueError("document uuid must be provided")

        response = self._client.delete(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
        )

        handle_response(response)

    async def aget_document(self, uuid: str) -> Document:
        """
        Asynchronously gets a document.

        Parameters
        ----------
        uuid: str
            The name of the document to get.

        Returns
        -------
        Document
            The retrieved document.

        Raises
        ------
        NotFoundError
            If the document is not found.

        APIError
            If the API response format is unexpected.
        """
        if not self._aclient:
            raise ValueError(
                "Can only get a document once a collection has been retrieved"
            )

        if uuid is None or uuid.strip() == "":
            raise ValueError("document uuid must be provided")

        response = await self._aclient.get(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
        )

        handle_response(response)

        return Document(**response.json())

    def get_document(self, uuid: str) -> Document:
        """
        Gets a document.

        Parameters
        ----------
        uuid: str
            The name of the document to get.

        Returns
        -------
        Document
            The retrieved document.

        Raises
        ------
        NotFoundError
            If the document is not found.

        APIError
            If the API response format is unexpected.
        """
        if not self._client:
            raise ValueError(
                "Can only get a document once a collection has been retrieved"
            )

        if uuid is None or uuid.strip() == "":
            raise ValueError("document uuid must be provided")

        response = self._client.get(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
        )

        handle_response(response)

        return Document(**response.json())

    async def aget_documents(self, uuids: List[str]) -> List[Document]:
        """
        Asynchronously gets a list of documents.

        Parameters
        ----------
        uuids: List[str]
            The list of document uuids to get.

        Returns
        -------
        List[Document]
            The list of document objects.

        Raises
        ------
        APIError
            If the API response format is unexpected.
        """
        if not self._aclient:
            raise ValueError(
                "Can only get documents once a collection has been retrieved"
            )

        if not uuids or len(uuids) == 0:
            raise ValueError("document uuids must be provided")

        if len(uuids) > LARGE_BATCH_WARNING_LIMIT:
            warnings.warn(LARGE_BATCH_WARNING, stacklevel=2)

        response = await self._aclient.post(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/list/get",
            json={"uuids": uuids},
        )

        handle_response(response)

        return [Document(**document) for document in response.json()]

    def get_documents(self, uuids: List[str]) -> List[Document]:
        """
        Gets a list of documents.

        Parameters
        ----------
        uuids: List[str]
            The list of document uuids to get.

        Returns
        -------
        List[Document]
            The list of document objects.

        Raises
        ------
        APIError
            If the API response format is unexpected.
        """
        if not self._client:
            raise ValueError(
                "Can only get documents once a collection has been retrieved"
            )

        if not uuids or len(uuids) == 0:
            raise ValueError("document uuids must be provided")

        if len(uuids) > LARGE_BATCH_WARNING_LIMIT:
            warnings.warn(LARGE_BATCH_WARNING, stacklevel=2)

        response = self._client.post(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents/list/get",
            json={"uuids": uuids},
        )

        handle_response(response)

        return [Document(**document) for document in response.json()]

    async def asearch_return_query_vector(
        self,
        text: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        limit: Optional[int] = None,
        search_type: Optional[str] = None,
        mmr_lambda: Optional[float] = None,
    ) -> Tuple[List[Document], List[float]]:
        if not self._aclient:
            raise ValueError(
                "Can only search documents once a collection has been retrieved"
            )

        if text is None and metadata is None:
            raise ValueError("One of text or metadata must be provided.")

        if text is not None and not isinstance(text, str):
            raise ValueError("Text must be a string.")

        search_type_value = SearchType(search_type or "similarity")

        payload = DocumentSearchPayload(
            text=text,
            metadata=metadata,
            search_type=search_type_value,
            mmr_lambda=mmr_lambda,
        )

        url = f"/collections/{urllib.parse.quote_plus(self.name)}/search"
        params = {"limit": limit} if limit is not None and limit > 0 else {}

        response = await self._aclient.post(
            url,
            params=params,
            json=payload.model_dump(exclude_none=True, exclude_unset=True),
        )

        # If the collection is not found, return an empty list
        if response.status_code == 404:
            return [], []

        # Otherwise, handle the response for other errors
        handle_response(response)

        return (
            [Document(**document) for document in response.json()["results"]],
            response.json()["query_vector"],
        )

    async def asearch(
        self,
        text: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        limit: Optional[int] = None,
        search_type: Optional[str] = None,
        mmr_lambda: Optional[float] = None,
    ) -> List[Document]:
        """
        Async search over documents in a collection based on provided search criteria.
        One of tex or metadata must be provided.

        Returns an empty list if no documents are found.

        Parameters
        ----------
        text : Optional[str], optional
            The search text.
        metadata : Optional[Dict[str, Any]], optional
            Document metadata to filter on.
        limit : Optional[int], optional
            Limit the number of returned documents.
        search_type : Optional[str], optional
            The type of search to perform. Defaults to "similarity".
            Must be one of "similarity" or "mmr".
        mmr_lambda : Optional[float], optional
            The lambda parameter for the MMR Reranking Algorithm.

        Returns
        -------
        List[Document]
            The list of documents that match the search criteria.

        Raises
        ------
        APIError
            If the API response format is unexpected or there's an error from the API.
        """

        results, _ = await self.asearch_return_query_vector(
            text=text,
            metadata=metadata,
            limit=limit,
            search_type=search_type,
            mmr_lambda=mmr_lambda,
        )

        return results

    def search_return_query_vector(
        self,
        text: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        limit: Optional[int] = None,
        search_type: Optional[str] = None,
        mmr_lambda: Optional[float] = None,
    ) -> Tuple[List[Document], List[float]]:
        if not self._client:
            raise ValueError(
                "Can only search documents once a collection has been retrieved"
            )

        if text is None is None and metadata is None:
            raise ValueError("One of text or metadata must be provided.")

        if text is not None and not isinstance(text, str):
            raise ValueError("Text must be a string.")

        search_type_value = SearchType(search_type or "similarity")

        payload = DocumentSearchPayload(
            text=text,
            metadata=metadata,
            search_type=search_type_value,
            mmr_lambda=mmr_lambda,
        )

        url = f"/collections/{urllib.parse.quote_plus(self.name)}/search"
        params = {"limit": limit} if limit is not None and limit > 0 else {}

        response = self._client.post(
            url,
            params=params,
            json=payload.model_dump(exclude_none=True, exclude_unset=True),
        )

        # If the collection is not found, return an empty list
        if response.status_code == 404:
            return [], []

        # Otherwise, handle the response for other errors
        handle_response(response)

        return (
            [Document(**document) for document in response.json()["results"]],
            response.json()["query_vector"],
        )

    def search(
        self,
        text: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        limit: Optional[int] = None,
        search_type: Optional[str] = None,
        mmr_lambda: Optional[float] = None,
    ) -> List[Document]:
        """
        Searches over documents in a collection based on provided search criteria.
        One of text, or metadata must be provided.

        Returns an empty list if no documents are found.

        Parameters
        ----------
        text : Optional[str], optional
            The search text.
        metadata : Optional[Dict[str, Any]], optional
            Document metadata to filter on.
        limit : Optional[int], optional
            Limit the number of returned documents.
        search_type : Optional[str], optional
            The type of search to perform. Defaults to "similarity".
            Must be one of "similarity" or "mmr".
        mmr_lambda : Optional[float], optional
            The lambda parameter for the MMR Reranking Algorithm.

        Returns
        -------
        List[Document]
            The list of documents that match the search criteria.

        Raises
        ------
        APIError
            If the API response format is unexpected or there's an error from the API.
        """

        results, _ = self.search_return_query_vector(
            text=text,
            metadata=metadata,
            limit=limit,
            search_type=search_type,
            mmr_lambda=mmr_lambda,
        )

        return results

Ancestors

DocumentCollectionModel
pydantic.main.BaseModel

Class variables

var model_computed_fields
var model_config
var model_fields

Instance variables

var status : str

Get the status of the collection.

Returns

str

The status of the collection.

ready: All documents have been embedded and the collection is ready for search.

pending: The collection is still processing.

Expand source code

@property
def status(self) -> str:
    """
    Get the status of the collection.

    Returns
    -------
    str
        The status of the collection.

        `ready`: All documents have been embedded and the collection is ready for
        search.

        `pending`: The collection is still processing.
    """
    if self.document_count and (
        self.document_embedded_count == self.document_count
    ):
        return "ready"
    else:
        return "pending"

Methods

async def aadd_documents(self, documents: List[Document], batch_size: int = 1000) ‑> List[str]

Asynchronously create documents.

documents : List[Document] A list of Document objects representing the documents to create. batch_size : int, optional The number of documents to upload in each batch. Defaults to 500.

Returns

List[str]: The UUIDs of the created documents.

Raises

APIError: If the API response format is unexpected.

Expand source code

async def aadd_documents(
    self,
    documents: List[Document],
    batch_size: int = DEFAULT_BATCH_SIZE,
) -> List[str]:
    """
    Asynchronously create documents.


    documents : List[Document]
        A list of Document objects representing the documents to create.
    batch_size : int, optional
        The number of documents to upload in each batch. Defaults to 500.

    Returns
    -------
    List[str]
        The UUIDs of the created documents.

    Raises
    ------
    APIError
        If the API response format is unexpected.
    """

    if not self._aclient:
        raise ValueError(
            "Can only add documents once a collection has been created"
        )

    if documents is None:
        raise ValueError("document list must be provided")

    uuids: List[str] = []
    for batch in generate_batches(documents, batch_size):
        response = await self._aclient.post(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents",
            json=batch,
        )

        handle_response(response)

        uuids.extend(response.json())

    return uuids

def add_documents(self, documents: List[Document], batch_size: int = 1000) ‑> List[str]

Create documents.

documents : List[Document] A list of Document objects representing the documents to create.

Returns

List[str]: The UUIDs of the created documents.
batch_size : int, optional: The number of documents to upload in each batch. Defaults to 500.

Raises

APIError: If the API response format is unexpected.

Expand source code

def add_documents(
    self,
    documents: List[Document],
    batch_size: int = DEFAULT_BATCH_SIZE,
) -> List[str]:
    """
    Create documents.


    documents : List[Document]
        A list of Document objects representing the documents to create.

    Returns
    -------
    List[str]
        The UUIDs of the created documents.
    batch_size : int, optional
        The number of documents to upload in each batch. Defaults to 500.

    Raises
    ------
    APIError
        If the API response format is unexpected.
    """
    if not self._client:
        raise ValueError(
            "Can only add documents once a collection has been created"
        )

    if documents is None:
        raise ValueError("document list must be provided")

    uuids: List[str] = []
    for batch in generate_batches(documents, batch_size):
        response = self._client.post(
            f"/collections/{urllib.parse.quote_plus(self.name)}/documents",
            json=batch,
        )

        handle_response(response)

        uuids.extend(response.json())

    return uuids

async def adelete_document(self, uuid: str) ‑> None

Asynchronously delete document.

Parameters

uuid : str: The uuid of the document to be deleted.

Returns

None

Raises

NotFoundError: If the document is not found.
APIError: If the API response format is unexpected.

Expand source code

async def adelete_document(self, uuid: str) -> None:
    """
    Asynchronously delete document.

    Parameters
    ----------
    uuid: str
        The uuid of the document to be deleted.

    Returns
    -------
    None

    Raises
    ------
    NotFoundError
        If the document is not found.

    APIError
        If the API response format is unexpected.
    """
    if not self._aclient:
        raise ValueError(
            "Can only delete a document once a collection has been retrieved"
        )

    if uuid is None or uuid.strip() == "":
        raise ValueError("document uuid must be provided")

    response = await self._aclient.delete(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
    )

    handle_response(response)

async def aget_document(self, uuid: str) ‑> Document

Asynchronously gets a document.

Parameters

uuid : str: The name of the document to get.

Returns

Document: The retrieved document.

Raises

NotFoundError: If the document is not found.
APIError: If the API response format is unexpected.

Expand source code

async def aget_document(self, uuid: str) -> Document:
    """
    Asynchronously gets a document.

    Parameters
    ----------
    uuid: str
        The name of the document to get.

    Returns
    -------
    Document
        The retrieved document.

    Raises
    ------
    NotFoundError
        If the document is not found.

    APIError
        If the API response format is unexpected.
    """
    if not self._aclient:
        raise ValueError(
            "Can only get a document once a collection has been retrieved"
        )

    if uuid is None or uuid.strip() == "":
        raise ValueError("document uuid must be provided")

    response = await self._aclient.get(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
    )

    handle_response(response)

    return Document(**response.json())

async def aget_documents(self, uuids: List[str]) ‑> List[Document]

Asynchronously gets a list of documents.

Parameters

uuids : List[str]: The list of document uuids to get.

Returns

List[Document]: The list of document objects.

Raises

APIError: If the API response format is unexpected.

Expand source code

async def aget_documents(self, uuids: List[str]) -> List[Document]:
    """
    Asynchronously gets a list of documents.

    Parameters
    ----------
    uuids: List[str]
        The list of document uuids to get.

    Returns
    -------
    List[Document]
        The list of document objects.

    Raises
    ------
    APIError
        If the API response format is unexpected.
    """
    if not self._aclient:
        raise ValueError(
            "Can only get documents once a collection has been retrieved"
        )

    if not uuids or len(uuids) == 0:
        raise ValueError("document uuids must be provided")

    if len(uuids) > LARGE_BATCH_WARNING_LIMIT:
        warnings.warn(LARGE_BATCH_WARNING, stacklevel=2)

    response = await self._aclient.post(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/list/get",
        json={"uuids": uuids},
    )

    handle_response(response)

    return [Document(**document) for document in response.json()]

async def asearch(self, text: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, limit: Optional[int] = None, search_type: Optional[str] = None, mmr_lambda: Optional[float] = None) ‑> List[Document]

Async search over documents in a collection based on provided search criteria. One of tex or metadata must be provided.

Returns an empty list if no documents are found.

Parameters

text : Optional[str], optional: The search text.
metadata : Optional[Dict[str, Any]], optional: Document metadata to filter on.
limit : Optional[int], optional: Limit the number of returned documents.
search_type : Optional[str], optional: The type of search to perform. Defaults to "similarity". Must be one of "similarity" or "mmr".
mmr_lambda : Optional[float], optional: The lambda parameter for the MMR Reranking Algorithm.

Returns

List[Document]: The list of documents that match the search criteria.

Raises

APIError: If the API response format is unexpected or there's an error from the API.

Expand source code

async def asearch(
    self,
    text: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    limit: Optional[int] = None,
    search_type: Optional[str] = None,
    mmr_lambda: Optional[float] = None,
) -> List[Document]:
    """
    Async search over documents in a collection based on provided search criteria.
    One of tex or metadata must be provided.

    Returns an empty list if no documents are found.

    Parameters
    ----------
    text : Optional[str], optional
        The search text.
    metadata : Optional[Dict[str, Any]], optional
        Document metadata to filter on.
    limit : Optional[int], optional
        Limit the number of returned documents.
    search_type : Optional[str], optional
        The type of search to perform. Defaults to "similarity".
        Must be one of "similarity" or "mmr".
    mmr_lambda : Optional[float], optional
        The lambda parameter for the MMR Reranking Algorithm.

    Returns
    -------
    List[Document]
        The list of documents that match the search criteria.

    Raises
    ------
    APIError
        If the API response format is unexpected or there's an error from the API.
    """

    results, _ = await self.asearch_return_query_vector(
        text=text,
        metadata=metadata,
        limit=limit,
        search_type=search_type,
        mmr_lambda=mmr_lambda,
    )

    return results

async def asearch_return_query_vector(self, text: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, limit: Optional[int] = None, search_type: Optional[str] = None, mmr_lambda: Optional[float] = None) ‑> Tuple[List[Document], List[float]]

Expand source code

async def asearch_return_query_vector(
    self,
    text: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    limit: Optional[int] = None,
    search_type: Optional[str] = None,
    mmr_lambda: Optional[float] = None,
) -> Tuple[List[Document], List[float]]:
    if not self._aclient:
        raise ValueError(
            "Can only search documents once a collection has been retrieved"
        )

    if text is None and metadata is None:
        raise ValueError("One of text or metadata must be provided.")

    if text is not None and not isinstance(text, str):
        raise ValueError("Text must be a string.")

    search_type_value = SearchType(search_type or "similarity")

    payload = DocumentSearchPayload(
        text=text,
        metadata=metadata,
        search_type=search_type_value,
        mmr_lambda=mmr_lambda,
    )

    url = f"/collections/{urllib.parse.quote_plus(self.name)}/search"
    params = {"limit": limit} if limit is not None and limit > 0 else {}

    response = await self._aclient.post(
        url,
        params=params,
        json=payload.model_dump(exclude_none=True, exclude_unset=True),
    )

    # If the collection is not found, return an empty list
    if response.status_code == 404:
        return [], []

    # Otherwise, handle the response for other errors
    handle_response(response)

    return (
        [Document(**document) for document in response.json()["results"]],
        response.json()["query_vector"],
    )

async def aupdate_document(self, uuid: str, document_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None) ‑> None

Asynchronously update document by UUID.

Parameters

uuid : str: The UUID of the document to update.
document_id : Optional[str]: The document_id of the document.
metadata : Optional[Dict[str, Any]]: The metadata of the document.

Returns

None

Raises

NotFoundError: If the document is not found.
APIError: If the API response format is unexpected.

Expand source code

async def aupdate_document(
    self,
    uuid: str,
    document_id: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
) -> None:
    """
    Asynchronously update document by UUID.

    Parameters
    ----------
    uuid : str
        The UUID of the document to update.
    document_id : Optional[str]
        The document_id of the document.
    metadata : Optional[Dict[str, Any]]
        The metadata of the document.

    Returns
    -------
    None

    Raises
    ------
    NotFoundError
        If the document is not found.

    APIError
        If the API response format is unexpected.
    """
    if not self._aclient:
        raise ValueError(
            "Can only update documents once a collection has been retrieved or"
            " created"
        )

    if uuid is None:
        raise ValueError("document uuid must be provided")

    if document_id is None and metadata is None:
        raise ValueError("document_id or metadata must be provided")

    payload = filter_dict({"document_id": document_id, "metadata": metadata})

    response = await self._aclient.patch(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
        json=payload,
    )

    handle_response(response)

def delete_document(self, uuid: str) ‑> None

Delete document.

Parameters

uuid : str: The uuid of the document to be deleted.

Returns

None

Raises

NotFoundError: If the document is not found.
APIError: If the API response format is unexpected.

Expand source code

def delete_document(self, uuid: str) -> None:
    """
    Delete document.

    Parameters
    ----------
    uuid: str
        The uuid of the document to be deleted.

    Returns
    -------
    None

    Raises
    ------
    NotFoundError
        If the document is not found.

    APIError
        If the API response format is unexpected.
    """
    if not self._client:
        raise ValueError(
            "Can only delete a document once a collection has been retrieved"
        )

    if uuid is None or uuid.strip() == "":
        raise ValueError("document uuid must be provided")

    response = self._client.delete(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
    )

    handle_response(response)

def get_document(self, uuid: str) ‑> Document

Gets a document.

Parameters

uuid : str: The name of the document to get.

Returns

Document: The retrieved document.

Raises

NotFoundError: If the document is not found.
APIError: If the API response format is unexpected.

Expand source code

def get_document(self, uuid: str) -> Document:
    """
    Gets a document.

    Parameters
    ----------
    uuid: str
        The name of the document to get.

    Returns
    -------
    Document
        The retrieved document.

    Raises
    ------
    NotFoundError
        If the document is not found.

    APIError
        If the API response format is unexpected.
    """
    if not self._client:
        raise ValueError(
            "Can only get a document once a collection has been retrieved"
        )

    if uuid is None or uuid.strip() == "":
        raise ValueError("document uuid must be provided")

    response = self._client.get(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
    )

    handle_response(response)

    return Document(**response.json())

def get_documents(self, uuids: List[str]) ‑> List[Document]

Gets a list of documents.

Parameters

uuids : List[str]: The list of document uuids to get.

Returns

List[Document]: The list of document objects.

Raises

APIError: If the API response format is unexpected.

Expand source code

def get_documents(self, uuids: List[str]) -> List[Document]:
    """
    Gets a list of documents.

    Parameters
    ----------
    uuids: List[str]
        The list of document uuids to get.

    Returns
    -------
    List[Document]
        The list of document objects.

    Raises
    ------
    APIError
        If the API response format is unexpected.
    """
    if not self._client:
        raise ValueError(
            "Can only get documents once a collection has been retrieved"
        )

    if not uuids or len(uuids) == 0:
        raise ValueError("document uuids must be provided")

    if len(uuids) > LARGE_BATCH_WARNING_LIMIT:
        warnings.warn(LARGE_BATCH_WARNING, stacklevel=2)

    response = self._client.post(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/list/get",
        json={"uuids": uuids},
    )

    handle_response(response)

    return [Document(**document) for document in response.json()]

def model_post_init(self: BaseModel, __context: Any) ‑> None

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args

self: The BaseModel instance.
__context: The context.

Expand source code

def init_private_attributes(self: BaseModel, __context: Any) -> None:
    """This function is meant to behave like a BaseModel method to initialise private attributes.

    It takes context as an argument since that's what pydantic-core passes when calling it.

    Args:
        self: The BaseModel instance.
        __context: The context.
    """
    if getattr(self, '__pydantic_private__', None) is None:
        pydantic_private = {}
        for name, private_attr in self.__private_attributes__.items():
            default = private_attr.get_default()
            if default is not PydanticUndefined:
                pydantic_private[name] = default
        object_setattr(self, '__pydantic_private__', pydantic_private)

def search(self, text: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, limit: Optional[int] = None, search_type: Optional[str] = None, mmr_lambda: Optional[float] = None) ‑> List[Document]

Searches over documents in a collection based on provided search criteria. One of text, or metadata must be provided.

Returns an empty list if no documents are found.

Parameters

text : Optional[str], optional: The search text.
metadata : Optional[Dict[str, Any]], optional: Document metadata to filter on.
limit : Optional[int], optional: Limit the number of returned documents.
search_type : Optional[str], optional: The type of search to perform. Defaults to "similarity". Must be one of "similarity" or "mmr".
mmr_lambda : Optional[float], optional: The lambda parameter for the MMR Reranking Algorithm.

Returns

List[Document]: The list of documents that match the search criteria.

Raises

APIError: If the API response format is unexpected or there's an error from the API.

Expand source code

def search(
    self,
    text: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    limit: Optional[int] = None,
    search_type: Optional[str] = None,
    mmr_lambda: Optional[float] = None,
) -> List[Document]:
    """
    Searches over documents in a collection based on provided search criteria.
    One of text, or metadata must be provided.

    Returns an empty list if no documents are found.

    Parameters
    ----------
    text : Optional[str], optional
        The search text.
    metadata : Optional[Dict[str, Any]], optional
        Document metadata to filter on.
    limit : Optional[int], optional
        Limit the number of returned documents.
    search_type : Optional[str], optional
        The type of search to perform. Defaults to "similarity".
        Must be one of "similarity" or "mmr".
    mmr_lambda : Optional[float], optional
        The lambda parameter for the MMR Reranking Algorithm.

    Returns
    -------
    List[Document]
        The list of documents that match the search criteria.

    Raises
    ------
    APIError
        If the API response format is unexpected or there's an error from the API.
    """

    results, _ = self.search_return_query_vector(
        text=text,
        metadata=metadata,
        limit=limit,
        search_type=search_type,
        mmr_lambda=mmr_lambda,
    )

    return results

def search_return_query_vector(self, text: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, limit: Optional[int] = None, search_type: Optional[str] = None, mmr_lambda: Optional[float] = None) ‑> Tuple[List[Document], List[float]]

Expand source code

def search_return_query_vector(
    self,
    text: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    limit: Optional[int] = None,
    search_type: Optional[str] = None,
    mmr_lambda: Optional[float] = None,
) -> Tuple[List[Document], List[float]]:
    if not self._client:
        raise ValueError(
            "Can only search documents once a collection has been retrieved"
        )

    if text is None is None and metadata is None:
        raise ValueError("One of text or metadata must be provided.")

    if text is not None and not isinstance(text, str):
        raise ValueError("Text must be a string.")

    search_type_value = SearchType(search_type or "similarity")

    payload = DocumentSearchPayload(
        text=text,
        metadata=metadata,
        search_type=search_type_value,
        mmr_lambda=mmr_lambda,
    )

    url = f"/collections/{urllib.parse.quote_plus(self.name)}/search"
    params = {"limit": limit} if limit is not None and limit > 0 else {}

    response = self._client.post(
        url,
        params=params,
        json=payload.model_dump(exclude_none=True, exclude_unset=True),
    )

    # If the collection is not found, return an empty list
    if response.status_code == 404:
        return [], []

    # Otherwise, handle the response for other errors
    handle_response(response)

    return (
        [Document(**document) for document in response.json()["results"]],
        response.json()["query_vector"],
    )

def update_document(self, uuid: str, document_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None) ‑> None

Update document by UUID.

Parameters

uuid : str: The UUID of the document to update.
document_id : Optional[str]: The document_id of the document.
metadata : Optional[Dict[str, Any]]: The metadata of the document.

Returns

None

Raises

NotFoundError: If the document is not found.
APIError: If the API response format is unexpected.

Expand source code

def update_document(
    self,
    uuid: str,
    document_id: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
) -> None:
    """
    Update document by UUID.

    Parameters
    ----------
    uuid : str
        The UUID of the document to update.
    document_id : Optional[str]
        The document_id of the document.
    metadata : Optional[Dict[str, Any]]
        The metadata of the document.

    Returns
    -------
    None

    Raises
    ------
    NotFoundError
        If the document is not found.

    APIError
        If the API response format is unexpected.
    """
    if not self._client:
        raise ValueError(
            "Can only update documents once a collection has been retrieved or"
            " created"
        )

    if uuid is None:
        raise ValueError("document uuid must be provided")

    if document_id is None and metadata is None:
        raise ValueError("document_id or metadata must be provided")

    payload = filter_dict({"document_id": document_id, "metadata": metadata})

    response = self._client.patch(
        f"/collections/{urllib.parse.quote_plus(self.name)}/documents/uuid/{uuid}",
        json=payload,
    )

    handle_response(response)

Inherited members

DocumentCollectionModel:
- to_dict