Source code for ibm_watsonx_ai.foundation_models.ilab.documents

#  -----------------------------------------------------------------------------------------
#  (C) Copyright IBM Corp. 2025.
#  https://opensource.org/licenses/BSD-3-Clause
#  -----------------------------------------------------------------------------------------

from __future__ import annotations

from typing import TYPE_CHECKING
import logging

from ibm_watsonx_ai._wrappers import requests
from ibm_watsonx_ai.foundation_models.ilab.helper import wait_for_run_finish, BaseRuns
from ibm_watsonx_ai.helpers.connections import (
    DataConnection,
)
from ibm_watsonx_ai.wml_client_error import WMLClientError
from ibm_watsonx_ai.wml_resource import WMLResource

if TYPE_CHECKING:
    from ibm_watsonx_ai import APIClient


[docs] class DocumentExtraction: """Class of InstructLab document extraction.""" id: str def __init__(self, name: str, api_client: APIClient): self.name = name self._client = api_client self._href_definitions = self._client.service_instance._href_definitions
[docs] def get_run_details(self) -> dict: """Get document extraction details :return: details of document extraction :rtype: dict """ if self.id is None: raise WMLClientError("Run in not started, operation cannot be performed.") response = requests.get( url=self._href_definitions.get_document_extraction_href(self.id), params=self._client._params(), headers=self._client._get_headers(), ) return self._client.repository._handle_response( 200, "getting documents extraction details", response )
[docs] def get_run_status(self) -> str: """Get document extraction status :return: status of document extraction :rtype: str """ return self.get_run_details()["entity"].get("status", {}).get("state")
[docs] def delete_run(self) -> str: """Delete document extraction run""" if self.id is None: raise WMLClientError("Run in not started, operation cannot be performed.") params = self._client._params() params["hard_delete"] = "true" response = requests.delete( url=self._href_definitions.get_document_extraction_href(self.id), params=params, headers=self._client._get_headers(), ) return self._client.repository._handle_response( 204, "deleting of document extraction", response, json_response=False )
[docs] def cancel_run(self) -> str: """Cancel document extraction run""" if self.id is None: raise WMLClientError("Run in not started, operation cannot be performed.") response = requests.delete( url=self._href_definitions.get_document_extraction_href(self.id), params=self._client._params(), headers=self._client._get_headers(), ) return self._client.repository._handle_response( 204, "cancelation of documents extraction", response, json_response=False )
[docs] class DocumentExtractionsRuns(BaseRuns): """Class of InstructLab document extraction runs.""" def __init__(self, api_client: APIClient): url = ( api_client.service_instance._href_definitions.get_document_extractions_href() ) BaseRuns.__init__(self, api_client, url)
[docs] def get_document_extraction( self, document_extraction_id: str ) -> DocumentExtraction: """Get document extraction object :param document_extraction_id: id of document extraction object :type document_extraction_id: str :returns: document extraction object :rtype: DocumentExtraction """ doc_extr_details = self.get_run_details(document_extraction_id) doc_extr = DocumentExtraction( doc_extr_details.get("metadata", {}).get("name"), self._client ) doc_extr.id = document_extraction_id return doc_extr
[docs] class DocumentExtractions(WMLResource): """Class of InstructLab document extraction module.""" _logger = logging.getLogger(__name__) def __init__(self, ilab_tuner_name: str, api_client: APIClient): WMLResource.__init__(self, "document extractions", api_client) self.ilab_tuner_name = ilab_tuner_name self._client = api_client self._href_definitions = self._client.service_instance._href_definitions
[docs] def extract( self, *, name: str | None = None, document_references: list[DataConnection], results_reference: DataConnection, background_mode: bool = False, ) -> DocumentExtraction: """Extract .md document from given .pdf document :param name: document extraction run name :type name: str :param document_references: .pdf document location :type document_references: list[DataConnection] :param results_reference: .md file extraction location :type results_reference: DataConnection :param background_mode: indicator if the method will run in the background, async or sync :type background_mode: bool, optional :returns: document extraction run :rtype: DocumentExtraction """ doc = DocumentExtraction( name if name else f"{self.ilab_tuner_name} - Documents Extraction", self._client, ) payload = { "name": doc.name, "document_references": [ doc_ref.to_dict() for doc_ref in document_references ], "results_reference": results_reference.to_dict(), } params = self._client._params() if self._client.default_project_id: payload["project_id"] = self._client.default_project_id params.pop("project_id") elif self._client.default_space_id: payload["space_id"] = self._client.default_space_id params.pop("space_id") response = requests.post( url=self._href_definitions.get_document_extractions_href(), json=payload, params=params, headers=self._client._get_headers(), ) res = self._handle_response(201, "running documents extraction", response) doc.id = res["metadata"]["id"] if not background_mode: wait_for_run_finish( asked_object=doc, res_name="Document extraction", logger=self._logger, ) return doc
[docs] def runs(self) -> DocumentExtractionsRuns: """Get the historical runs. :returns: runs object :rtype: DocumentExtractionsRuns """ return DocumentExtractionsRuns(self._client)