Source code for ibm_watsonx_ai.foundation_models.ilab.documents
# -----------------------------------------------------------------------------------------
# (C) Copyright IBM Corp. 2025.
# https://opensource.org/licenses/BSD-3-Clause
# -----------------------------------------------------------------------------------------
from __future__ import annotations
from typing import TYPE_CHECKING
import logging
from ibm_watsonx_ai._wrappers import requests
from ibm_watsonx_ai.foundation_models.ilab.helper import wait_for_run_finish, BaseRuns
from ibm_watsonx_ai.helpers.connections import (
DataConnection,
)
from ibm_watsonx_ai.wml_client_error import WMLClientError
from ibm_watsonx_ai.wml_resource import WMLResource
if TYPE_CHECKING:
from ibm_watsonx_ai import APIClient
[docs]
class DocumentExtraction:
"""Class of InstructLab document extraction."""
id: str
def __init__(self, name: str, api_client: APIClient):
self.name = name
self._client = api_client
self._href_definitions = self._client.service_instance._href_definitions
[docs]
def get_run_details(self) -> dict:
"""Get document extraction details
:return: details of document extraction
:rtype: dict
"""
if self.id is None:
raise WMLClientError("Run in not started, operation cannot be performed.")
response = requests.get(
url=self._href_definitions.get_document_extraction_href(self.id),
params=self._client._params(),
headers=self._client._get_headers(),
)
return self._client.repository._handle_response(
200, "getting documents extraction details", response
)
[docs]
def get_run_status(self) -> str:
"""Get document extraction status
:return: status of document extraction
:rtype: str
"""
return self.get_run_details()["entity"].get("status", {}).get("state")
[docs]
def delete_run(self) -> str:
"""Delete document extraction run"""
if self.id is None:
raise WMLClientError("Run in not started, operation cannot be performed.")
params = self._client._params()
params["hard_delete"] = "true"
response = requests.delete(
url=self._href_definitions.get_document_extraction_href(self.id),
params=params,
headers=self._client._get_headers(),
)
return self._client.repository._handle_response(
204, "deleting of document extraction", response, json_response=False
)
[docs]
def cancel_run(self) -> str:
"""Cancel document extraction run"""
if self.id is None:
raise WMLClientError("Run in not started, operation cannot be performed.")
response = requests.delete(
url=self._href_definitions.get_document_extraction_href(self.id),
params=self._client._params(),
headers=self._client._get_headers(),
)
return self._client.repository._handle_response(
204, "cancelation of documents extraction", response, json_response=False
)
[docs]
class DocumentExtractionsRuns(BaseRuns):
"""Class of InstructLab document extraction runs."""
def __init__(self, api_client: APIClient):
url = (
api_client.service_instance._href_definitions.get_document_extractions_href()
)
BaseRuns.__init__(self, api_client, url)
[docs]
def get_document_extraction(
self, document_extraction_id: str
) -> DocumentExtraction:
"""Get document extraction object
:param document_extraction_id: id of document extraction object
:type document_extraction_id: str
:returns: document extraction object
:rtype: DocumentExtraction
"""
doc_extr_details = self.get_run_details(document_extraction_id)
doc_extr = DocumentExtraction(
doc_extr_details.get("metadata", {}).get("name"), self._client
)
doc_extr.id = document_extraction_id
return doc_extr
[docs]
class DocumentExtractions(WMLResource):
"""Class of InstructLab document extraction module."""
_logger = logging.getLogger(__name__)
def __init__(self, ilab_tuner_name: str, api_client: APIClient):
WMLResource.__init__(self, "document extractions", api_client)
self.ilab_tuner_name = ilab_tuner_name
self._client = api_client
self._href_definitions = self._client.service_instance._href_definitions
[docs]
def extract(
self,
*,
name: str | None = None,
document_references: list[DataConnection],
results_reference: DataConnection,
background_mode: bool = False,
) -> DocumentExtraction:
"""Extract .md document from given .pdf document
:param name: document extraction run name
:type name: str
:param document_references: .pdf document location
:type document_references: list[DataConnection]
:param results_reference: .md file extraction location
:type results_reference: DataConnection
:param background_mode: indicator if the method will run in the background, async or sync
:type background_mode: bool, optional
:returns: document extraction run
:rtype: DocumentExtraction
"""
doc = DocumentExtraction(
name if name else f"{self.ilab_tuner_name} - Documents Extraction",
self._client,
)
payload = {
"name": doc.name,
"document_references": [
doc_ref.to_dict() for doc_ref in document_references
],
"results_reference": results_reference.to_dict(),
}
params = self._client._params()
if self._client.default_project_id:
payload["project_id"] = self._client.default_project_id
params.pop("project_id")
elif self._client.default_space_id:
payload["space_id"] = self._client.default_space_id
params.pop("space_id")
response = requests.post(
url=self._href_definitions.get_document_extractions_href(),
json=payload,
params=params,
headers=self._client._get_headers(),
)
res = self._handle_response(201, "running documents extraction", response)
doc.id = res["metadata"]["id"]
if not background_mode:
wait_for_run_finish(
asked_object=doc,
res_name="Document extraction",
logger=self._logger,
)
return doc
[docs]
def runs(self) -> DocumentExtractionsRuns:
"""Get the historical runs.
:returns: runs object
:rtype: DocumentExtractionsRuns
"""
return DocumentExtractionsRuns(self._client)