Source code for ibm_watsonx_ai.foundation_models.extractions.text_extractions
# -----------------------------------------------------------------------------------------
# (C) Copyright IBM Corp. 2024.
# https://opensource.org/licenses/BSD-3-Clause
# -----------------------------------------------------------------------------------------
from __future__ import annotations
from typing import TYPE_CHECKING, Literal
from ibm_watsonx_ai.wml_client_error import (
InvalidMultipleArguments,
WMLClientError,
UnexpectedType,
InvalidValue,
)
from ibm_watsonx_ai.wml_resource import WMLResource
from ibm_watsonx_ai._wrappers import requests
from ibm_watsonx_ai.helpers import DataConnection
if TYPE_CHECKING:
from ibm_watsonx_ai import APIClient, Credentials
import pandas
[docs]
class TextExtractions(WMLResource):
"""Instantiate the Text Extraction service.
:param credentials: credentials to the Watson Machine Learning instance
:type credentials: Credentials, optional
:param project_id: ID of the Watson Studio project, defaults to None
:type project_id: str, optional
:param space_id: ID of the Watson Studio space, defaults to None
:type space_id: str, optional
:param api_client: initialized APIClient object with a set project ID or space ID. If passed, ``credentials`` and ``project_id``/``space_id`` are not required, defaults to None
:type api_client: APIClient, optional
:raises InvalidMultipleArguments: raised if `space_id` and `project_id` or `credentials` and `api_client` are provided simultaneously
:raises WMLClientError: raised if the CPD version is less than 5.0
.. code-block:: python
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models.extractions import TextExtractions
extraction = TextExtractions(
credentials=Credentials(
api_key = "***",
url = "https://us-south.ml.cloud.ibm.com"),
project_id="*****"
)
"""
def __init__(
self,
credentials: Credentials | None = None,
project_id: str | None = None,
space_id: str | None = None,
api_client: APIClient | None = None,
) -> None:
if credentials is not None:
from ibm_watsonx_ai import APIClient
self._client = APIClient(credentials)
elif api_client is not None:
self._client = api_client
else:
raise InvalidMultipleArguments(
params_names_list=["credentials", "api_client"],
reason="None of the arguments were provided.",
)
if space_id is not None:
self._client.set.default_space(space_id)
elif project_id is not None:
self._client.set.default_project(project_id)
elif not api_client:
raise InvalidMultipleArguments(
params_names_list=["space_id", "project_id"],
reason="None of the arguments were provided.",
)
if not self._client.CLOUD_PLATFORM_SPACES and self._client.CPD_version < 5.0:
raise WMLClientError(error_msg="Operation is unsupported for this release.")
WMLResource.__init__(self, __name__, self._client)
[docs]
def run_job(
self,
document_reference: DataConnection,
results_reference: DataConnection,
steps: dict | None = None,
results_format: Literal["json", "markdown"] = "json",
) -> dict:
"""Start a request to extract text and metadata from a document.
:param document_reference: reference to the document in the bucket from which text will be extracted
:type document_reference: DataConnection
:param results_reference: reference to the location in the bucket where results will saved
:type results_reference: DataConnection
:param steps: steps for the text extraction pipeline, defaults to None
:type steps: dict | None, optional
:param results_format: results format for the text extraction, defaults to "json"
:type results_format: Literal["json", "markdown"], optional
:return: raw response from the server with the text extraction job details
:rtype: dict
**Example:**
.. code-block:: python
from ibm_watsonx_ai.metanames import TextExtractionsMetaNames
from ibm_watsonx_ai.helpers import DataConnection, S3Location
document_reference = DataConnection(
connection_asset_id="<connection_id>",
location=S3Location(bucket="<bucket_name>", path="path/to/file"),
)
results_reference = DataConnection(
connection_asset_id="<connection_id>",
location=S3Location(bucket="<bucket_name>", path="path/to/file"),
)
response = extraction.run_job(
document_reference=document_reference,
results_reference=results_reference,
steps={
TextExtractionsMetaNames.OCR: {
"process_image": True,
"languages_list": ["en", "fr"],
},
TextExtractionsMetaNames.TABLE_PROCESSING: {"enabled": True},
results_format="markdown",
},
)
"""
if not isinstance(document_reference, DataConnection):
raise UnexpectedType(
el_name="document_reference",
expected_type=DataConnection,
actual_type=type(document_reference),
)
elif not isinstance(results_reference, DataConnection):
raise UnexpectedType(
el_name="results_reference",
expected_type=DataConnection,
actual_type=type(results_reference),
)
TextExtractions._validate_type(steps, "steps", dict, False)
payload: dict = {}
if self._client.default_project_id is not None:
payload.update({"project_id": self._client.default_project_id})
elif self._client.default_space_id is not None:
payload.update({"space_id": self._client.default_space_id})
payload.update({"document_reference": document_reference._to_dict()})
payload.update({"results_reference": results_reference._to_dict()})
if steps is not None:
payload.update({"steps": steps})
if results_format == "json":
payload.update({"assembly_json": {}})
elif results_format == "markdown":
payload.update({"assembly_md": {}})
else:
raise ValueError(
"Incorrect results format provided. Only 'json' and 'markdown' are supported."
)
response = requests.post(
url=self._client.service_instance._href_definitions.get_text_extraction_href(),
json=payload,
params=self._client._params(skip_for_create=True, skip_userfs=True),
headers=self._client._get_headers(),
)
return self._handle_response(201, "run_job", response)
[docs]
def list_jobs(self, limit: int | None = None) -> pandas.DataFrame:
"""List text extraction jobs. If limit is None, all jobs will be listed.
:param limit: limit number of fetched records, defaults to None
:type limit: int | None, optional
:return: job information of a pandas DataFrame with text extraction
:rtype: pandas.DataFrame
**Example:**
.. code-block:: python
extraction.list_jobs()
"""
import pandas
columns = ["metadata.id", "metadata.created_at", "entity.results.status"]
details = self.get_job_details(limit=limit)
resources = details["resources"]
data_normalize = pandas.json_normalize(resources)
extraction_data = data_normalize.reindex(columns=columns)
df_details: pandas.DataFrame = pandas.DataFrame(
extraction_data, columns=columns
)
df_details.rename(
columns={
"metadata.id": "EXTRACTION_ID",
"metadata.created_at": "CREATED",
"entity.results.status": "STATUS",
},
inplace=True,
)
return df_details
[docs]
def get_job_details(
self, extraction_id: str | None = None, limit: int | None = None
) -> dict:
"""Return text extraction job details. If `extraction_id` is None, returns the details of all text extraction jobs.
:param extraction_id: ID of the text extraction job, defaults to None
:type extraction_id: str | None, optional
:param limit: limit number of fetched records, defaults to None
:type limit: int | None, optional
:return: details of the text extraction job
:rtype: dict
**Example:**
.. code-block:: python
extraction.get_job_details(extraction_id="<extraction_id>")
"""
TextExtractions._validate_type(extraction_id, "extraction_id", str, False)
if extraction_id is not None:
response = requests.get(
url=self._client.service_instance._href_definitions.get_text_extraction_href()
+ f"/{extraction_id}",
params=self._client._params(skip_userfs=True),
headers=self._client._get_headers(),
)
else:
_params: dict | None = None
if limit is not None:
if limit < 1 or limit > 200:
raise InvalidValue(
value_name="limit",
reason=f"The given value {limit} is not in the range <1, 200>",
)
else:
_params = {"limit": limit}
# TODO: pagination is not yet implemented
response = requests.get(
url=self._client.service_instance._href_definitions.get_text_extraction_href(),
params=(self._client._params(skip_userfs=True) | (_params or {})),
headers=self._client._get_headers(),
)
return self._handle_response(200, "get_job_details", response)
[docs]
def delete_job(self, extraction_id: str) -> Literal["SUCCESS"]:
"""Delete a text extraction job.
:return: return "SUCCESS" if the deletion succeeds
:rtype: str
**Example:**
.. code-block:: python
extraction.delete_job(extraction_id="<extraction_id>")
"""
TextExtractions._validate_type(extraction_id, "extraction_id", str, True)
params = self._client._params(skip_userfs=True)
params.update({"hard_delete": True})
response = requests.delete(
url=self._client.service_instance._href_definitions.get_text_extraction_href()
+ f"/{extraction_id}",
params=params,
headers=self._client._get_headers(),
)
return self._handle_response(204, "delete_job", response) # type: ignore[return-value]
[docs]
def get_results_reference(self, extraction_id: str) -> DataConnection:
"""Get a `DataConnection` instance that is a reference to the results stored on COS.
:param extraction_id: ID of text extraction job
:type extraction_id: str
:return: location of the Data Connection to text extraction job results
:rtype: DataConnection
**Example:**
.. code-block:: python
results_reference = extraction.get_results_reference(extraction_id="<extraction_id>")
"""
TextExtractions._validate_type(extraction_id, "extraction_id", str, True)
job_details = self.get_job_details(extraction_id=extraction_id)
results_reference = job_details.get("entity", {}).get("results_reference")
data_conn = DataConnection._from_dict(results_reference)
data_conn.set_client(self._client)
return data_conn
[docs]
@staticmethod
def get_id(extraction_details: dict) -> str:
"""Get the unique ID of a stored extraction request.
:param extraction_details: metadata of the stored extraction
:type extraction_details: dict
:return: unique ID of the stored extraction request
:rtype: str
**Example:**
.. code-block:: python
extraction_details = extraction.get_job_details(extraction_id)
extraction_id = extraction.get_id(extraction_details)
"""
TextExtractions._validate_type(
extraction_details, "extraction_details", dict, True
)
return WMLResource._get_required_element_from_dict(
extraction_details, "extraction_details", ["metadata", "id"]
)