Source code for ibm_watsonx_ai.foundation_models.inference.audio_model_inference

#  -----------------------------------------------------------------------------------------
#  (C) Copyright IBM Corp. 2025.
#  https://opensource.org/licenses/BSD-3-Clause
#  -----------------------------------------------------------------------------------------
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

from ibm_watsonx_ai.wml_client_error import InvalidMultipleArguments
from ibm_watsonx_ai.wml_resource import WMLResource

if TYPE_CHECKING:
    from ibm_watsonx_ai import APIClient, Credentials



[docs]
class AudioModelInference(WMLResource):
    """
    Instantiate the audio model interface

    :param model: type of model to use
    :type model: str, optional

    :param credentials: credentials for the Watson Machine Learning instance
    :type credentials: Credentials or dict, optional

    :param project_id: ID of the Watson Studio project
    :type project_id: str, optional

    :param space_id: ID of the Watson Studio space
    :type space_id: str, optional

    :param verify: You can pass one of the following as verify:

        * the path to a CA_BUNDLE file
        * the path of directory with certificates of trusted CAs
        * `True` - default path to truststore will be taken
        * `False` - no verification will be made
    :type verify: bool | str | Path, optional

    :param api_client: initialized APIClient object with a set project ID or space ID. If passed, ``credentials`` and ``project_id``/``space_id`` are not required.
    :type api_client: APIClient, optional

    **Example:**

    .. code-block:: python

        from ibm_watsonx_ai import Credentials
        from ibm_watsonx_ai.foundation_models import AudioModelInference

        audio_model = AudioModelInference(
            model="<AUDIO MODEL>",
            credentials=Credentials(
                api_key=IAM_API_KEY, url="https://us-south.ml.cloud.ibm.com"
            ),
            project_id=project_id,
        )

    """

    def __init__(
        self,
        model: str,
        credentials: Credentials | None = None,
        project_id: str | None = None,
        space_id: str | None = None,
        verify: bool | str | Path | None = None,
        api_client: APIClient | None = None,
    ) -> None:
        self.model = model

        if isinstance(verify, Path):
            verify = str(verify)

        if credentials:
            from ibm_watsonx_ai import APIClient

            self._client = APIClient(credentials, verify=verify)
        elif api_client:
            self._client = api_client
        else:
            raise InvalidMultipleArguments(
                params_names_list=["credentials", "api_client"],
                reason="None of the arguments were provided.",
            )

        if space_id:
            self._client.set.default_space(space_id)
        elif project_id:
            self._client.set.default_project(project_id)
        elif not api_client:
            raise InvalidMultipleArguments(
                params_names_list=["space_id", "project_id"],
                reason="None of the arguments were provided.",
            )

        WMLResource.__init__(self, __name__, self._client)


[docs]
    def transcribe(
        self,
        file_path: str | Path,
        language: str | None = None,
    ) -> dict:
        """
        Transcribe audio into text.

        :param file_path: The path to the audio file to transcribe
        :type file_path: str, Path, required

        :param language: Target language to which to transcribe, e.g. 'fr' for French. Default is English.
        :type language: str, optional

        **Example:**

        .. code-block:: python

            file_path = "sample_audio.mp3"

            response = audio_model.transcribe(file_path=file_path)

        """
        self._client._check_if_either_is_set()

        if isinstance(file_path, str):
            file_path = Path(file_path)

        headers = self._client._get_headers()
        headers.pop("Content-Type", None)

        if self._client.default_space_id:
            headers["Authorization"] += f";space_id={self._client.default_space_id}"

        elif self._client.default_project_id:
            headers["Authorization"] += f";project_id={self._client.default_project_id}"

        payload: dict = {"model": self.model}
        if language is not None:
            payload["language"] = language

        with file_path.open("rb") as file:
            files = {"file": (file_path.name, file, "multipart/form-data")}
            response = self._client.httpx_client.post(
                url=self._client._href_definitions.get_audio_transcriptions_href(),
                data=payload,
                files=files,
                headers=headers,
            )

        return self._handle_response(200, "transcribe", response)