Source code for ibm_watson_machine_learning.helpers.connections.connections

__all__ = [
    "DataConnection",
    "S3Connection",
    "ConnectionAsset",
    "S3Location",
    "FSLocation",
    "AssetLocation",
    "CP4DAssetLocation",
    "WMLSAssetLocation",
    "WSDAssetLocation",
    "CloudAssetLocation",
    "DeploymentOutputAssetLocation",
    "NFSConnection",
    "NFSLocation",
    'ConnectionAssetLocation',
    "DatabaseLocation",
    "ContainerLocation"
]

#  -----------------------------------------------------------------------------------------
#  (C) Copyright IBM Corp. 2020-2024.
#  https://opensource.org/licenses/BSD-3-Clause
#  -----------------------------------------------------------------------------------------

import io
import os
import uuid
import copy
import sys
from copy import deepcopy
from typing import Union, Tuple, List, TYPE_CHECKING, Optional
from warnings import warn

from ibm_boto3 import resource
from ibm_botocore.client import ClientError
from pandas import DataFrame
import pandas as pd
import ibm_watson_machine_learning._wrappers.requests as requests
from ibm_watson_machine_learning.utils.autoai.enums import PredictionType, DataConnectionTypes
from ibm_watson_machine_learning.utils.autoai.errors import (
    MissingAutoPipelinesParameters, UseWMLClient, MissingCOSStudioConnection, MissingProjectLib,
    HoldoutSplitNotSupported, InvalidCOSCredentials, MissingLocalAsset, InvalidIdType, NotWSDEnvironment,
    NotExistingCOSResource, InvalidDataAsset, CannotReadSavedRemoteDataBeforeFit, NoAutomatedHoldoutSplit
)

import numpy as np
from ibm_watson_machine_learning.utils.autoai.utils import all_logging_disabled, try_import_autoai_libs, \
    try_import_autoai_ts_libs
from ibm_watson_machine_learning.utils.autoai.watson_studio import get_project
from ibm_watson_machine_learning.data_loaders.datasets.experiment import DEFAULT_SAMPLING_TYPE, DEFAULT_SAMPLE_SIZE_LIMIT
from ibm_watson_machine_learning.wml_client_error import MissingValue, ApiRequestFailure, WMLClientError
from ibm_watson_machine_learning.utils.autoai.errors import ContainerTypeNotSupported
from ibm_watson_machine_learning.messages.messages import Messages
from .base_connection import BaseConnection
from .base_data_connection import BaseDataConnection
from .base_location import BaseLocation

if TYPE_CHECKING:
    from ibm_watson_machine_learning.workspace import WorkSpace


[docs] class DataConnection(BaseDataConnection): """Data Storage Connection class needed for WML training metadata (input data). :param connection: connection parameters of specific type :type connection: NFSConnection or ConnectionAsset, optional :param location: required location parameters of specific type :type location: Union[S3Location, FSLocation, AssetLocation] :param data_join_node_name: name(s) for node(s): - `None` - data file name will be used as node name - str - it will became node name - list[str] - multiple names passed, several nodes will have the same data connection (used for excel files with multiple sheets) :type data_join_node_name: None or str or list[str], optional :param data_asset_id: data asset ID if DataConnection should be pointing out to data asset :type data_asset_id: str, optional """ def __init__(self, location: Union['S3Location', 'FSLocation', 'AssetLocation', 'CP4DAssetLocation', 'WMLSAssetLocation', 'WSDAssetLocation', 'CloudAssetLocation', 'NFSLocation', 'DeploymentOutputAssetLocation', 'ConnectionAssetLocation', 'DatabaseLocation', 'ContainerLocation'] = None, connection: Optional[Union['S3Connection', 'NFSConnection', 'ConnectionAsset']] = None, data_join_node_name: Union[str, List[str]] = None, data_asset_id: str = None, connection_asset_id: str = None, **kwargs): if data_asset_id is None and location is None: raise MissingValue('location or data_asset_id', reason="Provide 'location' or 'data_asset_id'.") elif data_asset_id is not None and location is not None: raise ValueError("'data_asset_id' and 'location' cannot be specified together.") elif data_asset_id is not None: location = AssetLocation(asset_id=data_asset_id) if kwargs.get('model_location') is not None: location._model_location = kwargs['model_location'] if kwargs.get('training_status') is not None: location._training_status = kwargs['training_status'] elif connection_asset_id is not None and isinstance(location, (S3Location, DatabaseLocation, NFSLocation)): connection = ConnectionAsset(connection_id=connection_asset_id) elif connection_asset_id is None and connection is None and isinstance(location, (S3Location, DatabaseLocation, NFSLocation)): raise ValueError("'connection_asset_id' and 'connection' cannot be empty together when 'location' is " "[S3Location, DatabaseLocation, NFSLocation].") super().__init__() self.connection = connection self.location = location # TODO: remove S3 implementation if isinstance(connection, S3Connection): self.type = DataConnectionTypes.S3 elif isinstance(connection, ConnectionAsset): self.type = DataConnectionTypes.CA # note: We expect a `file_name` keyword for CA pointing to COS or NFS. if isinstance(self.location, (S3Location, NFSLocation)): self.location.file_name = self.location.path del self.location.path if isinstance(self.location, NFSLocation): del self.location.id # --- end note elif isinstance(location, FSLocation): self.type = DataConnectionTypes.FS elif isinstance(location, ContainerLocation): self.type = DataConnectionTypes.CN elif isinstance(location, (AssetLocation, CP4DAssetLocation, WMLSAssetLocation, CloudAssetLocation, WSDAssetLocation, DeploymentOutputAssetLocation)): self.type = DataConnectionTypes.DS self.auto_pipeline_params = {} # note: needed parameters for recreation of autoai holdout split self._wml_client = None self.__wml_client = None # only for getter/setter for AssetLocation href self._run_id = None self._obm = False self._obm_cos_path = None self._test_data = False self._user_holdout_exists = False # note: make data connection id as a location path for OBM + KB if data_join_node_name is None: # TODO: remove S3 implementation if self.type == DataConnectionTypes.S3 or ( self.type == DataConnectionTypes.CA and hasattr(location, 'file_name')): self.id = location.get_location() else: self.id = None else: self.id = data_join_node_name # --- end note # note: client as property and setter for dynamic href creation for AssetLocation @property def _wml_client(self): return self.__wml_client @_wml_client.setter def _wml_client(self, var): self.__wml_client = var if isinstance(self.location, (AssetLocation, WSDAssetLocation)): self.location.wml_client = self.__wml_client if getattr(var, 'project_type', None) == 'local_git_storage': self.location.userfs = True
[docs] def set_client(self, wml_client): """Set initialized wml client in connection to enable write/read operations with connection to service. :param wml_client: WML client to connect to service :type wml_client: APIClient **Example** .. code-block:: python DataConnection.set_client(wml_client) """ self._wml_client = wml_client
# --- end note
[docs] @classmethod def from_studio(cls, path: str) -> List['DataConnection']: """Create DataConnections from the credentials stored (connected) in Watson Studio. Only for COS. :param path: path in COS bucket to the training dataset :type path: str :return: list with DataConnection objects :rtype: list[DataConnection] **Example** .. code-block:: python data_connections = DataConnection.from_studio(path='iris_dataset.csv') """ try: from project_lib import Project except ModuleNotFoundError: raise MissingProjectLib("Missing project_lib package.") else: data_connections = [] for name, value in globals().items(): if isinstance(value, Project): connections = value.get_connections() if connections: for connection in connections: asset_id = connection['asset_id'] connection_details = value.get_connection(asset_id) if ('url' in connection_details and 'access_key' in connection_details and 'secret_key' in connection_details and 'bucket' in connection_details): data_connections.append( cls(connection=ConnectionAsset(connection_id=connection_details['id']), location=ConnectionAssetLocation(bucket=connection_details['bucket'], file_name=path)) ) if data_connections: return data_connections else: raise MissingCOSStudioConnection( "There is no any COS Studio connection. " "Please create a COS connection from the UI and insert " "the cell with project API connection (Insert project token)")
def _subdivide_connection(self): if type(self.id) is str or not self.id: return [self] else: def cpy(new_id): child = copy.copy(self) child.id = new_id return child return [cpy(id) for id in self.id] def _to_dict(self) -> dict: """Convert DataConnection object to dictionary representation. :return: DataConnection dictionary representation :rtype: dict """ if self.id and type(self.id) is list: raise InvalidIdType(list) _dict = {"type": self.type} # note: for OBM (id of DataConnection if an OBM node name) if self.id is not None: _dict['id'] = self.id # --- end note if self.connection is not None: _dict['connection'] = deepcopy(self.connection.to_dict()) else: _dict['connection'] = {} try: _dict['location'] = deepcopy(self.location.to_dict()) except AttributeError: _dict['location'] = {} # note: convert userfs to string - training service requires it as string if hasattr(self.location, 'userfs'): _dict['location']['userfs'] = str(getattr(self.location, 'userfs', False)).lower() # end note return _dict def __repr__(self): return str(self._to_dict()) def __str__(self): return str(self._to_dict()) @classmethod def _from_dict(cls, _dict: dict) -> 'DataConnection': """Create a DataConnection object from dictionary. :param _dict: a dictionary data structure with information about data connection reference :type _dict: dict :return: DataConnection object :rtype: DataConnection """ # TODO: remove S3 implementation if _dict['type'] == DataConnectionTypes.S3: warn(message="S3 DataConnection is deprecated! Please use data_asset_id instead.") data_connection: 'DataConnection' = cls( connection=S3Connection( access_key_id=_dict['connection']['access_key_id'], secret_access_key=_dict['connection']['secret_access_key'], endpoint_url=_dict['connection']['endpoint_url'] ), location=S3Location( bucket=_dict['location']['bucket'], path=_dict['location']['path'] ) ) elif _dict['type'] == DataConnectionTypes.FS: data_connection: 'DataConnection' = cls( location=FSLocation._set_path(path=_dict['location']['path']) ) elif _dict['type'] == DataConnectionTypes.CA: if _dict['location'].get('file_name') is not None and _dict['location'].get('bucket'): data_connection: 'DataConnection' = cls( connection_asset_id=_dict['connection']['id'], location=S3Location( bucket=_dict['location']['bucket'], path=_dict['location']['file_name'] ) ) elif _dict['location'].get('path') is not None and _dict['location'].get('bucket'): data_connection: 'DataConnection' = cls( connection_asset_id=_dict['connection']['id'], location=S3Location( bucket=_dict['location']['bucket'], path=_dict['location']['path'] ) ) elif _dict['location'].get('schema_name') and _dict['location'].get('table_name'): data_connection: 'DataConnection' = cls( connection_asset_id=_dict['connection']['id'], location=DatabaseLocation(schema_name=_dict['location']['schema_name'], table_name=_dict['location']['table_name'], catalog_name=_dict['location'].get('catalog_name') ) ) else: if 'asset_id' in _dict['connection']: data_connection: 'DataConnection' = cls( connection=NFSConnection(asset_id=_dict['connection']['asset_id']), location=NFSLocation(path=_dict['location']['path']) ) else: if _dict['location'].get('file_name') is not None: data_connection: 'DataConnection' = cls( connection_asset_id=_dict['connection']['id'], location=NFSLocation(path=_dict['location']['file_name']) ) else: data_connection: 'DataConnection' = cls( connection_asset_id=_dict['connection']['id'], location=NFSLocation(path=_dict['location']['path']) ) elif _dict['type'] == DataConnectionTypes.CN: data_connection: 'DataConnection' = cls( location=ContainerLocation(path=_dict['location']['path']) ) else: data_connection: 'DataConnection' = cls( location=AssetLocation._set_path(href=_dict['location']['href']) ) if _dict.get('id'): data_connection.id = _dict['id'] if _dict['location'].get('userfs'): if str(_dict['location'].get('userfs', 'false')).lower() in ['true', '1']: data_connection.location.userfs = True else: data_connection.location.userfs = False return data_connection def _recreate_holdout( self, data: 'DataFrame', with_holdout_split: bool = True ) -> Union[Tuple['DataFrame', 'DataFrame'], Tuple['DataFrame', 'DataFrame', 'DataFrame', 'DataFrame']]: """This method tries to recreate holdout data.""" if self.auto_pipeline_params.get('prediction_columns') is not None: # timeseries try_import_autoai_ts_libs() from autoai_ts_libs.utils.holdout_utils import make_holdout_split # Note: When lookback window is auto detected there is need to get the detected value from training details if self.auto_pipeline_params.get('lookback_window') == -1 or self.auto_pipeline_params.get('lookback_window') is None: ts_metrics = self._wml_client.training.get_details(self.auto_pipeline_params.get('run_id'), _internal=True)['entity']['status']['metrics'] final_ts_state_name = "after_final_pipelines_generation" for metric in ts_metrics: if metric['context']['intermediate_model']['process'] == final_ts_state_name: self.auto_pipeline_params['lookback_window'] = metric['context']['timeseries']['lookback_window'] break # Note: imputation is not supported X_train, X_holdout, y_train, y_holdout, _, _, _, _ = make_holdout_split( dataset=data, target_columns=self.auto_pipeline_params.get('prediction_columns'), learning_type="forecasting", test_size=self.auto_pipeline_params.get('holdout_size'), lookback_window=self.auto_pipeline_params.get('lookback_window'), feature_columns=self.auto_pipeline_params.get('feature_columns'), timestamp_column=self.auto_pipeline_params.get('timestamp_column_name'), # n_jobs=None, # tshirt_size=None, return_only_holdout=False ) X_columns = self.auto_pipeline_params.get('feature_columns') if self.auto_pipeline_params.get('feature_columns') else self.auto_pipeline_params['prediction_columns'] X_train = DataFrame(X_train, columns=X_columns) X_holdout = DataFrame(X_holdout, columns=X_columns) y_train = DataFrame(y_train, columns=self.auto_pipeline_params['prediction_columns']) y_holdout = DataFrame(y_holdout, columns=self.auto_pipeline_params['prediction_columns']) return X_train, X_holdout, y_train, y_holdout elif self.auto_pipeline_params.get('feature_columns') is not None: # timeseries anomaly detection try_import_autoai_ts_libs() from autoai_ts_libs.utils.holdout_utils import make_holdout_split from autoai_ts_libs.utils.constants import LEARNING_TYPE_TIMESERIES_ANOMALY_PREDICTION # Note: imputation is not supported X_train, X_holdout, y_train, y_holdout, _, _, _, _ = make_holdout_split( dataset=data, learning_type=LEARNING_TYPE_TIMESERIES_ANOMALY_PREDICTION, test_size=self.auto_pipeline_params.get('holdout_size'), # lookback_window=self.auto_pipeline_params.get('lookback_window'), feature_columns=self.auto_pipeline_params.get('feature_columns'), timestamp_column=self.auto_pipeline_params.get('timestamp_column_name'), # n_jobs=None, # tshirt_size=None, return_only_holdout=False ) X_columns = self.auto_pipeline_params['feature_columns'] y_column = ['anomaly_label'] X_train = DataFrame(X_train, columns=X_columns) X_holdout = DataFrame(X_holdout, columns=X_columns) y_train = DataFrame(y_train, columns=y_column) y_holdout = DataFrame(y_holdout, columns=y_column) return X_train, X_holdout, y_train, y_holdout else: if sys.version_info >= (3, 10): try_import_autoai_libs(minimum_version='1.14.0') else: try_import_autoai_libs(minimum_version='1.12.14') from autoai_libs.utils.holdout_utils import make_holdout_split, numpy_split_on_target_values from autoai_libs.utils.sampling_utils import numpy_sample_rows data.replace([np.inf, -np.inf], np.nan, inplace=True) data.drop_duplicates(inplace=True) data.dropna(subset=[self.auto_pipeline_params['prediction_column']], inplace=True) dfy = data[self.auto_pipeline_params['prediction_column']] data.drop(columns=[self.auto_pipeline_params['prediction_column']], inplace=True) y_column = [self.auto_pipeline_params['prediction_column']] X_columns = data.columns if self._test_data or not with_holdout_split: return data, dfy else: ############################ # REMOVE MISSING ROWS # from autoai_libs.utils.holdout_utils import numpy_remove_missing_target_rows # Remove (and save) the rows of X and y for which the target variable has missing values data, dfy, _, _, _, _ = numpy_remove_missing_target_rows( y=dfy, X=data ) # End of REMOVE MISSING ROWS # ################################### ################# # SAMPLING # # Get a sample of the rows if requested and applicable # (check for sampling is performed inside this function) try: data, dfy, _ = numpy_sample_rows( X=data, y=dfy, train_sample_rows_test_size=self.auto_pipeline_params['train_sample_rows_test_size'], learning_type=self.auto_pipeline_params['prediction_type'], return_sampled_indices=True ) # Note: we have a silent error here (the old core behaviour) # sampling is not performed as 'train_sample_rows_test_size' is bigger than data rows count # TODO: can we throw an error instead? except ValueError as e: if 'between' in str(e): pass else: raise e # End of SAMPLING # ######################## # Perform holdout split try: X_train, X_holdout, y_train, y_holdout, _, _ = make_holdout_split( x=data, y=dfy, learning_type=self.auto_pipeline_params['prediction_type'], fairness_info=self.auto_pipeline_params.get('fairness_info', None), test_size=self.auto_pipeline_params.get('holdout_size') if self.auto_pipeline_params.get('holdout_size') is not None else 0.1, return_only_holdout=False, time_ordered_data=self.auto_pipeline_params.get('time_ordered_data') ) except (TypeError, KeyError): if self.auto_pipeline_params.get('time_ordered_data'): warn("Outdated autoai_libs - time_ordered_data parameter is not supported. Please update autoai_libs to version >=1.16.2") X_train, X_holdout, y_train, y_holdout, _, _ = make_holdout_split( x=data, y=dfy, learning_type=self.auto_pipeline_params['prediction_type'], fairness_info=self.auto_pipeline_params.get('fairness_info', None), test_size=self.auto_pipeline_params.get('holdout_size') if self.auto_pipeline_params.get('holdout_size') is not None else 0.1, return_only_holdout=False ) X_train = DataFrame(X_train, columns=X_columns) X_holdout = DataFrame(X_holdout, columns=X_columns) y_train = DataFrame(y_train, columns=y_column) y_holdout = DataFrame(y_holdout, columns=y_column) return X_train, X_holdout, y_train, y_holdout
[docs] def read(self, with_holdout_split: bool = False, csv_separator: str = ',', excel_sheet: Union[str, int] = None, encoding: Optional[str] = 'utf-8', raw: Optional[bool] = False, binary: Optional[bool] = False, read_to_file: Optional[str] = None, number_of_batch_rows: Optional[int] = None, sampling_type: Optional[str] = None, sample_size_limit: Optional[int] = None, sample_rows_limit: Optional[int] = None, sample_percentage_limit: Optional[float] = None, **kwargs) -> Union['DataFrame', Tuple['DataFrame', 'DataFrame'], bytes]: """Download dataset stored in remote data storage. Returns batch up to 1GB. :param with_holdout_split: if `True`, data will be split to train and holdout dataset as it was by AutoAI :type with_holdout_split: bool, optional :param csv_separator: separator / delimiter for CSV file :type csv_separator: str, optional :param excel_sheet: excel file sheet name to use, only use when xlsx file is an input, support for number of the sheet is deprecated :type excel_sheet: str, optional :param encoding: encoding type of the CSV :type encoding: str, optional :param raw: if `False` there wil be applied simple data preprocessing (the same as in the backend), if `True`, data will be not preprocessed :type raw: bool, optional :param binary: indicates to retrieve data in binary mode, the result will be a python binary type variable :type binary: bool, optional :param read_to_file: stream read data to file under path specified as value of this parameter, use this parameter to prevent keeping data in-memory :type read_to_file: str, optional :param number_of_batch_rows: number of rows to read in each batch when reading from flight connection :type number_of_batch_rows: int, optional :param sampling_type: a sampling strategy how to read the data :type sampling_type: str, optional :param sample_size_limit: upper limit for overall data that should be downloaded in bytes, default: 1 GB :type sample_size_limit: int, optional :param sample_rows_limit: upper limit for overall data that should be downloaded in number of rows :type sample_rows_limit: int, optional :param sample_percentage_limit: upper limit for overall data that should be downloaded in percent of all dataset, this parameter is ignored, when `sampling_type` parameter is set to `first_n_records`, must be a float number between 0 and 1 :type sample_percentage_limit: float, optional .. note:: If more than one of: `sample_size_limit`, `sample_rows_limit`, `sample_percentage_limit` are set, then downloaded data is limited to the lowest threshold. :return: one of: - pandas.DataFrame contains dataset from remote data storage : Xy_train - Tuple[pandas.DataFrame, pandas.DataFrame, pandas.DataFrame, pandas.DataFrame] : X_train, X_holdout, y_train, y_holdout - Tuple[pandas.DataFrame, pandas.DataFrame] : X_test, y_test containing training data and holdout data from remote storage - bytes object, auto holdout split from backend (only train data provided) **Examples** .. code-block:: python train_data_connections = optimizer.get_data_connections() data = train_data_connections[0].read() # all train data # or X_train, X_holdout, y_train, y_holdout = train_data_connections[0].read(with_holdout_split=True) # train and holdout data User provided train and test data: .. code-block:: python optimizer.fit(training_data_reference=[DataConnection], training_results_reference=DataConnection, test_data_reference=DataConnection) test_data_connection = optimizer.get_test_data_connections() X_test, y_test = test_data_connection.read() # only holdout data # and train_data_connections = optimizer.get_data_connections() data = train_connections[0].read() # only train data """ # enables flight automatically for CP4D 4.0.x, 4.5.x try: use_flight = kwargs.get( 'use_flight', bool((self._wml_client is not None or 'USER_ACCESS_TOKEN' in os.environ or 'RUNTIME_ENV_ACCESS_TOKEN_FILE' in os.environ) and self._wml_client.CPD_version)) except: use_flight = False return_data_as_iterator = kwargs.get('return_data_as_iterator', False) sampling_type = sampling_type if sampling_type is not None else DEFAULT_SAMPLING_TYPE enable_sampling = kwargs.get('enable_sampling', True) total_size_limit = sample_size_limit if sample_size_limit is not None else kwargs.get('total_size_limit', DEFAULT_SAMPLE_SIZE_LIMIT) total_nrows_limit = sample_rows_limit total_percentage_limit = sample_percentage_limit if sample_percentage_limit is not None else 1.0 # Deprecation of excel_sheet as number: if isinstance(excel_sheet, int): warn( message="Support for excel sheet as number of the sheet (int) is deprecated! Please set excel sheet with name of the sheet.") flight_parameters = kwargs.get('flight_parameters', {}) impersonate_header = kwargs.get('impersonate_header', None) if with_holdout_split and self._user_holdout_exists: # when this connection is training one raise NoAutomatedHoldoutSplit(reason="Experiment was run based on user defined holdout dataset.") # note: experiment metadata is used only in autogen notebooks experiment_metadata = kwargs.get('experiment_metadata') # note: process subsampling stats flag _return_subsampling_stats = kwargs.get("_return_subsampling_stats", False) if experiment_metadata is not None: self.auto_pipeline_params['train_sample_rows_test_size'] = experiment_metadata.get( 'train_sample_rows_test_size') self.auto_pipeline_params['prediction_column'] = experiment_metadata.get('prediction_column') self.auto_pipeline_params['prediction_columns'] = experiment_metadata.get('prediction_columns') self.auto_pipeline_params['holdout_size'] = experiment_metadata.get('holdout_size') self.auto_pipeline_params['prediction_type'] = experiment_metadata['prediction_type'] self.auto_pipeline_params['fairness_info'] = experiment_metadata.get('fairness_info') self.auto_pipeline_params['lookback_window'] = experiment_metadata.get('lookback_window') self.auto_pipeline_params['timestamp_column_name'] = experiment_metadata.get('timestamp_column_name') self.auto_pipeline_params['feature_columns'] = experiment_metadata.get('feature_columns') self.auto_pipeline_params['time_ordered_data'] = experiment_metadata.get('time_ordered_data') # note: check for cloud if 'training_result_reference' in experiment_metadata: if isinstance(experiment_metadata['training_result_reference'].location, (S3Location, AssetLocation)): run_id = experiment_metadata['training_result_reference'].location._training_status.split('/')[-2] # WMLS else: run_id = experiment_metadata['training_result_reference'].location.path.split('/')[-3] self.auto_pipeline_params['run_id'] = run_id if self._test_data: csv_separator = experiment_metadata.get('test_data_csv_separator', csv_separator) excel_sheet = experiment_metadata.get('test_data_excel_sheet', excel_sheet) encoding = experiment_metadata.get('test_data_encoding', encoding) else: csv_separator = experiment_metadata.get('csv_separator', csv_separator) excel_sheet = experiment_metadata.get('excel_sheet', excel_sheet) encoding = experiment_metadata.get('encoding', encoding) if self.type == DataConnectionTypes.DS or self.type == DataConnectionTypes.CA: if self._wml_client is None: try: from project_lib import Project except ModuleNotFoundError: raise ConnectionError( "This functionality can be run only on Watson Studio or with wml_client passed to connection. " "Please initialize WML client using `DataConnection.set_client(wml_client)` function " "to be able to use this functionality.") if (with_holdout_split or self._test_data) and not self.auto_pipeline_params.get('prediction_type', False): raise MissingAutoPipelinesParameters( self.auto_pipeline_params, reason=f"To be able to recreate an original holdout split, you need to schedule a training job or " f"if you are using historical runs, just call historical_optimizer.get_data_connections()") # note: allow to read data at any time elif (('csv_separator' not in self.auto_pipeline_params and 'encoding' not in self.auto_pipeline_params) or csv_separator != ',' or encoding != 'utf-8'): self.auto_pipeline_params['csv_separator'] = csv_separator self.auto_pipeline_params['encoding'] = encoding # --- end note # note: excel_sheet in params only if it is not None (not specified): if excel_sheet: self.auto_pipeline_params['excel_sheet'] = excel_sheet # --- end note # note: set default quote character for flight (later applicable only for csv files stored in S3) self.auto_pipeline_params['quote_character'] = 'double_quote' # --- end note data = DataFrame() headers = None if self._wml_client is None: token = self._get_token_from_environment() if token is not None: headers = {'Authorization': f'Bearer {token}'} elif impersonate_header is not None: headers = self._wml_client._get_headers() headers['impersonate'] = impersonate_header if self.type == DataConnectionTypes.S3: raise ConnectionError( f"S3 DataConnection is deprecated! Please use data_asset_id instead.") elif self.type == DataConnectionTypes.DS: if use_flight and not self._obm: from ibm_watson_machine_learning.utils.utils import is_lib_installed is_lib_installed(lib_name='pyarrow', minimum_version='3.0.0', install=True) from pyarrow.flight import FlightError _iam_id = None if headers and headers.get('impersonate'): _iam_id = headers.get('impersonate', {}).get('iam_id') self._wml_client._iam_id = _iam_id try: if self._check_if_connection_asset_is_s3(): # note: update flight parameters only if `connection_properties` was not set earlier # (e.x. by wml/autoi) if not flight_parameters.get('connection_properties'): flight_parameters = self._update_flight_parameters_with_connection_details(flight_parameters) data = self._download_data_from_flight_service(data_location=self, binary=binary, read_to_file=read_to_file, flight_parameters=flight_parameters, headers=headers, enable_sampling=enable_sampling, sampling_type=sampling_type, number_of_batch_rows=number_of_batch_rows, return_data_as_iterator=return_data_as_iterator, _return_subsampling_stats=_return_subsampling_stats, total_size_limit=total_size_limit, total_nrows_limit=total_nrows_limit, total_percentage_limit=total_percentage_limit ) except (ConnectionError, FlightError, ApiRequestFailure) as download_data_error: # note: try to download normal data asset either directly from cams or from mounted NFS # to keep backward compatibility if ( self._wml_client and ( ( self._is_data_asset_normal() and self._is_size_acceptable() ) or self._is_data_asset_nfs() ) and ( "Found non-unique column index" not in str(download_data_error) ) ): import warnings warnings.warn(str(download_data_error), Warning) data = self._download_training_data_from_data_asset_storage() else: raise download_data_error # backward compatibility else: try: with all_logging_disabled(): if self._check_if_connection_asset_is_s3(): cos_client = self._init_cos_client() if self._obm: data = self._download_obm_data_from_cos(cos_client=cos_client) else: data = self._download_data_from_cos(cos_client=cos_client, binary=binary) else: data = self._download_training_data_from_data_asset_storage() except NotImplementedError as e: raise e except FileNotFoundError as e: raise e except Exception as e: # do not try Flight if we are on the cloud if self._wml_client is not None: if not self._wml_client.ICP: raise e elif os.environ.get('USER_ACCESS_TOKEN') is None and os.environ.get('RUNTIME_ENV_ACCESS_TOKEN_FILE') is None: raise CannotReadSavedRemoteDataBeforeFit() data = self._download_data_from_flight_service(data_location=self, binary=binary, read_to_file=read_to_file, flight_parameters=flight_parameters, headers=headers, enable_sampling=enable_sampling, sampling_type=sampling_type, number_of_batch_rows=number_of_batch_rows, return_data_as_iterator=return_data_as_iterator, _return_subsampling_stats=_return_subsampling_stats, total_size_limit=total_size_limit, total_nrows_limit=total_nrows_limit, total_percentage_limit=total_percentage_limit) elif self.type == DataConnectionTypes.FS: if self._obm: data = self._download_obm_data_from_file_system() else: data = self._download_training_data_from_file_system() elif self.type == DataConnectionTypes.CA or self.type == DataConnectionTypes.CN: if getattr(self._wml_client, 'ICP', False) and self.type == DataConnectionTypes.CN: raise ContainerTypeNotSupported() # block Container type on CPD if use_flight and not self._obm: # Workaround for container connection type, we need to fetch COS details from space/project if self.type == DataConnectionTypes.CN: # note: update flight parameters only if `connection_properties` was not set earlier # (e.x. by wml/autoi) if not flight_parameters.get('connection_properties'): flight_parameters = self._update_flight_parameters_with_connection_details(flight_parameters) data = self._download_data_from_flight_service(data_location=self, binary=binary, read_to_file=read_to_file, flight_parameters=flight_parameters, headers=headers, enable_sampling=enable_sampling, sampling_type=sampling_type, number_of_batch_rows=number_of_batch_rows, return_data_as_iterator=return_data_as_iterator, _return_subsampling_stats=_return_subsampling_stats, total_size_limit=total_size_limit, total_nrows_limit=total_nrows_limit, total_percentage_limit=total_percentage_limit) else: # backward compatibility try: with all_logging_disabled(): if self._check_if_connection_asset_is_s3(): cos_client = self._init_cos_client() try: if self._obm: data = self._download_obm_data_from_cos(cos_client=cos_client) else: data = self._download_data_from_cos(cos_client=cos_client, binary=binary) except Exception as cos_access_exception: raise ConnectionError( f"Unable to access data object in cloud object storage with credentials supplied. " f"Error: {cos_access_exception}") else: data = self._download_data_from_nfs_connection() except Exception as e: # do not try Flight is we are on the cloud if self._wml_client is not None: if not self._wml_client.ICP: raise e elif os.environ.get('USER_ACCESS_TOKEN') is None and os.environ.get('RUNTIME_ENV_ACCESS_TOKEN_FILE') is None: raise CannotReadSavedRemoteDataBeforeFit() data = self._download_data_from_flight_service(data_location=self, binary=binary, read_to_file=read_to_file, flight_parameters=flight_parameters, headers=headers, enable_sampling=enable_sampling, sampling_type=sampling_type, number_of_batch_rows=number_of_batch_rows, _return_subsampling_stats=_return_subsampling_stats, total_size_limit=total_size_limit, total_nrows_limit=total_nrows_limit, total_percentage_limit=total_percentage_limit) if getattr(self._wml_client, '_internal', False): pass # don't remove additional params if client is used internally else: # note: remove additional params and inline credentials added by _check_if_connection_asset_is_s3: [delattr(self.connection, attr) for attr in ['secret_access_key', 'access_key_id', 'endpoint_url', 'cos_type'] if hasattr(self.connection, attr)] # end note # create data statistics if data were not downloaded with flight: if not isinstance(data, tuple) and _return_subsampling_stats: data = (data, {"data_batch_size": sys.getsizeof(data), "data_batch_nrows": len(data)}) if binary: return data if raw or (self.auto_pipeline_params.get('prediction_column') is None and self.auto_pipeline_params.get('prediction_columns') is None and self.auto_pipeline_params.get('feature_columns') is None): return data else: if with_holdout_split: # when this connection is training one if return_data_as_iterator: raise WMLClientError("The flags `return_data_as_iterator` and `with_holdout_split` cannot be set both in the same time.") if _return_subsampling_stats: X_train, X_holdout, y_train, y_holdout = self._recreate_holdout(data=data[0]) return X_train, X_holdout, y_train, y_holdout, data[1] else: X_train, X_holdout, y_train, y_holdout = self._recreate_holdout(data=data) return X_train, X_holdout, y_train, y_holdout else: # when this data connection is a test / holdout one if return_data_as_iterator: return data if _return_subsampling_stats: if self.auto_pipeline_params.get('prediction_columns') or \ not self.auto_pipeline_params.get('prediction_column') or \ (self.auto_pipeline_params.get('prediction_column') and self.auto_pipeline_params.get( 'prediction_column') not in data[0].columns): # timeseries dataset does not have prediction columns. Whole data set is returned: test_X = data return test_X else: test_X, test_y = self._recreate_holdout(data=data[0], with_holdout_split=False) test_X[self.auto_pipeline_params.get('prediction_column', 'prediction_column')] = test_y return test_X, data[1] else: # when this data connection is a test / holdout one and no subsampling stats are needed if self.auto_pipeline_params.get('prediction_columns') or \ not self.auto_pipeline_params.get('prediction_column') or \ (self.auto_pipeline_params.get('prediction_column') and self.auto_pipeline_params.get( 'prediction_column') not in data.columns): # timeseries dataset does not have prediction columns. Whole data set is returned: test_X = data else: test_X, test_y = self._recreate_holdout(data=data, with_holdout_split=False) test_X[self.auto_pipeline_params.get('prediction_column', 'prediction_column')] = test_y return test_X # return one dataframe
[docs] def write(self, data: Union[str, 'DataFrame'], remote_name: str = None, **kwargs) -> None: """Upload file to a remote data storage. :param data: local path to the dataset or pandas.DataFrame with data :type data: str :param remote_name: name that dataset should be stored with in remote data storage :type remote_name: str """ # enables flight automatically for CP4D 4.0.x use_flight = kwargs.get( 'use_flight', bool((self._wml_client is not None or 'USER_ACCESS_TOKEN' in os.environ or 'RUNTIME_ENV_ACCESS_TOKEN_FILE' in os.environ) and self._wml_client.CPD_version)) flight_parameters = kwargs.get('flight_parameters', {}) impersonate_header = kwargs.get('impersonate_header', None) headers = None if self._wml_client is None: token = self._get_token_from_environment() if token is None: raise ConnectionError("WML client missing. Please initialize WML client and pass it to " "DataConnection._wml_client property to be able to use this functionality.") else: headers = {'Authorization': f'Bearer {token}'} elif impersonate_header is not None: headers = self._wml_client._get_headers() headers['impersonate'] = impersonate_header # TODO: Remove S3 implementation if self.type == DataConnectionTypes.S3: raise ConnectionError("S3 DataConnection is deprecated! Please use data_asset_id instead.") elif self.type == DataConnectionTypes.CA or self.type == DataConnectionTypes.CN: if getattr(self._wml_client, 'ICP', False) and self.type == DataConnectionTypes.CN: raise ContainerTypeNotSupported() # block Container type on CPD if self._check_if_connection_asset_is_s3(): # do not try Flight if we are on the cloud if self._wml_client is not None and not self._wml_client.ICP and not use_flight: # CLOUD if remote_name is None and self._to_dict().get('location', {}).get('path'): updated_remote_name = data.split('/')[-1] else: updated_remote_name = self._get_path_with_remote_name(self._to_dict(), remote_name) cos_resource_client = self._init_cos_client() if isinstance(data, str): with open(data, "rb") as file_data: cos_resource_client.Object(self.location.bucket, updated_remote_name).upload_fileobj( Fileobj=file_data) elif isinstance(data, DataFrame): # note: we are saving csv in memory as a file and stream it to the COS buffer = io.StringIO() data.to_csv(buffer, index=False) buffer.seek(0) with buffer as f: cos_resource_client.Object(self.location.bucket, updated_remote_name).upload_fileobj( Fileobj=io.BytesIO(bytes(f.read().encode()))) else: raise TypeError("data should be either of type \"str\" or \"pandas.DataFrame\"") # CP4D else: # Workaround for container connection type, we need to fetch COS details from space/project if self.type == DataConnectionTypes.CN: # note: update flight parameters only if `connection_properties` was not set earlier # (e.x. by wml/autoi) if not flight_parameters.get('connection_properties'): flight_parameters = self._update_flight_parameters_with_connection_details(flight_parameters) if isinstance(data, str): self._upload_data_via_flight_service(file_path=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers) elif isinstance(data, DataFrame): # note: we are saving csv in memory as a file and stream it to the COS self._upload_data_via_flight_service(data=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers) else: raise TypeError("data should be either of type \"str\" or \"pandas.DataFrame\"") else: if self._wml_client is not None and not self._wml_client.ICP and not use_flight: # CLOUD raise ConnectionError("Connections other than COS are not supported on a cloud yet.") # CP4D else: if isinstance(data, str): self._upload_data_via_flight_service(file_path=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers, binary=kwargs.get('binary', False)) elif isinstance(data, DataFrame): # note: we are saving csv in memory as a file and stream it to the COS self._upload_data_via_flight_service(data=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers) else: raise TypeError("data should be either of type \"str\" or \"pandas.DataFrame\"") if getattr(self._wml_client, '_internal', False): pass # don't remove additional params if client is used internally else: # note: remove additional params and inline credentials added by _check_if_connection_asset_is_s3: [delattr(self.connection, attr) for attr in ['secret_access_key', 'access_key_id', 'endpoint_url', 'cos_type'] if hasattr(self.connection, attr)] # end note elif self.type == DataConnectionTypes.DS: if self._wml_client is not None and not self._wml_client.ICP and not use_flight: # CLOUD raise ConnectionError("Write of data for Data Asset is not supported on Cloud.") elif self._wml_client is not None: if isinstance(data, str): self._upload_data_via_flight_service(file_path=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers) elif isinstance(data, DataFrame): # note: we are saving csv in memory as a file and stream it to the COS self._upload_data_via_flight_service(data=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers) else: raise TypeError("data should be either of type \"str\" or \"pandas.DataFrame\"") else: self._upload_data_via_flight_service(data=data, data_location=self, remote_name=remote_name, flight_parameters=flight_parameters, headers=headers)
def _init_cos_client(self) -> 'resource': """ Initiate COS client for further usage. """ from ibm_botocore.client import Config try: if hasattr(self.connection, 'auth_endpoint') and hasattr(self.connection, 'api_key'): cos_client = resource( service_name='s3', ibm_api_key_id=self.connection.api_key, ibm_auth_endpoint=self.connection.auth_endpoint, config=Config(signature_version="oauth"), endpoint_url=self.connection.endpoint_url ) else: cos_client = resource( service_name='s3', endpoint_url=self.connection.endpoint_url, aws_access_key_id=self.connection.access_key_id, aws_secret_access_key=self.connection.secret_access_key ) return cos_client except ValueError: if not self.connection.endpoint_url.startswith('https://'): raise WMLClientError(Messages.get_message(message_id="invalid_endpoint_url")) def _validate_cos_resource(self): cos_client = self._init_cos_client() try: files = cos_client.Bucket(self.location.bucket).objects.all() next(x for x in files if x.key == self.location.path) except Exception as e: raise NotExistingCOSResource(self.location.bucket, self.location.path) def _update_flight_parameters_with_connection_details(self, flight_parameters): with all_logging_disabled(): self._check_if_connection_asset_is_s3() connection_properties = { "bucket": self.location.bucket, "url": self.connection.endpoint_url } if hasattr(self.connection, 'auth_endpoint') and hasattr(self.connection, 'api_key'): connection_properties["iam_url"] = self.connection.auth_endpoint connection_properties["api_key"] = self.connection.api_key connection_properties["resource_instance_id"] = self.connection.resource_instance_id else: connection_properties["secret_key"] = self.connection.secret_access_key connection_properties["access_key"] = self.connection.access_key_id flight_parameters.update({"connection_properties": connection_properties}) flight_parameters.update({"datasource_type": {"entity": {"name": self._datasource_type}}}) return flight_parameters
# TODO: Remove S3 Implementation for connection class S3Connection(BaseConnection): """Connection class to COS data storage in S3 format. :param endpoint_url: S3 data storage url (COS) :type endpoint_url: str :param access_key_id: access_key_id of the S3 connection (COS) :type access_key_id: str, optional :param secret_access_key: secret_access_key of the S3 connection (COS) :type secret_access_key: str, optional :param api_key: API key of the S3 connection (COS) :type api_key: str, optional :param service_name: service name of the S3 connection (COS) :type service_name: str, optional :param auth_endpoint: authentication endpoint url of the S3 connection (COS) :type auth_endpoint: str, optional """ def __init__(self, endpoint_url: str, access_key_id: str = None, secret_access_key: str = None, api_key: str = None, service_name: str = None, auth_endpoint: str = None, resource_instance_id: str = None, _internal_use=False) -> None: if not _internal_use: warn(message="S3 DataConnection is deprecated! Please use data_asset_id instead.") if (access_key_id is None or secret_access_key is None) and (api_key is None or auth_endpoint is None): raise InvalidCOSCredentials(reason='You need to specify (access_key_id and secret_access_key) or' '(api_key and auth_endpoint)') if secret_access_key is not None: self.secret_access_key = secret_access_key if api_key is not None: self.api_key = api_key if service_name is not None: self.service_name = service_name if auth_endpoint is not None: self.auth_endpoint = auth_endpoint if access_key_id is not None: self.access_key_id = access_key_id if endpoint_url is not None: self.endpoint_url = endpoint_url if resource_instance_id is not None: self.resource_instance_id = resource_instance_id
[docs] class S3Location(BaseLocation): """Connection class to COS data storage in S3 format. :param bucket: COS bucket name :type bucket: str :param path: COS data path in the bucket :type path: str :param excel_sheet: name of excel sheet if pointed dataset is excel file used for Batched Deployment scoring :type excel_sheet: str, optional :param model_location: path to the pipeline model in the COS :type model_location: str, optional :param training_status: path to the training status json in COS :type training_status: str, optional """ def __init__(self, bucket: str, path: str, **kwargs) -> None: self.bucket = bucket self.path = path if kwargs.get('model_location') is not None: self._model_location = kwargs['model_location'] if kwargs.get('training_status') is not None: self._training_status = kwargs['training_status'] if kwargs.get('excel_sheet') is not None: self.sheet_name = kwargs['excel_sheet'] self.file_format = "xls" def _get_file_size(self, cos_resource_client: 'resource') -> 'int': try: size = cos_resource_client.Object(self.bucket, self.path).content_length except ClientError: size = 0 return size
[docs] def get_location(self) -> str: if hasattr(self, "file_name"): return self.file_name else: return self.path
class ContainerLocation(BaseLocation): """Connection class to default COS in user Project/Space.""" def __init__(self, path: Optional[str] = None, **kwargs) -> None: if path is None: self.path = "default_autoai_out" else: self.path = path self.bucket = None if kwargs.get('model_location') is not None: self._model_location = kwargs['model_location'] if kwargs.get('training_status') is not None: self._training_status = kwargs['training_status'] def to_dict(self) -> dict: _dict = super().to_dict() if 'bucket' in _dict and _dict['bucket'] is None: del _dict['bucket'] return _dict @classmethod def _set_path(cls, path: str) -> 'ContainerLocation': location = cls() location.path = path return location def _get_file_size(self): pass class FSLocation(BaseLocation): """Connection class to File Storage in CP4D.""" def __init__(self, path: Optional[str] = None) -> None: if path is None: self.path = "/{option}/{id}" + f"/assets/auto_ml/auto_ml.{uuid.uuid4()}/wml_data" else: self.path = path @classmethod def _set_path(cls, path: str) -> 'FSLocation': location = cls() location.path = path return location def _save_file_as_data_asset(self, workspace: 'WorkSpace') -> 'str': asset_name = self.path.split('/')[-1] if self.path: data_asset_details = workspace.wml_client.data_assets.create(asset_name, self.path) return workspace.wml_client.data_assets.get_uid(data_asset_details) else: raise MissingValue('path', reason="Incorrect initialization of class FSLocation") def _get_file_size(self, workspace: 'WorkSpace') -> 'int': # note if path is not file then returned size is 0 try: # note: try to get file size from remote server url = workspace.wml_client.service_instance._href_definitions.get_wsd_model_attachment_href() \ + f"/{self.path.split('/assets/')[-1]}" path_info_response = requests.head(url, headers=workspace.wml_client._get_headers(), params=workspace.wml_client._params()) if path_info_response.status_code != 200: raise ApiRequestFailure(u"Failure during getting path details", path_info_response) path_info = path_info_response.headers if 'X-Asset-Files-Type' in path_info and path_info['X-Asset-Files-Type'] == 'file': size = path_info['X-Asset-Files-Size'] else: size = 0 # -- end note except (ApiRequestFailure, AttributeError): # note try get size of file from local fs size = os.stat(path=self.path).st_size if os.path.isfile(path=self.path) else 0 # -- end note return size class AssetLocation(BaseLocation): def __init__(self, asset_id: str) -> None: self._wsd = self._is_wsd() self.href = None self._initial_asset_id = asset_id self.__wml_client = None if self._wsd: self._asset_name = None self._asset_id = None self._local_asset_path = None else: self.id = asset_id def _get_bucket(self, client) -> str: """Try to get bucket from data asset.""" connection_id = self._get_connection_id(client) conn_details = client.connections.get_details(connection_id) bucket = conn_details.get('entity', {}).get('properties', {}).get('bucket') if bucket is None: asset_details = client.data_assets.get_details(self.id) connection_path = asset_details['entity'].get('folder_asset', {}).get('connection_path') if connection_path is None: attachment_content = self._get_attachment_details(client) connection_path = attachment_content.get('connection_path') bucket = connection_path.split('/')[1] return bucket def _get_attachment_details(self, client) -> dict: if self.id is None and self.href: items = self.href.split('/') self.id = items[-1].split('?')[0] asset_details = client.data_assets.get_details(self.id) if 'attachment_id' in asset_details.get('metadata'): attachment_id = asset_details['metadata']['attachment_id'] else: attachment_id = asset_details['attachments'][0]['id'] attachment_url = client.service_instance._href_definitions.get_data_asset_href(self.id) attachment_url = f"{attachment_url}/attachments/{attachment_id}" if client.ICP: attachment = requests.get(attachment_url, headers=client._get_headers(), params=client._params()) else: attachment = requests.get(attachment_url, headers=client._get_headers(), params=client._params()) if attachment.status_code != 200: raise ApiRequestFailure(u"Failure during getting attachment details", attachment) return attachment.json() def _get_connection_id(self, client) -> str: attachment_content = self._get_attachment_details(client) return attachment_content.get('connection_id') @classmethod def _is_wsd(cls): if os.environ.get('USER_ACCESS_TOKEN') or os.environ.get('RUNTIME_ENV_ACCESS_TOKEN_FILE'): return False try: from project_lib import Project try: with all_logging_disabled(): access = Project.access() return True except RuntimeError: pass except ModuleNotFoundError: pass return False @classmethod def _set_path(cls, href: str) -> 'AssetLocation': items = href.split('/') _id = items[-1].split('?')[0] location = cls(_id) location.href = href return location def _get_file_size(self, workspace: 'WorkSpace', *args) -> 'int': if self._wsd: return self._wsd_get_file_size() else: asset_info_response = requests.get( workspace.wml_client.service_instance._href_definitions.get_data_asset_href(self.id), params=workspace.wml_client._params(), headers=workspace.wml_client._get_headers()) if asset_info_response.status_code != 200: raise ApiRequestFailure(u"Failure during getting asset details", asset_info_response) return asset_info_response.json()['metadata'].get('size') def _wsd_setup_local_asset_details(self) -> None: if not self._wsd: raise NotWSDEnvironment() # note: set local asset file from asset_id project = get_project() project_id = project.get_metadata()["metadata"]["guid"] local_assets = project.get_files() # note: reuse local asset_id when object is reused more times if self._asset_id is None: local_asset_id = self._initial_asset_id else: local_asset_id = self._asset_id # --- end note if local_asset_id not in str(local_assets): raise MissingLocalAsset(local_asset_id, reason="Provided asset_id cannot be found on WS Desktop.") else: for asset in local_assets: if asset['asset_id'] == local_asset_id: asset_name = asset['name'] self._asset_name = asset_name self._asset_id = local_asset_id local_asset_path = f"{os.path.abspath('.')}/{project_id}/assets/data_asset/{asset_name}" self._local_asset_path = local_asset_path def _wsd_move_asset_to_server(self, workspace: 'WorkSpace') -> None: if not self._wsd: raise NotWSDEnvironment() if not self._local_asset_path or self._asset_name or self._asset_id: self._wsd_setup_local_asset_details() remote_asset_details = workspace.wml_client.data_assets.create(self._asset_name, self._local_asset_path) self.href = remote_asset_details['metadata']['href'] def _wsd_get_file_size(self) -> 'int': if not self._wsd: raise NotWSDEnvironment() if not self._local_asset_path or self._asset_name or self._asset_id: self._wsd_setup_local_asset_details() return os.stat(path=self._local_asset_path).st_size if os.path.isfile(path=self._local_asset_path) else 0 @classmethod def list_wsd_assets(cls): if not cls._is_wsd(): raise NotWSDEnvironment project = get_project() return project.get_files() def to_dict(self) -> dict: """Return a json dictionary representing this model.""" _dict = vars(self).copy() if _dict.get('id', False) is None and _dict.get('href'): items = self.href.split('/') _dict['id'] = items[-1].split('?')[0] del _dict['_wsd'] del _dict[f"_{self.__class__.__name__}__wml_client"] if self._wsd: del _dict['_asset_name'] del _dict['_asset_id'] del _dict['_local_asset_path'] del _dict['_initial_asset_id'] return _dict @property def wml_client(self): return self.__wml_client @wml_client.setter def wml_client(self, var): self.__wml_client = var if self.__wml_client: self.href = self.__wml_client.service_instance._href_definitions.get_base_asset_href(self._initial_asset_id) else: self.href = f'/v2/assets/{self._initial_asset_id}' if not self._wsd: if self.__wml_client: if self.__wml_client.default_space_id: self.href = f'{self.href}?space_id={self.__wml_client.default_space_id}' else: self.href = f'{self.href}?project_id={self.__wml_client.default_project_id}' class ConnectionAssetLocation(BaseLocation): """Connection class to COS data storage. :param bucket: COS bucket name :type bucket: str :param file_name: COS data path in the bucket :type file_name: str :param model_location: path to the pipeline model in the COS :type model_location: str, optional :param training_status: path to the training status json in COS :type training_status: str, optional """ def __init__(self, bucket: str, file_name: str, **kwargs) -> None: self.bucket = bucket self.file_name = file_name self.path = file_name if kwargs.get('model_location') is not None: self._model_location = kwargs['model_location'] if kwargs.get('training_status') is not None: self._training_status = kwargs['training_status'] def _get_file_size(self, cos_resource_client: 'resource') -> 'int': try: size = cos_resource_client.Object(self.bucket, self.path).content_length except ClientError: size = 0 return size def to_dict(self) -> dict: """Return a json dictionary representing this model.""" return vars(self) class ConnectionAsset(BaseConnection): """Connection class for Connection Asset. :param connection_id: connection asset ID :type connection_id: str """ def __init__(self, connection_id: str): self.id = connection_id class NFSConnection(BaseConnection): """Connection class to file storage in CP4D of NFS format. :param asset_id: asset ID from the project on CP4D :type asset_id: str """ def __init__(self, asset_id: str): self.asset_id = asset_id self.id = asset_id class NFSLocation(BaseLocation): """Location class to file storage in CP4D of NFS format. :param path: data path form the project on CP4D :type path: str """ def __init__(self, path: str): self.path = path self.id = None self.file_name = None def _get_file_size(self, workspace: 'Workspace', *args) -> 'int': params = workspace.wml_client._params().copy() params['path'] = self.path params['detail'] = 'true' href = workspace.wml_client.connections._href_definitions.get_connection_by_id_href(self.id) + '/assets' asset_info_response = requests.get(href, params=params, headers=workspace.wml_client._get_headers(None)) if asset_info_response.status_code != 200: raise Exception(u"Failure during getting asset details", asset_info_response.json()) return asset_info_response.json()['details']['file_size'] def get_location(self) -> str: if hasattr(self, "file_name"): return self.file_name else: return self.path class CP4DAssetLocation(AssetLocation): """Connection class to data assets in CP4D. :param asset_id: asset ID from the project on CP4D :type asset_id: str """ def __init__(self, asset_id: str) -> None: super().__init__(asset_id) warning_msg = ("Depreciation Warning: Class CP4DAssetLocation is no longer supported and will be removed." "Use AssetLocation instead.") print(warning_msg) def _get_file_size(self, workspace: 'WorkSpace', *args) -> 'int': return super()._get_file_size(workspace) class WMLSAssetLocation(AssetLocation): """Connection class to data assets in WML Server. :param asset_id: asset ID of the file loaded on space in WML Server :type asset_id: str """ def __init__(self, asset_id: str) -> None: super().__init__(asset_id) warning_msg = ("Depreciation Warning: Class WMLSAssetLocation is no longer supported and will be removed." "Use AssetLocation instead.") print(warning_msg) def _get_file_size(self, workspace: 'WorkSpace', *args) -> 'int': return super()._get_file_size(workspace)
[docs] class CloudAssetLocation(AssetLocation): """Connection class to data assets as input data references to batch deployment job on Cloud. :param asset_id: asset ID of the file loaded on space on Cloud :type asset_id: str """ def __init__(self, asset_id: str) -> None: super().__init__(asset_id) self.href = self.href warning_msg = ("Depreciation Warning: Class CloudAssetLocation is no longer supported and will be removed." "Use AssetLocation instead.") print(warning_msg) def _get_file_size(self, workspace: 'WorkSpace', *args) -> 'int': return super()._get_file_size(workspace)
class WSDAssetLocation(BaseLocation): """Connection class to data assets in WS Desktop. :param asset_id: asset ID from the project on WS Desktop :type asset_id: str """ def __init__(self, asset_id: str) -> None: self.href = None self._asset_name = None self._asset_id = None self._local_asset_path = None self._initial_asset_id = asset_id self.__wml_client = None warning_msg = ("Depreciation Warning: Class WSDAssetLocation is no longer supported and will be removed." "Use AssetLocation instead.") print(warning_msg) @classmethod def list_assets(cls): project = get_project() return project.get_files() def _setup_local_asset_details(self) -> None: # note: set local asset file from asset_id project = get_project() project_id = project.get_metadata()["metadata"]["guid"] local_assets = project.get_files() # note: reuse local asset_id when object is reused more times if self._asset_id is None: local_asset_id = self.href.split('/')[3].split('?space_id')[0] else: local_asset_id = self._asset_id # --- end note if local_asset_id not in str(local_assets): raise MissingLocalAsset(local_asset_id, reason="Provided asset_id cannot be found on WS Desktop.") else: for asset in local_assets: if asset['asset_id'] == local_asset_id: asset_name = asset['name'] self._asset_name = asset_name self._asset_id = local_asset_id local_asset_path = f"{os.path.abspath('.')}/{project_id}/assets/data_asset/{asset_name}" self._local_asset_path = local_asset_path def _move_asset_to_server(self, workspace: 'WorkSpace') -> None: if not self._local_asset_path or self._asset_name or self._asset_id: self._setup_local_asset_details() remote_asset_details = workspace.wml_client.data_assets.create(self._asset_name, self._local_asset_path) self.href = remote_asset_details['metadata']['href'] @classmethod def _set_path(cls, href: str) -> 'WSDAssetLocation': location = cls('.') location.href = href return location @property def wml_client(self): return self.__wml_client @wml_client.setter def wml_client(self, var): self.__wml_client = var if self.__wml_client: self.href = self.__wml_client.service_instance._href_definitions.get_base_asset_href(self._initial_asset_id) else: self.href = f'/v2/assets/{self._initial_asset_id}' def to_dict(self) -> dict: """Return a json dictionary representing this model.""" _dict = vars(self).copy() del _dict['_asset_name'] del _dict['_asset_id'] del _dict['_local_asset_path'] del _dict[f"_{self.__class__.__name__}__wml_client"] del _dict['_initial_asset_id'] return _dict def _get_file_size(self) -> 'int': if not self._local_asset_path or self._asset_name or self._asset_id: self._setup_local_asset_details() return os.stat(path=self._local_asset_path).st_size if os.path.isfile(path=self._local_asset_path) else 0
[docs] class DeploymentOutputAssetLocation(BaseLocation): """Connection class to data assets where output of batch deployment will be stored. :param name: name of .csv file which will be saved as data asset :type name: str :param description: description of the data asset :type description: str, optional """ def __init__(self, name: str, description: str = "") -> None: self.name = name self.description = description
class DatabaseLocation(BaseLocation): """Location class to Database. :param schema_name: database schema name :type schema_name: str :param table_name: database table name :type table_name: str catalog_name: database catalog name, required only for Presto data source. :type catalog_name: str, optional """ def __init__(self, schema_name: str, table_name: str, catalog_name: str = None, **kwargs) -> None: self.schema_name = schema_name self.table_name = table_name self.catalog_name = catalog_name def _get_file_size(self) -> None: raise NotImplementedError() def to_dict(self) -> dict: """Get a json dictionary representing DatabaseLocation.""" return {key: value for key, value in vars(self).items() if value}