Working with DataConnection

Before you start an AutoAI experiment, you need to specify where your training dataset is located. AutoAI supports Cloud Object Storage (COS) and data assets on Cloud.

IBM Cloud - DataConnection Initialization

There are three types of connections: Connection Asset, Data Asset, and Container. To upload your experiment dataset, you must initialize DataConnection with your COS credentials.

Connection Asset

from ibm_watsonx_ai.helpers.connections import DataConnection, S3Location

connection_details = client.connections.create({
    client.connections.ConfigurationMetaNames.NAME: "Connection to COS",
    client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: client.connections.get_datasource_type_id_by_name('bluemixcloudobjectstorage'),
    client.connections.ConfigurationMetaNames.PROPERTIES: {
        'bucket': 'bucket_name',
        'access_key': 'COS access key id',
        'secret_key': 'COS secret access key'
        'iam_url': 'COS iam url',
        'url': 'COS endpoint url'
    }
})

connection_id = client.connections.get_uid(connection_details)

# note: this DataConnection will be used as a reference where to find your training dataset
training_data_references = DataConnection(
    connection_asset_id=connection_id,
    location=S3Location(
        bucket='bucket_name',   # note: COS bucket name where training dataset is located
        path='my_path'  # note: path within bucket where your training dataset is located
        )
    )

# note: this DataConnection will be used as a reference where to save all of the AutoAI experiment results
results_connection = DataConnection(
        connection_asset_id=connection_id,
        # note: bucket name and path could be different or the same as specified in the training_data_references
        location=S3Location(bucket='bucket_name',
                            path='my_path'
                            )
    )

Data Asset

from ibm_watsonx_ai.helpers.connections import DataConnection

data_location = './your_dataset.csv'
asset_details = client.data_assets.create(
        name=data_location.split('/')[-1],
        file_path=data_location
        )
    )

asset_id = client.data_assets.get_id(asset_details)
training_data_references = DataConnection(data_asset_id=asset_id)

Container

from ibm_watsonx_ai.helpers.connections import DataConnection, ContainerLocation

training_data_references = DataConnection(location=ContainerLocation(path="your_dataset.csv"))

IBM watsonx.ai software - DataConnection Initialization

There are three types of connections: Connection Asset, Data Asset, and FS. FS is only for saving result references. To upload your experiment dataset, you must initialize DataConnection with your service credentials.

Connection Asset - DatabaseLocation

from ibm_watsonx_ai.helpers.connections import DataConnection, DatabaseLocation

connection_details = client.connections.create({
    client.connections.ConfigurationMetaNames.NAME: f"Connection to Database - {your_database_name} ",
    client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: client.connections.get_datasource_type_id_by_name('your_database_name'),
    client.connections.ConfigurationMetaNames.PROPERTIES: {
            "database": "database_name",
            "password": "database_password",
            "port": "port_number",
            "host": "host_name",
            "username": "database_type" # e.g. "postgres"
            }
})

connection_id = client.connections.get_uid(connection_details)

training_data_references = DataConnection(
    connection_asset_id=connection_id,
    location=DatabaseLocation(
        schema_name=schema_name,
        table_name=table_name,
    )
)

Connection Asset - S3Location

For a Connection Asset with S3Location, connection_id to the S3 storage is required.

from ibm_watsonx_ai.helpers.connections import DataConnection, S3Location

training_data_references = DataConnection(
    connection_asset_id=connection_id,
    location=S3Location(bucket='bucket_name',   # note: COS bucket name where training dataset is located
                        path='my_path'  # note: path within bucket where your training dataset is located
                        )
)

Connection Asset - NFSLocation

Before establishing a connection, you need to create and start a volume where the dataset will be stored.

from ibm_watsonx_ai.helpers.connections import DataConnection, NFSLocation

connection_details={
        client.connections.ConfigurationMetaNames.NAME: "Client NFS Volume Connection from SDK",
        client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: client.connections.get_datasource_type_id_by_name('volumes'),
        client.connections.ConfigurationMetaNames.DESCRIPTION: "NFS volume connection from python client",
        client.connections.ConfigurationMetaNames.PROPERTIES: {"instance_id": volume_id,
                                                                "pvc": existing_pvc_volume_name,
                                                                "volume": volume_name,
                                                              'inherit_access_token':"true"},
                            'flags':['personal_credentials'],
        }

client.connections.create(connection_details)

connection_id = client.connections.get_uid(connection_details)

training_data_references = DataConnection(
        connection_asset_id=connection_id,
        location = NFSLocation(path=f'/{filename}'))

Data Asset

from ibm_watsonx_ai.helpers.connections import DataConnection

data_location = './your_dataset.csv'
asset_details = client.data_assets.create(
            name=data_location.split('/')[-1],
            file_path=data_location)

asset_id = client.data_assets.get_id(asset_details)
training_data_references = DataConnection(data_asset_id=asset_id)

FSLocation

After running fit(), you can read your results from a dedicated place in cluster’s filesystem using FSLocation.

from ibm_watsonx_ai.helpers.connections import DataConnection, FSLocation

training_result_reference = DataConnection(
    location=FSLocation(path="path_to_directory")
    )

Batch DataConnection

If you use a Batch type of deployment, you can store the output of the Batch deployment using DataConnection. For more information and usage instruction, see Batch.

from ibm_watsonx_ai.helpers.connections import DataConnection, DeploymentOutputAssetLocation
from ibm_watsonx_ai.deployment import Batch

service_batch = Batch(wml_credentials, source_space_id=space_id)
service_batch.create(
        experiment_run_id="id_of_your_experiment_run",
        model="choosen_pipeline",
        deployment_name='Batch deployment')

payload_reference = DataConnection(location=training_data_references)
results_reference = DataConnection(
        location=DeploymentOutputAssetLocation(name="batch_output_file_name.csv"))

scoring_params = service_batch.run_job(
    payload=[payload_reference],
    output_data_reference=results_reference,
    background_mode=False)

Upload your training dataset

An AutoAI experiment should have access to your training data. If you don’t have a training dataset stored already, you can store it by invoking the write() method of the DataConnection object.

training_data_references.set_client(client)
training_data_references.write(data='local_path_to_the_dataset', remote_name='training_dataset.csv')

Download your training dataset

To download a stored dataset, use the read() method of the DataConnection object.

training_data_references.set_client(client)
dataset = training_data_references.read()   # note: returns a pandas DataFrame