Working with DataConnection¶
Before you start an AutoAI experiment, you need to specify where your training dataset is located. AutoAI supports Cloud Object Storage (COS) and data assets on Cloud.
IBM Cloud - DataConnection Initialization¶
There are three types of connections: Connection Asset, Data Asset, and Container. To upload your experiment dataset, you must initialize DataConnection
with your COS credentials.
Connection Asset¶
from ibm_watsonx_ai.helpers.connections import DataConnection, S3Location
connection_details = client.connections.create({
client.connections.ConfigurationMetaNames.NAME: "Connection to COS",
client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: client.connections.get_datasource_type_id_by_name('bluemixcloudobjectstorage'),
client.connections.ConfigurationMetaNames.PROPERTIES: {
'bucket': 'bucket_name',
'access_key': 'COS access key id',
'secret_key': 'COS secret access key'
'iam_url': 'COS iam url',
'url': 'COS endpoint url'
}
})
connection_id = client.connections.get_uid(connection_details)
# note: this DataConnection will be used as a reference where to find your training dataset
training_data_references = DataConnection(
connection_asset_id=connection_id,
location=S3Location(
bucket='bucket_name', # note: COS bucket name where training dataset is located
path='my_path' # note: path within bucket where your training dataset is located
)
)
# note: this DataConnection will be used as a reference where to save all of the AutoAI experiment results
results_connection = DataConnection(
connection_asset_id=connection_id,
# note: bucket name and path could be different or the same as specified in the training_data_references
location=S3Location(bucket='bucket_name',
path='my_path'
)
)
Data Asset¶
from ibm_watsonx_ai.helpers.connections import DataConnection
data_location = './your_dataset.csv'
asset_details = client.data_assets.create(
name=data_location.split('/')[-1],
file_path=data_location
)
)
asset_id = client.data_assets.get_id(asset_details)
training_data_references = DataConnection(data_asset_id=asset_id)
Container¶
from ibm_watsonx_ai.helpers.connections import DataConnection, ContainerLocation
training_data_references = DataConnection(location=ContainerLocation(path="your_dataset.csv"))
IBM watsonx.ai software - DataConnection Initialization¶
There are three types of connections: Connection Asset, Data Asset, and
FS. FS is only for saving result references. To upload your experiment dataset,
you must initialize DataConnection
with your service credentials.
Connection Asset - DatabaseLocation¶
from ibm_watsonx_ai.helpers.connections import DataConnection, DatabaseLocation
connection_details = client.connections.create({
client.connections.ConfigurationMetaNames.NAME: f"Connection to Database - {your_database_name} ",
client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: client.connections.get_datasource_type_id_by_name('your_database_name'),
client.connections.ConfigurationMetaNames.PROPERTIES: {
"database": "database_name",
"password": "database_password",
"port": "port_number",
"host": "host_name",
"username": "database_type" # e.g. "postgres"
}
})
connection_id = client.connections.get_uid(connection_details)
training_data_references = DataConnection(
connection_asset_id=connection_id,
location=DatabaseLocation(
schema_name=schema_name,
table_name=table_name,
)
)
Connection Asset - S3Location¶
For a Connection Asset with S3Location, connection_id
to the S3 storage is required.
from ibm_watsonx_ai.helpers.connections import DataConnection, S3Location
training_data_references = DataConnection(
connection_asset_id=connection_id,
location=S3Location(bucket='bucket_name', # note: COS bucket name where training dataset is located
path='my_path' # note: path within bucket where your training dataset is located
)
)
Connection Asset - NFSLocation¶
Before establishing a connection, you need to create and start a volume where the dataset will be stored.
from ibm_watsonx_ai.helpers.connections import DataConnection, NFSLocation
connection_details={
client.connections.ConfigurationMetaNames.NAME: "Client NFS Volume Connection from SDK",
client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: client.connections.get_datasource_type_id_by_name('volumes'),
client.connections.ConfigurationMetaNames.DESCRIPTION: "NFS volume connection from python client",
client.connections.ConfigurationMetaNames.PROPERTIES: {"instance_id": volume_id,
"pvc": existing_pvc_volume_name,
"volume": volume_name,
'inherit_access_token':"true"},
'flags':['personal_credentials'],
}
client.connections.create(connection_details)
connection_id = client.connections.get_uid(connection_details)
training_data_references = DataConnection(
connection_asset_id=connection_id,
location = NFSLocation(path=f'/{filename}'))
Data Asset¶
from ibm_watsonx_ai.helpers.connections import DataConnection
data_location = './your_dataset.csv'
asset_details = client.data_assets.create(
name=data_location.split('/')[-1],
file_path=data_location)
asset_id = client.data_assets.get_id(asset_details)
training_data_references = DataConnection(data_asset_id=asset_id)
FSLocation¶
After running fit()
, you can read your results from a dedicated place in cluster’s filesystem using FSLocation.
from ibm_watsonx_ai.helpers.connections import DataConnection, FSLocation
training_result_reference = DataConnection(
location=FSLocation(path="path_to_directory")
)
Batch DataConnection¶
If you use a Batch type of deployment, you can store the output of the Batch deployment using DataConnection
.
For more information and usage instruction, see Batch.
from ibm_watsonx_ai.helpers.connections import DataConnection, DeploymentOutputAssetLocation
from ibm_watsonx_ai.deployment import Batch
service_batch = Batch(wml_credentials, source_space_id=space_id)
service_batch.create(
experiment_run_id="id_of_your_experiment_run",
model="choosen_pipeline",
deployment_name='Batch deployment')
payload_reference = DataConnection(location=training_data_references)
results_reference = DataConnection(
location=DeploymentOutputAssetLocation(name="batch_output_file_name.csv"))
scoring_params = service_batch.run_job(
payload=[payload_reference],
output_data_reference=results_reference,
background_mode=False)
Upload your training dataset¶
An AutoAI experiment should have access to your training data.
If you don’t have a training dataset stored already,
you can store it by invoking the write()
method of the DataConnection
object.
training_data_references.set_client(client)
training_data_references.write(data='local_path_to_the_dataset', remote_name='training_dataset.csv')
Download your training dataset¶
To download a stored dataset, use the read()
method of the DataConnection
object.
training_data_references.set_client(client)
dataset = training_data_references.read() # note: returns a pandas DataFrame