Source code for dse_do_utils.multiscenariomanager

# Copyright IBM All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

# -----------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------
# MultiScenarioManager
# VT 20230304: DEPRECATED. Requires review
# -----------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------

import pandas as pd
import os
from typing import Sequence, List, Dict, Tuple, Optional

#  Typing aliases
Inputs = Dict[str, pd.DataFrame]
Outputs = Dict[str, pd.DataFrame]
InputsOutputs = Tuple[Inputs, Outputs]

try:
    # Import as part of package
    from .scenariomanager import ScenarioManager
except:
    # import as part of DO Model Builder
    from scenariomanager import ScenarioManager

# from dse_do_utils import ScenarioManager


[docs]class MultiScenarioManager(object):
    """Manages multiple scenarios from same DO Model/Experiment.
    Can export all scenarios in one Excel spreadsheet, where it adds the scenario_name as an additional column.
    Also adds an additional 'Scenario' table. (This looks relevant for usage (filtering) in Cognos.)
    By default, writes an Excel file in datasets named "model_name + '_multi_output'.xlsx"

    Usage 1 - All scenarios from Model::

        model_name = 'My Model'
        msm = MultiScenarioManager(model_name=model_name)
        msm.get_multi_scenario_data()
        msm.write_data_to_excel()


    Usage 2 - Selected scenarios from Model::

        model_name = 'My Model'
        scenario_names = ['Scenario 1', 'Scenario 2']
        msm = MultiScenarioManager(model_name=model_name, scenario_names=scenario_names)
        msm.get_multi_scenario_data()
        msm.write_data_to_excel()

    """

    def __init__(self, model_name: Optional[str] = None, scenario_names: List[str] = [],
                 local_root: Optional[str] = None, project_id: Optional[str] = None,
                 project_access_token: Optional[str] = None, project=None):
        """Create a MultiScenarioManager.

        Args:
            model_name (str):
            scenario_names (List[str]): list of anmes of scenarios to export. If not specified or empty then it will select all scenarios in the model
            local_root (str): Path of root when running on a local computer
            project_id (str): Project-id, when running in WS Cloud, also requires a project_access_token
            project_access_token (str): When running in WS Cloud, also requires a project_id
            project (project_lib.Project): alternative for project_id and project_access_token for WS Cloud
        """
        self.model_name = model_name
        self.local_root = local_root
        self.project_id = project_id
        self.project_access_token = project_access_token
        self.project = project
        self.scenarios_df = self.get_scenarios_df(scenario_names)
        if scenario_names is None:
            #             self.scenario_names = self.get_all_scenario_names()
            self.scenario_names = list(self.scenarios_df.scenario_name)
        else:
            self.scenario_names = scenario_names
        self.inputs_by_scenario: Dict[str, Inputs] = {}
        self.outputs_by_scenario: Dict[str, Outputs] = {}
        self.inputs = None
        self.outputs = None

[docs]    def get_dd_client(self):
        """Return the Client managing the DO scenario.
        Returns: new dd_scenario.Client
        """
        from dd_scenario import Client
        if self.project is not None:
            pc = self.project.project_context
            return Client(pc=pc)
        elif (self.project_id is not None) and (self.project_access_token is not None):
            # When in WS Cloud:
            from project_lib import Project
            # The do_optimization project token is an authorization token used to access project resources like data sources, connections, and used by platform APIs.
            project = Project(project_id=self.project_id,
                              project_access_token=self.project_access_token)
            pc = project.project_context
            return Client(pc=pc)
        else:
            #  In WSL/CPD:
            return Client()

[docs]    def load_data_from_scenario(self, scenario_name):
        """TODO: see of by re-using a Client, this can be done faster"""
        sm = ScenarioManager(self.model_name, scenario_name, self.local_root, self.project_id,
                             self.project_access_token, self.project)
        inputs, outputs = sm.load_data_from_scenario()
        return inputs, outputs

[docs]    def env_is_wscloud(self) -> bool:
        """Return true if environment is WS Cloud"""
        return 'PWD' in os.environ and os.environ['PWD'] == '/home/dsxuser/work'

[docs]    def get_data_directory(self) -> str:
        """Returns the path to the datasets folder.

        :return: path to the datasets folder
        """
        if ScenarioManager.env_is_cpd40():
            from ibm_watson_studio_lib import access_project_or_space
            wslib = access_project_or_space()
            data_dir = wslib.mount.get_base_dir()
        elif self.env_is_wscloud():
            data_dir = '/home/dsxuser/work'  # or use os.environ['PWD'] ?
        elif ScenarioManager.env_is_cpd25():
            # Note that the data dir in CPD25 is not an actual real directory and is NOT in the hierarchy of the JupyterLab folder
            data_dir = '/project_data/data_asset'  # Do NOT use the os.path.join!
        elif ScenarioManager.env_is_dsx():
            data_dir = os.path.join(self.get_root_directory(),
                                    'datasets')  # Do we need to add an empty string at the end?
        else:  # Local file system
            data_dir = os.path.join(self.get_root_directory(), 'datasets')
        return data_dir

[docs]    def get_root_directory(self) -> str:
        """Return the root directory of the file system.
        If system is WS, it will return the DSX root, otherwise the directory specified in the local_root.
        Raises:
            ValueError if root directory doesn't exist.
        """
        if ScenarioManager.env_is_cpd25():
            root_dir = '.'
        elif ScenarioManager.env_is_dsx():  # Note that this is False in DO! So don't run in DO
            root_dir = os.environ['DSX_PROJECT_DIR']
        else:
            if self.local_root is None:
                raise ValueError('The local_root should be specified if loading from a file from outside of WS')
            root_dir = self.local_root
        # Assert that root_dir actually exists
        if not os.path.isdir(root_dir):
            raise ValueError("Root directory `{}` does not exist.".format(root_dir))
        return root_dir

[docs]    def add_data_file_to_project(self, file_path: str, file_name: Optional[str] = None) -> None:
        """Add a data file to the Watson Studio project.
        Applies to CP4Dv2.5 and WS Cloud
        Needs to be called after the file has been saved regularly in the file system in
        `/project_data/data_asset/` (for CPD2.5) or `/home/dsxuser/work/` in WS Cloud.
        Ensures the file is visible in the Data Assets of the Watson Studio UI.

        Args:
            file_path (str): full file path, including the file name and extension
            file_name (str): name of data asset. Default is None. If None, the file-name will be extracted from the file_path.
        """
        # Add to Project
        if self.project is None:
            from project_lib import Project
            self.project = Project.access()
        if file_name is None:
            file_name = os.path.basename(file_path)
        with open(file_path, 'rb') as f:
            self.project.save_data(file_name=file_name, data=f, overwrite=True)

[docs]    def get_multi_scenario_data(self, scenario_names: List[str] = None):
        if scenario_names is None:
            scenario_names = self.scenario_names
        for scenario_name in scenario_names:
            inputs, outputs = self.load_data_from_scenario(scenario_name)
            self.inputs_by_scenario[scenario_name] = inputs
            self.outputs_by_scenario[scenario_name] = outputs
        self.inputs = MultiScenarioManager.merge_scenario_data(self.inputs_by_scenario)
        self.inputs['Scenario'] = self.scenarios_df  # Adding the Scenarios as a reference table
        self.outputs = MultiScenarioManager.merge_scenario_data(self.outputs_by_scenario)

[docs]    def write_data_to_excel(self, excel_file_name: str = None) -> None:
        """Write inputs and/or outputs to an Excel file in datasets.
        The inputs and outputs as in the attributes `self.inputs` and `self.outputs` of the ScenarioManager

        If the excel_file_name is None, it will be generated from the model_name and scenario_name: MODEL_NAME + "_multi_output"

        Args:
            excel_file_name (str): The file name for the Excel file.
        """

        if excel_file_name is None:
            if self.model_name is not None:
                excel_file_name = f"{self.model_name}_multi_output"
            else:
                raise ValueError(
                    "The argument excel_file_name can only be 'None' if the model_name '{}' has been specified.".format(
                        self.model_name))

        # Save the regular Excel file:
        data_dir = self.get_data_directory()
        excel_file_path = os.path.join(data_dir, excel_file_name + '.xlsx')
        writer = pd.ExcelWriter(excel_file_path, engine='xlsxwriter')
        ScenarioManager.write_data_to_excel_s(writer, inputs=self.inputs, outputs=self.outputs)
        writer.save()
        if ScenarioManager.env_is_cpd25():
            self.add_data_file_to_project(excel_file_path, excel_file_name + '.xlsx')
        return excel_file_path

[docs]    @staticmethod
    def merge_scenario_data(data_by_scenario: Dict[str, Dict[str, pd.DataFrame]]) -> Dict[str, pd.DataFrame]:
        """Add scenario_name as column. Merge tables"""
        merged_data_dict = {}
        for scenario, data_dict in data_by_scenario.items():
            #             print(f"Merge scenario {scenario}")
            for table_name, df in data_dict.items():
                #                 print(f"Merge scenario {scenario} - table {table_name}")
                df['scenario_name'] = scenario
                if table_name in merged_data_dict.keys():
                    existing_df = merged_data_dict[table_name]
                    # TODO: what will happen if the 2 dataframes have different columns? A: you get both columns and NaN values
                    existing_df = existing_df.append(df, ignore_index=True, sort=False)
                    merged_data_dict[table_name] = existing_df
                else:
                    merged_data_dict[table_name] = df
        return merged_data_dict

[docs]    def get_all_scenario_names(self):
        """Deprecated. Replaced by get_scenarios_df"""
        names = []
        client = self.get_dd_client()
        model_builder = client.get_model_builder(name=self.model_name)
        if model_builder is None:
            raise ValueError('No DO model with name `{}` exists'.format(self.model_name))
        names = model_builder.get_scenarios(as_dict=True)
        return list(names.keys())

[docs]    def get_scenarios_df(self, scenario_names: List[str] = None) -> pd.DataFrame:
        """Return scenarios as Dataframe.
        If scenario_names is None, will get all scenarios in Model.
        Else, just the ones matching the names.
        For now, the only column in the df is the scenario_name.
        More can be added later.
        """
        names = []
        client = self.get_dd_client()
        model_builder = client.get_model_builder(name=self.model_name)
        if model_builder is None:
            raise ValueError('No DO model with name `{}` exists'.format(self.model_name))
        scenarios_dict = model_builder.get_scenarios(as_dict=True)
        df = pd.DataFrame({'scenario_name': list(scenarios_dict.keys())})
        if scenario_names is not None:
            df = df.query("scenario_name in @scenario_names")
        return df