Source code for dse_do_utils.utilities

# Copyright IBM All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
from collections import namedtuple

import pandas as pd


# General utilities module
# Contains functions


[docs]def add_sys_path(new_path):
    """ Adds a directory to Python's sys.path

    Does not add the directory if it does not exist or if it's already on
    sys.path. Returns 1 if OK, -1 if new_path does not exist, 0 if it was
    already on sys.path.
    Based on: https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s23.html

    Challenge: in order to use this function, we need to import the dse_do_utils package
    and thus we need to add it's location it to sys.path!
    This will work better once we can do a pip install dse-do_utils.
    """
    import sys
    import os

    # Avoid adding nonexistent paths
    if not os.path.exists(new_path):
        return -1

    # Standardize the path. Windows is case-insensitive, so lowercase
    # for definiteness.
    new_path = os.path.abspath(new_path)
    if sys.platform == 'win32':
        new_path = new_path.lower(  )

    # Check against all currently available paths
    for x in sys.path:
        x = os.path.abspath(x)
        if sys.platform == 'win32':
            x = x.lower(  )
        if new_path in (x, x + os.sep):
            return 0
    sys.path.append(new_path)
    return 1


[docs]def list_file_hierarchy(startpath: str) -> None:
    """Hierarchically print the contents of the folder tree, starting with the `startpath`.

    Usage::

        current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        parent_dir_2 = os.path.abspath(os.path.join(parent_dir, os.pardir))
        list_file_hierarchy(parent_dir_2) #List tree starting at the grand-parent of the current directory


    Args:
        startpath (str): Root of the tree

    Returns:
        None
    """
    import os
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))


[docs]def convert_size(size_bytes: int):
    """Returns string describing file size.

    Args:
        size_bytes (int): size if file in bytes

    From https://stackoverflow.com/questions/5194057/better-way-to-convert-file-sizes-in-python
    """
    import math
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


[docs]def df_itertuples_with_index_names(df: pd.DataFrame):
    """Alternative for df.itertuples() where we add the index as named attributes to the tuple.
    This allows access to the index column in the same way as a regular column.
    This will make it much easier to access the values of the named index.

    Normally with df.itertuples() one must access the values of the Index by position, e.g.::

        for row in df.itertuples():
            (index_a, index_b) = row.Index
            print(index_a)

    One would have to ensure to extract all index columns and know the order in the Index.
    However, with this function we can do::

        for row in df_itertuples_with_index_names(df):
            print(row.index_a)

    Test::

        # Create a sample df
        index = pd.MultiIndex.from_product([range(2), range(3)], names=['index_a', 'index_b'])
        df = pd.DataFrame({'my_column': range(len(index))}, index=index)
        # Loop over itertuples alternative:
        for row in df_itertuples_with_index_names(df):
            print(row.index_a)

    Index columns are added at the tail of the tuple, so to be compatible with code that uses the position of the fields in the tuple.
    Inspired by https://stackoverflow.com/questions/46151666/iterate-over-pandas-dataframe-with-multiindex-by-index-names.
    """
    Row = namedtuple("Row", ['Index', *df.columns, *df.index.names])
    for row in df.itertuples():
        yield Row(*(row + row.Index))