Source code for dse_do_utils.utilities
# Copyright IBM All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
from collections import namedtuple
import pandas as pd
# General utilities module
# Contains functions
[docs]def add_sys_path(new_path):
""" Adds a directory to Python's sys.path
Does not add the directory if it does not exist or if it's already on
sys.path. Returns 1 if OK, -1 if new_path does not exist, 0 if it was
already on sys.path.
Based on: https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s23.html
Challenge: in order to use this function, we need to import the dse_do_utils package
and thus we need to add it's location it to sys.path!
This will work better once we can do a pip install dse-do_utils.
"""
import sys
import os
# Avoid adding nonexistent paths
if not os.path.exists(new_path):
return -1
# Standardize the path. Windows is case-insensitive, so lowercase
# for definiteness.
new_path = os.path.abspath(new_path)
if sys.platform == 'win32':
new_path = new_path.lower( )
# Check against all currently available paths
for x in sys.path:
x = os.path.abspath(x)
if sys.platform == 'win32':
x = x.lower( )
if new_path in (x, x + os.sep):
return 0
sys.path.append(new_path)
return 1
[docs]def list_file_hierarchy(startpath: str) -> None:
"""Hierarchically print the contents of the folder tree, starting with the `startpath`.
Usage::
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
parent_dir_2 = os.path.abspath(os.path.join(parent_dir, os.pardir))
list_file_hierarchy(parent_dir_2) #List tree starting at the grand-parent of the current directory
Args:
startpath (str): Root of the tree
Returns:
None
"""
import os
for root, dirs, files in os.walk(startpath):
level = root.replace(startpath, '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
print('{}{}'.format(subindent, f))
[docs]def convert_size(size_bytes: int):
"""Returns string describing file size.
Args:
size_bytes (int): size if file in bytes
From https://stackoverflow.com/questions/5194057/better-way-to-convert-file-sizes-in-python
"""
import math
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
[docs]def df_itertuples_with_index_names(df: pd.DataFrame):
"""Alternative for df.itertuples() where we add the index as named attributes to the tuple.
This allows access to the index column in the same way as a regular column.
This will make it much easier to access the values of the named index.
Normally with df.itertuples() one must access the values of the Index by position, e.g.::
for row in df.itertuples():
(index_a, index_b) = row.Index
print(index_a)
One would have to ensure to extract all index columns and know the order in the Index.
However, with this function we can do::
for row in df_itertuples_with_index_names(df):
print(row.index_a)
Test::
# Create a sample df
index = pd.MultiIndex.from_product([range(2), range(3)], names=['index_a', 'index_b'])
df = pd.DataFrame({'my_column': range(len(index))}, index=index)
# Loop over itertuples alternative:
for row in df_itertuples_with_index_names(df):
print(row.index_a)
Index columns are added at the tail of the tuple, so to be compatible with code that uses the position of the fields in the tuple.
Inspired by https://stackoverflow.com/questions/46151666/iterate-over-pandas-dataframe-with-multiindex-by-index-names.
"""
Row = namedtuple("Row", ['Index', *df.columns, *df.index.names])
for row in df.itertuples():
yield Row(*(row + row.Index))