Source code for BaseITS.pre_processing

import pandas as pd
import numpy as np
from datetime import datetime


[docs]def align_prophet_naming_convection(
    df: pd.DataFrame, date_col_name: str, y_col_name: str, verbose=False
):
    """Function to align column names with ones expected by prophet model

    Args:
        df (pd.DataFrame): dataframe with the columns
        x_col_name (str): outcome column
        y_col_name (str): date column

    Returns:
        pd.DataFrame: dataframe with renamed columns to the expected prophet naming convection
    """
    if date_col_name not in df.columns.tolist():
        return ValueError(
            "Make sure that the provided date column name are in the dataframe provided."
        )
    if y_col_name not in df.columns.tolist():
        return ValueError(
            "Make sure that the provided y column name are in the dataframe provided."
        )
    elif date_col_name == "ds" and y_col_name == "y":
        if verbose:
            print("Column names already in the required convection")
        return df

    elif date_col_name != "ds" and y_col_name == "y":
        if verbose:
            print(
                "Only y column name in the required convection. Converting date column to required convection"
            )
        df.rename(columns={date_col_name: "ds"}, inplace=True)
        return df

    elif date_col_name == "ds" and y_col_name != "y":
        if verbose:
            print(
                "Only date column name in the required convection. Converting y column to required convection"
            )
        df = df.rename(columns={y_col_name: "y"})
        return df

    elif date_col_name != "ds" and y_col_name != "y":
        if verbose:
            print(
                "None of the column names in the required convection. Converting y column to required convection"
            )
        df.rename(columns={date_col_name: "ds"}, inplace=True)
        df.rename(columns={y_col_name: "y"}, inplace=True)
        return df

    ## Add test if column names provided not in the dataframes themselves

    else:
        raise ValueError(
            "Not covered test case. Should be investigated for further code improvement"
        )


[docs]def str_date_validate(date_text: str):
    """Function to validate strings that they are in the correct datetime format for conversion.

    Args:
        date_text (str): String with the date

    Raises:
        ValueError: Raises an error incase wrong string date format is provided

    Returns:
        datetime: Datetime converted value in the format ( '%Y-%m-%d')

    """
    try:
        value = datetime.datetime.strptime(date_text, "%Y-%m-%d")
        return value
    except ValueError:
        raise ValueError("Incorrect data format, should be YYYY-MM-DD")


[docs]def dates_validation(df: pd.DataFrame, date_col_name: str):
    """Function to validate dates to datetime format

    Args:
        df (pd.DataFrame): Dataframe with the data
        date_col_name (str): column with the dates

    Returns:
        pd.series: Series with the date_col_name with datetime datatype
    """

    # try: check if format is the expected one.

    if df[date_col_name].dtype != "datetime64[ns]":

        df[date_col_name] = pd.to_datetime(df[date_col_name])

        if str(df[date_col_name][0]).startswith("1970"):
            return "Error converting, make sure the datetime column is in the YYYY-MM-DD format first"
        else:
            return df

    elif df[date_col_name].dtype == "datetime64[ns]":
        df[date_col_name] = pd.to_datetime(df[date_col_name], format="%Y-%m-%d")
        print("Date column already in the correct format")
        return df

    else:
        raise ValueError("Incorrect data format, should be YYYY-MM-DD")


[docs]def aggregation_wide_df_type(
    df: pd.DataFrame,
    location_col_name: str,
    date_col_name: str,
    outcome_cols: list,
):
    """Function to aggregate outcome values in a wide dataframe type based on the date, outcome and location

    Args:
        df (pd.DataFrame): Wide dataframe type
        location_col_name (str): column name of the location in the dataframe
        date_col_name (str): date column name in the dataframe
        outcome_cols (list): list of the outcome column names

    Returns:
        pd.DataFrame: Dataframe with aggregated counts per location, date and outcome
    """

    df = (
        df.groupby([date_col_name, location_col_name])[outcome_cols].sum().reset_index()
    )
    # regions_df.groupby(['periodname','Region'])['Antenatal 4th Visit'].sum().reset_index()

    return df


[docs]def aggregation_long_df_type(
    df: pd.DataFrame,
    location_col_name: str,
    date_col_name: str,
    outcome_col_name: str,
    outcome_value_col_name: str,
):
    """Function to aggregate outcome values in a long dataframe type based on the date, outcome and location

    Args:
        df (pd.DataFrame): Long dataframe with the data
        location_col_name (str): column name of the locations in the dataframe
        date_col_name (str): column name of the date in the dataframe
        outcome_col_name (str): column name of the outcome in the dataframe
        outcome_value_col_name (str): column name of the outcome values in the dataframe

    Returns:
        pd.DataFrmae: Dataframe with aggregated counts per location, date and outcome
    """

    df = (
        df.groupby([location_col_name, date_col_name, outcome_col_name])[
            outcome_value_col_name
        ]
        .sum()
        .reset_index()
    )
    # long_df.groupby(['region','outcome','ds'])['value'].sum().reset_index()

    return df


[docs]def create_log_offset(df: pd.DataFrame, ofset_column: str):
    """Create offset for the poisson regression forecast model: (log)

    Args:
        df (pd.DataFrame): dataframe with the offset column
        ofset_column (str): column of the offset

    Returns:
        pd.Series: calcutated offset
    """
    offset = np.log(df[ofset_column])

    return offset