Source code for teemi.learn.auto_ml

#!/usr/bin/env python
# MIT License
# Copyright (c) 2024, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

""" This module is focused on Machine Learning purposes. And simplifying these workflows"""

import h2o
from h2o.automl import H2OAutoML
import pandas as pd


[docs]def autoML_on_partitioned_data(
    feature_cols: list,
    training_column: str,
    df_input_for_ml,
    path="",
    partitions=5,
    training_time=5,
    nfold=10,
) -> None:
    """Runs over a pandas dataframe and trains MLs according to specified partition lenght.

    Parameters
    ----------
    feature_cols: list
        the column you want to train on fx : feature_cols = ['0', '1', '2', '3']
    training_column : str
        the column you want the models to be able to predict
    df_input_for_ml : pd.DataFrame
        A pandas dataframe with your data
    training_time : int
        the amount of time you want to train the models.
        If you set it to 0, it will not have a time limit and will train untill it reaches saturation i.e. best models

    Returns
    -------
    csv files in the specified path
    """

    all_mae = []

    # partitioning
    step = int(len(df_input_for_ml) / partitions) + 1
    partitions = [i for i in range(0, len(df_input_for_ml), step)]

    # Partion columns  - used for getting the right output
    partitions_col = partitions[1:]
    partitions_col.append(len(df_input_for_ml))

    # INCREASING THE SIZE OF THE DATASET
    partitions_list = [
        df_input_for_ml[partitions[0] : partitions[i]]
        for i in range(1, len(partitions))
    ]
    # add the last_full partition
    partitions_list.append(df_input_for_ml[partitions[0] :])

    ### Making the dataframes into h2o dfs
    list_of_df_test_frames = []

    for df in partitions_list:
        # initialize a h20 dataframe
        df_test = h2o.H2OFrame(pd.concat([df], axis="columns"))

        # changing columns to strings
        for col in df_input_for_ml.columns:
            if col != training_column:
                col = str(col)

        # making the dataframes categorical except the training column
        for column in df_test.columns:
            if col != training_column:
                df_test[column] = df_test[column].asfactor()
        list_of_df_test_frames.append(df_test)

    ##### setting up ML
    autoML_dataclasses_list = []

    # Initialize 5 - H2O autoML class
    for i in range(len(list_of_df_test_frames)):
        AutoML = H2OAutoML(
            max_runtime_secs=training_time,  # 1 hour =int(3600 * 1) , if unlimited time is wanted then set this to zero = 0
            max_models=None,  # None =  no limit
            nfolds=nfold,  # number of folds for k-fold cross-validation (nfolds=0 disables cross-validation)
            seed=1,  # Reproducibility
            sort_metric="MAE",
            keep_cross_validation_predictions=True,
        )
        autoML_dataclasses_list.append(AutoML)

    ##### Training the models on partitioned data
    for i in range(len(autoML_dataclasses_list)):
        autoML_dataclasses_list[i].train(
            x=feature_cols, y=training_column, training_frame=list_of_df_test_frames[i]
        )

        print(
            "len of dataframes that are being trained on :",
            len(list_of_df_test_frames[i]),
        )

    ### getting the mae for each model
    model_name = []
    cv_sd_mae = []
    cv_mean_mae = []
    best_models_mae = []
    for model in autoML_dataclasses_list:
        # Mae for each model train
        best_model = model.get_best_model()
        best_models_mae.append(best_model.mae())

        # CV metrics
        best_model_cv_summary = (
            best_model.cross_validation_metrics_summary().as_data_frame()
        )
        mean = float(best_model_cv_summary.iloc[0:1, 0:3]["mean"])
        sd = float(best_model_cv_summary.iloc[0:1, 0:3]["sd"])
        # save ot
        cv_mean_mae.append(mean)
        cv_sd_mae.append(sd)

        ## save names
        model_name.append(best_model.model_id)

    # saving ALL maes
    all_mae.append(best_models_mae)
    df = pd.DataFrame(all_mae, columns=partitions_col, dtype=float)
    df = df.T

    # add cv mean mae and sd
    df["CV_mean_MAE"] = cv_mean_mae
    df["CV_SD_MAE"] = cv_sd_mae
    df["Model_name"] = model_name

    # getting a unique name
    from datetime import datetime

    now = datetime.now()  # current date and time
    time = now.strftime("%Y_%m_%d_%H:%M")

    df.to_csv(path + time + "_ml_models_running_over_partioned_data.csv")