Source code for teemi.learn.auto_ml

#!/usr/bin/env python
# MIT License
# Copyright (c) 2024, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

""" This module is focused on Machine Learning purposes. And simplifying these workflows"""

import h2o
from h2o.automl import H2OAutoML
import pandas as pd


[docs]def autoML_on_partitioned_data( feature_cols: list, training_column: str, df_input_for_ml, path="", partitions=5, training_time=5, nfold=10, ) -> None: """Runs over a pandas dataframe and trains MLs according to specified partition lenght. Parameters ---------- feature_cols: list the column you want to train on fx : feature_cols = ['0', '1', '2', '3'] training_column : str the column you want the models to be able to predict df_input_for_ml : pd.DataFrame A pandas dataframe with your data training_time : int the amount of time you want to train the models. If you set it to 0, it will not have a time limit and will train untill it reaches saturation i.e. best models Returns ------- csv files in the specified path """ all_mae = [] # partitioning step = int(len(df_input_for_ml) / partitions) + 1 partitions = [i for i in range(0, len(df_input_for_ml), step)] # Partion columns - used for getting the right output partitions_col = partitions[1:] partitions_col.append(len(df_input_for_ml)) # INCREASING THE SIZE OF THE DATASET partitions_list = [ df_input_for_ml[partitions[0] : partitions[i]] for i in range(1, len(partitions)) ] # add the last_full partition partitions_list.append(df_input_for_ml[partitions[0] :]) ### Making the dataframes into h2o dfs list_of_df_test_frames = [] for df in partitions_list: # initialize a h20 dataframe df_test = h2o.H2OFrame(pd.concat([df], axis="columns")) # changing columns to strings for col in df_input_for_ml.columns: if col != training_column: col = str(col) # making the dataframes categorical except the training column for column in df_test.columns: if col != training_column: df_test[column] = df_test[column].asfactor() list_of_df_test_frames.append(df_test) ##### setting up ML autoML_dataclasses_list = [] # Initialize 5 - H2O autoML class for i in range(len(list_of_df_test_frames)): AutoML = H2OAutoML( max_runtime_secs=training_time, # 1 hour =int(3600 * 1) , if unlimited time is wanted then set this to zero = 0 max_models=None, # None = no limit nfolds=nfold, # number of folds for k-fold cross-validation (nfolds=0 disables cross-validation) seed=1, # Reproducibility sort_metric="MAE", keep_cross_validation_predictions=True, ) autoML_dataclasses_list.append(AutoML) ##### Training the models on partitioned data for i in range(len(autoML_dataclasses_list)): autoML_dataclasses_list[i].train( x=feature_cols, y=training_column, training_frame=list_of_df_test_frames[i] ) print( "len of dataframes that are being trained on :", len(list_of_df_test_frames[i]), ) ### getting the mae for each model model_name = [] cv_sd_mae = [] cv_mean_mae = [] best_models_mae = [] for model in autoML_dataclasses_list: # Mae for each model train best_model = model.get_best_model() best_models_mae.append(best_model.mae()) # CV metrics best_model_cv_summary = ( best_model.cross_validation_metrics_summary().as_data_frame() ) mean = float(best_model_cv_summary.iloc[0:1, 0:3]["mean"]) sd = float(best_model_cv_summary.iloc[0:1, 0:3]["sd"]) # save ot cv_mean_mae.append(mean) cv_sd_mae.append(sd) ## save names model_name.append(best_model.model_id) # saving ALL maes all_mae.append(best_models_mae) df = pd.DataFrame(all_mae, columns=partitions_col, dtype=float) df = df.T # add cv mean mae and sd df["CV_mean_MAE"] = cv_mean_mae df["CV_SD_MAE"] = cv_sd_mae df["Model_name"] = model_name # getting a unique name from datetime import datetime now = datetime.now() # current date and time time = now.strftime("%Y_%m_%d_%H:%M") df.to_csv(path + time + "_ml_models_running_over_partioned_data.csv")