From 9a8944cb5dc94da4e90f0c41d1a54009bcc926cf Mon Sep 17 00:00:00 2001 From: coolneng Date: Fri, 1 Jan 2021 23:37:41 +0100 Subject: [PATCH] Replace SMOTETomek with SMOTEENN --- src/preprocessing.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index 6849acf..32709a4 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,7 +1,7 @@ from pandas import DataFrame, read_csv from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold -from imblearn.combine import SMOTETomek +from imblearn.combine import SMOTEENN def construct_dataframes(train, test): @@ -20,10 +20,9 @@ def rename_columns(df_list) -> DataFrame: def drop_null_values(df_list): + drop_columns = ["tipo_marchas", "descuento", "ciudad"] for df in df_list: - df.drop(columns="tipo_marchas", inplace=True) - df.drop(columns="descuento", inplace=True) - df.drop(columns="ciudad", inplace=True) + df.drop(columns=drop_columns, inplace=True) df.dropna(inplace=True) return df_list @@ -63,9 +62,9 @@ def split_data_target(df, dataset): def balance_training_data(df): - smote_tomek = SMOTETomek(random_state=42) + smote_enn = SMOTEENN(random_state=42) data, target = split_data_target(df=df, dataset="data") - balanced_data, balanced_target = smote_tomek.fit_resample(data, target) + balanced_data, balanced_target = smote_enn.fit_resample(data, target) balanced_data_df = DataFrame( balanced_data, columns=df.columns.difference(["precio_cat"]) )