diff --git a/src/preprocessing.py b/src/preprocessing.py index c0babbf..33d3ac6 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,6 +1,7 @@ from pandas import DataFrame, read_csv from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold +from imblearn.combine import SMOTETomek def construct_dataframes(train, test): @@ -12,19 +13,20 @@ def construct_dataframes(train, test): return df_list -def drop_null_values(df_list): - for df in df_list: - df.dropna(inplace=True) - df.drop(columns="Tipo_marchas", inplace=True) - return df_list - - def rename_columns(df_list) -> DataFrame: for df in df_list: df.columns = df.columns.str.lower() return df_list +def drop_null_values(df_list): + for df in df_list: + df.dropna(inplace=True) + df.drop(columns="tipo_marchas", inplace=True) + df["descuento"].fillna(0) + return df_list + + def trim_column_names(df_list) -> DataFrame: columns = ["consumo", "motor_CC", "potencia"] for df in df_list: @@ -55,6 +57,26 @@ def encode_columns(df_list): return df_list +def split_data_target(df, dataset): + if dataset == "data": + df.drop(columns="id", inplace=True) + data = df.drop(columns=["precio_cat"]) + target = df["precio_cat"] + else: + data = df.drop(columns=["id"]) + target = df["id"] + return data, target + + +def balance_training_data(df): + smote_tomek = SMOTETomek(random_state=42) + data, target = split_data_target(df=df, dataset="data") + balanced_data, balanced_target = smote_tomek.fit_resample(data, target) + balanced_data_df = DataFrame(balanced_data, columns=data.columns) + balanced_target_df = DataFrame(balanced_target, columns=target.columns) + return balanced_data_df, balanced_target_df + + def split_k_sets(df): k_fold = KFold(shuffle=True, random_state=42) return k_fold.split(df) @@ -66,4 +88,6 @@ def parse_data(train, test): processed_df_list = drop_null_values(renamed_df_list) numeric_df_list = trim_column_names(processed_df_list) encoded_df_list = encode_columns(numeric_df_list) - return encoded_df_list + train_data, train_target = balance_training_data(encoded_df_list[0]) + test_data, test_ids = split_data_target(encoded_df_list[1], dataset="test") + return train_data, train_target, test_data, test_ids