diff --git a/src/preprocessing.py b/src/preprocessing.py index 32709a4..273fa9e 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -19,9 +19,17 @@ def rename_columns(df_list) -> DataFrame: return df_list -def drop_null_values(df_list): +def process_null_values(df_list): drop_columns = ["tipo_marchas", "descuento", "ciudad"] + fill_columns = ["asientos", "motor_cc", "potencia"] for df in df_list: + for column in fill_columns: + if column == "asientos": + df[column].fillna(value=df[column].median(), inplace=True) + else: + df[column].fillna( + value=df[column].str.extract("(\d+)").mean(), inplace=True + ) df.drop(columns=drop_columns, inplace=True) df.dropna(inplace=True) return df_list @@ -80,7 +88,7 @@ def split_k_sets(df): def parse_data(train, test): df_list = construct_dataframes(train=train, test=test) renamed_df_list = rename_columns(df_list) - processed_df_list = drop_null_values(renamed_df_list) + processed_df_list = process_null_values(renamed_df_list) encoded_df_list = encode_columns(processed_df_list) train_data, train_target = balance_training_data(encoded_df_list[0]) test_data, test_ids = split_data_target(encoded_df_list[1], dataset="test")