From e05ccdabb94ebf00c6c5c5a3ea0bbc5e4d0cc0b4 Mon Sep 17 00:00:00 2001 From: coolneng Date: Fri, 1 Jan 2021 21:54:05 +0100 Subject: [PATCH] Remove string trimming function --- src/preprocessing.py | 47 ++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index 33d3ac6..37aa537 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -21,39 +21,33 @@ def rename_columns(df_list) -> DataFrame: def drop_null_values(df_list): for df in df_list: - df.dropna(inplace=True) df.drop(columns="tipo_marchas", inplace=True) - df["descuento"].fillna(0) - return df_list - - -def trim_column_names(df_list) -> DataFrame: - columns = ["consumo", "motor_CC", "potencia"] - for df in df_list: - for col in columns: - df[col] = df[col].str.replace(pat="[^.0-9]", repl="").astype(float) + df.drop(columns="descuento", inplace=True) + df.dropna(inplace=True) return df_list def encode_columns(df_list): label_encoder = LabelEncoder() files = [ - "ao" - "asientos" - "ciudad" - "combustible" - "consumo" - "descuento" - "kilometros" - "mano" - "motor_cc" - "nombre" - "potencia" + "ao", + "asientos", + "ciudad", + "combustible", + "consumo", + "kilometros", + "mano", + "motor_cc", + "nombre", + "potencia", ] for data in files: for df in df_list: label = label_encoder.fit(read_csv("data/" + data + ".csv", squeeze=True)) - df[data] = label.transform(df[data]) + if data == "ao": + df["año"] = label.transform(df["año"]) + else: + df[data] = label.transform(df[data]) return df_list @@ -72,8 +66,10 @@ def balance_training_data(df): smote_tomek = SMOTETomek(random_state=42) data, target = split_data_target(df=df, dataset="data") balanced_data, balanced_target = smote_tomek.fit_resample(data, target) - balanced_data_df = DataFrame(balanced_data, columns=data.columns) - balanced_target_df = DataFrame(balanced_target, columns=target.columns) + balanced_data_df = DataFrame( + balanced_data, columns=df.columns.difference(["precio_cat"]) + ) + balanced_target_df = DataFrame(balanced_target, columns=["precio_cat"]) return balanced_data_df, balanced_target_df @@ -86,8 +82,7 @@ def parse_data(train, test): df_list = construct_dataframes(train=train, test=test) renamed_df_list = rename_columns(df_list) processed_df_list = drop_null_values(renamed_df_list) - numeric_df_list = trim_column_names(processed_df_list) - encoded_df_list = encode_columns(numeric_df_list) + encoded_df_list = encode_columns(processed_df_list) train_data, train_target = balance_training_data(encoded_df_list[0]) test_data, test_ids = split_data_target(encoded_df_list[1], dataset="test") return train_data, train_target, test_data, test_ids