diff --git a/src/preprocessing.py b/src/preprocessing.py index 297aece..c0babbf 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,5 +1,5 @@ from pandas import DataFrame, read_csv -from sklearn.preprocessing import LabelEncoder, normalize +from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold @@ -12,22 +12,29 @@ def construct_dataframes(train, test): return df_list -def drop_null_values(df_list) -> DataFrame: +def drop_null_values(df_list): for df in df_list: df.dropna(inplace=True) df.drop(columns="Tipo_marchas", inplace=True) return df_list +def rename_columns(df_list) -> DataFrame: + for df in df_list: + df.columns = df.columns.str.lower() + return df_list + + def trim_column_names(df_list) -> DataFrame: - columns = ["Consumo", "Motor_CC", "Potencia"] + columns = ["consumo", "motor_CC", "potencia"] for df in df_list: for col in columns: df[col] = df[col].str.replace(pat="[^.0-9]", repl="").astype(float) return df_list -def encode_fields(df_list): +def encode_columns(df_list): + label_encoder = LabelEncoder() files = [ "ao" "asientos" @@ -40,10 +47,12 @@ def encode_fields(df_list): "motor_cc" "nombre" "potencia" - "potencia" ] for data in files: - pass + for df in df_list: + label = label_encoder.fit(read_csv("data/" + data + ".csv", squeeze=True)) + df[data] = label.transform(df[data]) + return df_list def split_k_sets(df): @@ -53,6 +62,8 @@ def split_k_sets(df): def parse_data(train, test): df_list = construct_dataframes(train=train, test=test) - processed_df_list = drop_null_values(df_list) + renamed_df_list = rename_columns(df_list) + processed_df_list = drop_null_values(renamed_df_list) numeric_df_list = trim_column_names(processed_df_list) - return numeric_df_list + encoded_df_list = encode_columns(numeric_df_list) + return encoded_df_list