diff --git a/src/preprocessing.py b/src/preprocessing.py index b40305b..297aece 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,20 +1,49 @@ from pandas import DataFrame, read_csv -from sklearn.preprocessing import normalize +from sklearn.preprocessing import LabelEncoder, normalize from sklearn.model_selection import KFold -def replace_values(df) -> DataFrame: - for column in df.columns: - df[column].fillna(value=df[column].mean(), inplace=True) - return df +def construct_dataframes(train, test): + file_list = [train, test] + df_dict = {} + for file in file_list: + df_dict[file] = read_csv(filepath_or_buffer=file) + df_list = list(df_dict.values()) + return df_list -def remove_letters(df) -> DataFrame: +def drop_null_values(df_list) -> DataFrame: + for df in df_list: + df.dropna(inplace=True) + df.drop(columns="Tipo_marchas", inplace=True) + return df_list + + +def trim_column_names(df_list) -> DataFrame: columns = ["Consumo", "Motor_CC", "Potencia"] - processed_df = df.copy() - for col in columns: - processed_df[col] = df[col].str.replace(pat="[^.0-9]", repl="").astype(float) - return processed_df + for df in df_list: + for col in columns: + df[col] = df[col].str.replace(pat="[^.0-9]", repl="").astype(float) + return df_list + + +def encode_fields(df_list): + files = [ + "ao" + "asientos" + "ciudad" + "combustible" + "consumo" + "descuento" + "kilometros" + "mano" + "motor_cc" + "nombre" + "potencia" + "potencia" + ] + for data in files: + pass def split_k_sets(df): @@ -22,8 +51,8 @@ def split_k_sets(df): return k_fold.split(df) -def parse_data(source) -> DataFrame: - df = read_csv(filepath_or_buffer=source, na_values="?") - processed_df = df.dropna() - numeric_df = remove_letters(processed_df) - return numeric_df +def parse_data(train, test): + df_list = construct_dataframes(train=train, test=test) + processed_df_list = drop_null_values(df_list) + numeric_df_list = trim_column_names(processed_df_list) + return numeric_df_list