From b6b7ffc40991b8b13d03118fca86bb74c951fa5f Mon Sep 17 00:00:00 2001 From: coolneng Date: Wed, 9 Dec 2020 17:54:42 +0100 Subject: [PATCH] Implement preprocessing module for P2 --- src/P2/preprocessing.py | 50 ++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/src/P2/preprocessing.py b/src/P2/preprocessing.py index e5c6031..21a83ea 100644 --- a/src/P2/preprocessing.py +++ b/src/P2/preprocessing.py @@ -1,16 +1,13 @@ -from pandas import read_csv -from sklearn.model_selection import KFold -from sklearn.preprocessing import LabelEncoder +from pandas import DataFrame, read_csv -def replace_values(df): - columns = ["BI-RADS", "Margin", "Density", "Age"] - for column in columns: +def replace_values(df) -> DataFrame: + for column in df.columns: df[column].fillna(value=df[column].mean(), inplace=True) return df -def process_na(df, action): +def process_na(df, action) -> DataFrame: if action == "drop": return df.dropna() elif action == "fill": @@ -22,28 +19,25 @@ def process_na(df, action): exit() -def encode_columns(df): - label_encoder = LabelEncoder() - encoded_df = df.copy() - encoded_df["Shape"] = label_encoder.fit_transform(df["Shape"]) - encoded_df["Severity"] = label_encoder.fit_transform(df["Severity"]) - return encoded_df +def filter_dataframe(df) -> DataFrame: + relevant_columns = [ + "HORA", + "DIASEMANA", + "COMUNIDAD_AUTONOMA", + "ISLA", + "TOT_HERIDOS_LEVES", + "TOT_HERIDOS_GRAVES", + "TOT_VEHICULOS_IMPLICADOS", + "TOT_MUERTOS", + "TIPO_VIA", + "LUMINOSIDAD", + "FACTORES_ATMOSFERICOS", + ] + filtered_df = df.filter(items=relevant_columns) + return filtered_df -def split_train_target(df): - train_data = df.drop(columns=["Severity"]) - target_data = df["Severity"] - return train_data, target_data - - -def split_k_sets(df): - k_fold = KFold(shuffle=True, random_state=42) - return k_fold.split(df) - - -def parse_data(source, action): +def parse_data(source, action) -> DataFrame: df = read_csv(filepath_or_buffer=source, na_values="?") processed_df = process_na(df=df, action=action) - encoded_df = encode_columns(df=processed_df) - test_data, target_data = split_train_target(df=encoded_df) - return test_data, target_data + return processed_df