diff --git a/src/preprocessing.py b/src/preprocessing.py index c179e71..0dcc9bf 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,5 +1,6 @@ from pandas import read_csv from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold def replace_values(df): @@ -12,7 +13,13 @@ def replace_values(df): def process_na(df, action): if action == "drop": return df.dropna() - return replace_values(df) + elif action == "fill": + return replace_values(df) + else: + print("Unknown action selected. The choices are: ") + print("fill: fills the na values with the mean") + print("drop: drops the na values") + exit() def encode_columns(df): @@ -23,8 +30,20 @@ def encode_columns(df): return encoded_df +def split_train_target(df): + train_data = df.drop(labels="Severity") + target_data = df["Severity"] + return train_data, target_data + + +def split_k_sets(df): + k_fold = KFold(shuffle=True, random_state=42) + return k_fold.split(df) + + def parse_data(source, action): df = read_csv(filepath_or_buffer=source, na_values="?") processed_df = process_na(df=df, action=action) encoded_df = encode_columns(df=processed_df) - return encoded_df + test_data, target_data = split_train_target(df=encoded_df) + return test_data, target_data