diff --git a/docs/assets/attribute's_correlation.png b/docs/assets/attribute's_correlation.png new file mode 100644 index 0000000..ba5d426 Binary files /dev/null and b/docs/assets/attribute's_correlation.png differ diff --git a/docs/assets/confusion_matrix.png b/docs/assets/confusion_matrix.png new file mode 100644 index 0000000..e27f6e7 Binary files /dev/null and b/docs/assets/confusion_matrix.png differ diff --git a/docs/assets/roc_auc_curve.png b/docs/assets/roc_auc_curve.png index 81cc2f6..1c18a02 100644 Binary files a/docs/assets/roc_auc_curve.png and b/docs/assets/roc_auc_curve.png differ diff --git a/src/preprocessing.py b/src/P1/preprocessing.py similarity index 100% rename from src/preprocessing.py rename to src/P1/preprocessing.py diff --git a/src/processing.py b/src/P1/processing.py similarity index 100% rename from src/processing.py rename to src/P1/processing.py diff --git a/src/P2/preprocessing.py b/src/P2/preprocessing.py new file mode 100644 index 0000000..e5c6031 --- /dev/null +++ b/src/P2/preprocessing.py @@ -0,0 +1,49 @@ +from pandas import read_csv +from sklearn.model_selection import KFold +from sklearn.preprocessing import LabelEncoder + + +def replace_values(df): + columns = ["BI-RADS", "Margin", "Density", "Age"] + for column in columns: + df[column].fillna(value=df[column].mean(), inplace=True) + return df + + +def process_na(df, action): + if action == "drop": + return df.dropna() + elif action == "fill": + return replace_values(df) + else: + print("Unknown action selected. The choices are: ") + print("fill: fills the na values with the mean") + print("drop: drops the na values") + exit() + + +def encode_columns(df): + label_encoder = LabelEncoder() + encoded_df = df.copy() + encoded_df["Shape"] = label_encoder.fit_transform(df["Shape"]) + encoded_df["Severity"] = label_encoder.fit_transform(df["Severity"]) + return encoded_df + + +def split_train_target(df): + train_data = df.drop(columns=["Severity"]) + target_data = df["Severity"] + return train_data, target_data + + +def split_k_sets(df): + k_fold = KFold(shuffle=True, random_state=42) + return k_fold.split(df) + + +def parse_data(source, action): + df = read_csv(filepath_or_buffer=source, na_values="?") + processed_df = process_na(df=df, action=action) + encoded_df = encode_columns(df=processed_df) + test_data, target_data = split_train_target(df=encoded_df) + return test_data, target_data