From 127e13d3702d2bcd3e347b51075b5df15a8225ba Mon Sep 17 00:00:00 2001 From: coolneng Date: Wed, 28 Oct 2020 19:29:50 +0100 Subject: [PATCH] Add preprocessing module --- docs/Experiments.org | 71 ++++++++++++++++++++++++++++++++++++++++++++ shell.nix | 5 +++- src/preprocessing.py | 26 ++++++++++++++++ src/processing.py | 1 + 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 docs/Experiments.org create mode 100644 src/preprocessing.py create mode 100644 src/processing.py diff --git a/docs/Experiments.org b/docs/Experiments.org new file mode 100644 index 0000000..1a32207 --- /dev/null +++ b/docs/Experiments.org @@ -0,0 +1,71 @@ +* Experiments + +We will first try to gather information about our dataset, by evaluating the statistics of our attributes. + +#+BEGIN_SRC python +from pandas import read_csv +from sklearn.preprocessing import LabelEncoder + + +def replace_values(df): + columns = ["BI-RADS", "Margin", "Density", "Age"] + for column in columns: + df[column].fillna(value=df[column].mean(), inplace=True) + return df + + +def process_na(df, action): + if action == "drop": + return df.dropna() + return replace_values(df) + + +def encode_columns(df): + encoder = LabelEncoder() + encoder.fit(df["Shape"]) + + +def parse_data(source, action): + df = read_csv(filepath_or_buffer=source, na_values="?") + processed_df = process_na(df, action) + return processed_df +#+END_SRC + +#+RESULTS: + + +#+BEGIN_SRC python +df = parse_data("../data/mamografia.csv", "drop") +print(df.describe()) +#+END_SRC + +#+RESULTS: +: BI-RADS Age Margin Density +: count 847.000000 847.000000 847.000000 847.000000 +: mean 4.322314 55.842975 2.833530 2.909091 +: std 0.703762 14.603754 1.564049 0.370292 +: min 0.000000 18.000000 1.000000 1.000000 +: 25% 4.000000 46.000000 1.000000 3.000000 +: 50% 4.000000 57.000000 3.000000 3.000000 +: 75% 5.000000 66.000000 4.000000 3.000000 +: max 6.000000 96.000000 5.000000 4.000000 + +We observe that *margin* and *density* are the columns with the most unknown values. The age group of our cohort is middle aged, the BI-RADS score is mostly in the suspicious category, the density is mostly low and the margin belongs to the microlobulated/obscured category. + +We'll try to impute values, instead of dropping them, when they're invalid. + +#+BEGIN_SRC python +df = parse_data("../data/mamografia.csv", "replace") +print(df.describe()) +#+END_SRC + +#+RESULTS: +: BI-RADS Age Margin Density +: count 961.000000 961.000000 961.000000 961.000000 +: mean 4.296142 55.487448 2.796276 2.910734 +: std 0.705555 14.442373 1.526880 0.365074 +: min 0.000000 18.000000 1.000000 1.000000 +: 25% 4.000000 45.000000 1.000000 3.000000 +: 50% 4.000000 57.000000 3.000000 3.000000 +: 75% 5.000000 66.000000 4.000000 3.000000 +: max 6.000000 96.000000 5.000000 4.000000 diff --git a/shell.nix b/shell.nix index 80d2a0c..3b017ae 100644 --- a/shell.nix +++ b/shell.nix @@ -2,4 +2,7 @@ with pkgs; -mkShell { buildInputs = [ python38Packages.scikitlearn ]; } +mkShell { + buildInputs = + [ python38 python38Packages.pandas python38Packages.scikitlearn ]; +} diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 0000000..b73b8e2 --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,26 @@ +from pandas import read_csv +from sklearn.preprocessing import LabelEncoder + + +def replace_values(df): + columns = ["BI-RADS", "Margin", "Density", "Age"] + for column in columns: + df[column].fillna(value=df[column].mean(), inplace=True) + return df + + +def process_na(df, action): + if action == "drop": + return df.dropna() + return replace_values(df) + + +def encode_columns(df): + encoder = LabelEncoder() + encoder.fit(df["Shape"]) + + +def parse_data(source, action): + df = read_csv(filepath_or_buffer=source, na_values="?") + processed_df = process_na(df, action) + return processed_df diff --git a/src/processing.py b/src/processing.py new file mode 100644 index 0000000..8eba2ec --- /dev/null +++ b/src/processing.py @@ -0,0 +1 @@ +from preprocessing import parse_data