From 127e13d3702d2bcd3e347b51075b5df15a8225ba Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Wed, 28 Oct 2020 19:29:50 +0100
Subject: [PATCH] Add preprocessing module

---
 docs/Experiments.org | 71 ++++++++++++++++++++++++++++++++++++++++++++
 shell.nix            |  5 +++-
 src/preprocessing.py | 26 ++++++++++++++++
 src/processing.py    |  1 +
 4 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 docs/Experiments.org
 create mode 100644 src/preprocessing.py
 create mode 100644 src/processing.py

diff --git a/docs/Experiments.org b/docs/Experiments.org
new file mode 100644
index 0000000..1a32207
--- /dev/null
+++ b/docs/Experiments.org
@@ -0,0 +1,71 @@
+* Experiments
+
+We will first try to gather information about our dataset, by evaluating the statistics of our attributes.
+
+#+BEGIN_SRC python
+from pandas import read_csv
+from sklearn.preprocessing import LabelEncoder
+
+
+def replace_values(df):
+    columns = ["BI-RADS", "Margin", "Density", "Age"]
+    for column in columns:
+        df[column].fillna(value=df[column].mean(), inplace=True)
+    return df
+
+
+def process_na(df, action):
+    if action == "drop":
+        return df.dropna()
+    return replace_values(df)
+
+
+def encode_columns(df):
+    encoder = LabelEncoder()
+    encoder.fit(df["Shape"])
+
+
+def parse_data(source, action):
+    df = read_csv(filepath_or_buffer=source, na_values="?")
+    processed_df = process_na(df, action)
+    return processed_df
+#+END_SRC
+
+#+RESULTS:
+
+
+#+BEGIN_SRC python
+df = parse_data("../data/mamografia.csv", "drop")
+print(df.describe())
+#+END_SRC
+
+#+RESULTS:
+:           BI-RADS         Age      Margin     Density
+: count  847.000000  847.000000  847.000000  847.000000
+: mean     4.322314   55.842975    2.833530    2.909091
+: std      0.703762   14.603754    1.564049    0.370292
+: min      0.000000   18.000000    1.000000    1.000000
+: 25%      4.000000   46.000000    1.000000    3.000000
+: 50%      4.000000   57.000000    3.000000    3.000000
+: 75%      5.000000   66.000000    4.000000    3.000000
+: max      6.000000   96.000000    5.000000    4.000000
+
+We observe that *margin* and *density* are the columns with the most unknown values. The age group of our cohort is middle aged, the BI-RADS score is mostly in the suspicious category, the density is mostly low and the margin belongs to the microlobulated/obscured category.
+
+We'll try to impute values, instead of dropping them, when they're invalid.
+
+#+BEGIN_SRC python
+df = parse_data("../data/mamografia.csv", "replace")
+print(df.describe())
+#+END_SRC
+
+#+RESULTS:
+:           BI-RADS         Age      Margin     Density
+: count  961.000000  961.000000  961.000000  961.000000
+: mean     4.296142   55.487448    2.796276    2.910734
+: std      0.705555   14.442373    1.526880    0.365074
+: min      0.000000   18.000000    1.000000    1.000000
+: 25%      4.000000   45.000000    1.000000    3.000000
+: 50%      4.000000   57.000000    3.000000    3.000000
+: 75%      5.000000   66.000000    4.000000    3.000000
+: max      6.000000   96.000000    5.000000    4.000000
diff --git a/shell.nix b/shell.nix
index 80d2a0c..3b017ae 100644
--- a/shell.nix
+++ b/shell.nix
@@ -2,4 +2,7 @@
 
 with pkgs;
 
-mkShell { buildInputs = [ python38Packages.scikitlearn ]; }
+mkShell {
+  buildInputs =
+    [ python38 python38Packages.pandas python38Packages.scikitlearn ];
+}
diff --git a/src/preprocessing.py b/src/preprocessing.py
new file mode 100644
index 0000000..b73b8e2
--- /dev/null
+++ b/src/preprocessing.py
@@ -0,0 +1,26 @@
+from pandas import read_csv
+from sklearn.preprocessing import LabelEncoder
+
+
+def replace_values(df):
+    columns = ["BI-RADS", "Margin", "Density", "Age"]
+    for column in columns:
+        df[column].fillna(value=df[column].mean(), inplace=True)
+    return df
+
+
+def process_na(df, action):
+    if action == "drop":
+        return df.dropna()
+    return replace_values(df)
+
+
+def encode_columns(df):
+    encoder = LabelEncoder()
+    encoder.fit(df["Shape"])
+
+
+def parse_data(source, action):
+    df = read_csv(filepath_or_buffer=source, na_values="?")
+    processed_df = process_na(df, action)
+    return processed_df
diff --git a/src/processing.py b/src/processing.py
new file mode 100644
index 0000000..8eba2ec
--- /dev/null
+++ b/src/processing.py
@@ -0,0 +1 @@
+from preprocessing import parse_data