Add preprocessing module

2020-10-28 19:29:50 +01:00 · 2020-10-28 19:29:50 +01:00 · 127e13d370
parent f919985414
commit 127e13d370
4 changed files with 102 additions and 1 deletions
--- a/docs/Experiments.org
+++ b/docs/Experiments.org
@ -0,0 +1,71 @@
+* Experiments
+
+We will first try to gather information about our dataset, by evaluating the statistics of our attributes.
+
+#+BEGIN_SRC python
+from pandas import read_csv
+from sklearn.preprocessing import LabelEncoder
+
+
+def replace_values(df):
+    columns = ["BI-RADS", "Margin", "Density", "Age"]
+    for column in columns:
+        df[column].fillna(value=df[column].mean(), inplace=True)
+    return df
+
+
+def process_na(df, action):
+    if action == "drop":
+        return df.dropna()
+    return replace_values(df)
+
+
+def encode_columns(df):
+    encoder = LabelEncoder()
+    encoder.fit(df["Shape"])
+
+
+def parse_data(source, action):
+    df = read_csv(filepath_or_buffer=source, na_values="?")
+    processed_df = process_na(df, action)
+    return processed_df
+#+END_SRC
+
+#+RESULTS:
+
+
+#+BEGIN_SRC python
+df = parse_data("../data/mamografia.csv", "drop")
+print(df.describe())
+#+END_SRC
+
+#+RESULTS:
+:           BI-RADS         Age      Margin     Density
+: count  847.000000  847.000000  847.000000  847.000000
+: mean     4.322314   55.842975    2.833530    2.909091
+: std      0.703762   14.603754    1.564049    0.370292
+: min      0.000000   18.000000    1.000000    1.000000
+: 25%      4.000000   46.000000    1.000000    3.000000
+: 50%      4.000000   57.000000    3.000000    3.000000
+: 75%      5.000000   66.000000    4.000000    3.000000
+: max      6.000000   96.000000    5.000000    4.000000
+
+We observe that *margin* and *density* are the columns with the most unknown values. The age group of our cohort is middle aged, the BI-RADS score is mostly in the suspicious category, the density is mostly low and the margin belongs to the microlobulated/obscured category.
+
+We'll try to impute values, instead of dropping them, when they're invalid.
+
+#+BEGIN_SRC python
+df = parse_data("../data/mamografia.csv", "replace")
+print(df.describe())
+#+END_SRC
+
+#+RESULTS:
+:           BI-RADS         Age      Margin     Density
+: count  961.000000  961.000000  961.000000  961.000000
+: mean     4.296142   55.487448    2.796276    2.910734
+: std      0.705555   14.442373    1.526880    0.365074
+: min      0.000000   18.000000    1.000000    1.000000
+: 25%      4.000000   45.000000    1.000000    3.000000
+: 50%      4.000000   57.000000    3.000000    3.000000
+: 75%      5.000000   66.000000    4.000000    3.000000
+: max      6.000000   96.000000    5.000000    4.000000
--- a/shell.nix
+++ b/shell.nix
@ -2,4 +2,7 @@

 with pkgs;

-mkShell { buildInputs = [ python38Packages.scikitlearn ]; }
+mkShell {
+  buildInputs =
+    [ python38 python38Packages.pandas python38Packages.scikitlearn ];
+}
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -0,0 +1,26 @@
+from pandas import read_csv
+from sklearn.preprocessing import LabelEncoder
+
+
+def replace_values(df):
+    columns = ["BI-RADS", "Margin", "Density", "Age"]
+    for column in columns:
+        df[column].fillna(value=df[column].mean(), inplace=True)
+    return df
+
+
+def process_na(df, action):
+    if action == "drop":
+        return df.dropna()
+    return replace_values(df)
+
+
+def encode_columns(df):
+    encoder = LabelEncoder()
+    encoder.fit(df["Shape"])
+
+
+def parse_data(source, action):
+    df = read_csv(filepath_or_buffer=source, na_values="?")
+    processed_df = process_na(df, action)
+    return processed_df
--- a/src/processing.py
+++ b/src/processing.py
@ -0,0 +1 @@
+from preprocessing import parse_data