Add preprocessing module

2020-10-28 19:29:50 +01:00 · 2020-10-28 19:29:50 +01:00 · 127e13d370
parent f919985414
commit 127e13d370
4 changed files with 102 additions and 1 deletions
--- a/docs/Experiments.org
+++ b/docs/Experiments.org
@ -0,0 +1,71 @@
 * Experiments
 We will first try to gather information about our dataset, by evaluating the statistics of our attributes.
 #+BEGIN_SRC python
 from pandas import read_csv
 from sklearn.preprocessing import LabelEncoder
 def replace_values(df):
    columns = ["BI-RADS", "Margin", "Density", "Age"]
    for column in columns:
        df[column].fillna(value=df[column].mean(), inplace=True)
    return df
 def process_na(df, action):
    if action == "drop":
        return df.dropna()
    return replace_values(df)
 def encode_columns(df):
    encoder = LabelEncoder()
    encoder.fit(df["Shape"])
 def parse_data(source, action):
    df = read_csv(filepath_or_buffer=source, na_values="?")
    processed_df = process_na(df, action)
    return processed_df
 #+END_SRC
 #+RESULTS:
 #+BEGIN_SRC python
 df = parse_data("../data/mamografia.csv", "drop")
 print(df.describe())
 #+END_SRC
 #+RESULTS:
 :           BI-RADS         Age      Margin     Density
 : count  847.000000  847.000000  847.000000  847.000000
 : mean     4.322314   55.842975    2.833530    2.909091
 : std      0.703762   14.603754    1.564049    0.370292
 : min      0.000000   18.000000    1.000000    1.000000
 : 25%      4.000000   46.000000    1.000000    3.000000
 : 50%      4.000000   57.000000    3.000000    3.000000
 : 75%      5.000000   66.000000    4.000000    3.000000
 : max      6.000000   96.000000    5.000000    4.000000
 We observe that *margin* and *density* are the columns with the most unknown values. The age group of our cohort is middle aged, the BI-RADS score is mostly in the suspicious category, the density is mostly low and the margin belongs to the microlobulated/obscured category.
 We'll try to impute values, instead of dropping them, when they're invalid.
 #+BEGIN_SRC python
 df = parse_data("../data/mamografia.csv", "replace")
 print(df.describe())
 #+END_SRC
 #+RESULTS:
 :           BI-RADS         Age      Margin     Density
 : count  961.000000  961.000000  961.000000  961.000000
 : mean     4.296142   55.487448    2.796276    2.910734
 : std      0.705555   14.442373    1.526880    0.365074
 : min      0.000000   18.000000    1.000000    1.000000
 : 25%      4.000000   45.000000    1.000000    3.000000
 : 50%      4.000000   57.000000    3.000000    3.000000
 : 75%      5.000000   66.000000    4.000000    3.000000
 : max      6.000000   96.000000    5.000000    4.000000
--- a/shell.nix
+++ b/shell.nix
@ -2,4 +2,7 @@
 with pkgs;
-mkShell { buildInputs = [ python38Packages.scikitlearn ]; }
+mkShell {
  buildInputs =
    [ python38 python38Packages.pandas python38Packages.scikitlearn ];
 }
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -0,0 +1,26 @@
 from pandas import read_csv
 from sklearn.preprocessing import LabelEncoder
 def replace_values(df):
    columns = ["BI-RADS", "Margin", "Density", "Age"]
    for column in columns:
        df[column].fillna(value=df[column].mean(), inplace=True)
    return df
 def process_na(df, action):
    if action == "drop":
        return df.dropna()
    return replace_values(df)
 def encode_columns(df):
    encoder = LabelEncoder()
    encoder.fit(df["Shape"])
 def parse_data(source, action):
    df = read_csv(filepath_or_buffer=source, na_values="?")
    processed_df = process_na(df, action)
    return processed_df
--- a/src/processing.py
+++ b/src/processing.py
@ -0,0 +1 @@
 from preprocessing import parse_data