Plot roc_auc_curve and add plumbing for plotting

2020-12-08 14:08:47 +01:00 · 2020-12-08 14:08:47 +01:00 · c5a147e5df
parent 3dd13a6fb5
commit c5a147e5df
2 changed files with 115 additions and 36 deletions
--- a/docs/assets/roc_auc_curve.png
+++ b/docs/assets/roc_auc_curve.png
--- a/src/processing.py
+++ b/src/processing.py
@ -1,12 +1,15 @@
-from numpy import mean
-from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
-from sklearn.model_selection import cross_val_score
+from numpy import mean, arange
+from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
+from sklearn.model_selection import cross_val_predict
 from sklearn.naive_bayes import GaussianNB
 from sklearn.neural_network import MLPClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import scale
 from sklearn.svm import LinearSVC
 from sklearn.tree import DecisionTreeClassifier
+from seaborn import set_theme
+from matplotlib.pyplot import *
+from pandas import DataFrame

 from sys import argv

@ -24,65 +27,141 @@ def choose_model(model):
        return DecisionTreeClassifier(random_state=42)
    elif model == "neuralnet":
        return MLPClassifier(hidden_layer_sizes=10)
-    else:
-        print("Unknown model selected. The choices are: ")
-        print("gnb: Gaussian Naive Bayes")
-        print("svc: Linear Support Vector Classification")
-        print("knn: K-neighbors")
-        print("tree: Decision tree")
-        print("neuralnet: MLP Classifier")
-        exit()


-def predict_data(data, target, model):
+def predict_data(data, target, model, results):
    model = choose_model(model)
    if model == "knn":
        data = scale(data)
-    accuracy_scores = []
-    confusion_matrices = []
-    auc = []
+    confusion_matrices, auc, fpr, tpr = [], [], [], []
    for train_index, test_index in split_k_sets(data):
        model.fit(data.iloc[train_index], target.iloc[train_index])
        prediction = model.predict(data.iloc[test_index])
-        accuracy_scores.append(accuracy_score(target.iloc[test_index], prediction))
        confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction))
        auc.append(roc_auc_score(target.iloc[test_index], prediction))
-    cv_score = cross_val_score(model, data, target, cv=10)
-    evaluate_performance(
-        confusion_matrix=mean(confusion_matrices, axis=0),
-        accuracy=mean(accuracy_scores),
-        cv_score=mean(cv_score),
+        fpr_item, tpr_item, _ = roc_curve(target.iloc[test_index], prediction)
+        fpr.append(fpr_item)
+        tpr.append(tpr_item)
+    populated_results = populate_results(
+        df=results,
+        model=model,
+        fpr=mean(fpr, axis=0),
+        tpr=mean(tpr, axis=0),
        auc=mean(auc),
+        confusion_matrix=mean(confusion_matrices, axis=0),
+    )
+    return populated_results
+
+
+def plot_roc_auc_curve(model, results):
+    rounded_auc = round(results.loc[model]["auc"], 3)
+    plot(
+        results.loc[model]["fpr"],
+        results.loc[model]["tpr"],
+        label=f"{model} , AUC={rounded_auc}",
+    )
+    xticks(arange(0.0, 1.0, step=0.1))
+    yticks(arange(0.0, 1.0, step=0.1))
+    legend(loc="lower right")
+
+
+def plot_confusion_matrix(model, results):
+    matrix = results.loc[model]["confusion_matrix"]
+    classes = ["Negative", "Positive"]
+    for item in matrix:
+        text(x=0.5, y=0.5, s=item)
+    xticks(ticks=arange(len(classes)), labels=classes)
+    yticks(ticks=arange(len(classes)), labels=classes)
+
+
+def choose_plot_type(type, model, results):
+    if type == "roc":
+        plot_roc_auc_curve(model, results)
+    elif type == "confusion_matrix":
+        plot_confusion_matrix(model, results)
+
+
+def plot_individual_figure(results, type, x_axis, y_axis, fig_title):
+    fig = figure(figsize=(8, 6))
+    for model in results.index:
+        choose_plot_type(type, model, results)
+    xlabel(x_axis)
+    ylabel(y_axis)
+    title(fig_title)
+    show()
+    fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
+
+
+# TODO Add cross_val_score
+def plot_all_figures(results):
+    set_theme()
+    plot_individual_figure(
+        results,
+        type="roc",
+        x_axis="fpr",
+        y_axis="tpr",
+        fig_title="ROC AUC curve",
+    )
+    plot_individual_figure(
+        results,
+        type="confusion_matrix",
+        x_axis="fpr",
+        y_axis="tpr",
+        fig_title="Confusion Matrix",
    )


-def evaluate_performance(confusion_matrix, accuracy, cv_score, auc):
-    print("Accuracy Score: " + str(accuracy))
-    print("Confusion matrix: ")
-    print(str(confusion_matrix))
-    print("Cross validation score: " + str(cv_score))
-    print("AUC: " + str(auc))
+def create_result_dataframes():
+    results = DataFrame(columns=["model", "fpr", "tpr", "auc", "confusion_matrix"])
+    indexed_results = results.set_index("model")
+    return indexed_results, indexed_results
+
+
+def populate_results(df, model, fpr, tpr, auc, confusion_matrix):
+    renamed_model = rename_model(model=f"{model}")
+    columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"]
+    values = [renamed_model, fpr, tpr, auc, confusion_matrix]
+    dictionary = dict(zip(columns, values))
+    populated_df = df.append(dictionary, ignore_index=True)
+    return populated_df
+
+
+def rename_model(model):
+    short_name = ["gnb", "svc", "knn", "tree", "neuralnet"]
+    models = [
+        "GaussianNB()",
+        "LinearSVC(random_state=42)",
+        "KNeighborsClassifier(n_neighbors=10)",
+        "DecisionTreeClassifier(random_state=42)",
+        "MLPClassifier(hidden_layer_sizes=10)",
+    ]
+    mapping = dict(zip(models, short_name))
+    return mapping[model]


 def usage():
-    print("Usage: " + argv[0] + "<preprocessing action> <model>")
+    print("Usage: " + argv[0] + "<preprocessing action>")
    print("preprocessing actions:")
    print("fill: fills the na values with the mean")
    print("drop: drops the na values")
-    print("models:")
-    print("gnb: Gaussian Naive Bayes")
-    print("svc: Linear Support Vector Classification")
-    print("knn: K-neighbors")
-    print("tree: Decision tree")
-    print("neuralnet: MLP Classifier")
    exit()


 def main():
-    if len(argv) != 3:
+    models = ["gnb", "svc", "knn", "tree", "neuralnet"]
+    if len(argv) != 2:
        usage()
    data, target = parse_data(source="data/mamografia.csv", action=str(argv[1]))
-    predict_data(data=data, target=target, model=str(argv[2]))
+    individual_result, complete_results = create_result_dataframes()
+    for model in models:
+        model_results = predict_data(
+            data=data, target=target, model=model, results=individual_result
+        )
+        complete_results = complete_results.append(
+            individual_result.append(model_results)
+        )
+    indexed_results = complete_results.set_index("model")
+    plot_all_figures(results=indexed_results)


 if __name__ == "__main__":