diff --git a/docs/assets/roc_auc_curve.png b/docs/assets/roc_auc_curve.png new file mode 100644 index 0000000..bc2f71f Binary files /dev/null and b/docs/assets/roc_auc_curve.png differ diff --git a/src/processing.py b/src/processing.py index 8e6c704..91a5b0d 100644 --- a/src/processing.py +++ b/src/processing.py @@ -1,12 +1,15 @@ -from numpy import mean -from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score -from sklearn.model_selection import cross_val_score +from numpy import mean, arange +from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve +from sklearn.model_selection import cross_val_predict from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import scale from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier +from seaborn import set_theme +from matplotlib.pyplot import * +from pandas import DataFrame from sys import argv @@ -24,65 +27,141 @@ def choose_model(model): return DecisionTreeClassifier(random_state=42) elif model == "neuralnet": return MLPClassifier(hidden_layer_sizes=10) - else: - print("Unknown model selected. The choices are: ") - print("gnb: Gaussian Naive Bayes") - print("svc: Linear Support Vector Classification") - print("knn: K-neighbors") - print("tree: Decision tree") - print("neuralnet: MLP Classifier") - exit() -def predict_data(data, target, model): +def predict_data(data, target, model, results): model = choose_model(model) if model == "knn": data = scale(data) - accuracy_scores = [] - confusion_matrices = [] - auc = [] + confusion_matrices, auc, fpr, tpr = [], [], [], [] for train_index, test_index in split_k_sets(data): model.fit(data.iloc[train_index], target.iloc[train_index]) prediction = model.predict(data.iloc[test_index]) - accuracy_scores.append(accuracy_score(target.iloc[test_index], prediction)) confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction)) auc.append(roc_auc_score(target.iloc[test_index], prediction)) - cv_score = cross_val_score(model, data, target, cv=10) - evaluate_performance( - confusion_matrix=mean(confusion_matrices, axis=0), - accuracy=mean(accuracy_scores), - cv_score=mean(cv_score), + fpr_item, tpr_item, _ = roc_curve(target.iloc[test_index], prediction) + fpr.append(fpr_item) + tpr.append(tpr_item) + populated_results = populate_results( + df=results, + model=model, + fpr=mean(fpr, axis=0), + tpr=mean(tpr, axis=0), auc=mean(auc), + confusion_matrix=mean(confusion_matrices, axis=0), + ) + return populated_results + + +def plot_roc_auc_curve(model, results): + rounded_auc = round(results.loc[model]["auc"], 3) + plot( + results.loc[model]["fpr"], + results.loc[model]["tpr"], + label=f"{model} , AUC={rounded_auc}", + ) + xticks(arange(0.0, 1.0, step=0.1)) + yticks(arange(0.0, 1.0, step=0.1)) + legend(loc="lower right") + + +def plot_confusion_matrix(model, results): + matrix = results.loc[model]["confusion_matrix"] + classes = ["Negative", "Positive"] + for item in matrix: + text(x=0.5, y=0.5, s=item) + xticks(ticks=arange(len(classes)), labels=classes) + yticks(ticks=arange(len(classes)), labels=classes) + + +def choose_plot_type(type, model, results): + if type == "roc": + plot_roc_auc_curve(model, results) + elif type == "confusion_matrix": + plot_confusion_matrix(model, results) + + +def plot_individual_figure(results, type, x_axis, y_axis, fig_title): + fig = figure(figsize=(8, 6)) + for model in results.index: + choose_plot_type(type, model, results) + xlabel(x_axis) + ylabel(y_axis) + title(fig_title) + show() + fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png") + + +# TODO Add cross_val_score +def plot_all_figures(results): + set_theme() + plot_individual_figure( + results, + type="roc", + x_axis="fpr", + y_axis="tpr", + fig_title="ROC AUC curve", + ) + plot_individual_figure( + results, + type="confusion_matrix", + x_axis="fpr", + y_axis="tpr", + fig_title="Confusion Matrix", ) -def evaluate_performance(confusion_matrix, accuracy, cv_score, auc): - print("Accuracy Score: " + str(accuracy)) - print("Confusion matrix: ") - print(str(confusion_matrix)) - print("Cross validation score: " + str(cv_score)) - print("AUC: " + str(auc)) +def create_result_dataframes(): + results = DataFrame(columns=["model", "fpr", "tpr", "auc", "confusion_matrix"]) + indexed_results = results.set_index("model") + return indexed_results, indexed_results + + +def populate_results(df, model, fpr, tpr, auc, confusion_matrix): + renamed_model = rename_model(model=f"{model}") + columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"] + values = [renamed_model, fpr, tpr, auc, confusion_matrix] + dictionary = dict(zip(columns, values)) + populated_df = df.append(dictionary, ignore_index=True) + return populated_df + + +def rename_model(model): + short_name = ["gnb", "svc", "knn", "tree", "neuralnet"] + models = [ + "GaussianNB()", + "LinearSVC(random_state=42)", + "KNeighborsClassifier(n_neighbors=10)", + "DecisionTreeClassifier(random_state=42)", + "MLPClassifier(hidden_layer_sizes=10)", + ] + mapping = dict(zip(models, short_name)) + return mapping[model] def usage(): - print("Usage: " + argv[0] + " ") + print("Usage: " + argv[0] + "") print("preprocessing actions:") print("fill: fills the na values with the mean") print("drop: drops the na values") - print("models:") - print("gnb: Gaussian Naive Bayes") - print("svc: Linear Support Vector Classification") - print("knn: K-neighbors") - print("tree: Decision tree") - print("neuralnet: MLP Classifier") exit() def main(): - if len(argv) != 3: + models = ["gnb", "svc", "knn", "tree", "neuralnet"] + if len(argv) != 2: usage() data, target = parse_data(source="data/mamografia.csv", action=str(argv[1])) - predict_data(data=data, target=target, model=str(argv[2])) + individual_result, complete_results = create_result_dataframes() + for model in models: + model_results = predict_data( + data=data, target=target, model=model, results=individual_result + ) + complete_results = complete_results.append( + individual_result.append(model_results) + ) + indexed_results = complete_results.set_index("model") + plot_all_figures(results=indexed_results) if __name__ == "__main__":