Loading 01_übung/README.md +3 −6 Original line number Diff line number Diff line Loading @@ -4,13 +4,10 @@ <!-- @import "[TOC]" {cmd="toc" depthFrom=1 depthTo=6 orderedList=false} --> <!-- code_chunk_output --> - [About this folder 🤓](#about-this-folder-) - [expml-2 01_übung <!-- omit in toc -->](#expml-2-01_übung-omit-in-toc-) - [Table of contents <!-- omit in toc -->](#table-of-contents-omit-in-toc-) - [About this folder 🤓](#about-this-folder) - [Structure](#structure) - [Goals 🏆](#goals) - [What is this special task about? 🤭](#what-is-this-special-task-about) - [It did't work. Why? 🤯](#it-didt-work-why) - [Goals 🏆](#goals-) - [What is this special task about? 🤭](#what-is-this-special-task-about-) - [It did't work. Why? 🤯](#it-didt-work-why-) <!-- /code_chunk_output --> Loading 02_übung/run_time_analysis.py +41 −6 Original line number Diff line number Diff line # LOAD PACKAGES import statistics from scipy.stats import ttest_ind as t_test, levene import matplotlib.pyplot as plt sample_katha = [3.21, 3.24, 3.4, 3.48, 3.16, 3.54, 3.51, 3.25] sample_stella = [2.29] standard_deviation = statistics.stdev(sample) print(statistics.mean(sample)) print(standard_deviation) # save run time data samples = dict() users = "Katha", "Stella" samples["Katha"] = dict() samples["Stella"] = dict() samples["Katha"]["sample"] = [4.034196138381958, 3.9400477409362793, 3.3960583209991455, 3.52065372467041, 3.604139804840088, 3.4799697399139404, 3.393357515335083, 3.4734957218170166, 3.4474666118621826, 3.430788040161133, 3.2149417650819933, 3.249572697397466, 3.4181864746693829, 3.4801234789720487, 3.163874927349273] samples["Stella"]["sample"] = [2.289949417114258, 3.029839515686035, 2.94633150100708, 3.0053203105926514, 4.141581773757935, 2.8266165256500244, 2.954447746276855, 2.8661489486694336, 2.7565417289733887, 2.7601661682128906, 2.3141674995422363, 2.191812515258789, 3.725212335586548, 2.219719648361206, 2.2193312644958496] # standard derivation for time data is 0.15 # calculate discriptive statistics for user in users: sample = samples[user]["sample"] samples[user]["mean"] = statistics.mean(sample) samples[user]["sd"] = statistics.stdev(sample) samples[user]["range"] = max(sample)-min(sample) print("-"*10, "\n", user) print("Mean: ", samples[user]["mean"]) print("Standard Derivation: ", samples[user]["sd"]) print("Range: ", samples[user]["range"]) # check conditions for t-test print(levene(samples["Katha"]["sample"], samples["Stella"]["sample"])) # compare the two run time samples (assuming Katha's laptop takes longer) print(t_test(samples["Katha"]["sample"], samples["Stella"]["sample"])) # plot run times plt.plot(samples["Katha"]["sample"],'g*', samples["Stella"]["sample"], 'ro') # individual points plt.hlines(y=[samples["Katha"]["mean"], samples["Stella"]["mean"]], # means of the samples xmin=0, xmax=len(samples["Stella"]["sample"] ), colors=['g', 'r'], lw=0.2) plt.ylabel("Run time \n(in sec)") plt.legend(['Katha', 'Stella'], loc="upper right", borderaxespad=0).set_title("Sample") plt.title("Run Times") plt.savefig("run_times.png", bbox_inches='tight') plt.show() plt.clf() 02_übung/übung2_code.py +10 −8 Original line number Diff line number Diff line Loading @@ -48,6 +48,7 @@ datasets = [make_moons(noise=0.3, random_state=0), linearly_separable ] # plot data and decision boundaries figure = plt.figure(figsize=(27, 11)) i = 1 # iterate over datasets Loading @@ -69,10 +70,10 @@ for ds_cnt, ds in enumerate(datasets): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points # plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points # plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) Loading @@ -87,21 +88,21 @@ for ds_cnt, ds in enumerate(datasets): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot # put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points # plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points # plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6) Loading @@ -119,6 +120,7 @@ end = time.time() print(end - start) plt.tight_layout() plt.savefig("plots_datasets_and_classifiers.png", bbox_inches='tight') # not needed in interactive Jupyter session plt.show() plt.savefig("plots.png") # not needed in interactive Jupyter session plt.clf() 04_übung/README.md +5 −7 Original line number Diff line number Diff line Loading @@ -5,14 +5,12 @@ <!-- code_chunk_output --> - [expml-2 04_übung <!-- omit in toc -->](#expml-2-04_übung-omit-in-toc-) - [Table of contents <!-- omit in toc -->](#table-of-contents-omit-in-toc-) - [About this folder 🤓](#about-this-folder) - [About this folder 🤓](#about-this-folder-) - [Structure](#structure) - [Goals 🏆](#goals) - [Multiprocessing 📶](#multiprocessing) - [Correcting the mistakes in the given code like pros 😎](#correcting-the-mistakes-in-the-given-code-like-pros) - [How can we add functionality? 👀](#how-can-we-add-functionality) - [Goals 🏆](#goals-) - [Multiprocessing 📶](#multiprocessing-) - [Correcting the mistakes in the given code like pros 😎](#correcting-the-mistakes-in-the-given-code-like-pros-) - [How can we add functionality? 👀](#how-can-we-add-functionality-) <!-- /code_chunk_output --> Loading project/evaluation.pydeleted 100644 → 0 +0 −82 Original line number Diff line number Diff line # LOAD PACKAGES from sklearn.dummy import DummyClassifier import pandas as pd import numpy as np #from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix, \ f1_score, balanced_accuracy_score, \ precision_score, recall_score, \ roc_curve, roc_auc_score from sklearn.preprocessing import LabelBinarizer # for all baseline models for bl_index, bl_label in zip(range(len(baseline_labels)), baseline_labels): # train baseline model bl_models.append(DummyClassifier(strategy= bl_label, random_state=123)) bl_models[bl_index].fit(X_train, y_train) # calculate prediction prediction = bl_models[bl_index].predict(X_test) # # transform multilabel to binary multiclass # mlb = LabelBinarizer() # prediction_binary = mlb.fit_transform(prediction) # y_test_binary = mlb.fit_transform(y_test) # calculate accuracy if acc in eva_criteria: bl_eva[bl_label][acc] = bl_models[bl_index].score(X_test,y_test) # calculate balanced accuracy if acc_bal in eva_criteria: bl_eva[bl_label][acc_bal] = balanced_accuracy_score(prediction,y_test) # create confusion matrix bl_confusion_matrices.append(confusion_matrix(prediction,y_test)) # calculate F1 measure(s) for average in [criterion for criterion in eva_criteria if "F1" in criterion]: bl_eva[bl_label][average] = f1_score(prediction,y_test, average=average.lstrip("F1_")) # calculate recall for average in [criterion for criterion in eva_criteria if "Recall" in criterion]: bl_eva[bl_label][average] = recall_score(prediction,y_test, average=average.lstrip("Recall_")) # calculate precision for average in [criterion for criterion in eva_criteria if "Precision" in criterion]: bl_eva[bl_label][average] = precision_score(prediction,y_test, average=average.lstrip("Precision_")) # # ROC # roc_auc_score(prediction_binary, y_test_binary, multi_class='ovr') # print results not included in the eva data frame if print_results: print("\n\n" + bl_label.upper() + ":\n\nPrediction:") print(prediction) # if acc in eva_criteria: # print("\nAccuracy: ") # print(bl_eva[bl_label][acc]) print("\nConfusion matrix: ") print(bl_confusion_matrices[bl_index]) # round evaluation values bl_eva_rounded = pd.DataFrame(index=eva_criteria, columns=baseline_labels) for c in baseline_labels: for i in eva_criteria: bl_eva_rounded[c][i] = round(bl_eva[c][i], 2) # print evaluation results if print_results: print("\nEvaluation results (not rounded):\n") print(bl_eva) print("\n\nEvaluation results (rounded):\n") print(bl_eva_rounded) # SAVE DATA TO CSV # evaluation results (not rounded) bl_eva.to_csv(path_or_buf="evaluation_baselines.csv") # evaluation results (rounded) bl_eva_rounded.to_csv(path_or_buf="evaluation_baselines_rounded.csv") No newline at end of file Loading
01_übung/README.md +3 −6 Original line number Diff line number Diff line Loading @@ -4,13 +4,10 @@ <!-- @import "[TOC]" {cmd="toc" depthFrom=1 depthTo=6 orderedList=false} --> <!-- code_chunk_output --> - [About this folder 🤓](#about-this-folder-) - [expml-2 01_übung <!-- omit in toc -->](#expml-2-01_übung-omit-in-toc-) - [Table of contents <!-- omit in toc -->](#table-of-contents-omit-in-toc-) - [About this folder 🤓](#about-this-folder) - [Structure](#structure) - [Goals 🏆](#goals) - [What is this special task about? 🤭](#what-is-this-special-task-about) - [It did't work. Why? 🤯](#it-didt-work-why) - [Goals 🏆](#goals-) - [What is this special task about? 🤭](#what-is-this-special-task-about-) - [It did't work. Why? 🤯](#it-didt-work-why-) <!-- /code_chunk_output --> Loading
02_übung/run_time_analysis.py +41 −6 Original line number Diff line number Diff line # LOAD PACKAGES import statistics from scipy.stats import ttest_ind as t_test, levene import matplotlib.pyplot as plt sample_katha = [3.21, 3.24, 3.4, 3.48, 3.16, 3.54, 3.51, 3.25] sample_stella = [2.29] standard_deviation = statistics.stdev(sample) print(statistics.mean(sample)) print(standard_deviation) # save run time data samples = dict() users = "Katha", "Stella" samples["Katha"] = dict() samples["Stella"] = dict() samples["Katha"]["sample"] = [4.034196138381958, 3.9400477409362793, 3.3960583209991455, 3.52065372467041, 3.604139804840088, 3.4799697399139404, 3.393357515335083, 3.4734957218170166, 3.4474666118621826, 3.430788040161133, 3.2149417650819933, 3.249572697397466, 3.4181864746693829, 3.4801234789720487, 3.163874927349273] samples["Stella"]["sample"] = [2.289949417114258, 3.029839515686035, 2.94633150100708, 3.0053203105926514, 4.141581773757935, 2.8266165256500244, 2.954447746276855, 2.8661489486694336, 2.7565417289733887, 2.7601661682128906, 2.3141674995422363, 2.191812515258789, 3.725212335586548, 2.219719648361206, 2.2193312644958496] # standard derivation for time data is 0.15 # calculate discriptive statistics for user in users: sample = samples[user]["sample"] samples[user]["mean"] = statistics.mean(sample) samples[user]["sd"] = statistics.stdev(sample) samples[user]["range"] = max(sample)-min(sample) print("-"*10, "\n", user) print("Mean: ", samples[user]["mean"]) print("Standard Derivation: ", samples[user]["sd"]) print("Range: ", samples[user]["range"]) # check conditions for t-test print(levene(samples["Katha"]["sample"], samples["Stella"]["sample"])) # compare the two run time samples (assuming Katha's laptop takes longer) print(t_test(samples["Katha"]["sample"], samples["Stella"]["sample"])) # plot run times plt.plot(samples["Katha"]["sample"],'g*', samples["Stella"]["sample"], 'ro') # individual points plt.hlines(y=[samples["Katha"]["mean"], samples["Stella"]["mean"]], # means of the samples xmin=0, xmax=len(samples["Stella"]["sample"] ), colors=['g', 'r'], lw=0.2) plt.ylabel("Run time \n(in sec)") plt.legend(['Katha', 'Stella'], loc="upper right", borderaxespad=0).set_title("Sample") plt.title("Run Times") plt.savefig("run_times.png", bbox_inches='tight') plt.show() plt.clf()
02_übung/übung2_code.py +10 −8 Original line number Diff line number Diff line Loading @@ -48,6 +48,7 @@ datasets = [make_moons(noise=0.3, random_state=0), linearly_separable ] # plot data and decision boundaries figure = plt.figure(figsize=(27, 11)) i = 1 # iterate over datasets Loading @@ -69,10 +70,10 @@ for ds_cnt, ds in enumerate(datasets): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points # plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points # plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) Loading @@ -87,21 +88,21 @@ for ds_cnt, ds in enumerate(datasets): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot # put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points # plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points # plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6) Loading @@ -119,6 +120,7 @@ end = time.time() print(end - start) plt.tight_layout() plt.savefig("plots_datasets_and_classifiers.png", bbox_inches='tight') # not needed in interactive Jupyter session plt.show() plt.savefig("plots.png") # not needed in interactive Jupyter session plt.clf()
04_übung/README.md +5 −7 Original line number Diff line number Diff line Loading @@ -5,14 +5,12 @@ <!-- code_chunk_output --> - [expml-2 04_übung <!-- omit in toc -->](#expml-2-04_übung-omit-in-toc-) - [Table of contents <!-- omit in toc -->](#table-of-contents-omit-in-toc-) - [About this folder 🤓](#about-this-folder) - [About this folder 🤓](#about-this-folder-) - [Structure](#structure) - [Goals 🏆](#goals) - [Multiprocessing 📶](#multiprocessing) - [Correcting the mistakes in the given code like pros 😎](#correcting-the-mistakes-in-the-given-code-like-pros) - [How can we add functionality? 👀](#how-can-we-add-functionality) - [Goals 🏆](#goals-) - [Multiprocessing 📶](#multiprocessing-) - [Correcting the mistakes in the given code like pros 😎](#correcting-the-mistakes-in-the-given-code-like-pros-) - [How can we add functionality? 👀](#how-can-we-add-functionality-) <!-- /code_chunk_output --> Loading
project/evaluation.pydeleted 100644 → 0 +0 −82 Original line number Diff line number Diff line # LOAD PACKAGES from sklearn.dummy import DummyClassifier import pandas as pd import numpy as np #from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix, \ f1_score, balanced_accuracy_score, \ precision_score, recall_score, \ roc_curve, roc_auc_score from sklearn.preprocessing import LabelBinarizer # for all baseline models for bl_index, bl_label in zip(range(len(baseline_labels)), baseline_labels): # train baseline model bl_models.append(DummyClassifier(strategy= bl_label, random_state=123)) bl_models[bl_index].fit(X_train, y_train) # calculate prediction prediction = bl_models[bl_index].predict(X_test) # # transform multilabel to binary multiclass # mlb = LabelBinarizer() # prediction_binary = mlb.fit_transform(prediction) # y_test_binary = mlb.fit_transform(y_test) # calculate accuracy if acc in eva_criteria: bl_eva[bl_label][acc] = bl_models[bl_index].score(X_test,y_test) # calculate balanced accuracy if acc_bal in eva_criteria: bl_eva[bl_label][acc_bal] = balanced_accuracy_score(prediction,y_test) # create confusion matrix bl_confusion_matrices.append(confusion_matrix(prediction,y_test)) # calculate F1 measure(s) for average in [criterion for criterion in eva_criteria if "F1" in criterion]: bl_eva[bl_label][average] = f1_score(prediction,y_test, average=average.lstrip("F1_")) # calculate recall for average in [criterion for criterion in eva_criteria if "Recall" in criterion]: bl_eva[bl_label][average] = recall_score(prediction,y_test, average=average.lstrip("Recall_")) # calculate precision for average in [criterion for criterion in eva_criteria if "Precision" in criterion]: bl_eva[bl_label][average] = precision_score(prediction,y_test, average=average.lstrip("Precision_")) # # ROC # roc_auc_score(prediction_binary, y_test_binary, multi_class='ovr') # print results not included in the eva data frame if print_results: print("\n\n" + bl_label.upper() + ":\n\nPrediction:") print(prediction) # if acc in eva_criteria: # print("\nAccuracy: ") # print(bl_eva[bl_label][acc]) print("\nConfusion matrix: ") print(bl_confusion_matrices[bl_index]) # round evaluation values bl_eva_rounded = pd.DataFrame(index=eva_criteria, columns=baseline_labels) for c in baseline_labels: for i in eva_criteria: bl_eva_rounded[c][i] = round(bl_eva[c][i], 2) # print evaluation results if print_results: print("\nEvaluation results (not rounded):\n") print(bl_eva) print("\n\nEvaluation results (rounded):\n") print(bl_eva_rounded) # SAVE DATA TO CSV # evaluation results (not rounded) bl_eva.to_csv(path_or_buf="evaluation_baselines.csv") # evaluation results (rounded) bl_eva_rounded.to_csv(path_or_buf="evaluation_baselines_rounded.csv") No newline at end of file