Loading project/classifier.py +36 −50 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ from evaluation_functions import calculate_statistical_characteristics, plot_roc plot_roc_curves_svm, as_title, plot_accuracies, visualize_confusion_matrix_of_best_model, \ collect_accuracies_in_data_frame import matplotlib.pyplot as plt from learning_curve import save_lc_plot #from baselines import eva_criteria Loading Loading @@ -139,7 +140,7 @@ eva_criteria = [ prec_w ] # parameter optimization # parameter optimization (takes very long for some of them) opt_par = False if opt_par: # define training data Loading @@ -153,8 +154,12 @@ if opt_par: parameter_rndfor_opt(X_train, y_train) #{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120} # # Optimal Parameters for Naive Bayes # parameter_naibay_opt(X_train, y_train) # Optimal Parameters for Support Vector Machines parameter_SVM_opt(X_train, y_train) # Optimal Parameters for Multilayer Perceptron parameter_MLP_opt(X_train, y_train) Loading Loading @@ -243,18 +248,33 @@ if __name__ == "__main__": if print_results: plot_tree(dec_tree, features) # if feature_variations_name=="features_most_important": # if name_classifier == "SVC": # plot_SVM(svc, X_train, y_train, "SVC") # elif classifier != naive_bayes and classifier != naive_bayes: # plot_dectree_boundary(X_train, y_train, classifier, name_classifier) # PLOT DECISION BOUNDARIES AND LEARNING CURVE if feature_variations_name=="features_most_important": # Plot decision boundaries if name_classifier == "SVC": plot_SVM(svc, X_train, y_train, "SVC") elif name_classifier != "Naive Bayes": plot_dec_boundary(X_train, y_train, classifier, name_classifier) # if features == features_all and classifier == dec_tree: # feature_importance_dectree(classifier, features) # elif features == features_all and classifier == rnd_forr: # feature_importance_rndfor(classifier, features, X_dev, y_dev) # elif features == features_all and classifier == naive_bayes: # feature_importance_NB(classifier, X_dev, y_dev, features) # Plot Learning Curves for classifiers if name_classifier== "Multilayer Perceptron": save_lc_plot(dec_tree, rnd_forr, "DC", "RF", X_train, y_train) save_lc_plot(svc, mlp, "SVK", "MLP", X_train, y_train) # SEARCH FOR MOST IMPORTANT FEATURES AND VALIDATION CURVES if features == features_all and classifier == dec_tree: feature_importance_dectree(classifier, features) # Validation Curve for maxdepth check_fitting_dectree(X_train, y_train, dec_tree) elif features == features_all and classifier == rnd_forr: feature_importance_rndfor(classifier, features, X_dev, y_dev) # Validation Curve for n_estimators check_fitting_rndfor(X_train, y_train, rnd_forr) elif features == features_all and classifier == naive_bayes: feature_importance_NB(classifier, X_dev, y_dev, features) if print_results: print(cl_confusion_matrices) #TODO delete Loading @@ -273,40 +293,6 @@ if __name__ == "__main__": # ROC curve TODO #plot_roc_curves_avarage(classifiers, X_train, y_train, X_test, y_test, save_as= feature_variations_name + '_roc_curve.png') # save confusions matrices to dict with all confusion matrices # Decision Tree Classification # Optimal Parameters # paramter_dectree_opt(X_train, y_train) # plot_tree(dec_tree, features) # Validation Curve for maxdepth # check_fitting_dectree(X_train, y_train, dec_tree) #feature importance # if features == features_all and classifier == dec_tree: # feature_importance_dectree(dec_tree, features) # Random Forest Classifier # Optimal Parameters # parameter_rndfor_opt(X_train, y_train) {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120} # # Classification with optimized parameters # rnd_forr = RandomForestClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 120) # rnd_forr.fit(X_train, y_train) # y_pred = rnd_forr.predict(X_dev) # print(y_pred) # print("Accuracy:", metrics.accuracy_score(y_dev, y_pred)) # feature importance # if features == features_all and classifier == rnd_forr: # feature_importance_rndfor(rnd_forr, features, X_dev, y_dev) # Validation Curve for n_estimators # check_fitting_rndfor(X_train, y_train, rnd_forr) # create dataframe with all accuracies all_accuracies, all_accuracies_sorted = collect_accuracies_in_data_frame(feature_variations_names, classifiers_names, ft_cl_eva, print_results = print_results) Loading project/feature_importance.py +9 −0 Original line number Diff line number Diff line Loading @@ -4,6 +4,9 @@ import matplotlib.pyplot as plt from sklearn.inspection import permutation_importance def feature_importance_dectree(dec_tree, features): """ Find most important features for classification with decision tree classifier. """ imp = dec_tree.feature_importances_ forest_importances = pd.Series(imp, index=features) Loading @@ -16,6 +19,9 @@ def feature_importance_dectree(dec_tree, features): plt.show() def feature_importance_rndfor(rnd_forr, features, X_dev, y_dev): """ Find most important features for classification with random forest classifier. """ imp = rnd_forr.feature_importances_ print(imp) forest_importances = pd.Series(imp, index=features) #.sort_values(ascending=False) Loading @@ -42,6 +48,9 @@ def feature_importance_rndfor(rnd_forr, features, X_dev, y_dev): plt.show() def feature_importance_NB(cnb, X_test, y_test, features): """ Find most important features for classification with naive bayes classifier. """ imps = permutation_importance(cnb, X_test, y_test) importances = imps.importances_mean std = imps.importances_std Loading project/graphics/DC_learning_curve.png +1.06 KiB (134 KiB) Loading image diff... project/graphics/SVK_learning_curve.png +1.38 KiB (139 KiB) Loading image diff... project/learning_curve.py +12 −71 Original line number Diff line number Diff line Loading @@ -2,36 +2,6 @@ import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import CategoricalNB from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier # # read data # data_folder = "./data" # train_data = pd.read_csv(data_folder + "/data_train.csv") # features_most_important = ["sex","age", "address", "Fedu","studytime","famsup","famrel","freetime",\ # "goout","health"] # X_train = train_data[features_most_important] # y_train = train_data.Walc # # DEFINE CLASSIFIERS # # Decision Tree Classifier # dec_tree = DecisionTreeClassifier(criterion= "gini", max_depth=7, min_samples_leaf=1, \ # min_samples_split=3,splitter="random", random_state=23) # # Random Forest Classifier # rnd_forr = RandomForestClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 1, \ # min_samples_split= 2, n_estimators= 120, random_state=13) # # Naive Bayes Classifier # naive_bayes = CategoricalNB() # smoothing with alpha = 1.0 # naive_bayes_ns = CategoricalNB(alpha=1.0) # no smoothing # # SVM Kernel # svc = SVC(kernel="rbf", gamma=5, decision_function_shape='ovo', C=1) # # Neuronal Network # mlp = MLPClassifier(alpha=1, max_iter=1000) def plot_learning_curve( estimator, Loading @@ -50,23 +20,15 @@ def plot_learning_curve( Parameters ---------- estimator : estimator instance An estimator instance implementing `fit` and `predict` methods which will be cloned for each validation. estimator : the classifier title : str Title for the chart. title : title for plot X : array-like of shape (n_samples, n_features) Training vector, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. X : training data y : array-like of shape (n_samples) or (n_samples, n_features) Target relative to ``X`` for classification or regression; None for unsupervised learning. y : labels axes : array-like of shape (3,), default=None Axes to use for plotting the curves. axes : axes for plotting, default (3,) ylim : tuple of shape (2,), default=None Defines minimum and maximum y-values plotted, e.g. (ymin, ymax). Loading @@ -80,28 +42,15 @@ def plot_learning_curve( - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like of shape (n_ticks,) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the ``dtype`` is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) generate the learning curve. """ if axes is None: _, axes = plt.subplots(2, 3, figsize=(20, 5)) Loading Loading @@ -192,24 +141,16 @@ def save_lc_plot(classifier1, classifier2, classifier_name1, classifier_name2, X fig, axes = plt.subplots(3, 2, figsize=(10, 15)) if classifier_name1 or classifier_name2 in ["DC", "RF"]: # Cross validation with 50 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) else: cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) # Plot Learning Curve Classifier 1 title = "Learning Curves " + classifier_name1 plot_learning_curve(classifier1, title, X_train, y_train, axes=axes[:, 0], ylim=(0.1, 1.01), cv=cv, n_jobs=4) # Plot Learning Curve for Classifier 2 title = "Learning Curves " + classifier_name2 plot_learning_curve(classifier2, title, X_train, y_train, axes=axes[:, 1], ylim=(0.1, 1.01), cv=cv, n_jobs=4) plt.savefig("graphics/" + classifier_name1 + "_learning_curve.png") No newline at end of file # save_lc_plot(dec_tree, rnd_forr, "DC", "RF", X_train, y_train) # save_lc_plot(svc, mlp, "SVK", "MLP", X_train, y_train) # save_lc_plot(naive_bayes, dec_tree, "NB", "DT", X_train, y_train) Loading
project/classifier.py +36 −50 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ from evaluation_functions import calculate_statistical_characteristics, plot_roc plot_roc_curves_svm, as_title, plot_accuracies, visualize_confusion_matrix_of_best_model, \ collect_accuracies_in_data_frame import matplotlib.pyplot as plt from learning_curve import save_lc_plot #from baselines import eva_criteria Loading Loading @@ -139,7 +140,7 @@ eva_criteria = [ prec_w ] # parameter optimization # parameter optimization (takes very long for some of them) opt_par = False if opt_par: # define training data Loading @@ -153,8 +154,12 @@ if opt_par: parameter_rndfor_opt(X_train, y_train) #{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120} # # Optimal Parameters for Naive Bayes # parameter_naibay_opt(X_train, y_train) # Optimal Parameters for Support Vector Machines parameter_SVM_opt(X_train, y_train) # Optimal Parameters for Multilayer Perceptron parameter_MLP_opt(X_train, y_train) Loading Loading @@ -243,18 +248,33 @@ if __name__ == "__main__": if print_results: plot_tree(dec_tree, features) # if feature_variations_name=="features_most_important": # if name_classifier == "SVC": # plot_SVM(svc, X_train, y_train, "SVC") # elif classifier != naive_bayes and classifier != naive_bayes: # plot_dectree_boundary(X_train, y_train, classifier, name_classifier) # PLOT DECISION BOUNDARIES AND LEARNING CURVE if feature_variations_name=="features_most_important": # Plot decision boundaries if name_classifier == "SVC": plot_SVM(svc, X_train, y_train, "SVC") elif name_classifier != "Naive Bayes": plot_dec_boundary(X_train, y_train, classifier, name_classifier) # if features == features_all and classifier == dec_tree: # feature_importance_dectree(classifier, features) # elif features == features_all and classifier == rnd_forr: # feature_importance_rndfor(classifier, features, X_dev, y_dev) # elif features == features_all and classifier == naive_bayes: # feature_importance_NB(classifier, X_dev, y_dev, features) # Plot Learning Curves for classifiers if name_classifier== "Multilayer Perceptron": save_lc_plot(dec_tree, rnd_forr, "DC", "RF", X_train, y_train) save_lc_plot(svc, mlp, "SVK", "MLP", X_train, y_train) # SEARCH FOR MOST IMPORTANT FEATURES AND VALIDATION CURVES if features == features_all and classifier == dec_tree: feature_importance_dectree(classifier, features) # Validation Curve for maxdepth check_fitting_dectree(X_train, y_train, dec_tree) elif features == features_all and classifier == rnd_forr: feature_importance_rndfor(classifier, features, X_dev, y_dev) # Validation Curve for n_estimators check_fitting_rndfor(X_train, y_train, rnd_forr) elif features == features_all and classifier == naive_bayes: feature_importance_NB(classifier, X_dev, y_dev, features) if print_results: print(cl_confusion_matrices) #TODO delete Loading @@ -273,40 +293,6 @@ if __name__ == "__main__": # ROC curve TODO #plot_roc_curves_avarage(classifiers, X_train, y_train, X_test, y_test, save_as= feature_variations_name + '_roc_curve.png') # save confusions matrices to dict with all confusion matrices # Decision Tree Classification # Optimal Parameters # paramter_dectree_opt(X_train, y_train) # plot_tree(dec_tree, features) # Validation Curve for maxdepth # check_fitting_dectree(X_train, y_train, dec_tree) #feature importance # if features == features_all and classifier == dec_tree: # feature_importance_dectree(dec_tree, features) # Random Forest Classifier # Optimal Parameters # parameter_rndfor_opt(X_train, y_train) {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120} # # Classification with optimized parameters # rnd_forr = RandomForestClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 120) # rnd_forr.fit(X_train, y_train) # y_pred = rnd_forr.predict(X_dev) # print(y_pred) # print("Accuracy:", metrics.accuracy_score(y_dev, y_pred)) # feature importance # if features == features_all and classifier == rnd_forr: # feature_importance_rndfor(rnd_forr, features, X_dev, y_dev) # Validation Curve for n_estimators # check_fitting_rndfor(X_train, y_train, rnd_forr) # create dataframe with all accuracies all_accuracies, all_accuracies_sorted = collect_accuracies_in_data_frame(feature_variations_names, classifiers_names, ft_cl_eva, print_results = print_results) Loading
project/feature_importance.py +9 −0 Original line number Diff line number Diff line Loading @@ -4,6 +4,9 @@ import matplotlib.pyplot as plt from sklearn.inspection import permutation_importance def feature_importance_dectree(dec_tree, features): """ Find most important features for classification with decision tree classifier. """ imp = dec_tree.feature_importances_ forest_importances = pd.Series(imp, index=features) Loading @@ -16,6 +19,9 @@ def feature_importance_dectree(dec_tree, features): plt.show() def feature_importance_rndfor(rnd_forr, features, X_dev, y_dev): """ Find most important features for classification with random forest classifier. """ imp = rnd_forr.feature_importances_ print(imp) forest_importances = pd.Series(imp, index=features) #.sort_values(ascending=False) Loading @@ -42,6 +48,9 @@ def feature_importance_rndfor(rnd_forr, features, X_dev, y_dev): plt.show() def feature_importance_NB(cnb, X_test, y_test, features): """ Find most important features for classification with naive bayes classifier. """ imps = permutation_importance(cnb, X_test, y_test) importances = imps.importances_mean std = imps.importances_std Loading
project/learning_curve.py +12 −71 Original line number Diff line number Diff line Loading @@ -2,36 +2,6 @@ import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import CategoricalNB from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier # # read data # data_folder = "./data" # train_data = pd.read_csv(data_folder + "/data_train.csv") # features_most_important = ["sex","age", "address", "Fedu","studytime","famsup","famrel","freetime",\ # "goout","health"] # X_train = train_data[features_most_important] # y_train = train_data.Walc # # DEFINE CLASSIFIERS # # Decision Tree Classifier # dec_tree = DecisionTreeClassifier(criterion= "gini", max_depth=7, min_samples_leaf=1, \ # min_samples_split=3,splitter="random", random_state=23) # # Random Forest Classifier # rnd_forr = RandomForestClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 1, \ # min_samples_split= 2, n_estimators= 120, random_state=13) # # Naive Bayes Classifier # naive_bayes = CategoricalNB() # smoothing with alpha = 1.0 # naive_bayes_ns = CategoricalNB(alpha=1.0) # no smoothing # # SVM Kernel # svc = SVC(kernel="rbf", gamma=5, decision_function_shape='ovo', C=1) # # Neuronal Network # mlp = MLPClassifier(alpha=1, max_iter=1000) def plot_learning_curve( estimator, Loading @@ -50,23 +20,15 @@ def plot_learning_curve( Parameters ---------- estimator : estimator instance An estimator instance implementing `fit` and `predict` methods which will be cloned for each validation. estimator : the classifier title : str Title for the chart. title : title for plot X : array-like of shape (n_samples, n_features) Training vector, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. X : training data y : array-like of shape (n_samples) or (n_samples, n_features) Target relative to ``X`` for classification or regression; None for unsupervised learning. y : labels axes : array-like of shape (3,), default=None Axes to use for plotting the curves. axes : axes for plotting, default (3,) ylim : tuple of shape (2,), default=None Defines minimum and maximum y-values plotted, e.g. (ymin, ymax). Loading @@ -80,28 +42,15 @@ def plot_learning_curve( - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like of shape (n_ticks,) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the ``dtype`` is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) generate the learning curve. """ if axes is None: _, axes = plt.subplots(2, 3, figsize=(20, 5)) Loading Loading @@ -192,24 +141,16 @@ def save_lc_plot(classifier1, classifier2, classifier_name1, classifier_name2, X fig, axes = plt.subplots(3, 2, figsize=(10, 15)) if classifier_name1 or classifier_name2 in ["DC", "RF"]: # Cross validation with 50 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) else: cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) # Plot Learning Curve Classifier 1 title = "Learning Curves " + classifier_name1 plot_learning_curve(classifier1, title, X_train, y_train, axes=axes[:, 0], ylim=(0.1, 1.01), cv=cv, n_jobs=4) # Plot Learning Curve for Classifier 2 title = "Learning Curves " + classifier_name2 plot_learning_curve(classifier2, title, X_train, y_train, axes=axes[:, 1], ylim=(0.1, 1.01), cv=cv, n_jobs=4) plt.savefig("graphics/" + classifier_name1 + "_learning_curve.png") No newline at end of file # save_lc_plot(dec_tree, rnd_forr, "DC", "RF", X_train, y_train) # save_lc_plot(svc, mlp, "SVK", "MLP", X_train, y_train) # save_lc_plot(naive_bayes, dec_tree, "NB", "DT", X_train, y_train)