Commit 061cd6a7 authored by wernicke's avatar wernicke
Browse files
parents 69c85f0a 966d228b
Loading
Loading
Loading
Loading
+36 −50
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ from evaluation_functions import calculate_statistical_characteristics, plot_roc
    plot_roc_curves_svm, as_title, plot_accuracies, visualize_confusion_matrix_of_best_model, \
        collect_accuracies_in_data_frame
import matplotlib.pyplot as plt
from learning_curve import save_lc_plot
#from baselines import eva_criteria


@@ -139,7 +140,7 @@ eva_criteria = [
    prec_w
]

# parameter optimization
# parameter optimization (takes very long for some of them)
opt_par = False
if opt_par:
    # define training data
@@ -153,8 +154,12 @@ if opt_par:
    parameter_rndfor_opt(X_train, y_train) 
    #{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120}

    # # Optimal Parameters for Naive Bayes
    # parameter_naibay_opt(X_train, y_train) 
    # Optimal Parameters for Support Vector Machines
    parameter_SVM_opt(X_train, y_train)

    # Optimal Parameters for Multilayer Perceptron
    parameter_MLP_opt(X_train, y_train)

    


@@ -243,18 +248,33 @@ if __name__ == "__main__":
            if print_results:
                plot_tree(dec_tree, features)

            # if feature_variations_name=="features_most_important":
            #     if name_classifier == "SVC":
            #         plot_SVM(svc, X_train, y_train, "SVC")
            #     elif classifier != naive_bayes and classifier != naive_bayes:
            #         plot_dectree_boundary(X_train, y_train, classifier, name_classifier)
            # PLOT DECISION BOUNDARIES AND LEARNING CURVE
            if feature_variations_name=="features_most_important":

                # Plot decision boundaries
                if name_classifier == "SVC":
                    plot_SVM(svc, X_train, y_train, "SVC")
                elif name_classifier != "Naive Bayes":
                    plot_dec_boundary(X_train, y_train, classifier, name_classifier)

            # if features == features_all and classifier == dec_tree:
            #     feature_importance_dectree(classifier, features)
            # elif features == features_all and classifier == rnd_forr:
            #     feature_importance_rndfor(classifier, features, X_dev, y_dev)
            # elif features == features_all and classifier == naive_bayes:
            #     feature_importance_NB(classifier, X_dev, y_dev, features)
                # Plot Learning Curves for classifiers
                if name_classifier== "Multilayer Perceptron":
                    save_lc_plot(dec_tree, rnd_forr, "DC", "RF", X_train, y_train)
                    save_lc_plot(svc, mlp, "SVK", "MLP", X_train, y_train)
                    
            # SEARCH FOR MOST IMPORTANT FEATURES AND VALIDATION CURVES
            if features == features_all and classifier == dec_tree:
                feature_importance_dectree(classifier, features)
                # Validation Curve for maxdepth
                check_fitting_dectree(X_train, y_train, dec_tree)

            elif features == features_all and classifier == rnd_forr:
                feature_importance_rndfor(classifier, features, X_dev, y_dev)
                # Validation Curve for n_estimators
                check_fitting_rndfor(X_train, y_train, rnd_forr)

            elif features == features_all and classifier == naive_bayes:
                feature_importance_NB(classifier, X_dev, y_dev, features)
        
        if print_results:
            print(cl_confusion_matrices) #TODO delete
@@ -273,40 +293,6 @@ if __name__ == "__main__":
        # ROC curve TODO
        #plot_roc_curves_avarage(classifiers, X_train, y_train, X_test, y_test, save_as= feature_variations_name + '_roc_curve.png')

        # save confusions matrices to dict with all confusion matrices
        # Decision Tree Classification
        # Optimal Parameters
        # paramter_dectree_opt(X_train, y_train)
        # plot_tree(dec_tree, features)

        # Validation Curve for maxdepth
        # check_fitting_dectree(X_train, y_train, dec_tree)

        #feature importance
        
        # if features == features_all and classifier == dec_tree:
        #     feature_importance_dectree(dec_tree, features)
        
        # Random Forest Classifier
        # Optimal Parameters
        # parameter_rndfor_opt(X_train, y_train) {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120}

        # # Classification with optimized parameters
        # rnd_forr = RandomForestClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 120)
        # rnd_forr.fit(X_train, y_train)
        # y_pred = rnd_forr.predict(X_dev)
        # print(y_pred)
        # print("Accuracy:", metrics.accuracy_score(y_dev, y_pred))


        # feature importance 
        # if features == features_all and classifier == rnd_forr:
        #     feature_importance_rndfor(rnd_forr, features, X_dev, y_dev)

        # Validation Curve for n_estimators
        # check_fitting_rndfor(X_train, y_train, rnd_forr)


# create dataframe with all accuracies
all_accuracies, all_accuracies_sorted = collect_accuracies_in_data_frame(feature_variations_names, classifiers_names, ft_cl_eva, print_results = print_results) 

+9 −0
Original line number Diff line number Diff line
@@ -4,6 +4,9 @@ import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

def feature_importance_dectree(dec_tree, features):
    """
    Find most important features for classification with decision tree classifier.
    """
    imp = dec_tree.feature_importances_
    forest_importances = pd.Series(imp, index=features)

@@ -16,6 +19,9 @@ def feature_importance_dectree(dec_tree, features):
    plt.show()

def feature_importance_rndfor(rnd_forr, features, X_dev, y_dev):
    """
    Find most important features for classification with random forest classifier.
    """
    imp = rnd_forr.feature_importances_
    print(imp)
    forest_importances = pd.Series(imp, index=features) #.sort_values(ascending=False)
@@ -42,6 +48,9 @@ def feature_importance_rndfor(rnd_forr, features, X_dev, y_dev):
    plt.show()

def feature_importance_NB(cnb, X_test, y_test, features):
    """
    Find most important features for classification with naive bayes classifier.
    """
    imps = permutation_importance(cnb, X_test, y_test)
    importances = imps.importances_mean
    std = imps.importances_std
+1.06 KiB (134 KiB)
Loading image diff...
+1.38 KiB (139 KiB)
Loading image diff...
+12 −71
Original line number Diff line number Diff line
@@ -2,36 +2,6 @@ import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# # read data
# data_folder = "./data"
# train_data = pd.read_csv(data_folder + "/data_train.csv")

# features_most_important = ["sex","age", "address", "Fedu","studytime","famsup","famrel","freetime",\
#     "goout","health"]

# X_train = train_data[features_most_important]
# y_train = train_data.Walc

# # DEFINE CLASSIFIERS
# # Decision Tree Classifier
# dec_tree = DecisionTreeClassifier(criterion= "gini", max_depth=7, min_samples_leaf=1, \
#     min_samples_split=3,splitter="random", random_state=23)
# # Random Forest Classifier
# rnd_forr = RandomForestClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 1, \
#     min_samples_split= 2, n_estimators= 120, random_state=13)
# # Naive Bayes Classifier
# naive_bayes = CategoricalNB() # smoothing with alpha = 1.0
# naive_bayes_ns = CategoricalNB(alpha=1.0) # no smoothing
# # SVM Kernel
# svc = SVC(kernel="rbf", gamma=5, decision_function_shape='ovo', C=1)
# # Neuronal Network
# mlp =  MLPClassifier(alpha=1, max_iter=1000)

def plot_learning_curve(
    estimator,
@@ -50,23 +20,15 @@ def plot_learning_curve(

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.
    estimator : the classifier

    title : str
        Title for the chart.
    title : title for plot

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.
    X : training data

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.
    y : labels

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.
    axes : axes for plotting, default (3,)

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).
@@ -80,28 +42,15 @@ def plot_learning_curve(
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
        generate the learning curve.
    """
    if axes is None:
        _, axes = plt.subplots(2, 3, figsize=(20, 5))
@@ -192,24 +141,16 @@ def save_lc_plot(classifier1, classifier2, classifier_name1, classifier_name2, X

    fig, axes = plt.subplots(3, 2, figsize=(10, 15))

    if classifier_name1 or classifier_name2 in ["DC", "RF"]:
    # Cross validation with 50 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
    else:
        cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
    
    # Plot Learning Curve Classifier 1
    title = "Learning Curves " + classifier_name1
    plot_learning_curve(classifier1, title, X_train, y_train, axes=axes[:, 0], ylim=(0.1, 1.01), cv=cv, n_jobs=4)

    # Plot Learning Curve for Classifier 2
    title = "Learning Curves " + classifier_name2
    plot_learning_curve(classifier2, title, X_train, y_train, axes=axes[:, 1], ylim=(0.1, 1.01), cv=cv, n_jobs=4)

    plt.savefig("graphics/" + classifier_name1 + "_learning_curve.png")
 No newline at end of file


# save_lc_plot(dec_tree, rnd_forr, "DC", "RF", X_train, y_train)

# save_lc_plot(svc, mlp, "SVK", "MLP", X_train, y_train)

# save_lc_plot(naive_bayes, dec_tree, "NB", "DT", X_train, y_train)
Loading