Commit 44120686 authored by F1nnH's avatar F1nnH
Browse files

Add docstrings

parent f2623316
Loading
Loading
Loading
Loading
+58 −4
Original line number Diff line number Diff line
@@ -12,12 +12,20 @@ import seaborn as sns
from sklearn.model_selection import train_test_split
import joblib

# Function to rescale images
def rescale(image):
    return image / 255.0

# Function to load images and labels for CNN
def load_images_and_labels_cnn(base_path, size=(50, 50)):
    """
    Load images and their labels for CNN processing from a given base path.
    Images are resized to a specified size.

    Parameters:
    base_path (str): The path of the directory containing image folders.
    size (tuple): The desired size for resizing the images (width, height).

    Returns:
    tuple: A tuple containing arrays of images and labels.
    """

    images = []
    labels = []
    for folder in os.listdir(base_path):
@@ -35,6 +43,18 @@ def load_images_and_labels_cnn(base_path, size=(50, 50)):

# Function to create the CNN model
def get_model(lr=0.001, dropout_rate=0.3, activation='relu'):
    """
    Create and compile a Convolutional Neural Network (CNN) model.

    Parameters:
    lr (float): Learning rate for the optimizer. Default is 0.001.
    dropout_rate (float): Dropout rate for regularization. Default is 0.3.
    activation (str): Activation function for the layers. Default is 'relu'.

    Returns:
    tensorflow.keras.models.Sequential: The compiled CNN model.
    """

    input_shape = (50, 50, 3)
    outputs_number = 30
    model = models.Sequential([
@@ -56,6 +76,16 @@ def get_model(lr=0.001, dropout_rate=0.3, activation='relu'):

# Learning rate schedule function
def lr_schedule(epoch):
    """
    Learning rate schedule function to adjust the learning rate based on the epoch.

    Parameters:
    epoch (int): The current epoch number during training.

    Returns:
    float: The adjusted learning rate.
    """

    lr = 0.001
    if epoch > 50:
        lr *= 0.5e-3
@@ -69,6 +99,18 @@ def lr_schedule(epoch):

# Function to create stratified subsets of data
def stratified_subset(X, y, subset_ratio):
    """
    Create a stratified subset of data based on the given subset ratio.

    Parameters:
    X (numpy.ndarray): The input features.
    y (numpy.ndarray): The target labels.
    subset_ratio (float): The ratio of the subset size to the original data size.

    Returns:
    tuple: A tuple containing the subset of features and labels.
    """

    unique_classes = np.unique(y)
    X_subset = []
    y_subset = []
@@ -85,6 +127,18 @@ def stratified_subset(X, y, subset_ratio):

# Function to train the model
def train_model(X_train, y_train, X_dev, y_dev, subset_description, label_encoder):
    """
    Train the CNN model with the given training and validation data.

    Parameters:
    X_train (numpy.ndarray): Training data features.
    y_train (numpy.ndarray): Training data labels.
    X_dev (numpy.ndarray): Validation data features.
    y_dev (numpy.ndarray): Validation data labels.
    subset_description (str): Description of the training data subset.
    label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder used for transforming class labels.
    """
    
    model = get_model()
    history = model.fit(
        X_train, y_train, 
+99 −29
Original line number Diff line number Diff line
@@ -10,15 +10,43 @@ from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

def create_directory(directory_name):
    """
    Create a directory if it doesn't already exist.

    Parameters:
    directory_name (str): The name or path of the directory to be created.
    """

    if not os.path.exists(directory_name):
        os.makedirs(directory_name)

def save_plot(figure, directory, file_name):
    """
    Save a matplotlib plot to a specified directory with a given file name.

    Parameters:
    figure (matplotlib.figure.Figure): The matplotlib figure object to be saved.
    directory (str): The directory where the figure will be saved.
    file_name (str): The name of the file in which the figure will be saved.
    """

    file_path = os.path.join(directory, file_name)
    figure.savefig(file_path)
    plt.close(figure)

def load_images_and_labels_cnn(base_path, size=(50, 50)):
    """
    Load images and their labels for CNN processing from a given base path.
    Images are resized to a specified size.

    Parameters:
    base_path (str): The path of the directory containing image folders.
    size (tuple): The desired size for resizing the images (width, height).

    Returns:
    tuple: A tuple containing arrays of images, labels, and file paths.
    """

    images = []
    labels = []
    file_paths = []  # Store file paths
@@ -37,6 +65,19 @@ def load_images_and_labels_cnn(base_path, size=(50, 50)):
    return np.array(images), np.array(labels), file_paths

def plot_metrics(y_true, y_pred, metric_name, title, directory, file_name, is_accuracy=False):
    """
    Plot and save a metric evaluation bar chart.

    Parameters:
    y_true (array): True labels.
    y_pred (array): Predicted labels by the model.
    metric_name (function): The metric function to evaluate (e.g., accuracy_score).
    title (str): Title of the plot.
    directory (str): Directory to save the plot.
    file_name (str): Name of the file to save the plot.
    is_accuracy (bool): Indicator whether the metric is accuracy or not. Default is False.
    """

    if is_accuracy:
        metric_value = metric_name(y_true, y_pred)
    else:
@@ -51,6 +92,20 @@ def plot_metrics(y_true, y_pred, metric_name, title, directory, file_name, is_ac
    save_plot(figure, directory, f'{file_name}.png')

def display_misclassified_images(original_paths, X, y_true, y_pred, label_encoder, num_images=7, directory='', model_name=''):
    """
    Display and save a specified number of misclassified images.

    Parameters:
    original_paths (list): List of paths to the original images.
    X (array): The array of image data.
    y_true (array): True labels.
    y_pred (array): Predicted labels.
    label_encoder (LabelEncoder): Encoder used to transform labels.
    num_images (int): Number of misclassified images to display. Default is 7.
    directory (str): Directory to save the images.
    model_name (str): Name of the model used for prefixing saved images.
    """
    
    misclassified_indices = np.where(y_true != y_pred)[0]
    if len(misclassified_indices) < num_images:
        num_images = len(misclassified_indices)
@@ -70,6 +125,15 @@ def display_misclassified_images(original_paths, X, y_true, y_pred, label_encode


def evaluate_model_with_metrics(model_path, datasets, label_encoder):
    """
    Evaluate a model with various metrics and save the results.

    Parameters:
    model_path (str): Path to the trained model.
    datasets (dict): A dictionary containing datasets for evaluation.
    label_encoder (LabelEncoder): Encoder used to transform labels.
    """

    model = tf.keras.models.load_model(model_path)
    model_name = os.path.basename(model_path).split('.')[0]
    results_directory = f'../figures/cnn/{model_name}'
@@ -115,7 +179,16 @@ def evaluate_model_with_metrics(model_path, datasets, label_encoder):
            display_misclassified_images(paths, X, y_true, y_pred, label_encoder, directory=results_directory, model_name=split_name)


def main(subset):
    """
    Main function to evaluate a CNN model on a specific dataset subset.

    Parameters:
    subset (str): Subset of the training data used for the model (e.g., "10_percent").
    """
    
    # Construct the model file path
    model_path = f'../trained_classifiers/fruit_classifier_{subset}.keras'

    # Load and preprocess data (as in the training script)
    X_train, y_train, train_paths = load_images_and_labels_cnn('../data/train')
@@ -134,15 +207,12 @@ datasets = {
        'Test': (X_test, y_test_encoded, test_paths)
    }

    evaluate_model_with_metrics(model_path, datasets, label_encoder)

# Evaluate the full model
# evaluate_model_with_metrics('fruit_classifier_full.keras', datasets, label_encoder)
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Evaluate a CNN model on a specific dataset subset.')
    parser.add_argument('subset', type=str, help='Subset of the training data used for the model (e.g., "10_percent").')
    args = parser.parse_args()

# For models trained on subsets
evaluate_model_with_metrics('../trained_classifiers/fruit_classifier_10_percent.keras', datasets, label_encoder)
# evaluate_model_with_metrics('fruit_classifier_50_percent.keras', datasets, label_encoder)
# evaluate_model_with_metrics('fruit_classifier_05_percent.keras', datasets, label_encoder)
# evaluate_model_with_metrics('fruit_classifier_30_percent.keras', datasets, label_encoder)
# evaluate_model_with_metrics('fruit_classifier_70_percent.keras', datasets, label_encoder)
# evaluate_model_with_metrics('fruit_classifier_90_percent.keras', datasets, label_encoder)
    main(args.subset)