Add docstrings (44120686) · Commits · igraf / exp-ml-2-hillengass-graf

project/src/classify_with_cnn.py

+58 −4

Original line number	Diff line number	Diff line
		@@ -12,12 +12,20 @@ import seaborn as sns
		from sklearn.model_selection import train_test_split
		import joblib

		# Function to rescale images
		def rescale(image):
		return image / 255.0

		# Function to load images and labels for CNN
		def load_images_and_labels_cnn(base_path, size=(50, 50)):
		"""
		Load images and their labels for CNN processing from a given base path.
		Images are resized to a specified size.

		Parameters:
		base_path (str): The path of the directory containing image folders.
		size (tuple): The desired size for resizing the images (width, height).

		Returns:
		tuple: A tuple containing arrays of images and labels.
		"""

		images = []
		labels = []
		for folder in os.listdir(base_path):
		@@ -35,6 +43,18 @@ def load_images_and_labels_cnn(base_path, size=(50, 50)):

		# Function to create the CNN model
		def get_model(lr=0.001, dropout_rate=0.3, activation='relu'):
		"""
		Create and compile a Convolutional Neural Network (CNN) model.

		Parameters:
		lr (float): Learning rate for the optimizer. Default is 0.001.
		dropout_rate (float): Dropout rate for regularization. Default is 0.3.
		activation (str): Activation function for the layers. Default is 'relu'.

		Returns:
		tensorflow.keras.models.Sequential: The compiled CNN model.
		"""

		input_shape = (50, 50, 3)
		outputs_number = 30
		model = models.Sequential([
		@@ -56,6 +76,16 @@ def get_model(lr=0.001, dropout_rate=0.3, activation='relu'):

		# Learning rate schedule function
		def lr_schedule(epoch):
		"""
		Learning rate schedule function to adjust the learning rate based on the epoch.

		Parameters:
		epoch (int): The current epoch number during training.

		Returns:
		float: The adjusted learning rate.
		"""

		lr = 0.001
		if epoch > 50:
		lr *= 0.5e-3
		@@ -69,6 +99,18 @@ def lr_schedule(epoch):

		# Function to create stratified subsets of data
		def stratified_subset(X, y, subset_ratio):
		"""
		Create a stratified subset of data based on the given subset ratio.

		Parameters:
		X (numpy.ndarray): The input features.
		y (numpy.ndarray): The target labels.
		subset_ratio (float): The ratio of the subset size to the original data size.

		Returns:
		tuple: A tuple containing the subset of features and labels.
		"""

		unique_classes = np.unique(y)
		X_subset = []
		y_subset = []
		@@ -85,6 +127,18 @@ def stratified_subset(X, y, subset_ratio):

		# Function to train the model
		def train_model(X_train, y_train, X_dev, y_dev, subset_description, label_encoder):
		"""
		Train the CNN model with the given training and validation data.

		Parameters:
		X_train (numpy.ndarray): Training data features.
		y_train (numpy.ndarray): Training data labels.
		X_dev (numpy.ndarray): Validation data features.
		y_dev (numpy.ndarray): Validation data labels.
		subset_description (str): Description of the training data subset.
		label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder used for transforming class labels.
		"""

		model = get_model()
		history = model.fit(
		X_train, y_train,

project/src/evaluate_cnn.py

+99 −29

Original line number	Diff line number	Diff line
		@@ -10,15 +10,43 @@ from tensorflow.keras.utils import to_categorical
		from sklearn.preprocessing import LabelEncoder

		def create_directory(directory_name):
		"""
		Create a directory if it doesn't already exist.

		Parameters:
		directory_name (str): The name or path of the directory to be created.
		"""

		if not os.path.exists(directory_name):
		os.makedirs(directory_name)

		def save_plot(figure, directory, file_name):
		"""
		Save a matplotlib plot to a specified directory with a given file name.

		Parameters:
		figure (matplotlib.figure.Figure): The matplotlib figure object to be saved.
		directory (str): The directory where the figure will be saved.
		file_name (str): The name of the file in which the figure will be saved.
		"""

		file_path = os.path.join(directory, file_name)
		figure.savefig(file_path)
		plt.close(figure)

		def load_images_and_labels_cnn(base_path, size=(50, 50)):
		"""
		Load images and their labels for CNN processing from a given base path.
		Images are resized to a specified size.

		Parameters:
		base_path (str): The path of the directory containing image folders.
		size (tuple): The desired size for resizing the images (width, height).

		Returns:
		tuple: A tuple containing arrays of images, labels, and file paths.
		"""

		images = []
		labels = []
		file_paths = [] # Store file paths
		@@ -37,6 +65,19 @@ def load_images_and_labels_cnn(base_path, size=(50, 50)):
		return np.array(images), np.array(labels), file_paths

		def plot_metrics(y_true, y_pred, metric_name, title, directory, file_name, is_accuracy=False):
		"""
		Plot and save a metric evaluation bar chart.

		Parameters:
		y_true (array): True labels.
		y_pred (array): Predicted labels by the model.
		metric_name (function): The metric function to evaluate (e.g., accuracy_score).
		title (str): Title of the plot.
		directory (str): Directory to save the plot.
		file_name (str): Name of the file to save the plot.
		is_accuracy (bool): Indicator whether the metric is accuracy or not. Default is False.
		"""

		if is_accuracy:
		metric_value = metric_name(y_true, y_pred)
		else:
		@@ -51,6 +92,20 @@ def plot_metrics(y_true, y_pred, metric_name, title, directory, file_name, is_ac
		save_plot(figure, directory, f'{file_name}.png')

		def display_misclassified_images(original_paths, X, y_true, y_pred, label_encoder, num_images=7, directory='', model_name=''):
		"""
		Display and save a specified number of misclassified images.

		Parameters:
		original_paths (list): List of paths to the original images.
		X (array): The array of image data.
		y_true (array): True labels.
		y_pred (array): Predicted labels.
		label_encoder (LabelEncoder): Encoder used to transform labels.
		num_images (int): Number of misclassified images to display. Default is 7.
		directory (str): Directory to save the images.
		model_name (str): Name of the model used for prefixing saved images.
		"""

		misclassified_indices = np.where(y_true != y_pred)[0]
		if len(misclassified_indices) < num_images:
		num_images = len(misclassified_indices)
		@@ -70,6 +125,15 @@ def display_misclassified_images(original_paths, X, y_true, y_pred, label_encode


		def evaluate_model_with_metrics(model_path, datasets, label_encoder):
		"""
		Evaluate a model with various metrics and save the results.

		Parameters:
		model_path (str): Path to the trained model.
		datasets (dict): A dictionary containing datasets for evaluation.
		label_encoder (LabelEncoder): Encoder used to transform labels.
		"""

		model = tf.keras.models.load_model(model_path)
		model_name = os.path.basename(model_path).split('.')[0]
		results_directory = f'../figures/cnn/{model_name}'
		@@ -115,7 +179,16 @@ def evaluate_model_with_metrics(model_path, datasets, label_encoder):
		display_misclassified_images(paths, X, y_true, y_pred, label_encoder, directory=results_directory, model_name=split_name)


		def main(subset):
		"""
		Main function to evaluate a CNN model on a specific dataset subset.

		Parameters:
		subset (str): Subset of the training data used for the model (e.g., "10_percent").
		"""

		# Construct the model file path
		model_path = f'../trained_classifiers/fruit_classifier_{subset}.keras'

		# Load and preprocess data (as in the training script)
		X_train, y_train, train_paths = load_images_and_labels_cnn('../data/train')
		@@ -134,15 +207,12 @@ datasets = {
		'Test': (X_test, y_test_encoded, test_paths)
		}

		evaluate_model_with_metrics(model_path, datasets, label_encoder)

		# Evaluate the full model
		# evaluate_model_with_metrics('fruit_classifier_full.keras', datasets, label_encoder)
		if __name__ == "__main__":
		parser = argparse.ArgumentParser(description='Evaluate a CNN model on a specific dataset subset.')
		parser.add_argument('subset', type=str, help='Subset of the training data used for the model (e.g., "10_percent").')
		args = parser.parse_args()

		# For models trained on subsets
		evaluate_model_with_metrics('../trained_classifiers/fruit_classifier_10_percent.keras', datasets, label_encoder)
		# evaluate_model_with_metrics('fruit_classifier_50_percent.keras', datasets, label_encoder)
		# evaluate_model_with_metrics('fruit_classifier_05_percent.keras', datasets, label_encoder)
		# evaluate_model_with_metrics('fruit_classifier_30_percent.keras', datasets, label_encoder)
		# evaluate_model_with_metrics('fruit_classifier_70_percent.keras', datasets, label_encoder)
		# evaluate_model_with_metrics('fruit_classifier_90_percent.keras', datasets, label_encoder)
		main(args.subset)