#!/usr/bin/env python3 # Assignment n 5, given in lesson 11 on 26/11/19. # Author: Leonardo Tamiano import numpy as np from sklearn import datasets from scipy import stats from scipy.stats import norm from sklearn.neighbors import KernelDensity # import previous work from assignment_3.assignment_3 import estimate_pmf from assignment_3.assignment_3 import kernel_pdf_estimation # ---------------------------------------------- # GENERAL FUNCTIONS # ---------------------------------------------- # This function splits the given dataset into sub-datasets according # to the class label C_j. def split_dataset_by_class(dataset, class_vector, number_of_distinct_classes): sub_datasets = [ [] for x in range(0, number_of_distinct_classes)] for i in range(0, len(class_vector)): sub_datasets[class_vector[i]].append(dataset[i]) sub_datasets = [ np.array(x) for x in sub_datasets ] return sub_datasets # ---------------------------------------------- # 1 - FIRST POINT # ---------------------------------------------- # Build a Bayes classifier function which takes a training dataset, a # class label vector for the training dataset, and a test dataset, and # returns a class label vector for the test dataset. # # Assume that the features are continuous random variables. def bayes_classifier(train_data, train_class_vector, test_data): number_of_classes = len(np.nonzero(np.bincount(train_class_vector))[0]) _, number_of_features = train_data.shape sub_datasets = split_dataset_by_class(train_data, train_class_vector, number_of_classes) # Compute, for each sub data set, the multi-variate pdf of the # features. Do this by using the kernel method for the # multi-variate case. multi_variate_pdfs = np.full((number_of_classes), None, dtype=object) for i in range(0, number_of_classes): # Use the kernel method for the multi-variate case. multi_variate_pdfs[i] = KernelDensity(kernel='gaussian').fit(sub_datasets[i]) # Compute the prior probabilities and the likelihood using the # previously estimated parameters and classify the test_data. number_of_train_instances, _ = train_data.shape classes_list, prior_probs = estimate_pmf(train_class_vector) posterior_probs = np.full(len(prior_probs), 0.0, dtype=float) test_class_vector = np.full(number_of_train_instances, -1, dtype=int) for u in range(0, len(train_data)): test_instance = train_data[u] likelihoods = np.full(number_of_classes, 0.0, dtype=float) for i in range(0, number_of_classes): # compute the likelihood for the given combination of # (instance, class). That is, compute # # P(\underscore{x} | c_i) # # use the previously estimated multi-variate pdf # associated with this class value to estimate the # likelihood. likelihoods[i] = np.exp(multi_variate_pdfs[i].score_samples([test_instance])) # Compute posterior probabilities using prior probabilities # and likelihood for i in range(0, number_of_classes): posterior_probs[i] = likelihoods[i] * prior_probs[i] # Assign to the instance the class with the highest posterior # probability test_class_vector[u] = np.argmax(posterior_probs) return test_class_vector # ---------------------------------------------- # 2 - SECOND POINT # ---------------------------------------------- # This function classifies a set number of test_data by using a naive # bayes classifier trained with train_data and train_class_vector. # # Assume that the feature that make up the instances are indpendent # continuous random variables. def naive_bayes_classifier(train_data, train_class_vector, test_data): number_of_classes = len(np.nonzero(np.bincount(train_class_vector))[0]) _, number_of_features = train_data.shape sub_datasets = split_dataset_by_class(train_data, train_class_vector, number_of_classes) # Compute, for each sub data set, the pdf of each feature using # the kernel method. pdfs = np.full((number_of_classes, number_of_features), None, dtype=object) for i in range(0, number_of_classes): for j in range(0, number_of_features): # print(f"CLASS:{i}, FEATURE:{j}") feature_of_interest = sub_datasets[i][:, j:j+1].flatten() pdfs[i][j] = kernel_pdf_estimation(feature_of_interest, kernObj=True) # Compute the prior probabilities and the likelihood using the # previously estimated parameters and classify the test_data. number_of_train_instances, _ = train_data.shape classes_list, prior_probs = estimate_pmf(train_class_vector) posterior_probs = np.full(len(prior_probs), 0.0, dtype=float) test_class_vector = np.full(number_of_train_instances, -1, dtype=int) for u in range(0, len(train_data)): test_instance = train_data[u] likelihoods = np.full(number_of_classes, 0.0, dtype=float) for i in range(0, number_of_classes): # compute the likelihood for the given combination of # (instance, class). That is, compute # # P(\underscore{x} | c_i) # likelihood = 1 for j in range(0, number_of_features): # # use the previously estimated pdf to compute the # probability of having x_j given class c_i, that is, # P(x_j | c_i). x_j = np.array([test_instance[j]]) prob = np.exp(pdfs[i][j].score_samples(x_j[:, np.newaxis])) # NOTE: here we use the assumption that the features # are independent likelihood *= prob likelihoods[i] = likelihood # Compute posterior probabilities using prior probabilities # and likelihood for i in range(0, number_of_classes): posterior_probs[i] = likelihoods[i] * prior_probs[i] # Assign to the instance the class with the highest posterior # probability test_class_vector[u] = np.argmax(posterior_probs) return test_class_vector # ---------------------------------------------- # 3 - THIRD POINT # ---------------------------------------------- # Assume features are continuous, independent and Gaussian distributed # random variables. def gaussian_naive_bayes_classifier(train_data, train_class_vector, test_data): number_of_classes = len(np.nonzero(np.bincount(train_class_vector))[0]) _, number_of_features = train_data.shape # split the dataset in sub-datasets according to class. This is # done for the training process. sub_datasets = split_dataset_by_class(train_data, train_class_vector, number_of_classes) # Compute, for each sub data set, and for each feature, the mean # and variance. mean_vector = np.full((number_of_classes, number_of_features), 0.0, dtype=float) std_vector = np.full((number_of_classes, number_of_features), 0.0, dtype=float) for i in range(0, number_of_classes): for j in range(0, number_of_features): # print(f"CLASS:{i}, FEATURE:{j}") feature_of_interest = sub_datasets[i][:, j:j+1] std_vector[i][j] = np.std(feature_of_interest) mean_vector[i][j] = np.mean(feature_of_interest) # Compute the prior probabilities and the likelihood using the # previously estimated parameters and classify the test_data. # # prior_probs is an array containing the prior-probabilities of # the various classes. That is # # prior_probs[j] = P(c_j) # number_of_train_instances, _ = train_data.shape classes_list, prior_probs = estimate_pmf(train_class_vector) posterior_probs = np.full(len(prior_probs), 0.0, dtype=float) test_class_vector = np.full(number_of_train_instances, -1, dtype=int) for u in range(0, len(train_data)): test_instance = train_data[u] likelihoods = np.full(number_of_classes, 0.0, dtype=float) for i in range(0, number_of_classes): # compute the likelihood for the given combination of # (instance, class). That is, compute # # P(\underscore{x} | c_i) # likelihood = 1 for j in range(0, number_of_features): mean = mean_vector[i][j] std = std_vector[i][j] # NOTE: here we use the assumption that the features # are independent from eachothers as well as normally # (gaussian) distributed. likelihood *= norm.pdf(test_instance[j], loc=mean, scale=std) likelihoods[i] = likelihood # Compute posterior probabilities using prior probabilities # and likelihood for i in range(0, number_of_classes): posterior_probs[i] = likelihoods[i] * prior_probs[i] # Assign to the instance the class with the highest posterior # probability test_class_vector[u] = np.argmax(posterior_probs) return test_class_vector # ---------------------------------------------- # 4 - FOURTH POINT # ---------------------------------------------- # Add a parameter that specifies the split between train/test to be # made. # # This function takes in input a dataset with a class vector and # returns two datasets and two class vector: a training dataset with # its class vector, and a testing dataset with its class vector. The # splitting is executed according to the specification: for each class # label 50% of the instances are taken as train data, and the remaning # 50% of the instances are taken as test data. def split_dataset(dataset, class_vector): full_dataset = np.column_stack([iris_data_matrix, class_vector]) number_of_classes = len(np.nonzero(np.bincount(class_vector))[0]) _, number_of_features = dataset.shape sub_datasets = split_dataset_by_class(full_dataset, class_vector, number_of_classes) train_dataset = [] test_dataset = [] # For each sub-dataset choose randomly 50% of rows to insert into # the training set and 50% of rows to insert into the test set. for i in range(0, number_of_classes): np.random.shuffle(sub_datasets[i]) rows, _ = sub_datasets[i].shape if rows % 2 == 0: training, test = sub_datasets[i][:int(rows/2)], sub_datasets[i][int(rows/2):] else: training, test = sub_datasets[i][:int((rows-1)/2)], sub_datasets[i][int(rows+1/2):] train_dataset.extend(training) test_dataset.extend(test) train_dataset = np.array(train_dataset) test_dataset = np.array(test_dataset) train_class_vector = np.array(train_dataset[:, number_of_features:number_of_features+1].astype(int).flatten()) test_class_vector = np.array(test_dataset[:, number_of_features:number_of_features+1].astype(int).flatten()) # delete class-vector column from the datasets train_dataset = np.delete(np.array(train_dataset), number_of_features, 1) test_dataset = np.delete(np.array(test_dataset), number_of_features, 1) return train_dataset, train_class_vector, test_dataset, test_class_vector # This function computes the precision of a classifier. In particular # computed_class_vector is the class_vector obtained by applying the # classifier to a bunch of test instances. These test instances are # correctly labeled by using the correct_class_vector def compute_classifier_accuracy(correct_class_vector, computed_class_vector): accuracy = 0 for i in range(0, len(correct_class_vector)): if correct_class_vector[i] == computed_class_vector[i]: accuracy += 1 # return the average accuracy obtained by the classifier return accuracy/len(correct_class_vector) # Actual that gets executed if __name__ == "__main__": # Load iris dataset iris = datasets.load_iris() iris_data_matrix, class_vector = iris.data, iris.target # split iris dataset into test set and train set train_dataset, train_class_vector, test_dataset, test_class_vector = split_dataset(iris_data_matrix, class_vector) gaussian_computed_class_vector = gaussian_naive_bayes_classifier(train_dataset, train_class_vector, test_dataset) naive_computed_class_vector = naive_bayes_classifier(train_dataset, train_class_vector, test_dataset) bayes_computed_class_vector = bayes_classifier(train_dataset, train_class_vector, test_dataset) gaussian_accuracy = compute_classifier_accuracy(test_class_vector, gaussian_computed_class_vector) naive_accuracy = compute_classifier_accuracy(test_class_vector, naive_computed_class_vector) bayes_accuracy = compute_classifier_accuracy(test_class_vector, bayes_computed_class_vector) print(f"Gaussian: {gaussian_accuracy}") print(f"Naive: {naive_accuracy}") print(f"Bayes: {bayes_accuracy}")