#!/usr/bin/env python3
# Assignment n 5, given in lesson 11 on 26/11/19.
# Author: Leonardo Tamiano

import numpy as np

from sklearn import datasets
from scipy import stats
from scipy.stats import norm
from sklearn.neighbors import KernelDensity

# import previous work
from assignment_3.assignment_3 import estimate_pmf
from assignment_3.assignment_3 import kernel_pdf_estimation

# ----------------------------------------------
# GENERAL FUNCTIONS
# ----------------------------------------------

# This function splits the given dataset into sub-datasets according
# to the class label C_j. 
def split_dataset_by_class(dataset, class_vector, number_of_distinct_classes):
    sub_datasets = [ [] for x in range(0, number_of_distinct_classes)]
    for i in range(0, len(class_vector)):
        sub_datasets[class_vector[i]].append(dataset[i])
    sub_datasets = [ np.array(x) for x in sub_datasets ]
    return sub_datasets

# ----------------------------------------------
# 1 - FIRST POINT
# ----------------------------------------------

# Build a Bayes classifier function which takes a training dataset, a
# class label vector for the training dataset, and a test dataset, and
# returns a class label vector for the test dataset.
#
# Assume that the features are continuous random variables.
def bayes_classifier(train_data, train_class_vector, test_data):
    number_of_classes     = len(np.nonzero(np.bincount(train_class_vector))[0])
    _, number_of_features = train_data.shape

    sub_datasets = split_dataset_by_class(train_data, train_class_vector, number_of_classes)

    # Compute, for each sub data set, the multi-variate pdf of the
    # features. Do this by using the kernel method for the
    # multi-variate case.
    multi_variate_pdfs = np.full((number_of_classes), None, dtype=object)
    
    for i in range(0, number_of_classes):
        # Use the kernel method for the multi-variate case.
        multi_variate_pdfs[i] = KernelDensity(kernel='gaussian').fit(sub_datasets[i])

    # Compute the prior probabilities and the likelihood using the
    # previously estimated parameters and classify the test_data.
    number_of_train_instances, _ = train_data.shape
    classes_list, prior_probs    = estimate_pmf(train_class_vector)
    posterior_probs              = np.full(len(prior_probs), 0.0, dtype=float)
    test_class_vector            = np.full(number_of_train_instances, -1, dtype=int)

    for u in range(0, len(train_data)):
        test_instance = train_data[u]
        likelihoods = np.full(number_of_classes, 0.0, dtype=float)

        for i in range(0, number_of_classes):
            # compute the likelihood for the given combination of
            # (instance, class). That is, compute
            #
            #        P(\underscore{x} | c_i)
            # 
            # use the previously estimated multi-variate pdf
            # associated with this class value to estimate the
            # likelihood.
            likelihoods[i] = np.exp(multi_variate_pdfs[i].score_samples([test_instance]))

        # Compute posterior probabilities using prior probabilities
        # and likelihood
        for i in range(0, number_of_classes):
            posterior_probs[i] = likelihoods[i] * prior_probs[i]

        # Assign to the instance the class with the highest posterior
        # probability
        test_class_vector[u] = np.argmax(posterior_probs)

    return test_class_vector
    
# ----------------------------------------------
# 2 - SECOND POINT
# ----------------------------------------------

# This function classifies a set number of test_data by using a naive
# bayes classifier trained with train_data and train_class_vector.
#
# Assume that the feature that make up the instances are indpendent
# continuous random variables.
def naive_bayes_classifier(train_data, train_class_vector, test_data):
    number_of_classes     = len(np.nonzero(np.bincount(train_class_vector))[0])
    _, number_of_features = train_data.shape

    sub_datasets = split_dataset_by_class(train_data, train_class_vector, number_of_classes)

    # Compute, for each sub data set, the pdf of each feature using
    # the kernel method.
    pdfs = np.full((number_of_classes, number_of_features), None, dtype=object)

    for i in range(0, number_of_classes):
        for j in range(0, number_of_features):
            # print(f"CLASS:{i}, FEATURE:{j}")
            feature_of_interest = sub_datasets[i][:, j:j+1].flatten()
            pdfs[i][j] = kernel_pdf_estimation(feature_of_interest, kernObj=True)

    # Compute the prior probabilities and the likelihood using the
    # previously estimated parameters and classify the test_data.
    number_of_train_instances, _ = train_data.shape
    classes_list, prior_probs    = estimate_pmf(train_class_vector)
    posterior_probs              = np.full(len(prior_probs), 0.0, dtype=float)
    test_class_vector            = np.full(number_of_train_instances, -1, dtype=int)    

    for u in range(0, len(train_data)):
        test_instance = train_data[u]
        likelihoods = np.full(number_of_classes, 0.0, dtype=float)
        
        for i in range(0, number_of_classes):
            # compute the likelihood for the given combination of
            # (instance, class). That is, compute
            #
            #        P(\underscore{x} | c_i)
            # 
            likelihood = 1

            for j in range(0, number_of_features):
                #
                # use the previously estimated pdf to compute the
                # probability of having x_j given class c_i, that is,
                # P(x_j | c_i).
                x_j = np.array([test_instance[j]])
                prob = np.exp(pdfs[i][j].score_samples(x_j[:, np.newaxis]))
                
                # NOTE: here we use the assumption that the features
                # are independent
                likelihood *= prob

            likelihoods[i] = likelihood

        # Compute posterior probabilities using prior probabilities
        # and likelihood
        for i in range(0, number_of_classes):
            posterior_probs[i] = likelihoods[i] * prior_probs[i]

        # Assign to the instance the class with the highest posterior
        # probability
        test_class_vector[u] = np.argmax(posterior_probs)

    return test_class_vector

# ----------------------------------------------
# 3 - THIRD POINT
# ----------------------------------------------

# Assume features are continuous, independent and Gaussian distributed
# random variables.
def gaussian_naive_bayes_classifier(train_data, train_class_vector, test_data):
    number_of_classes     = len(np.nonzero(np.bincount(train_class_vector))[0])
    _, number_of_features = train_data.shape

    # split the dataset in sub-datasets according to class. This is
    # done for the training process.
    sub_datasets = split_dataset_by_class(train_data, train_class_vector, number_of_classes)

    # Compute, for each sub data set, and for each feature, the mean
    # and variance.
    mean_vector = np.full((number_of_classes, number_of_features), 0.0, dtype=float)
    std_vector  = np.full((number_of_classes, number_of_features), 0.0, dtype=float)
    
    for i in range(0, number_of_classes):
        for j in range(0, number_of_features):
            # print(f"CLASS:{i}, FEATURE:{j}")
            feature_of_interest = sub_datasets[i][:, j:j+1]
            std_vector[i][j]    = np.std(feature_of_interest)
            mean_vector[i][j]   = np.mean(feature_of_interest)

    # Compute the prior probabilities and the likelihood using the
    # previously estimated parameters and classify the test_data.
    # 
    # prior_probs is an array containing the prior-probabilities of
    # the various classes. That is
    #
    #   prior_probs[j] = P(c_j)
    # 
    number_of_train_instances, _ = train_data.shape
    classes_list, prior_probs    = estimate_pmf(train_class_vector)
    posterior_probs              = np.full(len(prior_probs), 0.0, dtype=float)
    test_class_vector            = np.full(number_of_train_instances, -1, dtype=int)    
    
    for u in range(0, len(train_data)):
        test_instance = train_data[u]
        likelihoods = np.full(number_of_classes, 0.0, dtype=float)
        
        for i in range(0, number_of_classes):
            # compute the likelihood for the given combination of
            # (instance, class). That is, compute
            #
            #        P(\underscore{x} | c_i)
            # 
            likelihood = 1

            for j in range(0, number_of_features):
                mean = mean_vector[i][j]
                std  = std_vector[i][j]
                # NOTE: here we use the assumption that the features
                # are independent from eachothers as well as normally
                # (gaussian) distributed.
                likelihood *= norm.pdf(test_instance[j], loc=mean, scale=std)

            likelihoods[i] = likelihood

        # Compute posterior probabilities using prior probabilities
        # and likelihood
        for i in range(0, number_of_classes):
            posterior_probs[i] = likelihoods[i] * prior_probs[i]

        # Assign to the instance the class with the highest posterior
        # probability
        test_class_vector[u] = np.argmax(posterior_probs)

    return test_class_vector

# ----------------------------------------------
# 4 - FOURTH POINT
# ----------------------------------------------

# Add a parameter that specifies the split between train/test to be
# made.
# 
# This function takes in input a dataset with a class vector and
# returns two datasets and two class vector: a training dataset with
# its class vector, and a testing dataset with its class vector. The
# splitting is executed according to the specification: for each class
# label 50% of the instances are taken as train data, and the remaning
# 50% of the instances are taken as test data.
def split_dataset(dataset, class_vector):
    full_dataset      = np.column_stack([iris_data_matrix, class_vector])
    number_of_classes = len(np.nonzero(np.bincount(class_vector))[0])
    _, number_of_features = dataset.shape
    
    sub_datasets = split_dataset_by_class(full_dataset, class_vector, number_of_classes)
    train_dataset = []
    test_dataset  = []

    # For each sub-dataset choose randomly 50% of rows to insert into
    # the training set and 50% of rows to insert into the test set.
    for i in range(0, number_of_classes):
        np.random.shuffle(sub_datasets[i])
        rows, _ = sub_datasets[i].shape
        
        if rows % 2 == 0:
            training, test = sub_datasets[i][:int(rows/2)], sub_datasets[i][int(rows/2):]
        else:
            training, test = sub_datasets[i][:int((rows-1)/2)], sub_datasets[i][int(rows+1/2):]
            
        train_dataset.extend(training)
        test_dataset.extend(test)

        
    train_dataset = np.array(train_dataset)
    test_dataset  = np.array(test_dataset)

    train_class_vector = np.array(train_dataset[:, number_of_features:number_of_features+1].astype(int).flatten())
    test_class_vector  = np.array(test_dataset[:, number_of_features:number_of_features+1].astype(int).flatten())
    
    # delete class-vector column from the datasets
    train_dataset = np.delete(np.array(train_dataset), number_of_features, 1)
    test_dataset  = np.delete(np.array(test_dataset), number_of_features, 1)

    return train_dataset, train_class_vector, test_dataset, test_class_vector

# This function computes the precision of a classifier. In particular
# computed_class_vector is the class_vector obtained by applying the
# classifier to a bunch of test instances. These test instances are
# correctly labeled by using the correct_class_vector
def compute_classifier_accuracy(correct_class_vector, computed_class_vector):
    accuracy = 0

    for i in range(0, len(correct_class_vector)):
        if correct_class_vector[i] == computed_class_vector[i]:
            accuracy += 1

    # return the average accuracy obtained by the classifier
    return accuracy/len(correct_class_vector)

# Actual that gets executed
if __name__ == "__main__":
    # Load iris dataset
    iris = datasets.load_iris()
    iris_data_matrix, class_vector = iris.data, iris.target

    # split iris dataset into test set and train set
    train_dataset, train_class_vector, test_dataset, test_class_vector = split_dataset(iris_data_matrix, class_vector)

    gaussian_computed_class_vector = gaussian_naive_bayes_classifier(train_dataset, train_class_vector, test_dataset)
    naive_computed_class_vector = naive_bayes_classifier(train_dataset, train_class_vector, test_dataset)
    bayes_computed_class_vector = bayes_classifier(train_dataset, train_class_vector, test_dataset)

    gaussian_accuracy = compute_classifier_accuracy(test_class_vector, gaussian_computed_class_vector)
    naive_accuracy = compute_classifier_accuracy(test_class_vector, naive_computed_class_vector)    
    bayes_accuracy = compute_classifier_accuracy(test_class_vector, bayes_computed_class_vector)    
    
    print(f"Gaussian: {gaussian_accuracy}")
    print(f"Naive: {naive_accuracy}")
    print(f"Bayes: {bayes_accuracy}")