#!/usr/bin/env python3
# Assignment n 4, given in lesson 08 on 05/11/19.
# Author: Leonardo Tamiano

import numpy as np

from sklearn import datasets
from itertools import combinations

# import previous work
from assignment_1.entropy      import entropy
from assignment_2.assignment_2 import mutual_information
from assignment_3.assignment_3 import estimate_pmf

# Load iris dataset
iris = datasets.load_iris()
iris_data_matrix, class_vector = iris.data, iris.target

# ----------------------------------------------
# 1 - FIRST POINT
# ----------------------------------------------

# Discretize the samples as integers
discretized_feature_1 = (10*iris_data_matrix[:, 0:1]).astype(int).flatten()
discretized_feature_2 = (10*iris_data_matrix[:, 1:2]).astype(int).flatten()
discretized_feature_3 = (10*iris_data_matrix[:, 2:3]).astype(int).flatten()
discretized_feature_4 = (10*iris_data_matrix[:, 3:4]).astype(int).flatten()

# Compute the probability mass function of the features of the Iris
# dataset. For each feature we want to estimate a single pmf.
values_1, pmf_1 = estimate_pmf(discretized_feature_1)
values_2, pmf_2 = estimate_pmf(discretized_feature_2)
values_3, pmf_3 = estimate_pmf(discretized_feature_3)
values_4, pmf_4 = estimate_pmf(discretized_feature_4)

values = [values_1, values_2, values_3, values_4]
pmfs   = [pmf_1, pmf_2, pmf_3, pmf_4]

# ----------------------------------------------
# 2 - SECOND POINT
# ----------------------------------------------

# Compute the (discrete) entropy of the features of the Iris datasets.
entropy_1 = entropy(np.array(pmf_1))
entropy_2 = entropy(np.array(pmf_2))
entropy_3 = entropy(np.array(pmf_3))
entropy_4 = entropy(np.array(pmf_4))

print("/------------------------------------/")
print("Second point:\n")
print(f"entropy_1 = {entropy_1}")
print(f"entropy_2 = {entropy_2}")
print(f"entropy_3 = {entropy_3}")
print(f"entropy_4 = {entropy_4}")

# ----------------------------------------------
# 3 - THIRD POINT
# ----------------------------------------------

# NOTE: To compute the mutual informations we also have to compute the
# joint distribution for each couples of features.

# This function estimates the multivarivate pmf for a given
# data_matrix in which the rows are row vector samples and the columns
# are features.
def estimate_pmf_multivariate(data_matrix):
    rows, columns = data_matrix.shape
    # axis=0 to get unique rows
    unique_rows_array, pmf_vector = np.unique(data_matrix, axis=0, return_counts=True)
    return unique_rows_array, pmf_vector/rows


if __name__ == "__main__":
    print("/------------------------------------/")
    print("Third point:\n")
    
    # Compute the mutual information between all pairs of features in the
    # Iris dataset.
    feature_ids = [1, 2, 3, 4]
    
    # feature_ids = [1, 2]
    for pair in combinations(feature_ids, 2):
        # compute the mutual information between the i-th feature and the
        # j-th feature.
        i, j = pair

        # extract from the data matrix the features of interest and merge
        # them together to form a new data_matrix
        discretized_feature_i = (10*iris_data_matrix[:, i-1:i]).astype(int).flatten()
        discretized_feature_j = (10*iris_data_matrix[:, j-1:j]).astype(int).flatten()
        new_data_matrix = np.column_stack([discretized_feature_i, discretized_feature_j])

        # estimate the joint pmf of the feature of interest
        unique_rows_array, multivariate_pmf = estimate_pmf_multivariate(new_data_matrix)

        # construct the joint_pmf matrix to compute the mutual
        # information.
        #
        # NOTE: this is done so as to re-use exactly the code written
        # during assigment 2 for the computation of the mutual
        # information. The mutual information() function requires the full
        # joint_pmf matrix in which even the couples (i, j) such that
        # their relative joint probability is null, that is P(X = x_i, Y =
        # y_j) = 0. This could be optimized by writing a
        # mutual_information() function that is able to deal with sparse
        # representations, that is representations of joint_pmf in which
        # we only represent non-zero entries.
        N_feature_i = len(values[i-1])
        N_feature_j = len(values[j-1])
        joint_pmf   = np.full((N_feature_i, N_feature_j), 0.0)

        for u in range(0, N_feature_i):
            for v in range(0, N_feature_j):
                for w in range(0, len(unique_rows_array)):
                    arr1 = np.array([values[i-1][u], values[j-1][v]])
                    arr2 = unique_rows_array[w]

                    if np.array_equal(arr1, arr2):
                        joint_pmf[u][v] = multivariate_pmf[w]

        feature_i_pmf = pmfs[i-1]
        feature_j_pmf = pmfs[j-1]

        # compute the mutual information
        mutual_info = 0
        mutual_info = mutual_information(joint_pmf, feature_i_pmf, feature_j_pmf)
        print(f"Mutual information of features ({i}, {j}) is : {mutual_info}")       

'''

Data obtained from code

Mutual information of features (1, 2) is : 2.0898440905339544
Mutual information of features (1, 3) is : 3.002867101602725
Mutual information of features (1, 4) is : 2.240684758221668
Mutual information of features (2, 3) is : 2.2274285391747157
Mutual information of features (2, 4) is : 1.6759771322899026
Mutual information of features (3, 4) is : 2.6948471229825786

'''