#!/usr/bin/env python3 # Assignment n 3, given in lesson 07 on 29/10/19 # Author: Leonardo Tamiano import math import numpy as np import matplotlib.pyplot as plt import matplotlib.axes as axs from sklearn.neighbors import KernelDensity # import previous work from assignment_1.entropy import entropy # set seed for random number generation np.random.seed(1) # ---------------------------------------------- # 1 - FIRST POINT FUNCTIONS # ---------------------------------------------- # This function takes a list of samples generated from a discrete # random variable and returns the estimated pmf by counting the # frequencies of the values in the sample. def estimate_pmf(samples): n = len(samples) # returns a list containg each unique value (sample_list), and a # list containg the number of occurences for each unique value. sample_list, occur_vector = np.unique(samples, return_counts=True) # estimate the pmf by computing the frequency for each possible # value. pmf_estimation = [occur_vector[i] / float(n) for i in range(0, len(occur_vector))] return sample_list, pmf_estimation # ---------------------------------------------- # 2 - SECOND POINT FUNCTIONS # ---------------------------------------------- # This function computes the differential entropy of a generic # continuous r.v. given its pdf. To do so it approximates numerically # the integral def differential_entropy(pdf): X_plot, Y_plot = pdf y_values = np.array([Y_plot[i] * math.log(Y_plot[i], 2) for i in range(0, len(X_plot))]) return -np.trapz(y_values, X_plot) # ---------------------------------------------- # 3 - THIRD POINT FUNCTIONS # ---------------------------------------------- kernel_method_functions = ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"] # This function estimates the pdf of a continuous variable function # given a sample using the kernel method. def kernel_pdf_estimation(continuous_samples, kernObj=False, bandwidth=0, kernelFunction='gaussian'): # compute useful statistics cont_samp_std = np.std(continuous_samples) cont_samp_len = len(continuous_samples) cont_samp_min = min(continuous_samples) cont_samp_max = max(continuous_samples) margin = cont_samp_std * 2 optimal_bandwidth = 1.06 * cont_samp_std * np.power(cont_samp_len, -1/5) # set bandwidth to optimal value if not passed by user. if bandwidth == 0 or bandwidth < 0: bandwidthKDE = optimal_bandwidth else: bandwidthKDE = bandwidth # check that the passed function can be used with the kernel # method. if kernelFunction not in kernel_method_functions: kernelFunction = 'gaussian' kde_object = KernelDensity(kernel=kernelFunction, bandwidth=bandwidthKDE).fit(continuous_samples.reshape(-1, 1)) # decide wheter to return directly the kernel object or return # directly X_values with the associated pdf value. if kernObj: return kde_object else: X_plot = np.linspace(cont_samp_min - margin, cont_samp_max + margin, 1000)[:, np.newaxis] kde_LogDensity_estimate = kde_object.score_samples(X_plot) kde_estimate = np.exp(kde_LogDensity_estimate) return X_plot.flatten(), kde_estimate # ---------------------------------------------- # 4 - FINAL OBSERVATIONS # ---------------------------------------------- def bandwidth_significant_test(): # Continuous sample generation sample_size = 300 sample_mean = 33 sample_std = 5 continuous_samples = np.random.normal(loc=sample_mean, scale=sample_std, size=sample_size) # specific formula for gaussian r.v. continuous_theoretical_entropy = 0.5 + 0.5 * math.log(2*math.pi*sample_std**2) # estimate the pdf using the same kernel method but changing the # bandwidth optimal_bandwidth = 1.06 * np.std(continuous_samples) * np.power(len(continuous_samples), -1/5) X_plot = np.linspace(0, optimal_bandwidth + 250, 30) Y_plot = [] for bandwidth_value in X_plot: estimated_pdf = kernel_pdf_estimation(continuous_samples, bandwidth=bandwidth_value) continuous_sample_entropy = differential_entropy(estimated_pdf) continuous_difference = continuous_theoretical_entropy - continuous_sample_entropy Y_plot.append(continuous_difference) plt.plot(X_plot, Y_plot) plt.axhline(linewidth=1, color='r', linestyle='--') plt.axvline(x=optimal_bandwidth, color='black', linestyle='--') plt.xlabel("bandwidth") plt.ylabel("difference") plt.text(optimal_bandwidth + 0.3, -0.5, "optimal bandwidth", rotation = 0) plt.show() return if __name__ == "__main__": print("/------------------------------------/") print("First point.") print() # Discrete sample generation sample_size = 200 values = [1, 2, 3, 4] theoretical_pmf = [0.1, 0.3, 0.2, 0.4] discrete_samples = np.random.choice(values, p=theoretical_pmf, size=sample_size) # estimate the pmf using the generated sample _, estimated_pmf = estimate_pmf(discrete_samples) # Compute the entropies and their difference as requested discrete_theoretical_entropy = entropy(np.array(theoretical_pmf)) discrete_sample_entropy = entropy(np.array(estimated_pmf)) discrete_difference = discrete_theoretical_entropy - discrete_sample_entropy print(f"Testing with sample_size={sample_size} and pmf={theoretical_pmf}") print(f"Theoretical entropy: {discrete_theoretical_entropy}") print(f"Sample entropy: {discrete_sample_entropy}") print(f"Difference is: {discrete_difference}") print() print("/------------------------------------/") print("Third point.") print() # Continuous sample generation sample_size = 300 sample_mean = 33 sample_std = 5 continuous_samples = np.random.normal(loc=sample_mean, scale=sample_std, size=sample_size) # estimate pdf using the kernel method estimated_pdf = kernel_pdf_estimation(continuous_samples) # Compute entropies and their difference as requested continuous_theoretical_entropy = 0.5 + 0.5 * math.log(2*math.pi*sample_std**2) # specific formula for gaussian r.v. continuous_sample_entropy = differential_entropy(estimated_pdf) continuous_difference = continuous_theoretical_entropy - continuous_sample_entropy print(f"Testing with sample_size={sample_size} and pdf=N({sample_mean}, {sample_std}^2)") print(f"Theoretical entropy: {continuous_theoretical_entropy}") print(f"Sample entropy: {continuous_sample_entropy}") print(f"Difference is: {continuous_difference}") print() print("/------------------------------------/") print("Fourth point.") print() bandwidth_significant_test()