Source code for frb.dm_kde.pdf_fns

""" Functions to for making PDFs"""

import numpy as np
import scipy as sp
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

rv_amount = 10**6


[docs]
def make_pdf(distribution, num_of_draws, grid, stepsize):
    """
    Makes PDF of given distribution

    
    Arguments:
        distribution (array):
            Array of values describing PDF.
        num_of_draws (int):
            Number of samples for PDF.
        grid (array):
            Desired grid for PDF.
        stepsize (float):
            Stepsize of desired grid.

    Outputs:
        draws (array):
            Samples drawn from PDF.
        distribution_scaled (array):
            PDF corresponding to input grid.

    """
    x_grid = np.arange(len(grid)) #rv_discrete only accepts interger values
    values = grid
    pdf = sp.stats.rv_discrete(values=(x_grid, distribution))
    draws_ = pdf.rvs(size=num_of_draws)
    draws = values[draws_] #rescale to floats
    distribution_scaled = distribution/stepsize
    return draws, distribution_scaled



[docs]
def make_kde_funtion(grid, draws, min_bandwidth, max_bandwidth, bandwidth_stepsize, cv, kernel):
    " cv is number of cross-validation folds "
    """
       Returns KDE distribution

    
    Arguments:
        grid (array):
            Grid for PDF.
        draws (array):
            Sample from which to approximate PDF
        min_bandwidth (float):
            Start of bandwidth search range.
        max_bandwidth (float):
            End of bandwidth search range.
        bandwidth_stepsize (float):
            Stepsize for bandwidth search.
        cv (int):
            Number of folds for cross-validation
        kernel (str):
            Kernel to use. Valid kernels are 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'.
    Outputs:
        kde (array):
            PDF approximated by KDE
    """
    draws = np.asarray(draws)
    params = {'bandwidth': np.arange(min_bandwidth, max_bandwidth, bandwidth_stepsize)}
    grid_cv = GridSearchCV(KernelDensity(kernel=kernel), params, cv=cv)
    grid_cv.fit(draws.reshape(-1,1))
    bandwidth_opt= grid_cv.best_estimator_.bandwidth
    # print('Optimal bandwidth is:',bandwidth_opt)
    kde_skl = KernelDensity(kernel=kernel,bandwidth=bandwidth_opt)
    kde_skl.fit(draws.reshape(-1,1))
    log_kde = kde_skl.score_samples(grid.reshape(-1,1))
    kde = np.exp(log_kde)
    return kde