Source code for epipack.distributions

"""
Heuristics to fit duration distributions to sums of
exponentially distributed waiting times.
This module has to be considered work in progress.
It works, but can be much improved, e.g. by
putting conditions on the ordering of waiting
times such that the symmetry in parameter space
can be utilized for fitting.
"""

import numpy as np
from scipy.sparse import csr_matrix
from scipy.integrate import solve_ivp
from scipy.optimize import minimize
from scipy.stats import entropy

_DEFAULT_METHOD = 'RK23'

[docs]class ExpChain():
    """
    A class that represents a chain of states
    where the waiting time between consecutive
    states is distributed exponentially with
    predetermined mean. The class can be
    used to fit total waiting time distributions,
    i. e. distributions of sums of exponential
    random variables.

    Parameters
    ==========
    durations : numpy.ndarray of float
        mean waiting times between states in the chain

    Example
    =======

    >>> C = ExpChain([0.2,0.4])
    >>> C.get_mean()
    0.6
    """

    def __init__(self,durations):

        self.n = n = len(durations)
        self.tau = durations
        self.n_st = int(n) + 1
        self.dt = min(durations) / 50.

        data = []
        rows = []
        cols = []

        for i in range(self.n):
            src = i
            trg = i+1
            rows.append(trg)
            cols.append(src)
            data.append(1/self.tau[i])
            rows.append(src)
            cols.append(src)
            data.append(-1/self.tau[i])

        self.W = csr_matrix((data, (rows, cols)), shape=2*[self.n_st])

[docs]    def dydt(self,t,y):
        """
        The ODE that performs the transitions between states.
        """
        return self.W.dot(y)

[docs]    def get_cdf(self,t=None,percentile_cutoff=0.9999,method=_DEFAULT_METHOD):
        """
        Obtain the cumulative distribution function of the total waiting
        time this chain represents.

        Parameters
        ==========
        t : numpy.ndarray of float, default = None
            Ordered array of time points for which the CDF
            should be returned. If ``None``,
            ``scipy.optimize.solve_ivp`` will choose
            the points itself.
        percentile_cutoff : float, default = 1 - 1e-4
            maximum value of the CDF
        method : str, default = 'RK23'
            This is going to be passed to
            ``scipy.optimize.solve_ivp``.

        Returns
        =======
        t : numpy.ndarray of float
            Ordered array of time points for which the CDF
            was computed.
        cdf : numpy.ndarray of float
            the corresponding values of the cumulative
            distribution function
        """

        y0 = np.zeros(self.n_st)
        y0[0] = 1.0

        stop_condition = lambda t, y: percentile_cutoff - y[-1]
        stop_condition.terminal = True

        result = solve_ivp(
                            fun=self.dydt,
                            y0=y0,
                            t_eval=t,
                            t_span=(0,np.inf),
                            events=stop_condition,
                            method=method,
                          )


        return result.t, result.y[-1,:]

[docs]    def get_pdf(self,t=None,percentile_cutoff=0.9999,method=_DEFAULT_METHOD):
        """
        Obtain the probability distribution function of the total waiting
        time this chain represents. Uses :func:`ExpChain.get_cdf`.

        Parameters
        ==========
        t : numpy.ndarray of float, default = None
            Ordered array of time points for which the CDF
            should be returned. If ``None``,
            ``scipy.optimize.solve_ivp`` will choose
            the points itself.
        percentile_cutoff : float, default = 1 - 1e-4
            maximum value of the CDF
        method : str, default = 'RK23'
            This is going to be passed to
            ``scipy.optimize.solve_ivp``.

        Returns
        =======
        tmean : numpy.ndarray of float
            Ordered array of bin midpoints for which the pdf
            was computed.
        pdf : numpy.ndarray of float
            the corresponding values of the pdf.
        df : numpy.ndarray of float
            the corresponding bin sizes
        """
        t, P = self.get_cdf(t=t,percentile_cutoff=percentile_cutoff,method=method)
        dt = np.diff(t)
        dP = np.diff(P)
        tmean = 0.5*(t[1:]+t[:-1])
        return tmean, dP/dt, dt

[docs]    def get_cdf_at_percentiles(self,percentiles,method=_DEFAULT_METHOD):
        """
        Obtain the cumulative distribution function of the total waiting
        time this chain represents.

        Parameters
        ==========
        t : numpy.ndarray of float, default = None
            Ordered array of time points for which the CDF
            should be returned. If ``None``,
            ``scipy.optimize.solve_ivp`` will choose
            the points itself.
        percentile_cutoff : float, default = 1 - 1e-4
            maximum value of the CDF
        method : str, default = 'RK23'
            This is going to be passed to
            ``scipy.optimize.solve_ivp``.

        Returns
        =======
        t : numpy.ndarray of float
            Ordered array of time points for which the CDF
            was computed.
        cdf : numpy.ndarray of float
            the corresponding values of the cumulative
            distribution function
        """

        y0 = np.zeros(self.n_st)
        y0[0] = 1.0
        time_values = np.zeros_like(percentiles)
        t = 0

        for i, P in enumerate(percentiles):

            stop_condition = lambda t, y: P - y[-1]
            stop_condition.terminal = True

            result = solve_ivp(
                                fun=self.dydt,
                                y0=y0,
                                t_span=(t,np.inf),
                                events=stop_condition,
                                method=method,
                              )

            y0 = result.y[:,-1]
            t = result.t[-1]
            time_values[i] = t

        return time_values, percentiles

[docs]    def get_median_and_iqr(self,method=_DEFAULT_METHOD):
        """
        Returns the median and inter-quartile range
        of the waiting time distribution this chain
        represents.

        Parameters
        ==========
        method : str, default = 'RK23'
            This is going to be passed to
            ``scipy.optimize.solve_ivp``.

        Returns
        =======
        median : float
            the median of the distribution
        iqr : numpy.ndarray of float
            array of length 2 containing
            the inter-quartile range.
        """

        percentiles = [0.25,0.5,0.75]

        time_values, _ = self.get_cdf_at_percentiles(percentiles,method=method)

        return time_values[1], time_values[[0,2]]

[docs]    def get_mean(self):
        """
        Returns the mean waiting time of this chain.
        """
        return sum(self.tau)

[docs]def fit_chain_by_cdf(n,time_values,cdf,lower=1e-10,upper=1e10,percentile_cutoff=1-1e-15,x0=None):
    """
    Fit a chain of exponentially distributed random variables
    to a distribution where the cdf is known for several time points.

    While there exist statistcally sound measures to quantify
    the distance between
    two distributions, I found that the total mean squared distance
    actually finds decent fits consistently, so this function
    is going to be using that until someone convinces me that
    another distance measure yields better results.

    This whole thing should be considered heuristic
    patch work in any case.

    Parameters
    ==========
    n : int
        number of transitions in the chain
    time_values : numpy.ndarray of float
        Ordered array of time points for which the cdf
        is known.
    cdf : numpy.ndarray of float
        Ordered array of corresponding CDF values.
    lower: float, default = 1e-10
        lower bound of waiting times for each transition
    upper: float, default = 1e10
        upper bound of waiting times for each transition
    percentile_cutoff : float default = 1 - 1e-15
        max value of the CDF that should be integrated to
    x0 : numpy.ndarray of float, default = None
        array of length n that contains initial guesses
        of the chain's waiting times. If ``None``,
        ``x0`` is going to contain the value ``mean/n``
        n times, where ``mean`` is the mean of the
        distribution determined by ``cdf``.

    Returns
    =======
    chain : ExpChain
        The chain that was fit to the given CDF.

    Example
    =======

    >>> median, iqr = 13.184775302968362, ( 7.81098765, 20.86713744)
    >>> fit_C = fit_chain_by_cdf(3,[iqr[0], median, iqr[1]],[0.25,0.5,0.75])
    >>> fit_C.get_median_and_iqr()
    13.183969129892406, array([ 7.8109697 , 20.86699702])
    >>> fit_C.tau
    [9.22794388 0.75881288 5.72462722]

    """

    if n != int(n):
        raise ValueError("`n` must be integer, but is " + str(n))

    dQ = np.diff(cdf)

    cdf = np.array(cdf)

    if x0 is None:
        mean = (np.diff(np.concatenate((np.array([0]),time_values))) * (1 - cdf)).sum()
        x0 = [mean/n] * n

    def cost(x, n, cdf, time_values, percentile_cutoff):
        C  = ExpChain(x)
        _, P = C.get_cdf(time_values,percentile_cutoff=percentile_cutoff)
        if len(P) < len(cdf):
            P = np.concatenate((P,np.ones(len(cdf)-len(P))))
        #return ( ((cdf - P)/cdf)**2).sum()
        return ( ((cdf - P)/cdf)**2).sum()
        #return max(np.abs(cdf - P))
        #return ( ((percentiles - P))**2).sum()
        #dP = np.diff(P)
        #return entropy(np.diff(P), dQ)


    result = minimize(cost, x0, (n, cdf, time_values, percentile_cutoff), bounds=[(lower,upper)]*n)

    return ExpChain(result.x)


[docs]def fit_chain_by_median_and_iqr(n,median,iqr,lower=1e-10,upper=1e10):
    """
    Fit a chain of exponentially distributed random variables
    to a distribution where only median and iqr are known.

    Parameters
    ==========
    n : int
        number of transitions in the chain
    median : float
        the median of the distribution to fit to
    iqr : 2-tuple of float
        the inter-quartile range of the distribution to
        fit to
    lower: float, default = 1e-10
        lower bound of waiting times for each transition
    upper: float, default = 1e10
        upper bound of waiting times for each transition

    Returns
    =======
    chain : ExpChain
        The chain that was fit to the median and iqr.

    Example
    =======

    >>> times = [0.3,6.,9,0.4]
    >>> C = ExpChain(times)
    >>> fit_C = fit_chain_by_median_and_iqr(3,*C.get_median_and_iqr())
    >>> C.get_median_and_iqr()
    13.184775302968362, array([ 7.81098765, 20.86713744])
    >>> fit_C.get_median_and_iqr()
    13.183969129892406, array([ 7.8109697 , 20.86699702])
    >>> C.tau
    [0.3, 6.0, 9, 0.4]
    >>> fit_C.tau
    [9.22794388 0.75881288 5.72462722]

    """
    percentiles = np.array([0.25,0.5,0.75])
    time_values = np.array([iqr[0],median,iqr[1]])
    mean = median
    x0 = [mean/n]*n
    return fit_chain_by_cdf(n,time_values,percentiles,lower,upper,x0=x0)


if __name__ == "__main__":
    import matplotlib.pyplot as pl
    from scipy.stats import erlang

    n = 10
    mean = 10
    C = ExpChain([mean/n]*n)
    E = erlang(a=n,scale=mean/n)

    t, y = C.get_cdf()

    pl.plot(t, y)
    pl.plot(t, E.cdf(t))

    print(C.get_median_and_iqr())
    print(E.median(), E.interval(0.5))
    pl.show()


    # =========
    times = [0.3,6.,9,0.4]
    n = len(times)
    C = ExpChain(times)
    fit_C = fit_chain_by_median_and_iqr(3,*C.get_median_and_iqr())

    print("C med iqr =",C.get_median_and_iqr())
    print("fit med iqr =",fit_C.get_median_and_iqr())
    print("C durs =",C.tau)
    print("fit durs =",fit_C.tau)

    fig, ax = pl.subplots(1,2,figsize=(8,4))
    for ch in [C, fit_C]:
        t, P = ch.get_cdf()
        dP = np.diff(P)
        tmean = 0.5*(t[1:] + t[:-1])
        dt = np.diff(t)
        ax[1].bar(tmean, dP/dt,width=dt,alpha=0.5)
        ax[0].plot(t,P)
        print(sum(dP))

    pl.show()


    # ==================

    data = np.array([
    [0.9782608695652173, 0.6117433930093777],
    [1.9565217391304341, 0.7045325376527423],
    [2.934782608695652, 0.7829425973287867],
    [3.9673913043478257, 0.789456048119731],
    [4.945652173913045, 0.8064281992990434],
    [5.923913043478262, 0.7698055792365256],
    [6.9565217391304355, 0.7266458274130909],
    [7.934782608695654, 0.6691081746708346],
    [8.967391304347826, 0.6115693378800797],
    [9.945652173913045, 0.5474957374254049],
    [10.923913043478263, 0.4781933788007957],
    [11.956521739130435, 0.4245761106374918],
    [12.880434782608697, 0.3539677465188974],
    [13.96739130434783, 0.29773491522212747],
    [14.945652173913045, 0.24411883110732224],
    [15.923913043478263, 0.19703869470493518],
    [17.01086956521739, 0.15518494837548535],
    [17.934782608695652, 0.1342497868712702],
    [18.913043478260867, 0.09893435635123615],
    [19.945652173913043, 0.07799682675002373],
    [20.92391304347826, 0.06098204982476085],
    [21.956521739130434, 0.04658046793596648],
    [22.934782608695652, 0.030872880553187487],
    [24.07608695652174, 0.028233636449748856],
    [25, 0.016448801742919406],
    [25.978260869565215, 0.01119873070000943],
    [26.902173913043477, 0.011178601875532879],
    [27.989130434782606, 0.007233352278109395],
    [28.804347826086957, 0.007215591550630007],
    [29.945652173913047, 0.004576347447191376],
    [31.03260869565218, 0.003245476934735203],
    [31.902173913043477, 0.0032265321587572338],
    [32.93478260869565, 0.003204035237283298],
    [33.96739130434783, -0.000740030311641604],
    [35, -0.0007625272331155397],
    [36.141304347826086, 0.0005197972908970172],
    [37.22826086956522, 0.0004961163209245001],
    [38.15217391304348, 0.00047598749644794935],
    [39.07608695652174, 0.001763048214454832],
    [39.94565217391305, -0.0008702756464904482],
    [40.869565217391305, 0.0004167850715164345],
    [41.79347826086956, 0.0017038457895235393],
    [42.93478260869565, -0.0009353983139148703],
    [43.96739130434783, -0.0009578952353888059],
    [44.94565217391305, 0.0003279814341194953],
    ])
    t = np.round(data[:,0])
    print(t)
    P = data[:,1]
    P[P<0] = 0
    P /= P.sum()
    CDF = np.cumsum(P)
    fig, ax = pl.subplots(1,2,figsize=(8,4))
    ax[0].plot(t, CDF)
    ax[1].bar(t, P,alpha=0.4)
    ch = fit_chain_by_cdf(3,t,CDF,lower=0.1,upper=50,x0=[1.29,3.37,4.01])
    #ch = fit_chain_by_cdf(8,t,CDF,lower=0.1,upper=50)
    t, dPdt, width = ch.get_pdf()
    ax[1].bar(t, dPdt, width=width,alpha=0.4)
    ax[0].plot(*ch.get_cdf(t))

    pl.show()



    # ===========

    med_iqr = [
                ( 5, ( 2, 13)),
                (11, ( 6, 21)),
                (10, ( 4, 19)),
                ( 6, ( 3, 11)),
                ( 9, ( 4, 18)),
            ]

    for med, iqr in med_iqr:
        fit_C = fit_chain_by_median_and_iqr(3,med,iqr,lower=0.1,upper=100)
        print("demanded med and iqr =", med, iqr)
        print("fitted   med and iqr =", fit_C.get_median_and_iqr())
        print("fitted durations =",fit_C.tau)
        print()
        fig, ax = pl.subplots(1,2,figsize=(8,4))
        ax[0].plot(*fit_C.get_cdf())
        t, dPdt, width = fit_C.get_pdf()
        ax[1].bar(t, dPdt, width=width)

    pl.show()