Source code for bayesmark.quantiles

# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute quantiles and confidence intervals.
"""
import numpy as np
import scipy.stats as ss

from bayesmark.np_util import isclose_lte


def ensure_shape(x, y):
    """Util to broadcast on var to another but only when shape is different.

    This way we don't convert scalar into array type unnecessarily.
    """
    shape_y = np.shape(y)
    if np.shape(x) == shape_y:
        return x
    return np.broadcast_to(x, shape_y)


[docs]def order_stats(X):
    """Compute order statistics on sample `X`.

    Follows convention that order statistic 1 is minimum and statistic n is maximum. Therefore, array elements ``0``
    and ``n+1`` are ``-inf`` and ``+inf``.

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n,)
        Data for order statistics. Can be vectorized. Must be sortable data type (which is almost everything).

    Returns
    -------
    o_stats : :class:`numpy:numpy.ndarray` of shape (n+2,)
        Order statistics on `X`.
    """
    assert np.ndim(X) >= 1
    # NaN is not allowed since it does not have well defined order.
    assert not np.any(np.isnan(X))

    X_shape = np.shape(X)
    inf_pad = np.full(X_shape[:-1] + (1,), np.inf)

    o_stats = np.concatenate((-inf_pad, np.sort(X, axis=-1), inf_pad), axis=-1)
    return o_stats


def _quantile(n, q):
    idx = np.ceil(n * q).astype(int)
    return idx


[docs]def quantile(X, q):
    """Computes `q` th quantile of `X`.

    Similar to :func:`numpy:numpy.percentile` except that it matches the mathematical definition of a quantile *and*
    `q` is scaled in (0,1) rather than (0,100).

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n,)
        Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything).
    q : float
        Quantile to compute, must be in (0, 1). Can be vectorized.

    Returns
    -------
    estimate : dtype of `X`, scalar
        Empirical `q` quantile from sample `X`.
    """
    assert np.ndim(X) >= 1
    # We could robustify things to allow the edge cases, but maybe later
    assert np.all(0 < q) and np.all(q < 1)
    # Currently don't support broadcasting both at same time
    assert np.ndim(X) == 1 or np.ndim(q) == 0

    n = X.shape[-1]
    idx = _quantile(n, q)

    o_stats = order_stats(X)
    estimate = o_stats[..., idx]
    return estimate


def _quantile_CI(n, q, alpha):
    # Use in case there is -inf case from being at extreme of distn
    idx_lower = np.fmax(0, ss.binom.ppf(alpha / 2.0, n, q)).astype(int)
    assert np.all(isclose_lte(ss.binom.cdf(idx_lower - 1, n, q), alpha / 2.0))
    assert np.all(isclose_lte(alpha / 2.0, ss.binom.cdf(idx_lower, n, q)))
    assert np.all(0 <= idx_lower) and np.all(idx_lower <= n + 1)

    idx_upper = np.fmax(0, ss.binom.isf(alpha / 2.0, n, q)).astype(int) + 1
    assert np.all(isclose_lte(ss.binom.sf(idx_upper - 1, n, q), alpha / 2.0))
    assert np.all(isclose_lte(alpha / 2.0, ss.binom.sf(idx_upper - 2, n, q)))
    assert np.all(isclose_lte(1 - (alpha / 2.0), ss.binom.cdf(idx_upper - 1, n, q)))
    assert np.all(isclose_lte(ss.binom.cdf(idx_upper - 2, n, q), 1 - (alpha / 2.0)))
    assert np.all(0 <= idx_upper) and np.all(idx_upper <= n + 1)

    C = ss.binom.cdf(idx_upper - 1, n, q) - ss.binom.cdf(idx_lower - 1, n, q)
    assert np.all(isclose_lte(1.0 - alpha, C))

    return idx_lower, idx_upper


[docs]def quantile_CI(X, q, alpha=0.05):
    """Calculate CI on `q` quantile from same `X` using nonparametric estimation from order statistics.

    This will have alpha level of at most `alpha` due to the discrete nature of order statistics.

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n,)
        Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything).
    q : float
        Quantile to compute, must be in (0, 1). Can be vectorized.
    alpha : float
        False positive rate we allow for CI, must be in (0, 1). Can be vectorized.

    Returns
    -------
    LB : dtype of `X`, scalar
        Lower end on CI
    UB : dtype of `X`, scalar
        Upper end on CI
    """
    assert np.ndim(X) >= 1
    # We could robustify things to allow the edge cases, but maybe later
    assert np.all(0 < q) and np.all(q < 1)
    assert np.all(0 < alpha) and np.all(alpha < 1)
    # Currently don't support broadcasting both at same time
    assert np.ndim(X) == 1 or (np.ndim(q) == 0 and np.ndim(alpha) == 0)

    n = X.shape[-1]
    idx_lower, idx_upper = _quantile_CI(n, q, alpha)

    o_stats = order_stats(X)
    LB, UB = o_stats[..., idx_lower], o_stats[..., idx_upper]
    return LB, UB


[docs]def max_quantile_CI(X, q, m, alpha=0.05):
    """Calculate CI on `q` quantile of distribution on max of `m` iid samples using a data set `X`.

    This uses nonparametric estimation from order statistics and will have alpha level of at most `alpha` due to the
    discrete nature of order statistics.

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n,)
        Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything).
    q : float
        Quantile to compute, must be in (0, 1). Can be vectorized.
    m : int
        Compute statistics for distribution on max over `m` samples. Must be ``>= 1``. Can be vectorized.
    alpha : float
        False positive rate we allow for CI, must be in (0, 1). Can be vectorized.

    Returns
    -------
    estimate : dtype of `X`, scalar
        Best estimate on `q` quantile on max over `m` iid samples.
    LB : dtype of `X`, scalar
        Lower end on CI
    UB : dtype of `X`, scalar
        Upper end on CI
    """
    # X and alpha used/checked below in quantile_CI routine.
    # We could robustify things to allow the edge cases, but maybe later
    assert np.all(0 < q) and np.all(q < 1)
    # Could check int but if someone wants to interpolate, we will let them.
    assert np.all(m >= 1)
    # Currently don't support broadcasting both at same time
    assert np.ndim(X) == 1 or (np.ndim(q) == 0 and np.ndim(q) == 0 and np.ndim(alpha) == 0)

    q = q ** (1.0 / m)
    o_stats = order_stats(X)

    n = X.shape[-1]
    idx = _quantile(n, q)
    idx_lower, idx_upper = _quantile_CI(n, q, alpha=alpha)

    LB, UB = o_stats[..., idx_lower], o_stats[..., idx_upper]
    # Might need to broadcast estimate out if vectorization is in alpha
    estimate = ensure_shape(o_stats[..., idx], LB)
    return estimate, LB, UB


[docs]def min_quantile_CI(X, q, m, alpha=0.05):
    """Calculate confidence interval on `q` quantile of distribution on min of `m` iid samples using a data set `X`.

    This uses nonparametric estimation from order statistics and will have alpha level of at most `alpha` due to the
    discrete nature of order statistics.

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n,)
        Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything).
    q : float
        Quantile to compute, must be in (0, 1). Can be vectorized.
    m : int
        Compute statistics for distribution on min over `m` samples. Must be ``>= 1``. Can be vectorized.
    alpha : float
        False positive rate we allow for CI, must be in (0, 1). Can be vectorized.

    Returns
    -------
    estimate : dtype of `X`, scalar
        Best estimate on `q` quantile on min over `m` iid samples.
    LB : dtype of `X`, scalar
        Lower end on CI
    UB : dtype of `X`, scalar
        Upper end on CI
    """
    # X and alpha used/checked below in quantile_CI routine.
    # We could robustify things to allow the edge cases, but maybe later
    assert np.all(0 < q) and np.all(q < 1)
    # Could check int but if someone wants to interp, we will let them.
    assert np.all(m >= 1)
    # Currently don't support broadcasting both at same time
    assert np.ndim(X) == 1 or (np.ndim(q) == 0 and np.ndim(q) == 0 and np.ndim(alpha) == 0)

    # This might have numerics issues for small q
    q = 1.0 - (1.0 - q) ** (1.0 / m)
    o_stats = order_stats(X)

    n = X.shape[-1]
    idx = _quantile(n, q)
    idx_lower, idx_upper = _quantile_CI(n, q, alpha=alpha)

    LB, UB = o_stats[..., idx_lower], o_stats[..., idx_upper]
    # Might need to broadcast estimate out if vectorization is in alpha
    estimate = ensure_shape(o_stats[..., idx], LB)
    return estimate, LB, UB


[docs]def quantile_and_CI(X, q, alpha=0.05):
    """Calculate CI on `q` quantile from same `X` using nonparametric estimation from order statistics.

    This will have alpha level of at most `alpha` due to the discrete nature of order statistics.

    Parameters
    ----------
    X : :class:`numpy:numpy.ndarray` of shape (n,)
        Data for quantile estimation. Can be vectorized. Must be sortable data type (which is almost everything).
    q : float
        Quantile to compute, must be in (0, 1). Can be vectorized.
    alpha : float
        False positive rate we allow for CI, must be in (0, 1). Can be vectorized.

    Returns
    -------
    estimate : dtype of `X`, scalar
        Empirical `q` quantile from sample `X`.
    LB : dtype of `X`, scalar
        Lower end on CI
    UB : dtype of `X`, scalar
        Upper end on CI
    """
    # This routine is mostly just a wrapper routine
    estimate, LB, UB = max_quantile_CI(X, q=q, m=1, alpha=alpha)
    return estimate, LB, UB