Source code for pygram11.hist

"""pygram11 Histogram API."""

# MIT License
#
# Copyright (c) 2021 Douglas Davis
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# stdlib
from typing import Sequence, Tuple, Optional, Union

# third party
import numpy as np

# One dimensional backend functions
from pygram11._backend import _f1d, _v1d, _f1dw, _v1dw, _f1dmw, _v1dmw

# Two dimensional backend functions
from pygram11._backend import _f2d, _f2dw, _v2d, _v2dw


def _likely_uniform_bins(edges: np.ndarray) -> bool:
    """Test if bin edges describe a set of fixed width bins."""
    diffs = np.ediff1d(edges)
    return np.all(np.isclose(diffs, diffs[0]))


def _densify_fixed_counts(counts: np.ndarray, width: float) -> np.ndarray:
    """Convert fixed width histogram to unity integral over PDF."""
    return np.array(counts / (width * counts.sum()), dtype=np.float64)


def _densify_fixed_weighted_counts(
    raw: Tuple[np.ndarray, np.ndarray], width: float
) -> Tuple[np.ndarray, np.ndarray]:
    """Convert fixed width weighted histogram to unity integral over PDF."""
    counts = raw[0]
    integral = counts.sum()
    res0 = _densify_fixed_counts(counts, width)
    variances = raw[1] * raw[1]
    f1 = 1.0 / ((width * integral) ** 2)
    f2 = counts / integral
    res1 = f1 * (variances + (f2 * f2 * variances.sum()))
    return res0, np.sqrt(res1)


def _densify_variable_counts(
    counts: np.ndarray,
    edges: np.ndarray,
) -> np.ndarray:
    """Convert variable width histogram to unity integral over PDF."""
    widths = edges[1:] - edges[:-1]
    integral = float(np.sum(counts))
    return np.array(counts / widths / integral, dtype=np.float64)


def _densify_variable_weighted_counts(
    raw: Tuple[np.ndarray, np.ndarray], edges: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """Convert variable width histogram to unity integral over PDF."""
    counts = raw[0]
    variances = raw[1] * raw[1]
    integral = counts.sum()
    widths = edges[1:] - edges[:-1]
    res0 = _densify_variable_counts(counts, edges)
    f1 = 1.0 / ((widths * integral) ** 2)
    f2 = counts / integral
    res1 = f1 * (variances + (f2 * f2 * variances.sum()))
    return res0, res1


def _get_limits_1d(
    x: np.ndarray, range: Optional[Tuple[float, float]] = None
) -> Tuple[float, float]:
    """Get bin limits given an optional range and 1D data."""
    if range is None:
        return (float(np.amin(x)), float(np.amax(x)))
    return float(range[0]), float(range[1])


def _get_limits_2d(
    x: np.ndarray,
    y: np.ndarray,
    range: Optional[Sequence[Tuple[float, float]]] = None,
) -> Tuple[float, float, float, float]:
    """Get bin limits given an optional range and 2D data."""
    if range is None:
        return (
            float(np.amin(x)),
            float(np.amax(x)),
            float(np.amin(y)),
            float(np.amin(y)),
        )
    else:
        return (
            float(range[0][0]),
            float(range[0][1]),
            float(range[1][0]),
            float(range[1][1]),
        )


[docs]def bin_edges(bins: int, range: Tuple[float, float]) -> np.ndarray: """Construct bin edges given number of bins and axis limits. Parameters ---------- bins : int Total number of bins. range : (float, float) Minimum and maximum of the histogram axis. Returns ------- numpy.ndarray Edges defined by the number of bins and axis limits. Examples -------- >>> bin_edges(bins=8, range=(-2, 2)) array([-2. , -1.5, -1. , -0.5, 0. , 0.5, 1. , 1.5, 2. ]) """ return np.linspace(range[0], range[1], bins + 1)
[docs]def bin_centers( bins: Union[int, Sequence[float]], range: Optional[Tuple[float, float]] = None, ) -> np.ndarray: """Construct array of center values for each bin. Parameters ---------- bins : int or array_like Number of bins or bin edges array. range : (float, float), optional The minimum and maximum of the histogram axis. Returns ------- numpy.ndarray Array of bin centers. Raises ------ ValueError If ``bins`` is an integer and range is undefined (``None``). Examples -------- The centers given the number of bins and max/min: >>> bin_centers(10, range=(-3, 3)) array([-2.7, -2.1, -1.5, -0.9, -0.3, 0.3, 0.9, 1.5, 2.1, 2.7]) Or given bin edges: >>> bin_centers([0, 1, 2, 3, 4]) array([0.5, 1.5, 2.5, 3.5]) """ if isinstance(bins, int): if range is None: raise ValueError("Integer bins requires defining range") bins = bin_edges(bins, range=range) bins = np.asarray(bins) return 0.5 * (bins[1:] + bins[:-1])
[docs]def fix1d( x: np.ndarray, bins: int = 10, range: Optional[Tuple[float, float]] = None, weights: Optional[np.ndarray] = None, density: bool = False, flow: bool = False, ) -> Tuple[np.ndarray, Union[np.ndarray, None]]: r"""Histogram data with fixed (uniform) bin widths. Parameters ---------- x : numpy.ndarray Data to histogram. bins : int The number of bins. range : (float, float), optional The minimum and maximum of the histogram axis. If ``None``, min and max of ``x`` will be used. weights : numpy.ndarray, optional The weights for each element of ``x``. If weights are absent, the second return type will be ``None``. density : bool Normalize histogram counts as value of PDF such that the integral over the range is unity. flow : bool Include under/overflow in the first/last bins. Raises ------ ValueError If ``x`` and ``weights`` have incompatible shapes. TypeError If ``x`` or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The resulting histogram bin counts. :py:obj:`numpy.ndarray`, optional The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. The return is ``None`` if weights are not used. Examples -------- A histogram of ``x`` with 20 bins between 0 and 100: >>> h, __ = fix1d(x, bins=20, range=(0, 100)) When weights are absent the second return is ``None``. The same data, now histogrammed with weights and over/underflow included: >>> rng = np.random.default_rng(123) >>> w = rng.uniform(0.1, 0.9, x.shape[0])) >>> h, stderr = fix1d(x, bins=20, range=(0, 100), weights=w, flow=True) """ xmin, xmax = _get_limits_1d(x, range) if weights is None: result = _f1d(x, bins, xmin, xmax, flow) if density: width = (xmax - xmin) / bins result = _densify_fixed_counts(result, width) return result, None if np.shape(x) != np.shape(weights): raise ValueError("x and weights must have the same shape") result = _f1dw(x, weights, int(bins), xmin, xmax, flow) if density: width = (xmax - xmin) / bins result = _densify_fixed_weighted_counts(result, width) return result
[docs]def fix1dmw( x: np.ndarray, weights: np.ndarray, bins: int = 10, range: Optional[Tuple[float, float]] = None, flow: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: r"""Histogram data with multiple weight variations and fixed width bins. The weights array must have a total number of rows equal to the length of the input data. The number of columns in the weights array is equal to the number of weight variations. (The weights array must be an M x N matrix where M is the length of x and N is the number of weight variations). Parameters ---------- x : numpy.ndarray Data to histogram. weights : numpy.ndarray The weight variations for the elements of ``x``, first dimension is the length of ``x``, second dimension is the number of weights variations. bins : int The number of bins. range : (float, float), optional The minimum and maximum of the histogram axis. If ``None``, min and max of ``x`` will be used. flow : bool Include under/overflow in the first/last bins. Raises ------ ValueError If ``x`` and ``weights`` have incompatible shapes (if ``x.shape[0] != weights.shape[0]``). ValueError If ``weights`` is not a two dimensional array. TypeError If ``x`` or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray` The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. Examples -------- Multiple histograms of ``x`` using 20 different weight variations: >>> rng = np.random.default_rng(123) >>> x = rng.standard_normal(10000) >>> twenty_weights = np.abs(rng.standard_normal((x.shape[0], 20))) >>> h, err = fix1dmw(x, twenty_weights, bins=50, range=(-3, 3)) ``h`` and ``err`` are now shape ``(50, 20)``. Each column represents the histogram of the data using its respective weight. """ if len(np.shape(weights)) != 2: raise ValueError("weights must be a two dimensional array.") if np.shape(x)[0] != np.shape(weights)[0]: raise ValueError("x and weights have incompatible shapes.") xmin, xmax = _get_limits_1d(x, range) return _f1dmw(x, weights, int(bins), xmin, xmax, flow)
[docs]def var1d( x: np.ndarray, bins: np.ndarray, weights: Optional[np.ndarray] = None, density: bool = False, flow: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: r"""Histogram data with variable bin widths. Parameters ---------- x : numpy.ndarray Data to histogram bins : numpy.ndarray Bin edges weights : numpy.ndarray, optional The weights for each element of ``x``. If weights are absent, the second return type will be ``None``. density : bool Normalize histogram counts as value of PDF such that the integral over the range is unity. flow : bool Include under/overflow in the first/last bins. Raises ------ ValueError If the array of bin edges is not monotonically increasing. ValueError If ``x`` and ``weights`` have incompatible shapes. TypeError If ``x`` or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray`, optional The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. The return is ``None`` if weights are not used. Examples -------- A simple histogram with variable width bins: >>> rng = np.random.default_rng(123) >>> x = rng.standard_normal(1000) >>> edges = np.array([-3.0, -2.5, -1.5, -0.25, 0.25, 2.0, 3.0]) >>> h, __ = var1d(x, edges) """ if not np.all(bins[1:] >= bins[:-1]): raise ValueError("bins sequence must monotonically increase") if _likely_uniform_bins(bins): nbins = np.shape(bins)[0] - 1 return fix1d( x, bins=nbins, weights=weights, range=(bins[0], bins[-1]), flow=flow, density=density, ) bins = np.array(bins, dtype=np.float64, copy=False) if weights is None: result = _v1d(x, bins, flow) if density: result = _densify_variable_counts(result, bins) return result, None if np.shape(x) != np.shape(weights): raise ValueError("x and weights have incompatible shapes.") result = _v1dw(x, weights, bins, flow) if density: result = _densify_variable_weighted_counts(result, bins) return result
[docs]def var1dmw( x: np.ndarray, weights: np.ndarray, bins: np.ndarray, flow: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: r"""Histogram data with multiple weight variations and variable width bins. The weights array must have a total number of rows equal to the length of the input data. The number of columns in the weights array is equal to the number of weight variations. (The weights array must be an M x N matrix where M is the length of x and N is the number of weight variations). Parameters ---------- x : numpy.ndarray Data to histogram. weights : numpy.ndarray Weight variations for the elements of ``x``, first dimension is the shape of ``x``, second dimension is the number of weights. bins : numpy.ndarray Bin edges. flow : bool Include under/overflow in the first/last bins. Raises ------ ValueError If the array of bin edges is not monotonically increasing. ValueError If ``x`` and ``weights`` have incompatible shapes. ValueError If ``weights`` is not a two dimensional array. TypeError If ``x`` or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray` The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. Examples -------- Using three different weight variations: >>> rng = np.random.default_rng(123) >>> x = rng.standard_normal(10000) >>> weights = nb.abs(rng.standard_normal((x.shape[0], 3))) >>> edges = np.array([-3.0, -2.5, -1.5, -0.25, 0.25, 2.0, 3.0]) >>> h, err = var1dmw(x, weights, edges) >>> h.shape (6, 3) >>> err.shape (6, 3) """ if len(np.shape(weights)) != 2: raise ValueError("weights must be a two dimensional array.") if np.shape(x)[0] != np.shape(weights)[0]: raise ValueError("x and weights have incompatible shapes.") if not np.all(bins[1:] >= bins[:-1]): raise ValueError("bins sequence must monotonically increase.") if _likely_uniform_bins(bins): return fix1dmw( x, weights, bins=(len(bins) - 1), range=(bins[0], bins[-1]), flow=flow, ) return _v1dmw(x, weights, bins, flow)
[docs]def histogram(x, bins=10, range=None, weights=None, density=False, flow=False): r"""Histogram data in one dimension. Parameters ---------- x : array_like Data to histogram. bins : int or array_like If int: the number of bins; if array_like: the bin edges. range : (float, float), optional The minimum and maximum of the histogram axis. If ``None`` with integer ``bins``, min and max of ``x`` will be used. If ``bins`` is an array this is expected to be ``None``. weights : array_like, optional Weight variations for the elements of ``x``. For single weight histograms the shape must be the same shape as ``x``. For multiweight histograms the first dimension is the length of ``x``, second dimension is the number of weights variations. density : bool Normalize histogram counts as value of PDF such that the integral over the range is unity. flow : bool Include under/overflow in the first/last bins. Raises ------ ValueError If ``bins`` defines edges while ``range`` is also not ``None``. ValueError If the array of bin edges is not monotonically increasing. ValueError If ``x`` and ``weights`` have incompatible shapes. ValueError If multiweight histogramming is detected and ``weights`` is not a two dimensional array. TypeError If ``x`` or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray`, optional The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. The return is ``None`` if weights are not used. See Also -------- fix1d Used for no weight or single weight fixed bin width histograms fix1dmw Used for multiweight fixed bin width histograms. var1d Used for no weight or single weight variable bin width histograms. var1dmw Used for multiweight variable bin width histograms. Examples -------- A simple fixed width histogram: >>> h, __ = histogram(x, bins=20, range=(0, 100)) And with variable width histograms and weights: >>> h, err = histogram(x, bins=[-3, -2, -1.5, 1.5, 3.5], weights=w) """ # make sure x and weight data are NumPy arrays. x = np.array(x) if weights is not None: weights = np.asarray(weights) is_multiweight = np.shape(weights) != np.shape(x) if is_multiweight and len(np.shape(weights)) != 2: raise ValueError("weight must be a 2D array for multiweight histograms.") # fixed bins if isinstance(bins, int): if weights is not None: if is_multiweight: return fix1dmw(x, weights, bins=bins, range=range, flow=flow) return fix1d( x, weights=weights, bins=bins, range=range, density=density, flow=flow ) # variable bins else: bins = np.asarray(bins) if range is not None: raise ValueError("range must be None if bins is non-int") if weights is not None: if is_multiweight: return var1dmw(x, weights, bins=bins, flow=flow) return var1d(x, weights=weights, bins=bins, density=density, flow=flow)
[docs]def fix2d( x: np.ndarray, y: np.ndarray, bins: Union[int, Tuple[int, int]] = 10, range: Optional[Sequence[Tuple[float, float]]] = None, weights: Optional[np.ndarray] = None, flow: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: r"""Histogram the ``x``, ``y`` data with fixed (uniform) binning. The two input arrays (``x`` and ``y``) must be the same length (shape). Parameters ---------- x : numpy.ndarray First entries in data pairs to histogram. y : numpy.ndarray Second entries in data pairs to histogram. bins : int or (int, int) If int, both dimensions will have that many bins; if tuple, the number of bins for each dimension range : Sequence[Tuple[float, float]], optional Axis limits in the form ``[(xmin, xmax), (ymin, ymax)]``. If ``None`` the input data min and max will be used. weights : array_like, optional The weights for data element. If weights are absent, the second return type will be ``None``. flow : bool Include over/underflow. Raises ------ ValueError If ``x`` and ``y`` have incompatible shapes. ValueError If the shape of ``weights`` is incompatible with ``x`` and ``y`` TypeError If ``x``, ``y``, or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray`, optional The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. Examples -------- A histogram of (``x``, ``y``) with 20 bins between 0 and 100 in the ``x`` dimention and 10 bins between 0 and 50 in the ``y`` dimension: >>> h, __ = fix2d(x, y, bins=(20, 10), range=((0, 100), (0, 50))) The same data, now histogrammed weighted (via ``w``): >>> h, err = fix2d(x, y, bins=(20, 10), range=((0, 100), (0, 50)), weights=w) """ if np.shape(x) != np.shape(y): raise ValueError("x and y must be the same shape.") if weights is not None: if np.shape(weights) != np.shape(x): raise ValueError("data and weights must be the same shape.") if isinstance(bins, int): nx = ny = bins else: nx, ny = bins xmin, xmax, ymin, ymax = _get_limits_2d(x, y, range) if weights is None: result = _f2d(x, y, int(nx), xmin, xmax, int(ny), ymin, ymax, flow) return result, None return _f2dw(x, y, weights, int(nx), xmin, xmax, int(ny), ymin, ymax, flow)
[docs]def var2d( x: np.ndarray, y: np.ndarray, xbins: np.ndarray, ybins: np.ndarray, weights: Optional[np.ndarray] = None, flow: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: r"""Histogram the ``x``, ``y`` data with variable width binning. The two input arrays (``x`` and ``y``) must be the same length (shape). Parameters ---------- x : numpy.ndarray First entries in data pairs to histogram. y : numpy.ndarray Second entries in data pairs to histogram. xbins : numpy.ndarray Bin edges for the ``x`` dimension. ybins : np.ndarray Bin edges for the ``y`` dimension. weights : array_like, optional The weights for data element. If weights are absent, the second return type will be ``None``. flow : bool Include under/overflow. Raises ------ ValueError If ``x`` and ``y`` have different shape. ValueError If either bin edge definition is not monotonically increasing. TypeError If ``x``, ``y``, or ``weights`` are unsupported types Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray`, optional The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. Examples -------- A histogram of (``x``, ``y``) where the edges are defined by a :func:`numpy.logspace` in both dimensions: >>> bins = numpy.logspace(0.1, 1.0, 10, endpoint=True) >>> h, __ = var2d(x, y, bins, bins) """ if np.shape(x) != np.shape(y): raise ValueError("x and y must be the same shape.") if not np.all(xbins[1:] >= xbins[:-1]): raise ValueError("xbins sequence must monotonically increase.") if not np.all(ybins[1:] >= ybins[:-1]): raise ValueError("ybins sequence must monotonically increase.") if weights is not None: weights = np.asarray(weights) if np.shape(weights) != np.shape(x): raise ValueError("data and weights must be the same shape.") if weights is None: result = _v2d(x, y, xbins, ybins, flow) return result, None return _v2dw(x, y, weights, xbins, ybins, flow)
[docs]def histogram2d(x, y, bins=10, range=None, weights=None, flow=False): r"""Histogram data in two dimensions. This function provides an API very simiar to :func:`numpy.histogram2d`. Keep in mind that the returns are different. Parameters ---------- x: array_like Array representing the ``x`` coordinate of the data to histogram. y: array_like Array representing the ``y`` coordinate of the data to histogram. bins: int or array_like or [int, int] or [array, array], optional The bin specification: * If int, the number of bins for the two dimensions (``nx = ny = bins``). * If `array_like`, the bin edges for the two dimensions (``x_edges = y_edges = bins``). * If [int, int], the number of bins in each dimension (``nx, ny = bins``). * If [`array_like`, `array_like`], the bin edges in each dimension (``x_edges, y_edges = bins``). range: array_like, shape(2,2), optional The edges of this histogram along each dimension. If ``bins`` is not integral, then this parameter is ignored. If None, the default is ``[[x.min(), x.max()], [y.min(), y.max()]]``. weights: array_like An array of weights associated to each element :math:`(x_i, y_i)` pair. Each pair of the data will contribute its associated weight to the bin count. flow : bool Include over/underflow. Raises ------ ValueError If ``x`` and ``y`` have different shape or either bin edge definition is not monotonically increasing. ValueError If the shape of ``weights`` is not compatible with ``x`` and ``y``. TypeError If ``x``, ``y``, or ``weights`` are unsupported types See Also -------- fix2d Used for no weight or single weight fixed bin width histograms var2d Used for no weight or single weight variable bin width histograms. Returns ------- :py:obj:`numpy.ndarray` The bin counts. :py:obj:`numpy.ndarray` The standard error of each bin count, :math:`\sqrt{\sum_i w_i^2}`. Examples -------- >>> h, err = histogram2d(x, y, weights=w) """ try: N = len(bins) except TypeError: N = 1 x = np.asarray(x) y = np.asarray(y) if weights is not None: weights = np.asarray(weights) if N != 1 and N != 2: bins = np.asarray(bins) return var2d(x, y, bins, bins, weights=weights, flow=flow) elif N == 1: return fix2d(x, y, bins=bins, range=range, weights=weights, flow=flow) elif N == 2: if isinstance(bins[0], int) and isinstance(bins[1], int): return fix2d(x, y, bins=bins, range=range, weights=weights, flow=flow) else: b1 = np.asarray(bins[0]) b2 = np.asarray(bins[1]) return var2d(x, y, b1, b2, weights=weights, flow=flow) else: raise ValueError("bins argument is not compatible")