Source code for pylife.utils.histogram

# Copyright (c) 2019-2023 - for information on the respective copyright owner
# see the NOTICE file and/or the repository
# https://github.com/boschresearch/pylife
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__author__ = "Daniel Christopher Kreuter"
__maintainer__ = "Johannes Mueller"

import warnings

import numpy as np
import pandas as pd



[docs]
def combine_histogram(hist_list, method='sum'):
    """Combine a list of histograms to one.

    Parameters
    ----------
    hist_list: list of :class:`pandas.Series`
        list of histograms with all histograms as interval indexed :class:`pandas.Series`
    method: str or aggregating function
        method used for the aggregation, e.g. 'sum', 'min', 'max', 'mean', 'std'
        or any callable function that would aggregate a :class:`pandas.Series`.
        default is 'sum'

    Returns
    -------
    histogram : :class:`pd.Series`
        The resulting histogram

    Raises
    ------
    ValueError
        if the index levels of the histograms do not match.

    Notes
    -----
    Identical bins are grouped and then aggregated using ``method``.  Note that
    neither before or after the aggregation any rebinning takes place.  You
    might consider piping your histograms through
    :func:`~pylife.utils.histogram.rebin_histogram` before or after combining them.

    The histograms need to have compatible indices. Those can either be a
    simple class:`pandas.IntervalIndex` for a one dimensional histogram or a
    :class:`pandas.MultiIndex` whose levels are all ``IntervalIndex`` for
    multidimensional histograms.  For multidimensional histograms the names of
    the index levels must match throughout the input histogram list.

    Examples
    --------
    Two one dimensional histograms:

    >>> h1 = pd.Series([5., 10.], index=pd.interval_range(start=0, end=2))
    >>> h2 = pd.Series([12., 3., 20.], index=pd.interval_range(start=1, periods=3))
    >>> h1
    (0, 1]     5.0
    (1, 2]    10.0
    dtype: float64
    >>> h2 = pd.Series([12., 3., 20.], index=pd.interval_range(start=1, periods=3))
    >>> h2
    (1, 2]    12.0
    (2, 3]     3.0
    (3, 4]    20.0
    dtype: float64
    >>> combine_histogram([h1, h2])
    (0, 1]     5.0
    (1, 2]    22.0
    (2, 3]     3.0
    (3, 4]    20.0
    dtype: float64
    >>> combine_histogram([h1, h2], method='min')
    (0, 1]     5.0
    (1, 2]    10.0
    (2, 3]     3.0
    (3, 4]    20.0
    dtype: float64
    >>> combine_histogram([h1, h2], method='max')
    (0, 1]     5.0
    (1, 2]    12.0
    (2, 3]     3.0
    (3, 4]    20.0
    dtype: float64
    >>> combine_histogram([h1, h2], method='mean')
    (0, 1]     5.0
    (1, 2]    11.0
    (2, 3]     3.0
    (3, 4]    20.0
    dtype: float64

    Limitations
    -----------
    At the moment, additional dimensions i.e. index level that are not histogram bins,
    are not supported.  This limitation might fall in the future.
    """
    def dimensions_are_consistent():
        for h in hist_list[1:]:
            if len(h.index.names) != len(hist_list[0].index.names):
                return False
            if set(h.index.names) != set(hist_list[0].index.names):
                return False

        return True

    hist_list = list(filter(lambda h: len(h) > 0, hist_list))
    if len(hist_list) == 0:
        return pd.Series(dtype=np.float64, index=pd.IntervalIndex.from_tuples([]))

    if not dimensions_are_consistent():
        raise ValueError("Histograms must have identical dimensions to be combined.")

    names = hist_list[0].index.names

    concat = pd.concat(hist_list)
    combined = concat.groupby(concat.index).agg(method)

    if isinstance(concat.index, pd.MultiIndex):
        combined.index = pd.MultiIndex.from_tuples(combined.index, names=names)

    return combined




[docs]
def rebin_histogram(histogram, binning, nan_default=False):
    """Rebin a histogram to a given binning.

    Parameters
    ----------
    histogram : :class:`pandas.Series` with :class:`pandas.IntervalIndex`
        The histogram data to be rebinned

    binning : :class:`pandas.IntervalIndex` or int
        The given binning or number of bins

    nan_default : bool
        If True non occupied bins will be occupied with ``np.nan``, else 0.0
        Default False

    Returns
    -------
    rebinned : :class:`pandas.Series` with :class:`pandas.IntervalIndex`
        The rebinned histogram

    Raises
    ------
    TypeError
        if the ``histogram`` or the ``binning`` do not have an ``IntervalIndex``.
    ValueError
        if the binning is not monotonic increasing or has gaps.

    Notes
    -----
    The events collected in the bins of the original histogram are distributed
    linearly to the bins in the target bins.

    Examples
    --------
    >>> h
    (0.0, 1.0]    1.0
    (1.0, 2.0]    2.0
    (2.0, 3.0]    3.0
    (3.0, 4.0]    4.0
    dtype: float64
    >>> h = pd.Series([10.0, 20.0, 30.0, 40.0], index=pd.interval_range(0.0, 4.0, 4))
    >>> h
    (0.0, 1.0]    10.0
    (1.0, 2.0]    20.0
    (2.0, 3.0]    30.0
    (3.0, 4.0]    40.0
    dtype: float64

    Rebin to a finer binning:

    >>> target_binning = pd.interval_range(0.0, 4.0, 8)
    >>> rebin_histogram(h, target_binning)
    (0.0, 0.5]     5.0
    (0.5, 1.0]     5.0
    (1.0, 1.5]    10.0
    (1.5, 2.0]    10.0
    (2.0, 2.5]    15.0
    (2.5, 3.0]    15.0
    (3.0, 3.5]    20.0
    (3.5, 4.0]    20.0
    dtype: float64

    Rebin to a coarser binning:

    >>> target_binning = pd.interval_range(0.0, 4.0, 2)
    >>> rebin_histogram(h, target_binning)
    (0.0, 2.0]    30.0
    (2.0, 4.0]    70.0
    dtype: float64

    Define the target bin just by an int:

    >>> rebin_histogram(h, 5)
    (0.0, 0.8]     8.0
    (0.8, 1.6]    14.0
    (1.6, 2.4]    20.0
    (2.4, 3.2]    26.0
    (3.2, 4.0]    32.0
    dtype: float64

    Limitations
    -----------
    At the moment, additional dimensions i.e. index level that are not histogram bins,
    are not supported.  This limitation might fall in the future.
    """
    default_value = np.nan if nan_default else 0.0

    if not isinstance(histogram.index, pd.MultiIndex):
        return _do_rebin_histogram(histogram, binning, default_value)

    original_names = histogram.index.names
    for name in histogram.index.names:
        if not isinstance(histogram.index.get_level_values(name), pd.IntervalIndex):
            continue

        this_binning = binning.levels[binning.names.index(name)] if isinstance(binning, pd.MultiIndex) else binning

        remaining_names = list(filter(lambda m: m != name, original_names))
        histogram = (histogram
                     .groupby(remaining_names)
                     .apply(lambda h: _do_rebin_histogram(h.droplevel(remaining_names), this_binning, default_value)))

    return histogram.reorder_levels(original_names)



def _do_rebin_histogram(histogram, binning, default_value):
    def interval_overlap(reference_interval, test_interval):
        overlap = min(reference_interval.right, test_interval.right) - max(reference_interval.left, test_interval.left)
        return overlap / test_interval.length

    def aggregate_hist(interval):
        occupied = hist.loc[hist.index.overlaps(interval)].dropna()
        if len(occupied) == 0:
            return default_value

        return occupied.apply(lambda v: v.iloc[0] * interval_overlap(interval, v.name), axis=1).sum()

    def binning_of_n_bins(index, binnum):
        start = index.left.min()
        end = index.right.max()

        if np.isnan(start) or np.isnan(end):
            return pd.interval_range(0., 0., 0)

        return pd.interval_range(start, end, binnum)

    def binning_does_not_cover_histogram():
        return (
            histogram.index.right.max() > binning.right.max() or
            histogram.index.left.min() < binning.left.min()
        )

    if not isinstance(histogram.index, pd.IntervalIndex):
        raise TypeError("histogram needs to have an IntervalIndex.")

    if isinstance(binning, int):
        binning = binning_of_n_bins(histogram.index, binning)
    else:
        _fail_if_binning_invalid(binning)

    if binning_does_not_cover_histogram():
        warnings.warn("histogram is partly out of binning. This information will be lost!", RuntimeWarning)

    if len(histogram) == 0:
        rebinned = pd.Series(0.0, index=binning)
    else:
        hist = histogram.to_frame()
        rebinned = binning.to_series().apply(aggregate_hist)

    rebinned.name = histogram.name
    rebinned.index.name = histogram.index.name
    return rebinned


def _fail_if_binning_invalid(binning):
    def binning_is_overlapping_or_non_monotonic_increasing():
        return (
            len(binning) > 0
            and (
                not binning.is_non_overlapping_monotonic or binning.is_monotonic_decreasing
            )
        )

    def binning_has_gaps():
        if len(binning) == 0:
            return False
        left = binning.left[1:]
        right = binning.right[:-1]
        return pd.DataFrame({'l': left, 'r': right}).apply(lambda r: r.l != r.r, axis=1).any()

    if not isinstance(binning, pd.IntervalIndex):
        raise TypeError("binning argument must be a pandas.IntervalIndex.")

    if binning_is_overlapping_or_non_monotonic_increasing():
        raise ValueError("binning index must be monotonic increasing without overlaps.")

    if binning_has_gaps():
        raise ValueError("binning index must not have gaps.")