"""
This file contains utilities used in the TP1 exercises.
"""

from itertools import product, cycle
import numpy as np
import pandas as pd
import vector as vec
import matplotlib.pyplot as plt


### Class to access particle four vectors
@pd.api.extensions.register_dataframe_accessor("v4")
class FourVecAccessor(object):
    def __init__(self, pandas_obj):
        # to distinguish between multiple particles or single particle
        # we only need to save the column information,
        self._obj_columns = pandas_obj.columns
        # to keep data consistent when appending columns unsing this accessor save indices to use them in returns
        self._obj_indices = pandas_obj.index
        # get the correct index level, 0 for single particle, 1 for multiple
        _vars = self._obj_columns.get_level_values(self._obj_columns.nlevels - 1)

        if 'M' in _vars and "px" in _vars:
            kin_vars = ["M", "px", "py", "pz"]
        elif 'mass' in _vars and 'px' in _vars:
            kin_vars = ["mass", "px", "py", "pz"]
        elif 'mass' in _vars and 'pt' in _vars:
            kin_vars = ["mass", "pt", "phi", "eta"]
        elif 'E' in _vars and "px" in _vars:
            kin_vars = ["E", "px", "py", "pz"]
        elif 'E' in _vars and 'pt' in _vars:
            kin_vars = ["E", "pt", "phi", "eta"]
        else:
            raise KeyError("No matching structure implemented for interpreting the data as a four "
                           "momentum!")

        # the following lines are where the magic happens

        # no multi-index, just on particle
        if self._obj_columns.nlevels == 1:
            # get the dtypes for the kinetic variables
            dtypes = pandas_obj.dtypes
            kin_view = list(map(lambda x: (x, dtypes[x]), kin_vars))

            # get the kinetic variables from the dataframe and convert it to a numpy array.
            # require it to be C_CONTIGUOUS, vector uses C-Style
            # This array can then be viewed as a vector object.
            # Every property not given is calculated on the fly by the vector object.
            # E.g. the mass is not stored but calculated when the energy is given and vice versa.
            self._v4 = np.require(pandas_obj[kin_vars].to_numpy(), requirements='C').view(
                kin_view).view(vec.MomentumNumpy4D)

        # multi-index, e.g. getting the four momentum for multiple particles
        elif self._obj_columns.nlevels == 2:
            # get the dtypes for the kinetic variables
            # assume the same dtypes for the other particles
            dtypes = pandas_obj[self._obj_columns.get_level_values(0).unique()[0]].dtypes
            kin_view = list(map(lambda x: (x, dtypes[x]), kin_vars))
            self._v4 = np.require(pandas_obj.loc[:, (self._obj_columns.get_level_values(0).unique(),
                                                     kin_vars)].to_numpy(),
                                  requirements='C').view(kin_view).view(vec.MomentumNumpy4D)

        else:
            raise IndexError("Expected a dataframe with a maximum of two multi-index levels.")

    def __getattribute__(self, item):
        """
        Attributes of this accessor are forwarded to the four vector.

        Returns either a pandas dataframe, if we have multiple particles
        or a pandas Series for a single particle.
        """
        try:
            return object.__getattribute__(self, item)
        except AttributeError:
            try:
                return pd.DataFrame(self._v4.__getattribute__(item),
                                    columns=pd.MultiIndex.from_product(
                                        [self._obj_columns.unique(0), [item]]),
                                    index=self._obj_indices)
            except ValueError:
                try:
                    return pd.Series(self._v4.__getattribute__(item).flatten(), name=item, index=self._obj_indices)
                except AttributeError as e:
                    if "'function' object has no attribute 'flatten'" in str(e):
                        raise AttributeError(
                            "Functions of the four vectors can NOT be called directly via the "
                            "accessor. Use the vector property instead! "
                            "Usage: 'df['particle'].v4.vector.desired_function()'")
                    raise e

    @property
    def vector(self):
        """The four vector object itself. It's required when using methods like boosting."""
        if self._obj_columns.nlevels == 1:
            return self._v4[:, 0]
        else:
            return self._v4


### Function to sort elements of an event
def get_sorted_df_by_low_level_criteria(df, criteria, axis=1):
    """
    Sorts in ascending order the top levels of a given pd.DataFrame with two levels columns by given criteria
    that has the form of a pandas DataFrame containing low level quantity, i.e. df.leptons.v4.pt

    @param df: pd.DataFrame containing the top level objects that will be sorted
    @param criteria: pd.DataFrame containing the sorting criteria values that are passed to np.argsort
    @param axis: 0 or 1; row wise sort: axis=1; column wise sort: axis=0. Make sure to pass the
                 right sorted DataFrame (axis option)
    @return: pd.DataFrame
    """

    N, n, v = *df.shape, df[df.columns.get_level_values(0)[0]].shape[1]
    idx = np.argsort(criteria, axis=axis)
    sorted_data = np.take_along_axis(df.to_numpy().reshape((N, int(n / v), v)),
                                     idx.to_numpy()[:, :, None], axis).reshape((N, n))
    return pd.DataFrame(data=sorted_data, columns=df.columns)


### Proposal for a plotting function of physics object quantities in a grid plot given one or more MultiIndex pd.DataFrames
### with stack option
def _object_quantity_grid_plot(df, physics_object, object_quantity, label=None, unit=None, hist_range=None, stack=None):
    """
    Plot the distributions of given quantities of given physics objects that are present in given DataFrame(s). For
    two or more DataFrames, the plotting is done object/quantity wise in the same subplot.

    :param df: pd.DataFrame or list of pd.DataFrames
    :param physics_object: str or list of str of physics objects, i. e. "lepton_0" or ["jet_0", "jet_1"]
    :param object_quantity: str or list of str, i. e. "pt" or ["pt", "eta", "phi"]
    :param label: str or list of str with the same length as df if df is a list
    :param unit: str or list of str with the same length as object_quantity for x-axis label
    :param hist_range: None or (float, float) or list of those with same length as object_quantity or physics_object or the same
                       length as all created subplots:
                       a) physics_object = ["jet_0", "jet_1", "jet_2"], object_quantity = ["pt"],
                          hist_range = [(0, 100), (0, 50), (0, 25)]
                            => creates subplots of (jet_0, pt), (jet_1, pt) and (jet_2, pt) in (0, 100), (0, 50) and (0, 25)
                               intervals.
                       b) physics_object = ["jet_0", "jet_1", "jet_2"], object_quantity = ["pt", "eta"],
                          hist_range = [(0, 100), (-4, 4)]
                           => creates subplots where pt of all subplots ranges are set to (0, 100) and eta ranges to (-4, 4)
                       c) physics_object = ["jet_0", "jet_1", "jet_2"], object_quantity = ["pt", "eta"],
                          hist_range = [(0, 100), (-4, 4), None, (-4, 4), None, (-4, 4)]
                           => creates six subplots where all eta ranges are set to (-4, 4) and (jet_0, pt) range is set
                              to (0, 100). All other ranges are set by default.
    :param stack: str; "df" or "physics_object". In case of "physics_object" only one df should be passed
                  a) df=df1, physics_object=["QCD", "LM6", "LM9"], object_quantity=["H_t, N_jets"], stack="physics_object"
                      => creates two subplots where "H_t" and "N_jets" of provided physics_object is stacked
                  b) df=[df1, df2], physics_object=["jet_1", "jet_2"], object_quantity=["pt", "eta"], stack="df"
                      => creates for every physics_object x object_quantity a subplot where df1 and df2 are stacked
    :return: None
    """

    def to_list(it):  # helper for argument conversion
        return it if isinstance(it, list) else [it]

    def get_quantity(dataframe, _physics_object, _object_quantity):  # helper to get pd.Series...
        try:  # of a four vector quantity
            return getattr(dataframe[_physics_object].v4, _object_quantity).to_numpy()
        except AttributeError:  # of other quantity
            return dataframe.loc[:, (_physics_object, _object_quantity)].to_numpy()

    # converting to lists and padding some to the same length with None (for zip/product/cycle)
    df, label, unit, hist_range = to_list(df), to_list(label), to_list(unit), to_list(hist_range)
    physics_object, object_quantity = to_list(physics_object), to_list(object_quantity)
    unit = unit + [None] * (len(object_quantity) - len(unit))  # padding up to object_quantity dim if necessary
    label = label + [None] * (len(df) - len(label))  # padding up to df dim if necessary

    # checking for hist_range length in case to prevent strange plots - might be discarded
    assert (len(hist_range) == 1 or len(hist_range) == len(object_quantity)
            or (len(hist_range) == len(physics_object) and len(object_quantity) == 1)
            or len(hist_range) == len(list(product(physics_object, object_quantity)))), \
        "hist_range have either to be None or a single 2D - tuple or have the same length" \
        " as physics_object if a single object_quantity is given or have the same length" \
        " as object_quantity or have the same length as physics_object * object_quantity"
    hist_range = cycle(hist_range)
    # checking for the specific case - might be discarded
    assert len(df) == 1 and stack == "physics_object" or stack != "physics_object", \
        "Please pass a single DataFrame with pd.MultiIndex columns to stack multiple" \
        " physics objects or change the option to stack='df'"

    # number of created plots and subdivision in columns and rows
    n_plots = len(list(product(physics_object if stack != "physics_object" else df, object_quantity)))
    n_columns = len(object_quantity) if len(object_quantity) > 1 else (3 if n_plots > 3 else n_plots)
    n_rows = int(np.ceil(n_plots / n_columns))
    fig, ax = plt.subplots(n_rows, n_columns, figsize=(25, int(5 * n_rows)))

    if stack == "physics_object":  # stacking provided physics objects
        for i, (_ax, (_quantity, _unit)) in enumerate(zip(ax.flatten(), zip(object_quantity, unit))):
            _hist_range = next(hist_range)  # set a specific range for _quantity or None
            _ax.hist([get_quantity(df[0], _object, _quantity) for _object in physics_object],
                     bins=100, histtype="bar", range=_hist_range, density=False, stacked=True,
                     label=physics_object)
            _ax.set(yscale="log", ylabel="Events",
                    xlabel=f"{_quantity} {'in ' + _unit if _unit else ''}")
            _ax.legend()

    else:  # unstacked (comparing) variant or dataframe wise stacking
        for i, (_ax, (_object, (_quantity, _unit))) in enumerate(zip(ax.flatten(),
                                                                     product(physics_object,
                                                                             zip(object_quantity, unit)))):
            _hist_range = next(hist_range)  # set a specific range for (_object, _quantity) or None
            if stack == "df":  # dataframe wise stacking
                _ax.hist([get_quantity(_df, _object, _quantity) for _df, _ in zip(df, label)],
                         bins=100, histtype="bar", range=_hist_range, density=False, stacked=True,
                         label=[_label if _label else f"Label_{j}" for j, (_, _label) in enumerate(zip(df, label))])

            else:  # for shape comparison without stacking
                edges = None  # for auto setting of first histogram edges
                for j, (_df, _label) in enumerate(zip(df, label)):  # in case multiple dataframes are given
                    _, edges, _ = _ax.hist(get_quantity(_df, _object, _quantity),
                                           bins=edges if edges is not None else 100,  # same edges
                                           histtype="step", range=_hist_range,
                                           label=_label if _label else f"Label_{j}")  # default or custom

            _ax.set(yscale="log", title=f"{_object}: {_quantity}", ylabel="Events",
                    xlabel=f"{_quantity} {'in ' + _unit if _unit else ''}")
            _ax.legend()

    plt.tight_layout()
    plt.show()


### Proposal for a plotting function of physics object quantities in a grid plot given one or more MultiIndex pd.DataFrames
### without stack option
def plot_quantities(df, column, quantity=None,
                    bins=None, hist_range=None, density=False, weights=None,
                    label=None, unit=None, yscale="log", suptitle=None, color=None):
    """
    Plot the distributions of given quantities of given physics objects that are present in given DataFrame(s). For
    two or more DataFrames, the plotting is done object-quantity wise in the same subplot.

    :param df: pd.DataFrame or list of pd.DataFrames
    :param column: str or list of str of physics objects, i. e. "lepton_0" or ["jet_0", "jet_1"]
    :param quantity: str or list of str, i. e. "pt" or ["pt", "eta", "phi"] in case of a two level index, None
                     if only one index level exists
    :param label: str or list of str with the same length as df if df is a list
    :param unit: str or list of str with the same length as quantity for x-axis label or the same length as columns in case
                 quantity is None
    :param density: bool or list of bool with the same length as quantity
    :param hist_range: None or (float, float) or list of those with same length as quantity or column or the same
                       length as all created subplots:
                       a) column = ["jet_0", "jet_1", "jet_2"], quantity = ["pt"],
                          hist_range = [(0, 100), (0, 50), (0, 25)]
                            => creates subplots of (jet_0, pt), (jet_1, pt) and (jet_2, pt) in (0, 100), (0, 50) and (0, 25)
                               intervals.
                       b) column = ["jet_0", "jet_1", "jet_2"], quantity = ["pt", "eta"],
                          hist_range = [(0, 100), (-4, 4)]
                           => creates subplots where pt of all subplots ranges are set to (0, 100) and eta ranges to (-4, 4)
                       c) column = ["jet_0", "jet_1", "jet_2"], quantity = ["pt", "eta"],
                          hist_range = [(0, 100), (-4, 4), None, (-4, 4), None, (-4, 4)]
                           => creates six subplots where all eta ranges are set to (-4, 4) and (jet_0, pt) range is set
                              to (0, 100). All other ranges are set by default.
    :param bins: int or list of int, setting the number of used bins. In case of a list: the same rules applies for bins as for
                 hist_range
    :param yscale: str like as in matplotlib yscale or list of str, setting the used yscale. In case of a list: the same rules
                   applies for bins as for hist_range. Possible options are for example 'log', 'linear', 'logit', 'sumlog', ...
    :param weights: None, 1D np.ndarray or pd.Series  or a list of those with the same length as df
    :param suptitle: None or str containing the figure title
    :return: None
    """

    def to_list(it):  # helper for argument conversion
        return it if isinstance(it, list) else [it]

    with_quantity = True if quantity else False  # in case no quantity is provided
    df, label, unit, hist_range = to_list(df), to_list(label), to_list(unit), to_list(hist_range)
    column, quantity = to_list(column), to_list(quantity)
    bins, density, yscale, weights = to_list(bins), to_list(density), to_list(yscale), to_list(weights)

    # padding up to quantity dim if necessary
    unit = unit + [None] * (len(quantity) - len(unit))
    density = density + [True if len(density) == 1 and density[0] else False] * (len(quantity) - len(density))

    # padding up to df dim if necessary
    label = label + [None] * (len(df) - len(label))
    weights = weights + [None] * (len(df) - len(weights))

    def check_dim_of(it):  # checking for bins, hist_range, yscale length in case to prevent strange plots
        return (len(it) == 1 or len(it) == len(quantity)
                or (len(it) == len(column) and len(quantity) == 1)
                or len(it) == len(list(product(column, quantity))))

    assert check_dim_of(hist_range) and check_dim_of(bins) and check_dim_of(yscale), \
        "(hist_range/bins/yscale) have either to be None or a (two element tuple/int/str) or be a " \
        "list of those types and have the same length as column if single or None quantity is " \
        "given or have the same length as quantity or the length of column * quantity"
    bins, hist_range, yscale = cycle(bins), cycle(hist_range), cycle(yscale)

    # number of created plots and subdivision in columns and rows
    n_plots = len(list(product(column, quantity)))
    n_columns = len(quantity) if len(quantity) > 1 else (3 if n_plots > 3 else n_plots)
    n_rows = int(np.ceil(n_plots / n_columns))

    fig, ax = plt.subplots(n_rows, n_columns, figsize=(25, int(4 * n_rows)))
    ax = ax if isinstance(ax, np.ndarray) else np.array(ax)  # in case only a single column amd a single quantity is given
    fig.suptitle(suptitle)

    def get_quantity(dataframe, _column_name, _quantity_name):  # helper to get pd.Series...
        if _quantity_name is not None:  # in case of a two level index
            try:  # of a four vector quantity
                return getattr(dataframe[_column_name].v4, _quantity_name).to_numpy()
            except AttributeError:  # of other quantity
                return dataframe.loc[:, (_column_name, _quantity_name)].to_numpy()
        else:  # in case of a single level index
            return dataframe.loc[:, _column_name].to_numpy()

    def is_None(it):  # helper: checking for None __and__ avoiding a collision with np.ndarray like objects
        return isinstance(it, type(None))

    for i, (_ax, (_column, (_quantity, _unit, _density))) in enumerate(zip(ax.flatten(),
                                                                           product(column,
                                                                                   zip(quantity, unit, density)))):
        empty_subplot_content = 0  # counter for setting subplots with no content to invisible
        edges = None  # for auto setting of first histogram edges
        _bins, _hist_range, _yscale = next(bins), next(hist_range), next(yscale)  # bins, hist_range, yscale for (_column, _quantity)
        for j, (_df, _label, _weights, _color) in enumerate(zip(df, label, weights, color)):  # in case multiple dataframes are given
            _plot_values = get_quantity(_df, _column, _quantity)
            if np.all(np.isnan(_plot_values)):
                empty_subplot_content += 1
                continue
            _, edges, _ = _ax.hist(_plot_values, histtype="step", range=_hist_range, density=_density,
                                   bins=100 if not _bins and is_None(edges) else _bins if is_None(edges) else edges,  # same edges
                                   label=_label if _label else f"Label_{j}",  # default or custom
                                   weights=_weights,
                                   color=_color if _color else None)
            _ax.set(yscale=_yscale, ylabel="Events",
                    title=f"{_column}{':' if _quantity else ''} {_quantity if _quantity else ''}",
                    xlabel=f"{_quantity if _quantity else _column} "
                           f"{'in ' if (unit[i % len(unit)]) or (_unit and with_quantity) else ''}"
                           f"{_unit if (_unit and with_quantity) else (unit[i % len(unit)] if unit[i % len(unit)] else '')}")
            # _unit if with_quantity else unit[i] if _unit or unit[i] else ''
            _ax.legend()
            if empty_subplot_content == len(df):
                _ax.set_visible(False)  # not showing MET eta or similar not defined or emtpy NaN like quantity lists
            continue
    plt.tight_layout()
    plt.show()