Source code for solarforecastarbiter.validation.quality_mapping

"""
Define constant mappings between bit-mask values and understandable quality
flags
"""
from functools import wraps


import pandas as pd
import numpy as np


# The quality_flag field in MySQL is currently limited to 1 << 15;
# fields beyond 1 << 15 will require a change in the MySQL datatype
# for the quality_flag column. The mapping from description to bitmask
# is versioned so that future addtions or removals are backwards compatible
# without rerunning the validation on all data.
# DO NOT MODIFY THE VALUES OF THE _BITMASK_DESCRIPTION_DICT instead,
# add a increment the key and add a new value tuple, i.e. add version 2 like
# 2: {'OK': 0, 'USER FLAGGED: 1 << 0, ...} . The VERSION
# IDENTIFIER 0 - 2 must remain in their current positions. Versions 7
# and up will require another identifier bit to be determined at that
# time.  The version identifier also serves to mark data in the
# database as validated. The tuples are (description, bit mask)
BITMASK_DESCRIPTION_DICT = {1: {
    # start with 1 to distinguish validated vs not in DB
    'OK': 0,
    'USER FLAGGED': 1 << 0,
    'VERSION IDENTIFIER 0': 1 << 1,
    'VERSION IDENTIFIER 1': 1 << 2,
    'VERSION IDENTIFIER 2': 1 << 3,
    'NIGHTTIME': 1 << 4,
    'CLEARSKY': 1 << 5,
    'SHADED': 1 << 6,
    'UNEVEN FREQUENCY': 1 << 7,
    'LIMITS EXCEEDED': 1 << 8,
    'CLEARSKY EXCEEDED': 1 << 9,
    'STALE VALUES': 1 << 10,
    'INTERPOLATED VALUES': 1 << 11,
    'CLIPPED VALUES': 1 << 12,
    'INCONSISTENT IRRADIANCE COMPONENTS': 1 << 13,
    'DAILY VALIDATION APPLIED': 1 << 14,
    'RESERVED 1': 1 << 15  # available for new flag
    }
}

# logical combinations of the masks defined above.
# add a version layer for compatibility if needed in the future.
# derived masks may reference masks defined in an earlier key.
DERIVED_MASKS = {
    'DAYTIME': (np.logical_not, 'NIGHTTIME'),
    'DAYTIME STALE VALUES': (np.logical_and, 'DAYTIME', 'STALE VALUES'),
    'DAYTIME INTERPOLATED VALUES': (
        np.logical_and, 'DAYTIME', 'INTERPOLATED VALUES'),
}

# flags that should typically be discarded before resampling because they
# represent truly bad data
DISCARD_BEFORE_RESAMPLE = [
    'USER FLAGGED',
    'LIMITS EXCEEDED',
    'INCONSISTENT IRRADIANCE COMPONENTS',
    'STALE VALUES',
    'INTERPOLATED VALUES',
    'DAYTIME STALE VALUES',
    'DAYTIME INTERPOLATED VALUES'
]

# should never change unless another VERSION IDENTIFIER is required
VERSION_MASK = 0b1110
LATEST_VERSION = max(BITMASK_DESCRIPTION_DICT.keys())
DESCRIPTION_MASK_MAPPING = BITMASK_DESCRIPTION_DICT[LATEST_VERSION]
LATEST_VERSION_FLAG = (
    LATEST_VERSION * DESCRIPTION_MASK_MAPPING['VERSION IDENTIFIER 0'])
DAILY_VALIDATION_FLAG = DESCRIPTION_MASK_MAPPING['DAILY VALIDATION APPLIED']


[docs]def convert_bool_flags_to_flag_mask(flags, flag_description, invert):
    if flags is None:
        return None
    if invert:
        bool_flags = ~(flags.astype(bool))
    else:
        bool_flags = flags.astype(bool)
    return ((bool_flags * DESCRIPTION_MASK_MAPPING[flag_description])
            | LATEST_VERSION_FLAG)


[docs]def mask_flags(flag_description, invert=True):
    """
    Decorator that will convert a boolean pandas object into an integer,
    bitmasked object when `_return_mask=True`. This decorator adds the
    `_return_mask` kwarg to the decorated function. Using this decorator
    to mask values ensures the description and decorated function are
    clearly linked.

    Parameters
    ----------
    flag_description : str
        Description of the flag to convert from a boolean to integer. Must be
        a key of the DESCRIPTION_MASK_MAPPING dict.
    invert : boolean
        Whether to invert the boolean object before conversion e.g. if
        flag_description = 'LIMITS EXCEEDED' and a True value indicates
        that a parameter is within the limits, invert=True is required
        for the proper mapping.

    Returns
    -------
    flags : pandas Object
        Returns the output of the decorated function (which must be a pandas
        Object) as the original output or an object of type int with value
        determined by the truthiness of the orignal output and flag_description
    """
    def decorator(f):
        @wraps(f)
        def wrapper(*args, **kwargs):
            return_mask = kwargs.pop('_return_mask', False)
            flags = f(*args, **kwargs)
            if return_mask:
                if isinstance(flags, tuple):
                    return tuple(convert_bool_flags_to_flag_mask(
                        f, flag_description, invert) for f in flags)
                else:
                    return convert_bool_flags_to_flag_mask(
                        flags, flag_description, invert)
            else:
                return flags
        return wrapper
    return decorator


[docs]def has_data_been_validated(flags):
    """Return True (or a boolean series) if flags has been validated"""
    return flags > 1


[docs]def get_version(flag):
    """Extract the version from flag"""
    # will be more complicated if another version identifier must be added
    return np.right_shift(flag & VERSION_MASK, 1)


def _flag_description_checks(flag_description):
    if isinstance(flag_description, str):
        return
    else:
        if len(flag_description) == 0:
            raise TypeError('flag_description must have len > 0')
        for k in iter(flag_description):
            if not isinstance(k, str):
                raise TypeError(
                    'Elements of flag_description must have type str')


[docs]def check_if_single_value_flagged(flag, flag_description,
                                  _perform_checks=True):
    """Check if the single integer flag has been flagged for flag_description

    Parameters
    ----------
    flag : integer
        Integer flag
    flag_description : string or iterable of strings
        Checks to compare againsts flag


    Returns
    -------
    Boolean
        Whether any of `flag_description` checks are represented by `flag`

    Raises
    ------
    ValueError
        If flag has not been validated
    TypeError
        If flag_description is not a string or iterable of strings
    KeyError
        If flag_description is not a possible check for the flag version
    """
    if _perform_checks:
        if not has_data_been_validated(flag):
            raise ValueError('Data has not been validated')
        _flag_description_checks(flag_description)
    mask_dict = BITMASK_DESCRIPTION_DICT[get_version(flag)]
    if isinstance(flag_description, str):
        mask = mask_dict[flag_description]
        ok_mask = mask == 0
    else:
        mask = 0
        ok_mask = False
        for k in flag_description:
            m = mask_dict[k]
            if m == 0:
                ok_mask = True
            mask |= m

    out = bool(flag & mask)
    if ok_mask:
        out |= which_data_is_ok(flag)
    return out


[docs]def which_data_is_ok(flags):
    """Return True for flags that have been validated and are OK"""
    return (flags & ~VERSION_MASK == 0) & has_data_been_validated(flags)


def _make_mask_series(version):
    descriptions = [k for k in BITMASK_DESCRIPTION_DICT[version].keys()
                    if not (k.startswith('VERSION') or
                            k.startswith('RESERVED') or k == 'OK')]
    masks = [BITMASK_DESCRIPTION_DICT[version][desc]
             for desc in descriptions]
    return pd.Series(masks, index=descriptions)


[docs]def check_for_all_descriptions(flag, _check_if_validated=True):
    """
    Return a boolean Series indicating the checks a flag represents
    """
    if _check_if_validated and not has_data_been_validated(flag):
        raise ValueError('Data has not been validated')
    version = get_version(flag)
    mask_series = _make_mask_series(version)
    out = (mask_series & flag).astype(bool)
    return out


def _convert_version_mask(ser):
    version = ser.name
    if version == 0:
        return pd.DataFrame({'NOT VALIDATED': [True] * len(ser.index)},
                            index=ser.index)
    mask_series = _make_mask_series(version)
    out = pd.DataFrame(
        np.bitwise_and(mask_series.values[None, :],
                       ser.values[:, None]),
        columns=mask_series.index,
        index=ser.index,
        dtype=bool)
    out['NOT VALIDATED'] = False
    return out


def _add_derived_masks(masks):
    """Copies input DataFrame and then adds new masks derived from
    input masks"""
    unvalidated = masks['NOT VALIDATED']
    if unvalidated.all():
        return masks
    out = masks.copy()[~unvalidated]
    for flag, operations in DERIVED_MASKS.items():
        func = operations[0]
        cols = operations[1:]
        args = [out[col] for col in cols]
        out[flag] = func(*args)
    return pd.concat([out, masks[unvalidated]], sort=False).fillna(False)


[docs]def convert_mask_into_dataframe(flag_series):
    """
    Convert `flag_series` into a boolean DataFrame indicating which checks
    the flags represent.

    Parameters
    ----------
    flag_series : pandas.Series
        Integer series of validated quality flags

    Returns
    -------
    pandas.DataFrame
       Columns are keys of BITMASK_DESCRIPTION_DICT and values are booleans
       indicating if the input flag corresponds to the given check. An
       additional column, NOT VALIDATED, indicates if the data has not
       been validated. Additional columns defined by DERIVED_MASKS are
       computed based on the results of the fundamental flags.
       Columns may vary depending the version of the quality
       flags in the series.
    """
    vers = get_version(flag_series)
    fundamental_masks = flag_series.groupby(vers, sort=False).apply(
        _convert_version_mask).fillna(False)
    out = _add_derived_masks(fundamental_masks)
    return out


[docs]def convert_flag_frame_to_strings(flag_frame, sep=', ', empty='OK'):
    """
    Convert the `flag_frame` output of :py:func:`~convert_mask_into_dataframe`
    into a pandas.Series of strings which are the active flag names separated
    by `sep`. Any row where all columns are false will have a value of `empty`.

    Parameters
    ----------
    flag_frame : pandas.DataFrame
        Boolean DataFrame with descriptive column names
    sep : str
        String to separate column names by
    empty : str
        String to replace rows where no columns are True

    Returns
    -------
    pandas.Series
        Of joined column names from `flag_frame` separated by `sep` if True.
        Has the same index as `flag_frame`.
    """
    return np.logical_and(flag_frame, flag_frame.columns + sep).replace(
        False, '').sum(axis=1).str.rstrip(sep).replace('', empty)


[docs]def check_if_series_flagged(flag_series, flag_description):
    """
    Check if `flag_series` has been flagged for the checks given by
    flag_description

    Parameters
    ----------
    flag_series : pandas.Series
        Series of integer quality flags
    flag_description : string or iterable of strings
        Checks to compare `flag_series` to. If this is an iterable, the result
        will be a boolean indicating if the flag represents *ANY* of the
        checks.

    Returns
    -------
    pandas.Series
        Boolean Series indicating if *ANY* of `flag_description` checks are
        represented by each flag

    Raises
    ------
    ValueError
        If any of `flag_series` has not been validated.
    TypeError
        If flag_description is not a string or iterable of strings
    KeyError
        If flag_description is not a possible check for the flag version
    """
    if not has_data_been_validated(flag_series).all():
        raise ValueError('Data has not been validated')
    _flag_description_checks(flag_description)
    return flag_series.apply(check_if_single_value_flagged,
                             flag_description=flag_description,
                             _perform_checks=False)