Source code for solarforecastarbiter.validation.quality_mapping

Define constant mappings between bit-mask values and understandable quality
from functools import wraps

import pandas as pd
import numpy as np

# The quality_flag field in MySQL is currently limited to 1 << 15;
# fields beyond 1 << 15 will require a change in the MySQL datatype
# for the quality_flag column. The mapping from description to bitmask
# is versioned so that future addtions or removals are backwards compatible
# without rerunning the validation on all data.
# add a increment the key and add a new value tuple, i.e. add version 2 like
# 2: {'OK': 0, 'USER FLAGGED: 1 << 0, ...} . The VERSION
# IDENTIFIER 0 - 2 must remain in their current positions. Versions 7
# and up will require another identifier bit to be determined at that
# time.  The version identifier also serves to mark data in the
# database as validated. The tuples are (description, bit mask)
    # start with 1 to distinguish validated vs not in DB
    'OK': 0,
    'USER FLAGGED': 1 << 0,
    'VERSION IDENTIFIER 0': 1 << 1,
    'VERSION IDENTIFIER 1': 1 << 2,
    'VERSION IDENTIFIER 2': 1 << 3,
    'NIGHTTIME': 1 << 4,
    'CLEARSKY': 1 << 5,
    'SHADED': 1 << 6,
    'UNEVEN FREQUENCY': 1 << 7,
    'LIMITS EXCEEDED': 1 << 8,
    'CLEARSKY EXCEEDED': 1 << 9,
    'STALE VALUES': 1 << 10,
    'CLIPPED VALUES': 1 << 12,
    'RESERVED 1': 1 << 15  # available for new flag

# logical combinations of the masks defined above.
# add a version layer for compatibility if needed in the future.
# derived masks may reference masks defined in an earlier key.
    'DAYTIME': (np.logical_not, 'NIGHTTIME'),
    'DAYTIME STALE VALUES': (np.logical_and, 'DAYTIME', 'STALE VALUES'),
        np.logical_and, 'DAYTIME', 'INTERPOLATED VALUES'),

# flags that should typically be discarded before resampling because they
# represent truly bad data

# should never change unless another VERSION IDENTIFIER is required

[docs]def convert_bool_flags_to_flag_mask(flags, flag_description, invert): if flags is None: return None if invert: bool_flags = ~(flags.astype(bool)) else: bool_flags = flags.astype(bool) return ((bool_flags * DESCRIPTION_MASK_MAPPING[flag_description]) | LATEST_VERSION_FLAG)
[docs]def mask_flags(flag_description, invert=True): """ Decorator that will convert a boolean pandas object into an integer, bitmasked object when `_return_mask=True`. This decorator adds the `_return_mask` kwarg to the decorated function. Using this decorator to mask values ensures the description and decorated function are clearly linked. Parameters ---------- flag_description : str Description of the flag to convert from a boolean to integer. Must be a key of the DESCRIPTION_MASK_MAPPING dict. invert : boolean Whether to invert the boolean object before conversion e.g. if flag_description = 'LIMITS EXCEEDED' and a True value indicates that a parameter is within the limits, invert=True is required for the proper mapping. Returns ------- flags : pandas Object Returns the output of the decorated function (which must be a pandas Object) as the original output or an object of type int with value determined by the truthiness of the orignal output and flag_description """ def decorator(f): @wraps(f) def wrapper(*args, **kwargs): return_mask = kwargs.pop('_return_mask', False) flags = f(*args, **kwargs) if return_mask: if isinstance(flags, tuple): return tuple(convert_bool_flags_to_flag_mask( f, flag_description, invert) for f in flags) else: return convert_bool_flags_to_flag_mask( flags, flag_description, invert) else: return flags return wrapper return decorator
[docs]def has_data_been_validated(flags): """Return True (or a boolean series) if flags has been validated""" return flags > 1
[docs]def get_version(flag): """Extract the version from flag""" # will be more complicated if another version identifier must be added return np.right_shift(flag & VERSION_MASK, 1)
def _flag_description_checks(flag_description): if isinstance(flag_description, str): return else: if len(flag_description) == 0: raise TypeError('flag_description must have len > 0') for k in iter(flag_description): if not isinstance(k, str): raise TypeError( 'Elements of flag_description must have type str')
[docs]def check_if_single_value_flagged(flag, flag_description, _perform_checks=True): """Check if the single integer flag has been flagged for flag_description Parameters ---------- flag : integer Integer flag flag_description : string or iterable of strings Checks to compare againsts flag Returns ------- Boolean Whether any of `flag_description` checks are represented by `flag` Raises ------ ValueError If flag has not been validated TypeError If flag_description is not a string or iterable of strings KeyError If flag_description is not a possible check for the flag version """ if _perform_checks: if not has_data_been_validated(flag): raise ValueError('Data has not been validated') _flag_description_checks(flag_description) mask_dict = BITMASK_DESCRIPTION_DICT[get_version(flag)] if isinstance(flag_description, str): mask = mask_dict[flag_description] ok_mask = mask == 0 else: mask = 0 ok_mask = False for k in flag_description: m = mask_dict[k] if m == 0: ok_mask = True mask |= m out = bool(flag & mask) if ok_mask: out |= which_data_is_ok(flag) return out
[docs]def which_data_is_ok(flags): """Return True for flags that have been validated and are OK""" return (flags & ~VERSION_MASK == 0) & has_data_been_validated(flags)
def _make_mask_series(version): descriptions = [k for k in BITMASK_DESCRIPTION_DICT[version].keys() if not (k.startswith('VERSION') or k.startswith('RESERVED') or k == 'OK')] masks = [BITMASK_DESCRIPTION_DICT[version][desc] for desc in descriptions] return pd.Series(masks, index=descriptions)
[docs]def check_for_all_descriptions(flag, _check_if_validated=True): """ Return a boolean Series indicating the checks a flag represents """ if _check_if_validated and not has_data_been_validated(flag): raise ValueError('Data has not been validated') version = get_version(flag) mask_series = _make_mask_series(version) out = (mask_series & flag).astype(bool) return out
def _convert_version_mask(ser): version = if version == 0: return pd.DataFrame({'NOT VALIDATED': [True] * len(ser.index)}, index=ser.index) mask_series = _make_mask_series(version) out = pd.DataFrame( np.bitwise_and(mask_series.values[None, :], ser.values[:, None]), columns=mask_series.index, index=ser.index, dtype=bool) out['NOT VALIDATED'] = False return out def _add_derived_masks(masks): """Copies input DataFrame and then adds new masks derived from input masks""" unvalidated = masks['NOT VALIDATED'] if unvalidated.all(): return masks out = masks.copy()[~unvalidated] for flag, operations in DERIVED_MASKS.items(): func = operations[0] cols = operations[1:] args = [out[col] for col in cols] out[flag] = func(*args) return pd.concat([out, masks[unvalidated]], sort=False).fillna(False)
[docs]def convert_mask_into_dataframe(flag_series): """ Convert `flag_series` into a boolean DataFrame indicating which checks the flags represent. Parameters ---------- flag_series : pandas.Series Integer series of validated quality flags Returns ------- pandas.DataFrame Columns are keys of BITMASK_DESCRIPTION_DICT and values are booleans indicating if the input flag corresponds to the given check. An additional column, NOT VALIDATED, indicates if the data has not been validated. Additional columns defined by DERIVED_MASKS are computed based on the results of the fundamental flags. Columns may vary depending the version of the quality flags in the series. """ vers = get_version(flag_series) fundamental_masks = flag_series.groupby(vers, sort=False).apply( _convert_version_mask).fillna(False) out = _add_derived_masks(fundamental_masks) return out
[docs]def convert_flag_frame_to_strings(flag_frame, sep=', ', empty='OK'): """ Convert the `flag_frame` output of :py:func:`~convert_mask_into_dataframe` into a pandas.Series of strings which are the active flag names separated by `sep`. Any row where all columns are false will have a value of `empty`. Parameters ---------- flag_frame : pandas.DataFrame Boolean DataFrame with descriptive column names sep : str String to separate column names by empty : str String to replace rows where no columns are True Returns ------- pandas.Series Of joined column names from `flag_frame` separated by `sep` if True. Has the same index as `flag_frame`. """ return np.logical_and(flag_frame, flag_frame.columns + sep).replace( False, '').sum(axis=1).str.rstrip(sep).replace('', empty)
[docs]def check_if_series_flagged(flag_series, flag_description): """ Check if `flag_series` has been flagged for the checks given by flag_description Parameters ---------- flag_series : pandas.Series Series of integer quality flags flag_description : string or iterable of strings Checks to compare `flag_series` to. If this is an iterable, the result will be a boolean indicating if the flag represents *ANY* of the checks. Returns ------- pandas.Series Boolean Series indicating if *ANY* of `flag_description` checks are represented by each flag Raises ------ ValueError If any of `flag_series` has not been validated. TypeError If flag_description is not a string or iterable of strings KeyError If flag_description is not a possible check for the flag version """ if not has_data_been_validated(flag_series).all(): raise ValueError('Data has not been validated') _flag_description_checks(flag_description) return flag_series.apply(check_if_single_value_flagged, flag_description=flag_description, _perform_checks=False)