Source code for solarforecastarbiter.io.fetch.bsrn

"""Parse data from the BSRN, fetch data from NASA LARC.

Modified from pvlib python pvlib/iotools/bsrn.py.
See LICENSES/PVLIB-PYTHON_LICENSE
"""
from io import StringIO
import logging

import pandas as pd
import requests

logger = logging.getLogger('bsrn')

COL_SPECS = [(0, 3), (4, 9), (10, 16), (16, 22), (22, 27), (27, 32), (32, 39),
             (39, 45), (45, 50), (50, 55), (55, 64), (64, 70), (70, 75)]

BSRN_COLUMNS = ['day', 'minute',
                'ghi', 'ghi_std', 'ghi_min', 'ghi_max',
                'dni', 'dni_std', 'dni_min', 'dni_max',
                'empty', 'empty', 'empty', 'empty', 'empty',
                'dhi', 'dhi_std', 'dhi_min', 'dhi_max',
                'lwd', 'lwd_std', 'lwd_min', 'lwd_max',
                'temp_air', 'relative_humidity', 'pressure']


[docs]def parse_bsrn(fbuf):
    """
    Parse a buffered BSRN station-to-archive file into a DataFrame.

    The BSRN (Baseline Surface Radiation Network) is a world wide network
    of high-quality solar radiation monitoring stations as described in [1]_.
    The function only parses the basic measurements (LR0100), which include
    global, diffuse, direct and downwelling long-wave radiation [2]_. Future
    updates may include parsing of additional data and meta-data.

    BSRN files are freely available and can be accessed via FTP [3]_. Required

    username and password are easily obtainable as described in the BSRN's
    Data Release Guidelines [4]_.

    Parameters
    ----------
    fbuf: io.StringIO
        A buffer containing the data to be parsed.

    Returns
    -------
    data: DataFrame
        A DataFrame with the columns as described below. For more extensive
        description of the variables, consult [2]_.

    Notes
    -----
    The data DataFrame includes the following fields:

    =======================  ======  ==========================================
    Key                      Format  Description
    =======================  ======  ==========================================
    day                      int     Day of the month 1-31
    minute                   int     Minute of the day 0-1439
    ghi                      float   Mean global horizontal irradiance [W/m^2]
    ghi_std                  float   Std. global horizontal irradiance [W/m^2]
    ghi_min                  float   Min. global horizontal irradiance [W/m^2]
    ghi_max                  float   Max. global horizontal irradiance [W/m^2]
    dni                      float   Mean direct normal irradiance [W/m^2]
    dni_std                  float   Std. direct normal irradiance [W/m^2]
    dni_min                  float   Min. direct normal irradiance [W/m^2]
    dni_max                  float   Max. direct normal irradiance [W/m^2]
    dhi                      float   Mean diffuse horizontal irradiance [W/m^2]
    dhi_std                  float   Std. diffuse horizontal irradiance [W/m^2]
    dhi_min                  float   Min. diffuse horizontal irradiance [W/m^2]
    dhi_max                  float   Max. diffuse horizontal irradiance [W/m^2]
    lwd                      float   Mean. downward long-wave radiation [W/m^2]
    lwd_std                  float   Std. downward long-wave radiation [W/m^2]
    lwd_min                  float   Min. downward long-wave radiation [W/m^2]
    lwd_max                  float   Max. downward long-wave radiation [W/m^2]
    temp_air                 float   Air temperature [°C]
    relative_humidity        float   Relative humidity [%]
    pressure                 float   Atmospheric pressure [hPa]
    =======================  ======  ==========================================

    References
    ----------
    .. [1] `World Radiation Monitoring Center - Baseline Surface Radiation
        Network (BSRN)
        <https://bsrn.awi.de/>`_
    .. [2] `Update of the Technical Plan for BSRN Data Management, 2013,
       Global Climate Observing System (GCOS) GCOS-172.
       <https://bsrn.awi.de/fileadmin/user_upload/bsrn.awi.de/Publications/gcos-174.pdf>`_
    .. [3] `BSRN Data Retrieval via FTP
       <https://bsrn.awi.de/data/data-retrieval-via-ftp/>`_
    .. [4] `BSRN Data Release Guidelines
       <https://bsrn.awi.de/data/conditions-of-data-release/>`_
    """

    # Read file and store the starting line number for each logical record (LR)
    line_no_dict = {}

    fbuf.readline()  # first line should be *U0001, so read it and discard
    line_no_dict['0001'] = 0
    date_line = fbuf.readline()  # second line contains the year and month
    start_date = pd.Timestamp(year=int(date_line[7:11]),
                              month=int(date_line[3:6]), day=1,
                              tz='UTC')  # BSRN timestamps are UTC
    for num, line in enumerate(fbuf, start=2):
        if line.startswith('*'):  # Find start of all logical records
            line_no_dict[line[2:6]] = num  # key is 4 digit LR number

    fbuf.seek(0)  # reset buffer to start of data

    # Determine start and end line of logical record LR0100 to be parsed
    start_row = line_no_dict['0100'] + 1  # Start line number
    # If LR0100 is the last logical record, then read rest of file
    if start_row-1 == max(line_no_dict.values()):
        end_row = num  # then parse rest of the file
    else:  # otherwise parse until the beginning of the next logical record
        end_row = min([i for i in line_no_dict.values() if i > start_row]) - 1
    nrows = end_row-start_row+1

    # Read file as a fixed width file (fwf)
    data = pd.read_fwf(fbuf, skiprows=start_row, nrows=nrows, header=None,
                       colspecs=COL_SPECS, na_values=[-999.0, -99.9],
                       compression='infer')

    # Create multi-index and unstack, resulting in one column for each variable
    data = data.set_index([data.index // 2, data.index % 2])
    data = data.unstack(level=1).swaplevel(i=0, j=1, axis='columns')

    # Sort columns to match original order and assign column names
    data = data.reindex(sorted(data.columns), axis='columns')
    data.columns = BSRN_COLUMNS
    # Drop empty columns
    data = data.drop('empty', axis='columns')

    # Change day and minute type to integer
    data['day'] = data['day'].astype('Int64')
    data['minute'] = data['minute'].astype('Int64')

    # Set datetime index
    data.index = (start_date
                  + pd.to_timedelta(data['day']-1, unit='d')
                  + pd.to_timedelta(data['minute'], unit='T'))

    return data


[docs]def read_bsrn_from_nasa_larc(start, end):
    """Read a range of BRSN monthly data from the NASA LARC.

    Parameters
    ----------
    start: pandas.Timestamp
    end: pandas.Timestamp

    Returns
    -------
    bsrn_data: pd.DataFrame
    """
    # data not available until month is complete, so avoid requesting file
    # that does not exist. assumes file is available as soon as month is
    # complete.
    end_of_last_month = (
        pd.Timestamp.utcnow().normalize() - pd.offsets.MonthBegin()
        - pd.Timedelta('1s'))
    range_end = min(end, end_of_last_month)
    # use period_range to avoid this funky date_range behavior:
    # > pd.date_range(start='2020-01-01', end='2020-01-30 23:59:59', freq='M')
    # DatetimeIndex([], dtype='datetime64[ns]', freq='M')
    # > pd.date_range(start='2020-01-01', end='2020-01-31 00:00:00', freq='M')
    # DatetimeIndex(['2020-01-31'], dtype='datetime64[ns]', freq='M')
    months = pd.period_range(start=start, end=range_end, freq='M')
    # a better programmer would use asyncio
    month_data = []
    for month in months:
        try:
            d = read_bsrn_month_from_nasa_larc(month.year, month.month)
        except Exception as e:
            logger.warning('could not get bsrn data from nasa larc for '
                           f'{month.year}, {month.month}. {e}')
        else:
            month_data.append(d)
    # concat raises exception on empty list. maybe better to let that bubble up
    if len(month_data):
        bsrn_data = pd.concat(month_data)
        return bsrn_data[start:end]
    else:
        # not sure how we get here in practice
        return pd.DataFrame()  # pragma: no cover


[docs]def read_bsrn_month_from_nasa_larc(year, month):
    """Read one month of BSRN data from the NASA LARC.

    Parameters
    ----------
    year: int, str
        The year of the data.
    month: int, str
        The month of the data (1 - 12).

    Returns
    -------
    bsrn_data: pd.DataFrame

    Notes
    -----
    Data starts in December, 2014.
    """
    base_url = 'https://cove.larc.nasa.gov/BSRN/LRC49/'
    year = str(year)
    url = f'{base_url}{year}/lrc{int(month):02}{year[2:]}.dat'
    r = requests.get(url)
    r.raise_for_status()
    with StringIO(r.text) as buf:
        return parse_bsrn(buf)