Source code for idaes.core.dmf.tables

#################################################################################
# The Institute for the Design of Advanced Energy Systems Integrated Platform
# Framework (IDAES IP) was produced under the DOE Institute for the
# Design of Advanced Energy Systems (IDAES).
#
# Copyright (c) 2018-2023 by the software owners: The Regents of the
# University of California, through Lawrence Berkeley National Laboratory,
# National Technology & Engineering Solutions of Sandia, LLC, Carnegie Mellon
# University, West Virginia University Research Corporation, et al.
# All rights reserved.  Please see the files COPYRIGHT.md and LICENSE.md
# for full copyright and license information.
#################################################################################
"""
Table handling for DMF.

The main class defined here is :class:`Table`. It provides constructor methods
for reading from Excel and CSV files. There is a convention defined for
indicating units in column headers so that this code can split the unit from
the column name. Other methods are defined for adding and extracting tables
from DMF :class:`idaes.core.dmf.resource.Resource` objects.

In the simplest case, you would create a new DMF resource for a CSV table like this::

    from idaes.core.dmf.resource import Resource
    resource = Resource()
    resource.add_table("my_file.csv")
    # you can now save this resource in the DMF

Then you could retrieve and use that table like this::

    # retrieve resource from the DMF
    table = resource.tables["my_file.csv"]
    dataframe = table.data    # Pandas dataframe
    units = table.units       # Units extracted from header row (strings)

See also, on the DMF Resource class:

    * :meth:`idaes.core.dmf.resource.Resource.add_table`
    * :attr:`idaes.core.dmf.resource.Resource.tables`

"""
# TODO: Missing docstrings
# pylint: disable=missing-class-docstring
# pylint: disable=missing-function-docstring

# stdlib
from typing import List, Tuple, Dict
import re
from io import BufferedIOBase, RawIOBase
import os

# ext
import pandas as pd

# Local
from idaes.core.dmf.resource import Resource

__authors__ = ["Dan Gunter (LBNL)"]
__author__ = __authors__[0]


class DataFormatError(Exception):
    def __init__(self, source, problem):
        message = f"in {source}: {problem}"
        super().__init__(self, message)


[docs]class Table:
    """Represent a table stored in the DMF.

    Tables are expected to have a header row with optional units, which if present
    are encoded in [square brackets]. Whitespace is ignored between the column
    name and the units. For example::

            T [C], P [bar], G0/RT H2O, G0/RT NaCl [-], A phi [(kg/mol^0.5]
            0, 1, -23.4638, -13.836, 0.3767
    """

    def __init__(self):
        """Create new, empty, table.

        Use :meth:`read_csv` or :meth:`read_excel` to populate the table with data.
        """
        self._data = pd.DataFrame({})
        self._units = {}
        self._filepath = None
        self._desc = ""

    @property
    def data(self) -> pd.DataFrame:
        """Pandas dataframe for data."""
        return self._data

    @property
    def units_dict(self) -> Dict[str, str]:
        """Units as a dict keyed by table column name."""
        return self._units.copy()

    @property
    def units_list(self) -> List[str]:
        """Units in order of table columns."""
        return [self._units[c] for c in self._data.columns]

    #: Shorthand for getting list of units
    units = units_list

    @property
    def description(self):
        return self._desc

    @description.setter
    def description(self, value):
        self._desc = value

[docs]    @staticmethod
    def read_table(filepath, inline: bool, file_format: str) -> "Table":
        """Determine the input file type, then construct a new Table object
        by calling one of :meth:`Table.read_csv` or :meth:`Table.read_excel`.

        Args:
            filepath: Any valid first argument to pandas `read_csv`
            inline: If True, read the whole table in; otherwise just get the
                    column names and units from the header row.
            file_format: One of 'infer', 'csv', or 'excel'. For 'infer',
                         use the file extension (and only the extension) to
                         determine if it's a CSV or Excel file.

        Returns:
            Constructed Table object

        Raises:
            IOError: If the input cannot be read or parsed
        """
        fmt = file_format.lower()
        name = filepath.name
        if fmt == "infer":
            if name.endswith(".csv"):
                fmt = "csv"
            elif name.endswith(".xls") or name.endswith(".xlsx"):
                fmt = "excel"
            else:
                raise ValueError(f"Cannot infer file format for '{name}'")
        elif fmt not in ("csv", "excel"):
            raise ValueError(f"Unknown file format '{fmt}'; must be csv or excel")

        # create a new table to work with
        table = Table()

        # set up keywords to read only header row if we are not including data inline
        kwargs = {}
        if not inline:
            kwargs["nrows"] = 0

        # read the table (or at least its header)
        try:
            if fmt == "csv":
                table.read_csv(filepath, **kwargs)
            elif fmt == "excel":
                table.read_excel(filepath, **kwargs)
        except Exception as err:
            raise IOError(f"Cannot read '{filepath}': {err}")

        return table

[docs]    def read_csv(self, filepath, **kwargs) -> None:
        """Read the table from a CSV file using pandas' `read_csv()`.
        See `Pandas read_csv docs
        <https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html>`_
        for details.

        Existing table will be replaced.

        Args:
            filepath: Any valid first argument to pandas `read_csv`
            kwargs: Keyword arguments passed to pandas `read_csv`

        Returns:
            None
        """
        self._data = pd.read_csv(filepath, **kwargs)
        self._extract_units()
        self._filepath = filepath

[docs]    def read_excel(self, filepath, **kwargs) -> None:
        """Read the table from a CSV file using pandas' `read_excel()`.
        See `Pandas read_excel docs
        <https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html>`_
        for details.

        Existing table will be replaced.

        Args:
            filepath: Any valid first argument to pandas `read_excel`
            **kwargs: Keyword arguments passed to pandas `read_excel`

        Returns:
            None

        Raises:
            ValueError: if more than one Excel sheet is returned
            DataFormatError: if the input data or header is invalid
        """
        # Workaround for older versions of Python/Pandas (python 3.6):
        # set engine explicitly to openpyxl for *.xlsx files
        v = [int(_) for _ in pd.__version__.split(".")]
        if v[0] <= 1 and v[1] <= 1:  # version < 1.2.0
            # if it's a file and has xlsx extension, set engine
            if not isinstance(filepath, (BufferedIOBase, RawIOBase)):
                ext = os.path.splitext(str(filepath))[-1]
                if ext == ".xlsx":
                    kwargs["engine"] = "openpyxl"

        data = pd.read_excel(filepath, **kwargs)
        if isinstance(data, dict):
            raise ValueError(
                f"Read from excel file must return a single sheet, "
                f"but sheet_name='{kwargs.get('sheet_name', '?')}' "
                f"returned {len(data)} sheets: {list(data.keys())}"
            )
        self._data = data
        self._extract_units()
        self._filepath = filepath

    def _extract_units(self):
        new_names, units_dict = {}, {}

        for name in self._data.columns:
            base_name, units = self._split_units(name)
            new_names[name] = base_name
            units_dict[base_name] = units

        self._data.rename(columns=new_names, inplace=True)
        self._units = units_dict

    #: Regular expression for extracting units from column names.
    #: In plain English, the following forms are expected for a
    #: column name: "Name", "Name[Units]", "Longer Name With $% Chars [ Units ]"
    #: For both the Name and the Units, any sequence of characters valid
    #: in the current encoding are acceptable (except, of course, a "["
    #: in the name, which means start-of-units)
    UNITS_REGEX = r"""
        (?P<name>[^[]+) # column name
        (?:\s*\[        # start of [units] section
        (?P<units>.*?)  # column units
        \])?            # end of [units] section, which is optional
        """

    @classmethod
    def _split_units(cls, name) -> Tuple[str, str]:
        m = re.match(cls.UNITS_REGEX, name, flags=re.X)
        if m is None:
            raise DataFormatError(
                name,
                "No recognized column name. Expected syntax is "
                "'name' or 'name [units]'",
            )
        new_name = m.group("name").strip()
        unit = m.group("units")
        if unit == "-" or unit is None:
            unit = ""  # normalize empty units to empty string
        else:
            unit = unit.strip()  # note: may also end up
        return new_name, unit

[docs]    def add_to_resource(self, rsrc: Resource):
        """Add the current table, inline, to the given resource.

        Args:
            rsrc: A DMF :class:`Resource` instance

        Returns:
            None
        """
        rsrc.data[Resource.TABLE_FIELD] = self.as_dict()

[docs]    @classmethod
    def from_resource(cls, rsrc: Resource) -> Dict[str, "Table"]:
        """Get an instance of this class from data in the given resource.

        Args:
            rsrc: A DMF :class:`Resource` instance

        Returns:
            Dictionary of tables in resource. If there is only one inline
            table, the dictionary is of length one with only key "" (empty string).
            If there are multiple tables referenced by file the dictionary
            keys are the (relative) file names.
            If there are no tables in this resource, raises KeyError.

        Raises:
            KeyError: if there are no tables in this resource
        """
        data = rsrc.v["data"]
        if Resource.TABLE_FIELD in data:
            # Single inline resource
            table_ = cls.from_dict(data[Resource.TABLE_FIELD])
            return {"": table_}
        elif Resource.TABLE_INFO_FIELD in data:
            # One or more files
            tables = {}
            for idx, path in enumerate(rsrc.get_datafiles()):
                table_ = cls.read_table(path, True, "infer")
                table_.description = rsrc.v[Resource.DATAFILES_FIELD][idx].get(
                    "desc", ""
                )
                tables[path.name] = table_
            return tables
        else:
            raise KeyError("No table in resource")

[docs]    def as_dict(self, values=True) -> Dict:
        """Get the representation of this table as a dict.

        Args:
            values: If True, include the values in the dict. Otherwise only
                    include the units for each column.

        Returns:
            Dictionary with the structure accepted by :meth:`from_dict`.
            If the "values" argument is False, that key will be missing from
            the dict for each column.
        """
        header = list(self._data.columns)
        d = {}
        for column in header:
            d[column] = {"units": self._units[column]}
            if values:
                d[column]["values"] = list(self._data[column])
        return d

[docs]    @classmethod
    def from_dict(cls, data: Dict) -> "Table":  # unquote in Py3.7+ see PEP563
        """Create a new Table object from a dictionary of data and units.

        Args:
            data: Dictionary with the following structure::

                {
                    'column-name-1': {
                        'units': 'unit',
                        'values': [ value, value, .. ]
                    },
                    'column-name-2': {
                        'units': 'unit',
                        'values': [ value, value, .. ]
                    },
                    ...etc...
                }

        Returns:
            :class:`Table` object
        """
        tbl = Table()
        dataframe_dict = {}
        for column, info in data.items():
            dataframe_dict[column] = info.get("values", [])
            tbl._units[column] = info.get("units", "")
        tbl._data = pd.DataFrame(dataframe_dict)
        return tbl