# Source code for idaes.surrogate.pysmo.polynomial_regression

#################################################################################
# The Institute for the Design of Advanced Energy Systems Integrated Platform
# Framework (IDAES IP) was produced under the DOE Institute for the
# Design of Advanced Energy Systems (IDAES), and is copyright (c) 2018-2021
# by the software owners: The Regents of the University of California, through
# Lawrence Berkeley National Laboratory,  National Technology & Engineering
# Solutions of Sandia, LLC, Carnegie Mellon University, West Virginia University
#
#################################################################################

# Imports from the python standard library
from __future__ import division

# from builtins import int, str
import os.path
import pprint
import random
import warnings

# Imports from third parties
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pickle
from pyomo.environ import *
from pyomo.core.expr.visitor import replace_expressions
import scipy.optimize as opt
from scipy.special import comb as comb
from six import string_types

# Imports from IDAES namespace
from idaes.surrogate.pysmo.utils import NumpyEvaluator

__author__ = "Oluwamayowa Amusat"

"""
The purpose of this file is to perform polynomial regression in Pyomo.
This will be done in two stages. First, a sampling plan will
be used to select samples for generating a surrogate model.
In the second stage, the surrogate model is constructed by fitting to
different order polynomials. Long term, an iterative adaptive sampling
approach will be incorporated for model improvement.
Cross-validation is used to select the best model.

FeatureScaling:
Simple script for scaling and un-scaling the input data

Polynomial Regression:
Three approaches are implemented for evaluating polynomial coefficients -
a. Moore-Penrose maximum likelihood method (Forrester et al.)
b. Optimization using the BFGS algorithm.
c. Optimization with Pyomo
The Pyomo optimization approach is enabled as the default at this time.
"""

[docs]class FeatureScaling:
"""

A class for scaling and unscaling input and output data. The class contains two main methods: data_scaling and data_unscaling
"""

def __init__(self):
pass

[docs]    @staticmethod
def data_scaling(data):
"""
data_scaling performs column-wise minimax scaling on the input dataset.

Args:
data : The input data set to be scaled. Must be a numpy array or dataframe.

Returns:
(tuple): tuple containing:
- **scaled_data**  : A 2-D Numpy Array containing the scaled data. All array values will be between [0, 1].
- **data_minimum** : A 2-D row vector containing the column-wise minimums of the input data.
- **data_maximum** : A 2-D row vector containing the column-wise maximums of the input data.

Raises:
TypeError:
Raised when the input data is not a numpy array or dataframe

"""
# Confirm that data type is an array or DataFrame
if isinstance(data, np.ndarray):
input_data = data
elif isinstance(data, pd.DataFrame):
input_data = data.values
else:
raise TypeError(
"original_data_input: Pandas dataframe or numpy array required."
)

if input_data.ndim == 1:
input_data = input_data.reshape(len(input_data), 1)
data_minimum = np.min(input_data, axis=0)
data_maximum = np.max(input_data, axis=0)
scale = data_maximum - data_minimum
scale[scale == 0.0] = 1.0
scaled_data = (input_data - data_minimum) / scale
# scaled_data = (input_data - data_minimum)/(data_maximum - data_minimum)
data_minimum = data_minimum.reshape(1, data_minimum.shape)
data_maximum = data_maximum.reshape(1, data_maximum.shape)

return scaled_data, data_minimum, data_maximum

[docs]    @staticmethod
def data_unscaling(x_scaled, x_min, x_max):
"""

data_unscaling performs column-wise un-scaling on the a minmax-scaled input dataset.

Args:
x_scaled (NumPy Array)  : Data to be un-scaled. Data values should be between 0 and 1.
x_min (NumPy vector)    : :math:n \\times 1 vector containing the actual minimum value for each column. Must contain same number of elements as the number of columns in x_scaled.
x_max (NumPy vector)    : :math:n \\times 1 vector vector containing the actual minimum value for each column. Must contain same number of elements as the number of columns in x_scaled.

Returns:
NumPy Array : A 2-D numpy array containing the scaled data, :math:x_{min} + x_{scaled} * (x_{max} - x_{min})

Raises:
IndexError: Raised when the dimensions of the arrays are inconsistent.

"""
# Check if it can be evaluated. Will return index error if dimensions are wrong
if x_scaled.ndim == 1:  # Check if 1D, and convert to 2D if required.
x_scaled = x_scaled.reshape(len(x_scaled), 1)
if (x_scaled.shape != x_min.size) or (x_scaled.shape != x_max.size):
raise IndexError("Dimensionality problems with data for un-scaling.")
unscaled_data = x_min + x_scaled * (x_max - x_min)
return unscaled_data

[docs]class PolynomialRegression:
"""

The PolynomialRegression class performs polynomial regression on a training data set.

The class must first be initialized by calling PolynomialRegression. Regression is then carried out by calling training.

For a given dataset with :math:n features :math:x_{1},x_{2},\ldots,x_{n}, Polyregression is able to consider three types of basis functions:
(a) Mononomial terms (:math:x_{i}^{p},p \leq 10) for all individual features. The maximum degree to be considered can be set by the user (**maximum_polynomial_order**)
(b) All first order interaction terms :math:x_{1}x_{2}, :math:x_{1}x_{3} etc. This can be turned on or off by the user (set **multinomials**)
(c) User defined input features, e.g. :math:\sin(x_{1}). These must be Pyomo functions and should be provided as a list by the user calling set_additional_terms method before the polynomial training is done.

**Example:**

.. code-block:: python

# Initialize the class and set additional terms
>>> d = PolynomialRegression(full_data, training_data, maximum_polynomial_order=2, max_iter=20, multinomials=1, solution_method='pyomo')
>>> p = d.get_feature_vector()

# Train polynomial model and predict output for an test data x_test
>>> d.training()
>>> predictions = d.predict_output(x_test)

Args:
regression_data_input(NumPy Array of Pandas Dataframe) : The dataset for regression training. It is expected to contain the features (X) and output (Y) data, with the output values (Y) in the last column.
original_data_input(NumPy Array of Pandas Dataframe) : If **regression_data_input** was drawn from a larger dataset by some sampling approach, the larger dataset may be provided here.
When additional data is not available, the same data supplied for training_data can be supplied - this tells the algorithm not to carry out adaptive sampling.
maximum_polynomial_order(int): The maximum polynomial order to be considered.

Further details about the optional inputs may be found under the __init__ method.

"""

[docs]    def __init__(
self,
original_data_input,
regression_data_input,
maximum_polynomial_order,
number_of_crossvalidations=None,
training_split=None,
max_fraction_training_samples=None,
max_iter=None,
solution_method=None,
multinomials=None,
fname=None,
overwrite=False,
):
"""
Initialization of PolynomialRegression class.

Args:
regression_data_input(NumPy Array of Pandas Dataframe) : The dataset for regression training. It is expected to contain features and output data, with the output values (Y) in the last column.
original_data_input(NumPy Array of Pandas Dataframe) : If **regression_data_input** was drawn from a larger dataset by some sampling approach, the larger dataset may be provided here.
maximum_polynomial_order(int) : The maximum polynomial order to be considered.

Keyword Args:
number_of_crossvalidations(int) : The number of polynomial fittings and cross-validations to be carried out for each polynomial function/expression. Must be a positive, non-zero integer. Default=3.

training_split(float): The training/test split to be used for regression_data_input. Must be between 0 and 1. Default = 0.75

solution_method(str): The method to be used for solving the least squares optimization problem for polynomial regression. Three options are available:

(a) "MLE"  : The mle (maximum likelihood estimate) method solves the least squares problem using linear algebra. Details of the method may be found in Forrester et al.
(b) "BFGS" : This approach solves the least squares problem using scipy's BFGS algorithm.
(c) "pyomo": This option solves the optimization problem in pyomo with IPOPT as solver. This is the default option.

multinomials(bool):  This option determines whether or not multinomial terms are considered during polynomial fitting. Takes 0 for No and 1 for Yes. Default = 1.

Returns:
**self** object containing all the input information.

Raises:
ValueError:
- The input datasets (**original_data_input** or **regression_data_input**) are of the wrong type (not Numpy arrays or Pandas Dataframes)

Exception:
* **maximum_polynomial_order** is not a positive, non-zero integer or **maximum_polynomial_order** is higher than the number of training samples available
Exception:
* **solution_method** is not 'mle', 'pyomo' or 'bfgs
Exception:
- **multinomials** is not binary (0 or 1)
Exception:
- **training_split** is not between 0 and 1
Exception:
- **number_of_crossvalidations** is not a positive, non-zero integer
Exception:
- **max_fraction_training_samples** is not between 0 and 1
Exception:
- **no_adaptive_samples** is not a positive, non-zero integer
Exception:
- **max_iter** is not a positive, non-zero integer

warnings.warn:
- When the number of cross-validations is too high, i.e. number_of_crossvalidations > 10
"""

print(
"\n===========================Polynomial Regression===============================================\n"
)
# Checks if fname is provided or exists in the path
if not isinstance(overwrite, bool):
raise Exception("overwrite must be boolean.")
self.overwrite = overwrite
if fname is None:
fname = "solution.pickle"
self.filename = "solution.pickle"
elif (
not isinstance(fname, str)
or os.path.splitext(fname)[-1].lower() != ".pickle"
):
raise Exception(
'fname must be a string with extension ".pickle". Please correct.'
)
if (
os.path.exists(fname) and overwrite is True
):  # Explicit overwrite done by user
print(
"Warning:",
fname,
"already exists; previous file will be overwritten.\n",
)
self.filename = fname
elif os.path.exists(fname) and overwrite is False:  # User is not overwriting
self.filename = (
os.path.splitext(fname)
+ "_v_"
+ pd.Timestamp.today().strftime("%m-%d-%y_%H%M%S")
+ ".pickle"
)
print(
"Warning:",
fname,
'already exists; results will be saved to "',
self.filename,
'".\n',
)
# self.filename = 'solution.pickle'
elif os.path.exists(fname) is False:
self.filename = fname

if isinstance(original_data_input, pd.DataFrame):
original_data = original_data_input.values
# FIXME: if we add an option to specify the response column, this needs to change
self.regression_data_columns = list(original_data_input.columns)[:-1]
elif isinstance(original_data_input, np.ndarray):
original_data = original_data_input
self.regression_data_columns = list(range(original_data_input.shape - 1))
else:
raise ValueError(
"original_data_input: Pandas dataframe or numpy array required."
)

if isinstance(regression_data_input, pd.DataFrame):
regression_data = regression_data_input.values
elif isinstance(regression_data_input, np.ndarray):
regression_data = regression_data_input
else:
raise ValueError(
"regression_data_input: Pandas dataframe or numpy array required."
)

# Check for potential dimensionality problems in input data
if regression_data.shape > original_data.shape:
raise Exception(
"The sampled data has more entries than the original dataset."
)
elif regression_data.shape != original_data.shape:
raise Exception(
"Dimensional discrepancies in the dimensions of the original and regression datasets."
)
elif (regression_data.shape == 1) or (original_data.shape == 1):
raise Exception(
"Input data requires at least two dimensions (X and Y data)."
)

self.original_data = original_data
self.regression_data = regression_data

if number_of_crossvalidations is None:
print("The number of cross-validation cases (3) is used.")
number_of_crossvalidations = 3
elif number_of_crossvalidations > 10:
warnings.warn(
"The number of cross-validations entered is large. The simulation may take a while to run"
)
self.number_of_crossvalidations = number_of_crossvalidations

if not isinstance(maximum_polynomial_order, int):
raise Exception("Maximum polynomial order must be an integer")
elif maximum_polynomial_order > 10:
warnings.warn(
"The maximum allowed polynomial order is 10. Value has been adjusted to 10."
)
maximum_polynomial_order = 10
self.max_polynomial_order = maximum_polynomial_order

self.number_of_x_vars = regression_data.shape - 1

if training_split is None:
print("The default training/cross-validation split of 0.75 is used.")
training_split = 0.75
elif training_split >= 1 or training_split <= 0:
raise Exception(
"Fraction of samples used for training must be between 0 and 1"
)
self.fraction_training = training_split

self.number_of_samples = regression_data.shape

if max_fraction_training_samples is None:
max_fraction_training_samples = 0.5
elif max_fraction_training_samples > 1 or max_fraction_training_samples < 0:
raise Exception(
"The fraction for the maximum number of training samples must be between 0 and 1"
)
self.max_fraction_training_samples = max_fraction_training_samples

if (regression_data.shape < original_data.shape) and max_iter is None:
max_iter = 10
if (
regression_data.shape == original_data.shape
):
print("No iterations will be run.")
max_iter = 0
self.max_iter = max_iter

# Ensure all other key variables are integers
if not isinstance(self.number_of_crossvalidations, int):
raise Exception("Number of cross-validations must be an integer")
raise Exception("Number of adaptive samples must be an integer")
elif not isinstance(self.max_iter, int):
raise Exception("Maximum number of iterations must be an integer")
elif self.max_polynomial_order >= regression_data.shape:
raise Exception(
"max_polynomial_order too high for the number of samples supplied"
)

if (self.max_polynomial_order <= 0) or (self.number_of_crossvalidations <= 0):
raise Exception(
"maximum_polynomial_order and number_of_crossvalidations must be positive, non-zero integers"
)
elif (self.no_adaptive_samples < 0) or (self.max_iter < 0):
raise Exception("no_adaptive_samples and max_iter must be positive")

if solution_method is None:
solution_method = "pyomo"
self.solution_method = solution_method
print("Default parameter estimation method is used.")
elif not isinstance(solution_method, string_types):
raise Exception("Invalid solution method. Must be of type <str>.")
elif (
(solution_method.lower() == "mle")
or (solution_method.lower() == "pyomo")
or (solution_method.lower() == "bfgs")
):
solution_method = solution_method.lower()
self.solution_method = solution_method
else:
raise Exception(
'Invalid parameter estimation method entered. Select one of maximum likelihood (solution_method="mle"), Pyomo optimization (solution_method="pyomo") or BFGS (solution_method="bfgs") methods. '
)
print("Parameter estimation method: ", self.solution_method, "\n")

if multinomials is None:
self.multinomials = 1
elif multinomials == 1:
self.multinomials = 1
elif multinomials == 0:
self.multinomials = 0
else:
raise Exception(
'Multinomial must be binary: input "1" for "Yes" and "0" for "No". '
)

self.feature_list = []

# Results
self.optimal_weights_array = None
self.final_polynomial_order = None
self.errors = None
self.number_of_iterations = None
self.iteration_summary = None
self.final_training_data = None
self.dataframe_of_optimal_weights_polynomial = None
self.dataframe_of_optimal_weights_extra_terms = None
self.extra_terms_feature_vector = None
self.fit_status = None

"""

The training_test_data_creation splits data into training and test data sets.

Given the number of cross-validations and the required training/test split, it:
- calculates the number of training samples as num_training = int(training_split x total number of samples),
- shuffles regression_data number_of_crossvalidations times,
- splits off the top num_training samples in each shuffle as individual training sets, and
- takes the bottom (total number of samples - num_training) samples in each shuffle to create its corresponding test dataset.

Args:
self: containing the number of training samples (self.number_of_samples), training/test split (self.fraction_training) and the required number of cross-validations (self.number_of_crossvalidations).

Keyword Args:
additional_features(NumPy Array): A numpy array containing additional features provided by the user. When supplied, additional_features is column-appended to self.regression data before the training and tests sets are created.

Returns:
Tuple(training_data, cross_val_data)

- training_data: Dictionary containing all the training datasets created.

* When no additional features have been specified, the dictionary has a length of number_of_crossvalidations.
* When additional features have been specified, the dictionary has a length of 2 * number_of_crossvalidations, with the training data for additional_features stored separately.

- cross_val_data: Dictionary containing all the test datasets created. Dictionary will have the same length as training_data.

"""
training_data = {}
cross_val_data = {}
num_training = int(np.around(self.number_of_samples * self.fraction_training))
if num_training == 0:
raise Exception("The inputted of fraction_training is too low.")
elif num_training == self.number_of_samples:
raise Exception("The inputted of fraction_training is too high.")
for i in range(1, self.number_of_crossvalidations + 1):
np.random.seed(i)
A = np.zeros(
(self.regression_data.shape, self.regression_data.shape)
)
A[:, :] = self.regression_data
np.random.shuffle(
A
)  # Shuffles the rows of the regression data randomly
training_data["training_set_" + str(i)] = A[0:num_training, :]
cross_val_data["test_set_" + str(i)] = A[num_training:, :]
A = np.zeros(
(
self.regression_data.shape,
)
)
A[:, 0 : self.regression_data.shape] = self.regression_data
np.random.shuffle(
A
)  # Shuffles the rows of the regression data randomly
training_data["training_set_" + str(i)] = A[
0:num_training, : self.regression_data.shape
]
training_data["training_extras_" + str(i)] = A[
0:num_training, self.regression_data.shape :
]
cross_val_data["test_set_" + str(i)] = A[
num_training:, : self.regression_data.shape
]
cross_val_data["test_extras_" + str(i)] = A[
num_training:, self.regression_data.shape :
]
return training_data, cross_val_data

@classmethod
def polygeneration(
self,
polynomial_order,
multinomials,
x_input_train_data,
):
"""

This function generates a x-variable vector for the required polynomial order. This is done in four stages:

- First, generates the pure mononomials are generated by increasing the polynomial degree  by 1 until polynomial_order is reached.
- Next, the first-order multinomials are generated if self.multinomials = 1. This is implemented in suci a way that each multinomial appears only once, i.e. x_i.x_j = x_j.x_i. The multinomial columns are appended to the enx of the array.
- Next, a column of ones is inserted in the first column space to represent the constant term.
- Finally, the columns containing the extra terms supplied by the user are added to the end of the array (when available).

Thus, the format of the output array is [constant, nmononomials, multinomials, extra terms]

Args:
polynomial_order(int):                    The polynomial order currently under consideration
multinomials(bool):                       Boolean variable that determines whether or not multinomial terms are considered during polynomial fitting.
x_input_train_data(NumPy Array):           Input data containing features supplied by the user

Keyword Args:

Returns:
x_train_data(NumPy Array):                 Array containing all polynomial features to be considered during regression

Example:
if polynomial_order=2, numtinomials=1, x_input_train_data = [x1, x2, x3], additional_x_training_data = [sin(x1), tanh(x3)], then x_train_data will contain the regression features
x_train_data = [1, x1, x2, x3, x1^2, x2^2, x3^2, x1.x2, x1.x3, x2.x3, sin(x1), tanh(x3)]

"""
N = x_input_train_data.shape
x_train_data = x_input_train_data
# Generate the constant and pure power terms
for i in range(2, polynomial_order + 1):
x_train_data = np.concatenate(
(x_train_data, x_input_train_data ** i), axis=1
)

if multinomials == 1:
# Next, generate first order multinomials
for i in range(0, x_input_train_data.shape):
for j in range(0, i):
x_train_data = np.concatenate(
(
x_train_data,
(
x_input_train_data[:, i] * x_input_train_data[:, j]
).reshape(N, 1),
),
axis=1,
)

# Concatenate to generate full dataset.
x_train_data = np.concatenate((np.ones((N, 1)), x_train_data), axis=1)

x_train_data = np.concatenate(
)

return x_train_data

@staticmethod
def cost_function(theta, x, y, reg_parameter):
"""

This function is an implementation of the cost function for linear regression:
cost = [sum of square errors over m samples / (2 * m)]  + [reg_parameter * theta*2 / (2 * m)]

This is the objective function for the BFGS optimization problem.

Args:
theta        : polynomial coefficients/weights,  (n x 1) in size
x            : array of features, (m x n) in size
y            : actual output vector, size (m x 1)
reg_parameter: reqularization parameter, set to

Returns:
cost_value   : the cost value for the fit, the objective value of the optimization problem

"""

y = y.reshape(y.shape, 1)
y_prediction = np.matmul(x, theta)
y_prediction = y_prediction.reshape(y_prediction.shape, 1)
cost_value = (0.5 / x.shape) * (np.sum((y - y_prediction) ** 2))
cost_penalty = (reg_parameter * 0.5 / x.shape) * (np.sum(theta ** 2))
cost_value = cost_value + cost_penalty
return cost_value

@staticmethod
"""

This function is an implementation of the gradient function for linear regression:
if
cost = [(A.x - y)^2 / 2m] + [reg_parameter * theta*2/ (2 * m)],
then
gradient = [((A.x - y)* A) / m] + [reg_parameter * theta/ m]

This is the gradient function supplied to the BFGS optimization algorithm.

Args:
theta        : polynomial coefficients/weights,  (n x 1) in size
x            : array of features, (m x n) in size
y            : actual output vector, size (m x 1)
reg_parameter: reqularization parameter

Returns:
grad_value   : the cost gradients for the fit, size (n x 1)

"""
y = y.reshape(y.shape, 1)
y_prediction = np.matmul(x, theta)
y_prediction = y_prediction.reshape(y_prediction.shape, 1)
t1 = (y_prediction - y) * x
grad_values = (1 / x.shape) * np.sum(t1, axis=0)
gradient_penalty = (reg_parameter / x.shape) * theta
theta.size,
)

def bfgs_parameter_optimization(self, x, y):
"""
This function performs parameter optimization using scipy's BFGS algorithm.
It takes in the functions pre-defined functions cost_function and gradient_function as the cost and gradient functions.

Args:
x            : array of features, (m x n) in size
y            : actual output vector, size (m x 1)

Initialization:
The regularization parameter and initial weights are set to zero,
reg_parameter = 0
init_theta = 0

Returns:
theta: The optimal linear regression weights found

"""
init_theta = np.zeros((x.shape, 1))
reg_parameter = 0.0
other_args = (x, y, reg_parameter)
theta = opt.fmin_bfgs(
self.cost_function,
init_theta,
args=other_args,
)
return theta

@staticmethod
def MLE_estimate(x, y):
"""

Maximum likelihood estimate method for solving polynomial regression problems:

If
Ax = B,
then
x = inv_A * B

where the inv_A is called the Moore-Penrose inverse.

Numpy's pseudoinverse function has been used to calculate the inverse here.

Args:
x            : array of features, (m x n) in size
y            : actual output vector, size (m x 1)

Returns:
phi: The optimal linear regression weights found

For more details about the maximum likelihood estimate methos, see to Forrester et al.

"""
moore_penrose_inverse = np.linalg.pinv(x)  # Moore Penrose inverse of vector x
phi = np.matmul(moore_penrose_inverse, y)
return phi

@staticmethod
def pyomo_optimization(x, y):
"""
Pyomo implementation of least squares optimization problem:

Minimize cost = (y' - y) ^ 2
subject to: y' = Ax

The problem is solved within Pyomo's framework using IPOPT as solver.

Args:
x            : array of features, (m x n) in size
y            : actual output vector, size (m x 1)

Returns:
phi: The optimal linear regression weights found
"""

model = ConcreteModel()

x_data = pd.DataFrame(x)
y_data = pd.DataFrame(y)

model.M = Set(initialize=x_data.index.values)  # Rows indices passed into set
model.N = Set(
initialize=x_data.columns.values
)  # x column indices passed into set
model.P = Set(
initialize=y_data.columns.values
)  # y column index passed into set

model.x = Param(model.M, model.N, initialize=x_data.stack().to_dict())
model.y_real = Param(model.M, model.P, initialize=y_data.stack().to_dict())

# Define variables
model.theta = Var(model.N, initialize=0.1, domain=Reals)

# constraint y_p = theta.X
def xy_product(model, i, k):
return sum(model.theta[j] * model.x[i, j] for j in model.N for k in model.P)

model.y_predictions = Expression(
model.M, model.P, rule=xy_product, doc="Predicted value calc: y = hx"
)

# Cost function - RMSE
def model_rms_error(model):
cost_value = (1 / len(model.M)) * sum(
((model.y_real[i, k] - model.y_predictions[i, k]) ** 2)
for i in model.M
for k in model.P
)
return cost_value

model.prediction_error = Objective(
rule=model_rms_error, sense=minimize, doc="Minimum RMSE error"
)

instance = model
opt = SolverFactory("ipopt")
opt.options["max_iter"] = 10000000
result = opt.solve(instance)  # , tee=True)

# Convert theta variable into numpy array
phi = np.zeros((len(instance.theta), 1))
iterator = 0
for s in instance.N:
phi[iterator, 0] = instance.theta[s].value
iterator += 1
return phi

@staticmethod
def cross_validation_error_calculation(phi, x_test_data, y_test_data):
"""

This function calculates the average sum of square errors between the actual and predicted output values,
ss_error = sum of squared errors / number of samples

Args:
phi             : optimal weight vector obtained by optimization
x_test_data     : vector of features x_test_data
y_test_data     : actual output values associated with

Returns:
ss_error        : The average sum of squared errors

"""
y_test_prediction = np.matmul(x_test_data, phi)
ss_error = (1 / y_test_data.shape) * (
np.sum((y_test_data - y_test_prediction) ** 2)
)
return ss_error

def polyregression(
self,
poly_order,
training_data,
test_data,
):
"""

Function that performs polynomial regression on a given dataset. It returns the estimated parameters and the fitting errors. It

- calls the method self.polygeneration to generate the required polynomial/feature array based on the current polynomial order poly_order,
- calls the pre-selected solution algorithm to solve the least squares problem, and
- calls the cross_validation_error_calculation method to calculate the training and cross-validation errors.

Args:
poly_order(int)           : The polynomial order currently being considered - between 1 and max_polynomial_order
training_data(NumPy Array) : The training data to be regressed
test_data(NumPy Array)    : The test data to be used to cross-validate the polynomial fit

Keyword Args:
additional_x_training_data  : Array containing additional training features based on additional_features list supplied by the user. Will have same number of rows as training_data.
additional_x_test_data      : Array of additional cross-validation features based on additional_features list supplied by the user. Will have same number of rows as test_data.

Returns:
phi_vector                  : the optimal weight vector for the polynomial considered here, returns zeros when problem is underspecified, i.e number of features > number of training samples.
training_error              : the average SSE estimate in the training dataset, returns Inf when number of features > number of training samples (DoF < 0).
crossval_error             : the average SSE estimate on the cross-validation dataset, returns Inf when number of features > number of training samples (DoF < 0).

"""
x_training_data = training_data[:, :-1]
y_training_data = training_data[:, -1]
x_test_data = test_data[:, :-1]
y_test_data = test_data[:, -1]
x_polynomial_data = self.polygeneration(
)

# Check that the problem has more samples than features - necessary for fitting. If not, return Infinity.
if x_polynomial_data.shape >= x_polynomial_data.shape:
if self.solution_method == "mle":
phi_vector = self.MLE_estimate(
x_polynomial_data,
y_training_data.reshape(y_training_data.shape, 1),
)
elif self.solution_method == "bfgs":
phi_vector = self.bfgs_parameter_optimization(
x_polynomial_data, y_training_data
)
elif self.solution_method == "pyomo":
phi_vector = self.pyomo_optimization(x_polynomial_data, y_training_data)
phi_vector = phi_vector.reshape(
phi_vector.shape, 1
)  # Pseudo-inverse approach

x_polynomial_data_test = self.polygeneration(
)
training_error = self.cross_validation_error_calculation(
phi_vector,
x_polynomial_data,
y_training_data.reshape(y_training_data.shape, 1),
)
crossval_error = self.cross_validation_error_calculation(
phi_vector,
x_polynomial_data_test,
y_test_data.reshape(y_test_data.shape, 1),
)

else:
phi_vector = np.zeros((x_polynomial_data.shape, 1))
phi_vector[:, 0] = np.Inf
training_error = np.Inf
crossval_error = np.Inf

# print(poly_order, x_polynomial_data.shape, x_polynomial_data.shape, training_error, crossval_error)

return phi_vector, training_error, crossval_error

def surrogate_performance(
):
"""

This function evaluates the performance of the surrogate model on the entire dataset.
1. A vector is created to hold the original input data and the predicted y values from the surrogate model is created - comparison_vector
2. The predicted values from the surrogate model are then evaluated.
3. The errors on each datapoint(individual error), the mean absolute error and the mean square errors are calculated.
4. The R-square coefficient is then calculated.
5. The adjusted R2 is calculated next, taking into account the number of terms in the equation

The comparison vector is sorted based on the performance of the surrogate model in its prediction - best to worst.
Note that the error on each data point is based on the error maximization function in ALAMO (Cozad et al., Eq. 7)

"""

comparison_vector = np.zeros(
(self.original_data.shape, self.original_data.shape + 1)
)
comparison_vector[:, : self.original_data.shape] = self.original_data[:, :]

# Create x terms for the whole input data, and evaluate the predicted y's as phi.X.
x_evaluation_data = self.polygeneration(
order_best,
self.multinomials,
self.original_data[:, 0 : self.original_data.shape - 1],
)
y_prediction = np.matmul(x_evaluation_data, phi_best)
y_prediction = y_prediction.reshape(y_prediction.shape, 1)
comparison_vector[:, self.original_data.shape] = y_prediction[:, 0]

# Error calculations:
den = np.max(comparison_vector[:, -2]) - np.min(comparison_vector[:, -2])
individual_error = (
(comparison_vector[:, -1] - comparison_vector[:, -2]) / den
) ** 2
mae_error = (1 / comparison_vector.shape) * np.sum(
np.abs(comparison_vector[:, -1] - comparison_vector[:, -2])
)
mse_error = (1 / comparison_vector.shape) * np.sum(
(comparison_vector[:, -1] - comparison_vector[:, -2]) ** 2
)
# R-squared coefficient calculation:
input_y_mean = np.mean(comparison_vector[:, -2], axis=0)
ss_total = np.sum((comparison_vector[:, -2] - input_y_mean) ** 2)
ss_residual = np.sum((comparison_vector[:, -1] - comparison_vector[:, -2]) ** 2)
r_square = 1 - (ss_residual / ss_total)

# Sort comparison vector based on error in predictions
individual_error = individual_error.reshape(individual_error.shape, 1)
comparison_vector = np.append(comparison_vector, individual_error, 1)
sorted_comparison_vector = comparison_vector[comparison_vector[:, -1].argsort()]

samp_size = self.original_data.shape
no_nonzero_terms = np.count_nonzero(phi_best[1:, 0])
# Evaluate R2_adjusted only if R2>0: Fit is better than mean value. When fit is worse than hor. line, return 0
if r_square > 0:
(1 - r_square) * ((samp_size - 1) / (samp_size - no_nonzero_terms - 1))
)
else:

return sorted_comparison_vector, mae_error, mse_error, r_square, r2_adjusted

def results_generation(self, beta, order):
"""
This function prints the results of the fitting to the screen.
"""
results_df = pd.Series()
counter = 1
print("\n------------------------------------------------------------")
print("The final coefficients of the regression terms are: \n")
print("k               |", beta[0, 0])
results_df = results_df.append(pd.Series({"k": beta[0, 0]}))
if self.multinomials == 1:
for i in range(1, order + 1):
for j in range(1, self.number_of_x_vars + 1):
print("(x_", j, ")^", i, "     |", beta[counter, 0])
col_name = "(x_" + str(j) + ")^" + str(i)
results_df = results_df.append(
pd.Series({col_name: beta[counter, 0]})
)
counter += 1
for i in range(1, self.number_of_x_vars + 1):
for j in range(1, self.number_of_x_vars + 1):
if i > j:
print("x_", j, ".x_", i, "     |", beta[counter, 0])
col_name = "(x_" + str(j) + ")" + ".(x_" + str(i) + ")"
results_df = results_df.append(
pd.Series({col_name: beta[counter, 0]})
)
counter += 1

else:
for i in range(1, order + 1):
for j in range(1, self.number_of_x_vars + 1):
print("(x_", j, ")^", i, "     |", beta[counter, 0])
col_name = "(x_" + str(j) + ")^" + str(i)
results_df = results_df.append(
pd.Series({col_name: beta[counter, 0]})
)
counter += 1

return results_df

@staticmethod
def error_plotting(vector_of_results):
"""
This function generates displays a plot of the different errors
"""
ax1 = plt.subplot(2, 2, 1)
ax1.plot(
vector_of_results[:, 0],
vector_of_results[:, 2],
"green",
vector_of_results[:, 0],
vector_of_results[:, 3],
"red",
)
ax1.set_title("Training (green) vs Cross-validation error (red)")

ax2 = plt.subplot(2, 2, 2)
ax2.plot(vector_of_results[:, 0], vector_of_results[:, 4], "green")
ax2.set_title("MAE")

ax3 = plt.subplot(2, 2, 3)
ax3.plot(vector_of_results[:, 0], vector_of_results[:, 5], "blue")
ax3.set_title("MSE")

ax4 = plt.subplot(2, 2, 4)
ax4.plot(
vector_of_results[:, 0],
vector_of_results[:, 6],
"blue",
vector_of_results[:, 0],
vector_of_results[:, 7],
"red",
)
ax4.set_title("R-squared (blue) and Adjusted R-squared (red)")

plt.show()

return ax1, ax2, ax3, ax4

"""

This function generates a 2D array of the additional features from the list supplied by the user.
Note: It assumes that each list element is 1D

Args:
additional_regression_features(list): a list of features to be added to the regression problem. Each element of the list must have the same number of entries as self.number_of_samples

Returns:

Raises:
Exception:
* when additional_regression_features is not a list
Exception:
* when the entries in additional_regression_features are not of type 1-D NumPy Array or Pandas Series
Exception:
* when the length of the entries in additional_regression_features do not match the number of rows in self.regression_data

"""
# Check for list
# Determine number of additional features, and create 2D array for additional features
)
# If entry is an array and is of the right sizes
if (
and (
== self.regression_data.shape
)
):
elif (
and (
== self.regression_data.shape
)
):
i
].values
elif (
and (
== self.regression_data.shape
)
):
i
].values
else:
raise Exception(
"Wrong data dimensions or type - additional_regression_features contain 1-D vectors, have same number of entries as regression_data and be of type pd.Series, pd.Dataframe or np.ndarray."
)

"""

polynomial_regression_fitting is the core method which is called in the PolynomialRegression class.
It ties together all the other functions in the class.

For each polynomial order, it
- calls the function user_defined_terms to generate the array of additional features (when required),
- calls the function training_test_data_creation to generate the training and test data sets,
- calls the function polyregression to determine the optimal weight vector and the fitting errors,
- determines whether the new fit improves is the best so far by the crossvalidation error of the current fit to the previous best,
- calls the function surrogate_performance to calculate the errors and R-values of the current fit, and
- returns results to user.

When adaptive sampling is done, the function also
- selects the adaptive samples to be added to the training data based on the magnitudes of the prediction errors of individual samples in self.original_data, and
- determines when the the stopping conditions have been satisfied.

The following stopping conditions are considered when adaptive sampling is in use:
- The maximum number of training samples allowed by the user has been exceeded
- Mean absolute error ~= 0
- Mean squared error ~= 0
- R^2 = 1
- The preset iteration number given by the user has been reached
- All available points in self.original_data have been used for training.

Keyword Args:
additional_regression_features(<list>):     Additional features the user wants the algorithm to consider during regression.
It should be noted that adaptive sampling is not available when additional features have been supplied by the user, i.e. when len(additional_regression_features) > 0.

Returns:
results:                                    Python object containing the results of the polynomial regression process including the polynomial order
(results.polynomial_order), polynomial coefficients (results.optimal_weights_array) and fit errors (results.errors).
See information on ResultReport class for details on contents.

"""
# Parameters that represent the best solution found at each iteration based on the cross-validation error
best_error = 1e20
train_error_fit = 1e20
phi_best = 0
order_best = 0

if (additional_regression_features is None) or (
):
print(
"max_fraction_training_samples set at ",
self.max_fraction_training_samples,
)
print(
)
print("Maximum number of iterations (Max_iter) set at: ", self.max_iter)

training_data, cross_val_data = self.training_test_data_creation()
for poly_order in range(1, self.max_polynomial_order + 1):
for cv_number in range(1, self.number_of_crossvalidations + 1):
phi, train_error, cv_error = self.polyregression(
poly_order,
training_data["training_set_" + str(cv_number)],
cross_val_data["test_set_" + str(cv_number)],
)
if cv_error < best_error:
best_error = cv_error
phi_best = phi
order_best = poly_order
train_error_fit = train_error
print(
"\nInitial surrogate model is of order",
order_best,
" with a cross-val error of %4f" % best_error,
)
(
sorted_comparison_vector,
mae_error,
mse_error,
r_square,
) = self.surrogate_performance(phi_best, order_best)
print(
"Initial Regression Model Performance:\nOrder: ",
order_best,
" / MAE: %4f" % mae_error,
" / MSE: %4f" % mse_error,
" / R^2: %4f" % r_square,
)

# Parameters that retain the previous best solutions. They are compared to the best solution at each iteration based on the R-square coefficient.
(
order_opt,
train_error_opt,
best_error_opt,
mae_error_opt,
mse_error_opt,
r_square_opt,
phi_opt,
) = (
order_best,
train_error_fit,
best_error,
mae_error,
mse_error,
r_square,
phi_best,
)

eps_neg = 1e-6
eps_pos = 0.999999
iteration_number = 1
stopping_criterion = int(
np.ceil(
self.max_fraction_training_samples * self.original_data.shape
)
)
vector_of_results = np.zeros((stopping_criterion, 9))
while (
(self.regression_data.shape < stopping_criterion)
and (mae_error > eps_neg)
and (mse_error > eps_neg)
and (r_square < eps_pos)
and (iteration_number < self.max_iter)
and (
< self.original_data.shape
)
):
print("\n-------------------------------------------------")
print("\nIteration ", iteration_number)
best_error = 1e20

# Select n_adaptive_samples worst fitting points to be added to the dataset used in the previous evaluation.
scv_input_data = sorted_comparison_vector[:, :-2]
sorted_comparison_vector_unique = scv_input_data[
np.all(
np.any(
(scv_input_data - self.regression_data[:, None]), axis=2
),
axis=0,
)
]
# PYLINT-WHY: pylint considers self.no_adaptive_samples to be None here
# pylint: disable=invalid-unary-operand-type
# pylint: enable=invalid-unary-operand-type
]
self.regression_data = np.concatenate(
)
self.number_of_samples = self.regression_data.shape[
0
]  # Never forget to update
print(
"\n",
" additional points added to training data. New number of training samples: ",
self.regression_data.shape,
)

training_data, cross_val_data = self.training_test_data_creation()

for poly_order in range(1, self.max_polynomial_order + 1):
for cv_number in range(1, self.number_of_crossvalidations + 1):
phi, train_error, cv_error = self.polyregression(
poly_order,
training_data["training_set_" + str(cv_number)],
cross_val_data["test_set_" + str(cv_number)],
)
if cv_error < best_error:
best_error = cv_error
phi_best = phi
order_best = poly_order
train_error_fit = train_error
print(
"\nThe best regression model is of order",
order_best,
" with a cross-val error of %4f" % best_error,
)

(
sorted_comparison_vector,
mae_error,
mse_error,
r_square,
) = self.surrogate_performance(phi_best, order_best)
print(
"Regression performance on full data in iteration",
iteration_number,
"\nOrder: ",
order_best,
" / MAE: %4f" % mae_error,
" / MSE: %4f" % mse_error,
" / R_sq: %4f" % r_square,
)

# Determine if solution is improved. If yes, update solution. if no, retain previous best.
(
phi_opt,
order_opt,
mae_error_opt,
mse_error_opt,
r_square_opt,
train_error_opt,
best_error_opt,
) = (
phi_best,
order_best,
mae_error,
mse_error,
r_square,
train_error_fit,
best_error,
)
print("New solution found.")
else:
print("Previous solution retained.")
vector_of_results[iteration_number, :] = [
iteration_number,
order_opt,
train_error_opt,
best_error_opt,
mae_error_opt,
mse_error_opt,
r_square_opt,
self.regression_data.shape,
]
iteration_number += 1

# Remove all zero rows in the solution vector
vector_of_results = vector_of_results[
~np.all(vector_of_results == 0, axis=1)
]
# Round phi to 2.d.p and print results to screen
beta_vector = np.round(phi_opt, 6)
print("\nPolynomial regression performs poorly for this dataset.")
else:
print(
"\nPolynomial regression generates a good surrogate model for the input data."
)
if iteration_number > 1:
_, _, _, _ = self.error_plotting(vector_of_results)
print(
"\n-------------------------------------------------\n-------------------------------------------------"
)
print(
"Best solution found: ",
"\nOrder: ",
order_opt,
" / MAE: %4f" % mae_error_opt,
" / MSE: %4f" % mse_error_opt,
" / R_sq: %4f" % r_square_opt,
)
dataframe_coeffs = self.results_generation(beta_vector, order_opt)

vector_of_results_df = pd.DataFrame(
{
"Iteration_number": vector_of_results[:, 0],
"Polynomial order": vector_of_results[:, 1],
"Training error": vector_of_results[:, 2],
"Cross-val error": vector_of_results[:, 3],
"MAE": vector_of_results[:, 4],
"MSE": vector_of_results[:, 5],
"R2": vector_of_results[:, 6],
"Number of training samples": vector_of_results[:, 8],
}
)

extra_terms_feature_vector = list(
self.feature_list[i] for i in self.regression_data_columns
)

# Results
self.optimal_weights_array = phi_opt
self.final_polynomial_order = order_opt
self.errors = {
"MAE": mae_error_opt,
"MSE": mse_error_opt,
"R2": r_square_opt,
}
self.number_of_iterations = iteration_number
self.iteration_summary = vector_of_results_df
self.final_training_data = self.regression_data

self.dataframe_of_optimal_weights_polynomial = dataframe_coeffs
self.dataframe_of_optimal_weights_extra_terms = []
self.extra_terms_feature_vector = extra_terms_feature_vector
if r_square_opt > 0.95:
self.fit_status = "ok"
else:
warnings.warn(
"Polynomial regression generates poor fit for the dataset"
)
self.fit_status = "poor"

self.pickle_save({"model": self})
return self

else:
print("No iterations will be run.")
# Determine number of additional features based on length of list, and convert list into array
)

training_data, cross_val_data = self.training_test_data_creation(
)
for poly_order in range(1, self.max_polynomial_order + 1):
for cv_number in range(1, self.number_of_crossvalidations + 1):
phi, train_error, cv_error = self.polyregression(
poly_order,
training_data["training_set_" + str(cv_number)],
cross_val_data["test_set_" + str(cv_number)],
training_data["training_extras_" + str(cv_number)],
cross_val_data["test_extras_" + str(cv_number)],
)
if cv_error < best_error:
best_error = cv_error
phi_best = phi
order_best = poly_order
train_error_fit = train_error
print(
"\nBest surrogate model is of order",
order_best,
" with a cross-val S.S. Error  of %4f" % best_error,
)

# KEY: Modification of self variable outside initialization. Required to make @surrogate_performance work here.
self.original_data = self.regression_data
_, mae_error, mse_error, r_square, _ = self.surrogate_performance(
)

# Round solution to 6.d.p
beta_vector = np.round(phi_best, 6)

# Print results to screen
dataframe_coeffs = self.results_generation(beta_vector, order_best)

extra_terms_coeffs = pd.Series()
print(
"\nThe coefficients of the extra terms in additional_regression_features are:\n"
)
for af in range(number_additional_features, 0, -1):
print(
"]: ",
beta_vector[len(beta_vector) - af, 0],
)
col_name = (
+ str(number_additional_features - af + 1)
+ "]"
)
extra_terms_coeffs = extra_terms_coeffs.append(
pd.Series({col_name: beta_vector[len(beta_vector) - af, 0]})
)

# Print errors
print(
"\nRegression model performance on training data:\nOrder: ",
order_best,
" / MAE: %4f" % mae_error,
" / MSE: %4f" % mse_error,
" / R^2: %4f" % r_square,
)

extra_terms_feature_vector = list(
self.feature_list[i] for i in self.regression_data_columns
)

# Results
self.optimal_weights_array = phi_best
self.final_polynomial_order = order_best
self.errors = {"MAE": mae_error, "MSE": mse_error, "R2": r_square}
self.number_of_iterations = []
self.iteration_summary = []
self.final_training_data = self.regression_data
self.dataframe_of_optimal_weights_polynomial = dataframe_coeffs
self.dataframe_of_optimal_weights_extra_terms = extra_terms_coeffs
self.extra_terms_feature_vector = extra_terms_feature_vector
if r_square > 0.95:
self.fit_status = "ok"
else:
warnings.warn(
"Polynomial regression generates poor fit for the dataset"
)
self.fit_status = "poor"

self.pickle_save({"model": self})

return self

[docs]    def get_feature_vector(self):
"""

The get_feature_vector method generates the list of regression features from the column headers of the input dataset.

Returns:
Pyomo IndexedParam  : An indexed parameter list of the variables supplied in the original data

**Example:**

.. code-block:: python

# Create a small dataframe with three columns ('one', 'two', 'three') and two rows (A, B)
>>> xy_data = pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], orient='index', columns=['one', 'two', 'three'])

# Initialize the **PolynomialRegression** class and print the column headers for the variables
>>> f = PolynomialRegression(xy_data, xy_data, maximum_polynomial_order=1, multinomials=True, training_split=0.8)
>>> p = f.get_feature_vector()
>>> for i in p.keys():
>>>     print(i)
one
two

"""
p = Param(self.regression_data_columns, mutable=True, initialize=0)
p.index_set().construct()
p.construct()
self.feature_list = p
return p

"""

set_additional_terms accepts additional user-defined features for consideration during regression.

Args:
term_list (list) : List of additional terms to be considered as regression features. Each term in the list must be a Pyomo-supported intrinsic function.

**Example:**

.. code-block:: python

# To add the sine and cosine of a variable with header 'X1' in the dataset as additional regression features:
>>> xy_data = pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], orient='index', columns=['X1', 'X2', 'Y'])
>>> A = PolynomialRegression(xy_data, xy_data, maximum_polynomial_order=5)
>>> p = A.get_feature_vector()
>>> A.set_additional_terms([ pyo.sin(p['X1']) , pyo.cos(p['X1']) ])

"""

# def fit_surrogate(self):
[docs]    def training(self):
"""

The training method trains a polynomial model to an input dataset.
It calls the core method which is called in the PolynomialRegression class (polynomial_regression_fitting).
It accepts no user input, inheriting the information passed in class initialization.

Returns:
tuple   : Python Object (**results**) containing the results of the polynomial regression process including:
- the polynomial order  (**self.final_polynomial_order**)
- polynomial coefficients (**self.optimal_weights_array**), and
- MAE and MSE errors as well as the :math:R^{2} (**results.errors**).

"""

cMap = ComponentMap()
for i, col in enumerate(self.regression_data_columns):
cMap[self.feature_list[col]] = self.regression_data[:, i]
npe = NumpyEvaluator(cMap)
)

[docs]    def generate_expression(self, variable_list):
"""

The generate_expression method returns the Pyomo expression for the polynomial model trained.

The expression is constructed based on a supplied list of variables **variable_list** and the output of training.

Args:
variable_list(list)           : List of input variables to be used in generating expression. This can be the a list generated from the results of get_feature_vector. The user can also choose to supply a new list of the appropriate length.

Returns:
Pyomo Expression              : Pyomo expression of the polynomial model based on the variables provided in **variable_list**.

"""
terms = PolynomialRegression.polygeneration(
self.final_polynomial_order, self.multinomials, np.array([variable_list])
).transpose()
n = len(terms)

ans = sum(
w * t
for w, t in zip(
np.nditer(self.optimal_weights_array),
np.nditer(terms, flags=["refs_ok"]),
)
)

user_term_map = dict(
(id(a), b)
for a, b in zip(
self.extra_terms_feature_vector,
variable_list,
)
)
for w, expr in zip(
np.nditer(self.optimal_weights_array[n:]),
):
ans += float(w) * replace_expressions(expr, user_term_map)
return ans

[docs]    def predict_output(self, x_data):
"""

The predict_output method generates output predictions for input data x_data based a previously generated polynomial fitting.

Args:
x_data          : Numpy array of designs for which the output is to be evaluated/predicted.

Returns:
Numpy Array    : Output variable predictions based on the polynomial fit.

"""
nf = x_data.shape
x_list = [i for i in range(0, nf)]
import pyomo.environ as aml

m = aml.ConcreteModel()
i = aml.Set(initialize=x_list)
m.xx = aml.Var(i)
m.o2 = aml.Objective(expr=self.generate_expression([m.xx[i] for i in x_list]))
y_eq = np.zeros((x_data.shape, 1))
for j in range(0, x_data.shape):
for i in x_list:
m.xx[i] = x_data[j, i]
y_eq[j, 0] = aml.value(m.o2([m.xx[i] for i in x_list]))
return y_eq

def pickle_save(self, solutions):
"""
The training method saves the results of the run in a pickle object. It saves an object with two elements: the setup (index) and the results (index).
"""
try:
filehandler = open(self.filename, "wb")
pickle.dump(solutions, filehandler)
print("\nResults saved in ", str(self.filename))
except:
raise IOError("File could not be saved.")

@staticmethod
"""
pickle_load loads the results of a saved run 'file.obj'. It returns an array of two elements: the setup (index) and the results (index).

Input arguments:
solution_file            : Pickle object file containing previous solution to be loaded.

"""
try:
filehandler = open(solution_file, "rb")
except:
raise Exception("File could not be loaded.")

def parity_residual_plots(self):
"""

inputs:

Returns:

"""
y_predicted = self.predict_output(self.final_training_data[:, :-1])
fig1 = plt.figure(figsize=(16, 9), tight_layout=True)
ax.plot(self.final_training_data[:, -1], self.final_training_data[:, -1], "-")
ax.plot(self.final_training_data[:, -1], y_predicted, "o")
ax.set_xlabel(r"True data", fontsize=12)
ax.set_ylabel(r"Surrogate values", fontsize=12)
ax.set_title(r"Parity plot", fontsize=12)

ax2.plot(
self.final_training_data[:, -1],
self.final_training_data[:, -1]
- y_predicted[:,].reshape(
y_predicted.shape,
),
"s",
mfc="w",
mec="m",
ms=6,
)
ax2.axhline(y=0, xmin=0, xmax=1)
ax2.set_xlabel(r"True data", fontsize=12)
ax2.set_ylabel(r"Residuals", fontsize=12)
ax2.set_title(r"Residual plot", fontsize=12)

plt.show()

return

def _report(self):
## Will only work with Python > 3.5
var_list = []
eqn = self.generate_expression(var_list)

double_line = "=" * 120
s = (
f"\n{double_line}"
f"\nResults of polynomial regression run:\n"
f"\nPolynomial order                   : {self.final_polynomial_order}\n"
f"Number of terms in polynomial model: {self.optimal_weights_array.size}\n"
f"\nPolynomial Expression:\n"
f"--------------------------\n"
f"\n{eqn}\n"
f"--------------------------\n"
f"\nModel training errors:"
f"\n-----------------------\n"
f"Mean Squared Error (MSE)         : {self.errors['MSE']}\n"
f"Root Mean Squared Error (RMSE)   : {np.sqrt(self.errors['MSE'])}\n"
f"Mean Absolute error (MSE)        : {self.errors['MAE']}\n"
f"Goodness of fit (R2)             : {self.errors['R2']}\n"
f"\n{double_line}"
)
return s

def print_report(self):
s = self._report()
print(s)

def _repr_pretty_(self, p, cycle=False):

s = self._report()
p.text(s)

[docs]    def confint_regression(self, confidence=0.95):
"""
The confint_regression method prints the confidence intervals for the regression patamaters.

Args:
confidence      : Required confidence interval level, default = 0.95 (95%)

"""
from scipy.stats import t

data = self.final_training_data
y_pred = self.predict_output(data[:, :-1])
dof = (
data.shape - len(self.optimal_weights_array) + 1
)  # be careful when there are additional features
ssr = np.sum((data[:, -1] - y_pred[:, 0]) ** 2)
sig_sq = ssr / dof
if (self.additional_features_data is None) or (
):
x_exp = self.polygeneration(
self.final_polynomial_order, self.multinomials, data[:, :-1]
)  # will not account for additional features
else:
x_exp = self.polygeneration(
self.final_polynomial_order,
self.multinomials,
data[:, :-1],
)

covar = sig_sq * np.linalg.pinv(x_exp.transpose() @ x_exp)
ss_reg_params = np.sqrt(
np.diag(covar)
)  # standard error for each regression parameter
t_dist = t.ppf(
(1 + confidence) / 2, dof
)  # alternatively, t_dist_data = st.t.interval(0.99, 8)
# Evaluate confidence intervals, Tabulate and print results
c_data = np.zeros((self.optimal_weights_array.shape, 4))
c_data[:, 0] = self.optimal_weights_array[:, 0]
c_data[:, 1] = ss_reg_params[
:,
]
c_data[:, 2] = (
self.optimal_weights_array[:, 0]
- t_dist
* ss_reg_params[
:,
]
)
c_data[:, 3] = (
self.optimal_weights_array[:, 0]
+ t_dist
* ss_reg_params[
:,
]
)