Source code for pysindy.feature_library.polynomial_library

from itertools import chain
from itertools import combinations
from itertools import combinations_with_replacement as combinations_w_r

import numpy as np
from scipy import sparse
from sklearn import __version__
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing._csr_polynomial_expansion import _csr_polynomial_expansion
from sklearn.utils.validation import check_is_fitted

from ..utils import AxesArray
from ..utils import comprehend_axes
from ..utils import wrap_axes
from .base import BaseFeatureLibrary
from .base import x_sequence_or_item


[docs]class PolynomialLibrary(PolynomialFeatures, BaseFeatureLibrary): """Generate polynomial and interaction features. This is the same as :code:`sklearn.preprocessing.PolynomialFeatures`, but also adds the option to omit interaction features from the library. Parameters ---------- degree : integer, optional (default 2) The degree of the polynomial features. include_interaction : boolean, optional (default True) Determines whether interaction features are produced. If false, features are all of the form ``x[i] ** k``. interaction_only : boolean, optional (default False) If true, only interaction features are produced: features that are products of at most ``degree`` *distinct* input features (so not ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). include_bias : boolean, optional (default True) If True (default), then include a bias column, the feature in which all polynomial powers are zero (i.e. a column of ones - acts as an intercept term in a linear model). order : str in {'C', 'F'}, optional (default 'C') Order of output array in the dense case. 'F' order is faster to compute, but may slow down subsequent estimators. library_ensemble : boolean, optional (default False) Whether or not to use library bagging (regress on subset of the candidate terms in the library) ensemble_indices : integer array, optional (default [0]) The indices to use for ensembling the library. Attributes ---------- powers_ : array, shape (n_output_features, n_input_features) powers_[i, j] is the exponent of the jth input in the ith output. n_input_features_ : int The total number of input features. WARNING: This is deprecated in scikit-learn version 1.0 and higher so we check the sklearn.__version__ and switch to n_features_in if needed. n_output_features_ : int The total number of output features. This number is computed by iterating over all appropriately sized combinations of input features. """ def __init__( self, degree=2, include_interaction=True, interaction_only=False, include_bias=True, order="C", library_ensemble=False, ensemble_indices=[0], ): super(PolynomialLibrary, self).__init__( degree=degree, interaction_only=interaction_only, include_bias=include_bias, order=order, ) BaseFeatureLibrary.__init__( self, library_ensemble=library_ensemble, ensemble_indices=ensemble_indices ) if degree < 0 or not isinstance(degree, int): raise ValueError("degree must be a nonnegative integer") if (not include_interaction) and interaction_only: raise ValueError( "Can't have include_interaction be False and interaction_only" " be True" ) self.include_interaction = include_interaction @staticmethod def _combinations( n_features, degree, include_interaction, interaction_only, include_bias ): comb = combinations if interaction_only else combinations_w_r start = int(not include_bias) if not include_interaction: if include_bias: return chain( [()], chain.from_iterable( combinations_w_r([j], i) for i in range(1, degree + 1) for j in range(n_features) ), ) else: return chain.from_iterable( combinations_w_r([j], i) for i in range(1, degree + 1) for j in range(n_features) ) return chain.from_iterable( comb(range(n_features), i) for i in range(start, degree + 1) ) @property def powers_(self): check_is_fitted(self) if float(__version__[:3]) >= 1.0: n_features = self.n_features_in_ else: n_features = self.n_input_features_ combinations = self._combinations( n_features, self.degree, self.include_interaction, self.interaction_only, self.include_bias, ) return np.vstack([np.bincount(c, minlength=n_features) for c in combinations])
[docs] def get_feature_names(self, input_features=None): """Return feature names for output features. Parameters ---------- input_features : list of string, length n_features, optional String names for input features if available. By default, "x0", "x1", ... "xn_features" is used. Returns ------- output_feature_names : list of string, length n_output_features """ powers = self.powers_ if input_features is None: input_features = ["x%d" % i for i in range(powers.shape[1])] feature_names = [] for row in powers: inds = np.where(row)[0] if len(inds): name = " ".join( "%s^%d" % (input_features[ind], exp) if exp != 1 else input_features[ind] for ind, exp in zip(inds, row[inds]) ) else: name = "1" feature_names.append(name) return feature_names
[docs] @x_sequence_or_item def fit(self, x_full, y=None): """ Compute number of output features. Parameters ---------- x : array-like, shape (n_samples, n_features) The data. Returns ------- self : instance """ n_features = x_full[0].shape[x_full[0].ax_coord] combinations = self._combinations( n_features, self.degree, self.include_interaction, self.interaction_only, self.include_bias, ) if float(__version__[:3]) >= 1.0: self.n_features_in_ = n_features else: self.n_input_features_ = n_features self.n_output_features_ = sum(1 for _ in combinations) return self
[docs] @x_sequence_or_item def transform(self, x_full): """Transform data to polynomial features. Parameters ---------- x : array-like or CSR/CSC sparse matrix, shape (n_samples, n_features) The data to transform, row by row. Prefer CSR over CSC for sparse input (for speed), but CSC is required if the degree is 4 or higher. If the degree is less than 4 and the input format is CSC, it will be converted to CSR, have its polynomial features generated, then converted back to CSC. If the degree is 2 or 3, the method described in "Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is used, which is much faster than the method used on CSC input. For this reason, a CSC input will be converted to CSR, and the output will be converted back to CSC prior to being returned, hence the preference of CSR. Returns ------- xp : np.ndarray or CSR/CSC sparse matrix, shape (n_samples, n_output_features) The matrix of features, where n_output_features is the number of polynomial features generated from the combination of inputs. """ check_is_fitted(self) xp_full = [] for x in x_full: if sparse.issparse(x) and x.format not in ["csr", "csc"]: # create new with correct sparse axes = comprehend_axes(x) x = x.asformat("csr") wrap_axes(axes, x) n_samples = x.shape[x.ax_time] n_features = x.shape[x.ax_coord] if float(__version__[:3]) >= 1.0: if n_features != self.n_features_in_: raise ValueError("x shape does not match training shape") else: if n_features != self.n_input_features_: raise ValueError("x shape does not match training shape") if sparse.isspmatrix_csr(x): if self.degree > 3: return sparse.csr_matrix(self.transform(x.tocsc())) to_stack = [] if self.include_bias: to_stack.append(np.ones(shape=(n_samples, 1), dtype=x.dtype)) to_stack.append(x) for deg in range(2, self.degree + 1): xp_next = _csr_polynomial_expansion( x.data, x.indices, x.indptr, x.shape[1], self.interaction_only, deg, ) if xp_next is None: break to_stack.append(xp_next) xp = sparse.hstack(to_stack, format="csr") elif sparse.isspmatrix_csc(x) and self.degree < 4: return sparse.csc_matrix(self.transform(x.tocsr())) else: combinations = self._combinations( n_features, self.degree, self.include_interaction, self.interaction_only, self.include_bias, ) if sparse.isspmatrix(x): columns = [] for comb in combinations: if comb: out_col = 1 for col_idx in comb: out_col = x[..., col_idx].multiply(out_col) columns.append(out_col) else: bias = sparse.csc_matrix(np.ones((x.shape[0], 1))) columns.append(bias) xp = sparse.hstack(columns, dtype=x.dtype).tocsc() else: xp = AxesArray( np.empty( (*x.shape[:-1], self.n_output_features_), dtype=x.dtype, order=self.order, ), x.__dict__, ) for i, comb in enumerate(combinations): xp[..., i] = x[..., comb].prod(-1) xp_full = xp_full + [xp] if self.library_ensemble: xp_full = self._ensemble(xp_full) return xp_full