From 6783a0a6010211fd61968223ed41ece9e5ec3835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 4 Sep 2024 09:26:32 -0500 Subject: [PATCH 01/75] docs: update title of pypi notebook example to reflect use of the PyPI public dataset In response to feedback on internal change 662899733. --- notebooks/dataframes/pypi.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/dataframes/pypi.ipynb b/notebooks/dataframes/pypi.ipynb index 3777e98d42..7b16412ff5 100644 --- a/notebooks/dataframes/pypi.ipynb +++ b/notebooks/dataframes/pypi.ipynb @@ -25,7 +25,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Analyzing Python dependencies with BigQuery DataFrames\n", + "# Analyzing package downloads from PyPI with BigQuery DataFrames\n", "\n", "In this notebook, you'll use the [PyPI public dataset](https://console.cloud.google.com/marketplace/product/gcp-public-data-pypi/pypi) and the [deps.dev public dataset](https://deps.dev/) to visualize Python package downloads for a package and its dependencies.\n", "\n", From 1d3956025146ae442f3e8f8b22d1e1660de068d3 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 27 Jan 2025 20:16:17 +0000 Subject: [PATCH 02/75] feat: add support for creating a Matrix Factorization model --- bigframes/ml/decomposition.py | 24 +++++++++++ bigframes/ml/loader.py | 1 + .../sklearn/decomposition/_mf.py | 40 +++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 third_party/bigframes_vendored/sklearn/decomposition/_mf.py diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index c98e18322a..d34a0bfc13 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -19,6 +19,7 @@ from typing import List, Literal, Optional, Union +import bigframes_vendored.sklearn.decomposition._ml import bigframes_vendored.sklearn.decomposition._pca from google.cloud import bigquery @@ -197,3 +198,26 @@ def score( # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE. return self._bqml_model.evaluate() + + +@log_adapter.class_logger +class MF( + base.UnsupervisedTrainablePredictor, + bigframes_vendored.sklearn.decomposition._mf.MF, +): + __doc__ = bigframes_vendored.sklearn.decomposition._mf.MF.__doc__ + + def __init__( + self, + n_components: Optional[Union[int, float]] = None, + *, + user_col: str, + item_col: str, + l2_reg: float, + ): + self.n_components = n_components + self.user_col = user_col + self.item_col = item_col + self.l2_reg = l2_reg + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 5d52927ded..53961879e6 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -42,6 +42,7 @@ "LINEAR_REGRESSION": linear_model.LinearRegression, "LOGISTIC_REGRESSION": linear_model.LogisticRegression, "KMEANS": cluster.KMeans, + "MF": decomposition.MF, "PCA": decomposition.PCA, "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor, "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py new file mode 100644 index 0000000000..d453645b1a --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -0,0 +1,40 @@ +""" Matrix Factorization. +""" + +# Author: Alexandre Gramfort +# Olivier Grisel +# Mathieu Blondel +# Denis A. Engemann +# Michael Eickenberg +# Giorgio Patrini +# +# License: BSD 3 clause + +from abc import ABCMeta + +from bigframes_vendored.sklearn.base import BaseEstimator + +# from bigframes import constants + + +class PCA(BaseEstimator, metaclass=ABCMeta): + """Matrix Factorization (MF). + + **Examples:** + + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> from sklearn.decomposition import NMF + >>> model = NMF(n_components=2, init='random', random_state=0) + >>> W = model.fit_transform(X) + >>> H = model.components_ + + Args: + n_components (int, float or None, default None): + Number of components to keep. If n_components is not set, all + components are kept, n_components = min(n_samples, n_features). + If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. + svd_solver ("full", "randomized" or "auto", default "auto"): + The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver. + + """ From 1bef4a2f0f45507c954eaacdcbb14866a5ba4477 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 27 Jan 2025 22:30:24 +0000 Subject: [PATCH 03/75] feat: add support for creating a Matrix Factorization model --- bigframes/ml/decomposition.py | 2 + bigframes/ml/loader.py | 1 + .../sklearn/decomposition/_mf.py | 74 +++++-------------- 3 files changed, 21 insertions(+), 56 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 9e18276dd1..486979b832 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -211,11 +211,13 @@ def __init__( self, n_components: Optional[Union[int, float]] = None, *, + num_factors: int, user_col: str, item_col: str, l2_reg: float, ): self.n_components = n_components + self.num_factors = num_factors self.user_col = user_col self.item_col = item_col self.l2_reg = l2_reg diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 53961879e6..c8ed914468 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -83,6 +83,7 @@ def from_bq( session: bigframes.session.Session, bq_model: bigquery.Model ) -> Union[ + decomposition.MF, decomposition.PCA, cluster.KMeans, linear_model.LinearRegression, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index a4c175ab9a..bae62b9e85 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -17,15 +17,15 @@ from bigframes import constants -class PCA(BaseEstimator, metaclass=ABCMeta): +class MF(BaseEstimator, metaclass=ABCMeta): """Matrix Factorization (MF). **Examples:** - >>> import numpy as np - >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) - >>> from sklearn.decomposition import NMF - >>> model = NMF(n_components=2, init='random', random_state=0) + >>> import bigframes.pandas as bpd + >>> from bigframes.ml.decomposition import MF + >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> model = MF(n_components=2, init='random', random_state=0) >>> W = model.fit_transform(X) >>> H = model.components_ @@ -34,9 +34,18 @@ class PCA(BaseEstimator, metaclass=ABCMeta): Number of components to keep. If n_components is not set, all components are kept, n_components = min(n_samples, n_features). If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. - svd_solver ("full", "randomized" or "auto", default "auto"): - The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver. - + num_factors (int or auto, default auto): + Specifies the number of latent factors to use. + If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples. + user_col (str): + The user column name. + item_col (str): + The item column name. + l2_reg (float, default 1.0): + If you aren't running hyperparameter tuning, then you can specify a FLOAT64 value. The default value is 1.0. + If you are running hyperparameter tuning, then you can use one of the following options: + The HPARAM_RANGE keyword and two FLOAT64 values that define the range to use for the hyperparameter. For example, L2_REG = HPARAM_RANGE(1.5, 5.0). + The HPARAM_CANDIDATES keyword and an array of FLOAT64 values that provide discrete values to use for the hyperparameter. For example, L2_REG = HPARAM_CANDIDATES([0, 1.0, 3.0, 5.0]). """ def fit(self, X, y=None): @@ -62,7 +71,7 @@ def score(self, X=None, y=None): .. note:: Output matches that of the BigQuery ML.EVALUATE function. - See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#matrix_factorization_models for the outputs relevant to this model type. Args: @@ -86,50 +95,3 @@ def predict(self, X): Returns: bigframes.dataframe.DataFrame: Predicted DataFrames.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - - @property - def components_(self): - """Principal axes in feature space, representing the directions of maximum variance in the data. - - Returns: - bigframes.dataframe.DataFrame: DataFrame of principal components, containing following columns: - principal_component_id: An integer that identifies the principal component. - - feature: The column name that contains the feature. - - numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL. - - categorical_value: A list of mappings containing information about categorical features. Each mapping contains the following fields: - categorical_value.category: The name of each category. - - categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies. - - The output contains one row per feature per component. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - - @property - def explained_variance_(self): - """The amount of variance explained by each of the selected components. - - Returns: - bigframes.dataframe.DataFrame: DataFrame containing following columns: - principal_component_id: An integer that identifies the principal component. - - explained_variance: The factor by which the eigenvector is scaled. Eigenvalue and explained variance are the same concepts in PCA. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - - @property - def explained_variance_ratio_(self): - """Percentage of variance explained by each of the selected components. - - Returns: - bigframes.dataframe.DataFrame: DataFrame containing following columns: - principal_component_id: An integer that identifies the principal component. - - explained_variance_ratio: the total variance is the sum of variances, also known as eigenvalues, of all - of the individual principal components. The explained variance ratio by a principal component is - the ratio between the variance, also known as eigenvalue, of that principal component and the total variance. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From e336bde7894bdddc8f8e2ac6fadec730d7ef213d Mon Sep 17 00:00:00 2001 From: rey-esp Date: Tue, 28 Jan 2025 11:10:06 -0600 Subject: [PATCH 04/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 486979b832..9acabc26bc 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -201,7 +201,7 @@ def score( @log_adapter.class_logger -class MF( +class MatrixFactorization( base.UnsupervisedTrainablePredictor, bigframes_vendored.sklearn.decomposition._mf.MF, ): From d5f713a4bd3616f8b8feefd9fff0afe22253497c Mon Sep 17 00:00:00 2001 From: rey-esp Date: Tue, 28 Jan 2025 11:11:17 -0600 Subject: [PATCH 05/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 9acabc26bc..5169fadfe5 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -209,7 +209,6 @@ class MatrixFactorization( def __init__( self, - n_components: Optional[Union[int, float]] = None, *, num_factors: int, user_col: str, From 5e3e4434176906c010936221b7e59551f2cb2d4f Mon Sep 17 00:00:00 2001 From: rey-esp Date: Tue, 28 Jan 2025 11:12:01 -0600 Subject: [PATCH 06/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 5169fadfe5..e995aee62f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -213,7 +213,8 @@ def __init__( num_factors: int, user_col: str, item_col: str, - l2_reg: float, + # TODO: Add support for hyperparameter tuning. + l2_reg: float = 1.0, ): self.n_components = n_components self.num_factors = num_factors From c116e8ad21e8b5e64623a03fbe9964244b89857a Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 28 Jan 2025 19:13:11 +0000 Subject: [PATCH 07/75] rating_col --- bigframes/ml/decomposition.py | 29 ++++++------------- bigframes/ml/loader.py | 4 +-- .../sklearn/decomposition/_mf.py | 10 ++----- 3 files changed, 14 insertions(+), 29 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index e995aee62f..fe2094630e 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -203,9 +203,9 @@ def score( @log_adapter.class_logger class MatrixFactorization( base.UnsupervisedTrainablePredictor, - bigframes_vendored.sklearn.decomposition._mf.MF, + bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization, ): - __doc__ = bigframes_vendored.sklearn.decomposition._mf.MF.__doc__ + __doc__ = bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization.__doc__ def __init__( self, @@ -213,13 +213,14 @@ def __init__( num_factors: int, user_col: str, item_col: str, + rating_col: Optional[str] = "rating", # TODO: Add support for hyperparameter tuning. l2_reg: float = 1.0, ): - self.n_components = n_components self.num_factors = num_factors self.user_col = user_col self.item_col = item_col + self.rating_col = rating_col self.l2_reg = l2_reg self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() @@ -227,8 +228,8 @@ def __init__( @classmethod def _from_bq( cls, session: bigframes.session.Session, bq_model: bigquery.Model - ) -> MF: - assert bq_model.model_type == "MF" + ) -> MatrixFactorization: + assert bq_model.model_type == "MatrixFactorization" kwargs = utils.retrieve_params_from_bq_model( cls, bq_model, _BQML_PARAMS_MAPPING @@ -248,15 +249,9 @@ def _from_bq( def _bqml_options(self) -> dict: """The model options as they will be set for BQML""" options: dict = { - "model_type": "ML", + "model_type": "MatrixFactorization", } - assert self.n_components is not None - if 0 < self.n_components < 1: - options["pca_explained_variance_ratio"] = float(self.n_components) - elif self.n_components >= 1: - options["num_principal_components"] = int(self.n_components) - return options def _fit( @@ -264,17 +259,11 @@ def _fit( X: utils.ArrayType, y=None, transforms: Optional[List[str]] = None, - ) -> PCA: + ) -> MatrixFactorization: (X,) = utils.batch_convert_to_dataframe(X) # To mimic sklearn's behavior - if self.n_components is None: - self.n_components = min(X.shape) - self._bqml_model = self._bqml_model_factory.create_model( - X_train=X, - transforms=transforms, - options=self._bqml_options, - ) + return self @property diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index c8ed914468..7aa6d6708f 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -42,7 +42,7 @@ "LINEAR_REGRESSION": linear_model.LinearRegression, "LOGISTIC_REGRESSION": linear_model.LogisticRegression, "KMEANS": cluster.KMeans, - "MF": decomposition.MF, + "MatrixFactorization": decomposition.MatrixFactorization, "PCA": decomposition.PCA, "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor, "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier, @@ -83,7 +83,7 @@ def from_bq( session: bigframes.session.Session, bq_model: bigquery.Model ) -> Union[ - decomposition.MF, + decomposition.MatrixFactorization, decomposition.PCA, cluster.KMeans, linear_model.LinearRegression, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index bae62b9e85..da5eacb18b 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -17,23 +17,19 @@ from bigframes import constants -class MF(BaseEstimator, metaclass=ABCMeta): +class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): """Matrix Factorization (MF). **Examples:** >>> import bigframes.pandas as bpd - >>> from bigframes.ml.decomposition import MF + >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) - >>> model = MF(n_components=2, init='random', random_state=0) + >>> model = MatrixFactorization(n_components=2, init='random', random_state=0) >>> W = model.fit_transform(X) >>> H = model.components_ Args: - n_components (int, float or None, default None): - Number of components to keep. If n_components is not set, all - components are kept, n_components = min(n_samples, n_features). - If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. num_factors (int or auto, default auto): Specifies the number of latent factors to use. If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples. From dedef3980dc516bd28a5a9d55f46fc15c71e2743 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 28 Jan 2025 21:14:14 +0000 Subject: [PATCH 08/75] (nearly) complete class --- bigframes/ml/decomposition.py | 97 +++++-------------- bigframes/ml/loader.py | 2 +- .../sklearn/decomposition/_mf.py | 2 +- 3 files changed, 25 insertions(+), 76 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index fe2094630e..1d5e8a9e07 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -19,7 +19,7 @@ from typing import List, Literal, Optional, Union -import bigframes_vendored.sklearn.decomposition._ml +import bigframes_vendored.sklearn.decomposition._mf import bigframes_vendored.sklearn.decomposition._pca from google.cloud import bigquery @@ -210,13 +210,15 @@ class MatrixFactorization( def __init__( self, *, + feedback_type: Literal["explicit", "implicit"] = "explicit", num_factors: int, user_col: str, item_col: str, - rating_col: Optional[str] = "rating", + rating_col: str = "rating", # TODO: Add support for hyperparameter tuning. l2_reg: float = 1.0, ): + self.feedback_type = feedback_type self.num_factors = num_factors self.user_col = user_col self.item_col = item_col @@ -229,18 +231,12 @@ def __init__( def _from_bq( cls, session: bigframes.session.Session, bq_model: bigquery.Model ) -> MatrixFactorization: - assert bq_model.model_type == "MatrixFactorization" + assert bq_model.model_type == "MATRIX_FACTORIZATION" kwargs = utils.retrieve_params_from_bq_model( cls, bq_model, _BQML_PARAMS_MAPPING ) - last_fitting = bq_model.training_runs[-1]["trainingOptions"] - if "numPrincipalComponents" in last_fitting: - kwargs["n_components"] = int(last_fitting["numPrincipalComponents"]) - # elif "pcaExplainedVarianceRatio" in last_fitting: - # kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"]) - model = cls(**kwargs) model._bqml_model = core.BqmlModel(session, bq_model) return model @@ -249,9 +245,18 @@ def _from_bq( def _bqml_options(self) -> dict: """The model options as they will be set for BQML""" options: dict = { - "model_type": "MatrixFactorization", + "model_type": "matrix_factorization", + "feedback_type": self.feedback_type, + "user_col": self.user_col, + "item_col": self.item_col, + "rating_col": self.rating_col, + "l2_reg": self.l2_reg, } + if self.num_factors is not None: + options["num_factors"] = self.num_factors + + print(repr(options)) return options def _fit( @@ -262,79 +267,23 @@ def _fit( ) -> MatrixFactorization: (X,) = utils.batch_convert_to_dataframe(X) - # To mimic sklearn's behavior - + self._bqml_model = self._bqml_model_factory.create_model( + X_train=X, + transforms=transforms, + options=self._bqml_options, + ) return self - @property - def components_(self) -> bpd.DataFrame: - if not self._bqml_model: - raise RuntimeError("A model must be fitted before calling components_.") - - return self._bqml_model.principal_components() - - @property - def explained_variance_(self) -> bpd.DataFrame: - if not self._bqml_model: - raise RuntimeError( - "A model must be fitted before calling explained_variance_." - ) - - return self._bqml_model.principal_component_info()[ - ["principal_component_id", "eigenvalue"] - ].rename(columns={"eigenvalue": "explained_variance"}) - - @property - def explained_variance_ratio_(self) -> bpd.DataFrame: - if not self._bqml_model: - raise RuntimeError( - "A model must be fitted before calling explained_variance_ratio_." - ) - - return self._bqml_model.principal_component_info()[ - ["principal_component_id", "explained_variance_ratio"] - ] - def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + # TODO: Create recommend() return self._bqml_model.predict(X) - def detect_anomalies( - self, - X: utils.ArrayType, - *, - contamination: float = 0.1, - ) -> bpd.DataFrame: - """Detect the anomaly data points of the input. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Series or a DataFrame to detect anomalies. - contamination (float, default 0.1): - Identifies the proportion of anomalies in the training dataset that are used to create the model. - The value must be in the range [0, 0.5]. - - Returns: - bigframes.dataframe.DataFrame: detected DataFrame.""" - if contamination < 0.0 or contamination > 0.5: - raise ValueError( - f"contamination must be [0.0, 0.5], but is {contamination}." - ) - - if not self._bqml_model: - raise RuntimeError("A model must be fitted before detect_anomalies") - - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - - return self._bqml_model.detect_anomalies( - X, options={"contamination": contamination} - ) - - def to_gbq(self, model_name: str, replace: bool = False) -> PCA: + def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization: """Save the model to BigQuery. Args: @@ -344,7 +293,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PCA: Determine whether to replace if the model already exists. Default to False. Returns: - PCA: Saved model.""" + MatrixFactorization: Saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 7aa6d6708f..1f62eec0ff 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -42,7 +42,7 @@ "LINEAR_REGRESSION": linear_model.LinearRegression, "LOGISTIC_REGRESSION": linear_model.LogisticRegression, "KMEANS": cluster.KMeans, - "MatrixFactorization": decomposition.MatrixFactorization, + "MATRIX_FACTORIZATION": decomposition.MatrixFactorization, "PCA": decomposition.PCA, "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor, "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier, diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index da5eacb18b..1b371c1af5 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -57,7 +57,7 @@ def fit(self, X, y=None): Ignored. Returns: - PCA: Fitted estimator. + bigframes.ml.decomposition.MatrixFactorization: Fitted estimator. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 27871786d39c7d81f03cb347a08d93c1dab45a49 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 28 Jan 2025 21:32:56 +0000 Subject: [PATCH 09/75] removem print() --- bigframes/ml/decomposition.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 1d5e8a9e07..0f802ddebb 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -256,7 +256,6 @@ def _bqml_options(self) -> dict: if self.num_factors is not None: options["num_factors"] = self.num_factors - print(repr(options)) return options def _fit( From 086b4dd86ed4a09b859ce7b2a381fe4513549c0a Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 29 Jan 2025 00:07:37 +0000 Subject: [PATCH 10/75] adding recommend --- bigframes/ml/core.py | 6 ++++++ bigframes/ml/decomposition.py | 7 +++---- bigframes/ml/sql.py | 5 +++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index d038b8f4c0..4fafc470a5 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -117,6 +117,12 @@ def model(self) -> bigquery.Model: """Get the BQML model associated with this wrapper""" return self._model + def recommend(self, input_data: bpd.DataFrame) -> bpd.DataFrame: + return self._apply_ml_tvf( + input_data, + self._model_manipulation_sql_generator.ml_recommend, + ) + def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 0f802ddebb..574dadbe4d 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -273,14 +273,13 @@ def _fit( ) return self - def predict(self, X: utils.ArrayType) -> bpd.DataFrame: + def recommend(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: - raise RuntimeError("A model must be fitted before predict") + raise RuntimeError("A model must be fitted before recommend") (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - # TODO: Create recommend() - return self._bqml_model.predict(X) + return self._bqml_model.recommend(X) def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization: """Save the model to BigQuery. diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index b662d4c22c..d59f2013da 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -299,6 +299,11 @@ def alter_model( return "\n".join(parts) # ML prediction TVFs + def ml_recommend(self, source_sql: str) -> str: + """Encode ML.RECOMMEND for BQML""" + return f"""SELECT * FROM ML.RECOMMEND(MODEL {self._model_ref_sql()}, + ({source_sql}))""" + def ml_predict(self, source_sql: str) -> str: """Encode ML.PREDICT for BQML""" return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()}, From 7c371ac847108491bcd6f01504fe41e6673afb89 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 30 Jan 2025 14:09:40 +0000 Subject: [PATCH 11/75] remove hyper parameter runing references --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 1b371c1af5..c088fa59cd 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -32,16 +32,12 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): Args: num_factors (int or auto, default auto): Specifies the number of latent factors to use. - If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples. user_col (str): The user column name. item_col (str): The item column name. l2_reg (float, default 1.0): - If you aren't running hyperparameter tuning, then you can specify a FLOAT64 value. The default value is 1.0. - If you are running hyperparameter tuning, then you can use one of the following options: - The HPARAM_RANGE keyword and two FLOAT64 values that define the range to use for the hyperparameter. For example, L2_REG = HPARAM_RANGE(1.5, 5.0). - The HPARAM_CANDIDATES keyword and an array of FLOAT64 values that provide discrete values to use for the hyperparameter. For example, L2_REG = HPARAM_CANDIDATES([0, 1.0, 3.0, 5.0]). + A floating point value for L2 regularization. The default value is 1.0. """ def fit(self, X, y=None): From 8de384a768edefd4103cd93b2a8dcbf7fa75d1d9 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 4 Feb 2025 15:29:18 +0000 Subject: [PATCH 12/75] swap predict in _mf for recommend --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index c088fa59cd..83a2f9bea8 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -77,7 +77,7 @@ def score(self, X=None, y=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def predict(self, X): + def recommend(self, X): """Predict the closest cluster for each sample in X. Args: From 647532b1ebfb0b638cffcc8565d0271d3217bd2d Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 4 Feb 2025 19:39:46 +0000 Subject: [PATCH 13/75] recommend -> predict --- bigframes/ml/decomposition.py | 2 +- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 574dadbe4d..1ea7d98177 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -273,7 +273,7 @@ def _fit( ) return self - def recommend(self, X: utils.ArrayType) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before recommend") diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 83a2f9bea8..c088fa59cd 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -77,7 +77,7 @@ def score(self, X=None, y=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def recommend(self, X): + def predict(self, X): """Predict the closest cluster for each sample in X. Args: From b340c4fb48bbbda8a040608b6255dd88d8b27f9c Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 4 Feb 2025 20:27:56 +0000 Subject: [PATCH 14/75] update predict doc string --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index c088fa59cd..2d9ec4e1a1 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -78,7 +78,7 @@ def score(self, X=None, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict(self, X): - """Predict the closest cluster for each sample in X. + """Generate a predicted rating for every user-item row combination for a matrix factorization model. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): From 4c90c1d84116eac05ea29ea75127b7f186f39016 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 10 Feb 2025 22:55:28 +0000 Subject: [PATCH 15/75] Merge branch 'main' into b338873783-matrix-factorization --- bigframes/core/__init__.py | 24 +- bigframes/core/blocks.py | 20 + bigframes/core/compile/compiled.py | 2 +- bigframes/core/compile/compiler.py | 22 +- bigframes/core/nodes.py | 423 +++++++++--------- bigframes/core/rewrite/__init__.py | 2 + bigframes/core/rewrite/implicit_align.py | 40 +- bigframes/core/rewrite/legacy_align.py | 6 +- bigframes/core/rewrite/order.py | 16 +- bigframes/core/rewrite/pruning.py | 195 ++++++++ bigframes/core/rewrite/slices.py | 4 +- bigframes/dataframe.py | 15 +- bigframes/ml/llm.py | 148 ++++++ bigframes/ml/loader.py | 2 + bigframes/ml/utils.py | 3 + bigframes/operations/timedelta_ops.py | 6 +- bigframes/pandas/core/tools/timedeltas.py | 20 +- bigframes/streaming/dataframe.py | 32 +- docs/templates/toc.yml | 2 +- .../bq_dataframes_template.ipynb | 2 +- tests/system/small/test_dataframe.py | 11 +- tests/system/small/test_pandas.py | 38 +- .../ibis/backends/sql/datatypes.py | 1 - .../ibis/backends/sql/rewrites.py | 2 +- .../bigframes_vendored/ibis/common/graph.py | 5 + 25 files changed, 759 insertions(+), 282 deletions(-) create mode 100644 bigframes/core/rewrite/pruning.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 5f64bf68dd..dc9b8e3b9b 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -304,18 +304,25 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue: if destination_id in self.column_ids: # Mutate case exprs = [ ( - ex.deref(source_id if (col_id == destination_id) else col_id), - ids.ColumnId(col_id), + bigframes.core.nodes.AliasedRef( + ex.deref(source_id if (col_id == destination_id) else col_id), + ids.ColumnId(col_id), + ) ) for col_id in self.column_ids ] else: # append case self_projection = ( - (ex.deref(col_id), ids.ColumnId(col_id)) for col_id in self.column_ids + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in self.column_ids ) exprs = [ *self_projection, - (ex.deref(source_id), ids.ColumnId(destination_id)), + ( + bigframes.core.nodes.AliasedRef( + ex.deref(source_id), ids.ColumnId(destination_id) + ) + ), ] return ArrayValue( nodes.SelectionNode( @@ -337,7 +344,10 @@ def create_constant( def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ((ex.deref(col_id), ids.ColumnId(col_id)) for col_id in column_ids) + selections = ( + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in column_ids + ) return ArrayValue( nodes.SelectionNode( child=self.node, @@ -488,7 +498,9 @@ def prepare_join_names( nodes.SelectionNode( other.node, tuple( - (ex.deref(old_id), ids.ColumnId(new_id)) + bigframes.core.nodes.AliasedRef( + ex.deref(old_id), ids.ColumnId(new_id) + ) for old_id, new_id in r_mapping.items() ), ), diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b1f4ed35cc..c6e3096e51 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -276,6 +276,26 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: mapping[label] = (*mapping.get(label, ()), id) return mapping + def resolve_label_exact(self, label: Label) -> Optional[str]: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such a column, returns None.""" + matches = self.label_to_col_id.get(label, []) + if len(matches) > 1: + raise ValueError( + f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" + ) + return matches[0] if len(matches) != 0 else None + + def resolve_label_exact_or_error(self, label: Label) -> str: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such a column, raises an error too.""" + col_id = self.resolve_label_exact(label) + if col_id is None: + raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}") + return col_id + @functools.cached_property def col_id_to_index_name(self) -> typing.Mapping[str, Label]: """Get column label for value columns, or index name for index columns""" diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 906bdb1f0d..93be998b5b 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -184,7 +184,7 @@ def _to_ibis_expr( # Special case for empty tables, since we can't create an empty # projection. if not self._columns: - return bigframes_vendored.ibis.memtable([]) + return self._table.select([bigframes_vendored.ibis.literal(1)]) table = self._table.select(self._columns) if fraction is not None: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index dca204401e..ff5f1d61c8 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -62,9 +62,11 @@ def compile_sql( if ordered: node, limit = rewrites.pullup_limit_from_slice(node) node = nodes.bottom_up(node, rewrites.rewrite_slice) + # TODO: Extract out CTEs node, ordering = rewrites.pull_up_order( node, order_root=True, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) ir = self.compile_node(node) return ir.to_sql( order_by=ordering.all_ordering_columns, @@ -76,6 +78,7 @@ def compile_sql( node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) ir = self.compile_node(node) return ir.to_sql(selections=output_ids) @@ -86,6 +89,7 @@ def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) return self.compile_node(node).to_sql(limit=n_rows, selections=ids) def compile_raw( @@ -97,6 +101,7 @@ def compile_raw( node = nodes.bottom_up(node, rewrites.rewrite_slice) node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) + node = rewrites.column_pruning(node) ir = self.compile_node(node) sql = ir.to_sql() return sql, node.schema.to_bigquery(), ordering @@ -192,31 +197,34 @@ def compile_readtable(self, node: nodes.ReadTableNode): return self.compile_read_table_unordered(node.source, node.scan_list) def read_table_as_unordered_ibis( - self, source: nodes.BigqueryDataSource + self, + source: nodes.BigqueryDataSource, + scan_cols: typing.Sequence[str], ) -> ibis_types.Table: full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}" - used_columns = tuple(col.name for col in source.table.physical_schema) # Physical schema might include unused columns, unsupported datatypes like JSON physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( - list(i for i in source.table.physical_schema if i.name in used_columns) + list(source.table.physical_schema) ) if source.at_time is not None or source.sql_predicate is not None: import bigframes.session._io.bigquery sql = bigframes.session._io.bigquery.to_query( full_table_name, - columns=used_columns, + columns=scan_cols, sql_predicate=source.sql_predicate, time_travel_timestamp=source.at_time, ) return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) else: - return ibis_api.table(physical_schema, full_table_name) + return ibis_api.table(physical_schema, full_table_name).select(scan_cols) def compile_read_table_unordered( self, source: nodes.BigqueryDataSource, scan: nodes.ScanList ): - ibis_table = self.read_table_as_unordered_ibis(source) + ibis_table = self.read_table_as_unordered_ibis( + source, scan_cols=[col.source_id for col in scan.items] + ) return compiled.UnorderedIR( ibis_table, tuple( @@ -291,7 +299,7 @@ def set_output_names( return nodes.SelectionNode( node, tuple( - (ex.DerefOp(old_id), ids.ColumnId(out_id)) + bigframes.core.nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id)) for old_id, out_id in zip(node.ids, output_ids) ), ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 085d52daa6..88e084d79c 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,7 +20,7 @@ import functools import itertools import typing -from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple +from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple, TypeVar import google.cloud.bigquery as bq @@ -44,6 +44,8 @@ COLUMN_SET = frozenset[bfet_ids.ColumnId] +Self = TypeVar("Self") + @dataclasses.dataclass(frozen=True) class Field: @@ -87,10 +89,17 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def row_count(self) -> typing.Optional[int]: return None + @abc.abstractmethod + def remap_vars( + self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> Self: + """Remap defined (in this node only) variables.""" + ... + @abc.abstractmethod def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> Self: """Remap variable references""" ... @@ -100,6 +109,10 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: """The variables defined in this node (as opposed to by child nodes).""" ... + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + @functools.cached_property def session(self): sessions = [] @@ -248,18 +261,11 @@ def planning_complexity(self) -> int: @abc.abstractmethod def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + self: Self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> Self: """Apply a function to each child node.""" ... - @abc.abstractmethod - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: - """Remap defined (in this node only) variables.""" - ... - @property def defines_namespace(self) -> bool: """ @@ -269,16 +275,6 @@ def defines_namespace(self) -> bool: """ return False - @functools.cached_property - def defined_variables(self) -> set[str]: - """Full set of variables defined in the namespace, even if not selected.""" - self_defined_variables = set(self.schema.names) - if self.defines_namespace: - return self_defined_variables - return self_defined_variables.union( - *(child.defined_variables for child in self.child_nodes) - ) - def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: return self._dtype_lookup[id] @@ -286,9 +282,6 @@ def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: def _dtype_lookup(self): return {field.id: field.dtype for field in self.fields} - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - return self.transform_children(lambda x: x.prune(used_cols)) - class AdditiveNode: """Definition of additive - if you drop added_fields, you end up with the descendent. @@ -336,7 +329,7 @@ def explicitly_ordered(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> UnaryNode: transformed = dataclasses.replace(self, child=t(self.child)) if self == transformed: # reusing existing object speeds up eq, and saves a small amount of memory @@ -406,12 +399,18 @@ def row_count(self) -> typing.Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> SliceNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> SliceNode: return self @@ -483,6 +482,10 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.indicator_col,) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset({self.left_col.id, self.right_col.id}) + @property def additive_base(self) -> BigFrameNode: return self.left_child @@ -490,9 +493,7 @@ def additive_base(self) -> BigFrameNode: def replace_additive_base(self, node: BigFrameNode): return dataclasses.replace(self, left_child=node) - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> InNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) @@ -501,17 +502,16 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - return self - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> InNode: return dataclasses.replace( self, indicator_col=mappings.get(self.indicator_col, self.indicator_col) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> InNode: return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True)) # type: ignore @@ -574,9 +574,20 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + (*l_cond.column_references, *r_cond.column_references) + for l_cond, r_cond in self.conditions + ) + ) + + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(*self.ids, *self.referenced_ids) + + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> JoinNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) @@ -585,21 +596,14 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # If this is a cross join, make sure to select at least one column from each side - condition_cols = used_cols.union( - map(lambda x: x.id, itertools.chain.from_iterable(self.conditions)) - ) - return self.transform_children( - lambda x: x.prune(frozenset([*condition_cols, *used_cols])) - ) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> JoinNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> JoinNode: new_conds = tuple( ( l_cond.remap_column_refs(mappings, allow_partial_bindings=True), @@ -665,7 +669,7 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> ConcatNode: transformed = dataclasses.replace( self, children=tuple(t(child) for child in self.children) ) @@ -674,17 +678,15 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Make concat prunable, probably by redefining - return self - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ConcatNode: new_ids = tuple(mappings.get(id, id) for id in self.output_ids) return dataclasses.replace(self, output_ids=new_ids) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ConcatNode: return self @@ -735,25 +737,23 @@ def defines_namespace(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> FromRangeNode: transformed = dataclasses.replace(self, start=t(self.start), end=t(self.end)) if self == transformed: # reusing existing object speeds up eq, and saves a small amount of memory return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Make FromRangeNode prunable (or convert to other node types) - return self - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> FromRangeNode: return dataclasses.replace( self, output_id=mappings.get(self.output_id, self.output_id) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> FromRangeNode: return self @@ -774,9 +774,7 @@ def fast_offsets(self) -> bool: def fast_ordered_limit(self) -> bool: return False - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafNode: return self @@ -785,6 +783,9 @@ class ScanItem(typing.NamedTuple): dtype: bigframes.dtypes.Dtype # Might be multiple logical types for a given physical source type source_id: str # Flexible enough for both local data and bq data + def with_id(self, id: bfet_ids.ColumnId) -> ScanItem: + return ScanItem(id, self.dtype, self.source_id) + @dataclasses.dataclass(frozen=True) class ScanList: @@ -841,25 +842,9 @@ def row_count(self) -> typing.Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(item.id for item in self.fields) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # Don't preoduce empty scan list no matter what, will result in broken sql syntax - # TODO: Handle more elegantly - new_scan_list = ScanList( - tuple(item for item in self.scan_list.items if item.id in used_cols) - or (self.scan_list.items[0],) - ) - return ReadLocalNode( - self.feather_bytes, - self.data_schema, - self.n_rows, - new_scan_list, - self.offsets_col, - self.session, - ) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ReadLocalNode: new_scan_list = ScanList( tuple( ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id) @@ -875,7 +860,9 @@ def remap_vars( self, scan_list=new_scan_list, offsets_col=new_offsets_col ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ReadLocalNode: return self @@ -1003,16 +990,9 @@ def row_count(self) -> typing.Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(item.id for item in self.scan_list.items) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - new_scan_list = ScanList( - tuple(item for item in self.scan_list.items if item.id in used_cols) - or (self.scan_list.items[0],) - ) - return dataclasses.replace(self, scan_list=new_scan_list) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ReadTableNode: new_scan_list = ScanList( tuple( ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id) @@ -1021,7 +1001,9 @@ def remap_vars( ) return dataclasses.replace(self, scan_list=new_scan_list) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ReadTableNode: return self def with_order_cols(self): @@ -1089,6 +1071,10 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.col_id,) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + @property def added_fields(self) -> Tuple[Field, ...]: return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) @@ -1097,22 +1083,17 @@ def added_fields(self) -> Tuple[Field, ...]: def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> PromoteOffsetsNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - if self.col_id not in used_cols: - return self.child.prune(used_cols) - else: - new_used = used_cols.difference([self.col_id]) - return self.transform_children(lambda x: x.prune(new_used)) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> PromoteOffsetsNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> PromoteOffsetsNode: return self @@ -1136,17 +1117,22 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - consumed_ids = used_cols.union(self.predicate.column_references) - pruned_child = self.child.prune(consumed_ids) - return FilterNode(pruned_child, self.predicate) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(self.ids) | self.referenced_ids + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset(self.predicate.column_references) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> FilterNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> FilterNode: return dataclasses.replace( self, predicate=self.predicate.remap_column_refs( @@ -1183,20 +1169,24 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - ordering_cols = itertools.chain.from_iterable( - map(lambda x: x.referenced_columns, self.by) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(self.ids) | self.referenced_ids + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) - consumed_ids = used_cols.union(ordering_cols) - pruned_child = self.child.prune(consumed_ids) - return OrderByNode(pruned_child, self.by) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> OrderByNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> OrderByNode: all_refs = set( itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) @@ -1233,20 +1223,43 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ReversedNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ReversedNode: return self +class AliasedRef(typing.NamedTuple): + ref: ex.DerefOp + id: bfet_ids.ColumnId + + @classmethod + def identity(cls, id: bfet_ids.ColumnId) -> AliasedRef: + return cls(ex.DerefOp(id), id) + + def remap_vars( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> AliasedRef: + return AliasedRef(self.ref, mappings.get(self.id, self.id)) + + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> AliasedRef: + return AliasedRef(ex.DerefOp(mappings.get(self.ref.id, self.ref.id)), self.id) + + @dataclasses.dataclass(frozen=True, eq=False) class SelectionNode(UnaryNode): - input_output_pairs: typing.Tuple[ - typing.Tuple[ex.DerefOp, bigframes.core.identifiers.ColumnId], ... - ] + input_output_pairs: Tuple[AliasedRef, ...] def _validate(self): for ref, _ in self.input_output_pairs: @@ -1280,33 +1293,26 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.input_output_pairs) - def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: - return {ref.id: out_id for ref, out_id in self.input_output_pairs} - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - pruned_selections = ( - tuple( - select for select in self.input_output_pairs if select[1] in used_cols - ) - or self.input_output_pairs[:1] - ) - consumed_ids = frozenset(i[0].id for i in pruned_selections) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(ref.id for ref, id in self.input_output_pairs) - pruned_child = self.child.prune(consumed_ids) - return SelectionNode(pruned_child, pruned_selections) + def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: + return {ref.id: id for ref, id in self.input_output_pairs} def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: - new_pairs = tuple( - (ref, mappings.get(id, id)) for ref, id in self.input_output_pairs + ) -> SelectionNode: + new_fields = tuple( + item.remap_vars(mappings) for item in self.input_output_pairs ) - return dataclasses.replace(self, input_output_pairs=new_pairs) + return dataclasses.replace(self, input_output_pairs=new_fields) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> SelectionNode: new_fields = tuple( - (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) - for ex, id in self.input_output_pairs + item.remap_refs(mappings) for item in self.input_output_pairs ) return dataclasses.replace(self, input_output_pairs=new_fields) # type: ignore @@ -1353,30 +1359,38 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.assignments) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + i[0].column_references for i in self.assignments + ) + ) + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + ex.column_references for ex, id in self.assignments + ) + ) + @property def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols) - if len(pruned_assignments) == 0: - return self.child.prune(used_cols) - consumed_ids = itertools.chain.from_iterable( - i[0].column_references for i in pruned_assignments - ) - pruned_child = self.child.prune(used_cols.union(consumed_ids)) - return ProjectionNode(pruned_child, pruned_assignments) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ProjectionNode: new_fields = tuple((ex, mappings.get(id, id)) for ex, id in self.assignments) return dataclasses.replace(self, assignments=new_fields) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ProjectionNode: new_fields = tuple( (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) for ex, id in self.assignments @@ -1418,16 +1432,18 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.col_id,) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> RowCountNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): - return self - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Handle row count pruning + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> RowCountNode: return self @@ -1487,33 +1503,31 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.aggregations) @property - def has_ordered_ops(self) -> bool: - return not all( - aggregate.op.order_independent for aggregate, _ in self.aggregations - ) - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + def consumed_ids(self) -> COLUMN_SET: by_ids = (ref.id for ref in self.by_column_ids) - pruned_aggs = ( - tuple(agg for agg in self.aggregations if agg[1] in used_cols) - or self.aggregations[:1] - ) agg_inputs = itertools.chain.from_iterable( - agg.column_references for agg, _ in pruned_aggs + agg.column_references for agg, _ in self.aggregations ) - consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs)) - pruned_child = self.child.prune(consumed_ids) - return AggregateNode( - pruned_child, pruned_aggs, self.by_column_ids, dropna=self.dropna + order_ids = itertools.chain.from_iterable( + part.scalar_expression.column_references for part in self.order_by + ) + return frozenset(itertools.chain(by_ids, agg_inputs, order_ids)) + + @property + def has_ordered_ops(self) -> bool: + return not all( + aggregate.op.order_independent for aggregate, _ in self.aggregations ) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> AggregateNode: new_aggs = tuple((agg, mappings.get(id, id)) for agg, id in self.aggregations) return dataclasses.replace(self, aggregations=new_aggs) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> AggregateNode: new_aggs = tuple( (agg.remap_column_refs(mappings, allow_partial_bindings=True), id) for agg, id in self.aggregations @@ -1578,6 +1592,20 @@ def added_field(self) -> Field: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.output_name,) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset( + set(self.ids).difference([self.output_name]).union(self.referenced_ids) + ) + + @property + def referenced_ids(self) -> COLUMN_SET: + return ( + frozenset() + .union(self.expression.column_references) + .union(self.window_spec.all_referenced_columns) + ) + @property def inherits_order(self) -> bool: # does the op both use ordering at all? and if so, can it inherit order? @@ -1590,27 +1618,19 @@ def inherits_order(self) -> bool: def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> WindowOpNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - if self.output_name not in used_cols: - return self.child.prune(used_cols) - consumed_ids = ( - used_cols.difference([self.output_name]) - .union(self.expression.column_references) - .union(self.window_spec.all_referenced_columns) - ) - return self.transform_children(lambda x: x.prune(consumed_ids)) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> WindowOpNode: return dataclasses.replace( self, output_name=mappings.get(self.output_name, self.output_name) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> WindowOpNode: return dataclasses.replace( self, expression=self.expression.remap_column_refs( @@ -1646,14 +1666,18 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> RandomSampleNode: return self def remap_refs( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> RandomSampleNode: return self @@ -1703,21 +1727,20 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.offsets_col,) if (self.offsets_col is not None) else () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # Cannot prune explode op - consumed_ids = used_cols.union(ref.id for ref in self.column_ids) - return self.transform_children(lambda x: x.prune(consumed_ids)) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset(ref.id for ref in self.column_ids) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ExplodeNode: if (self.offsets_col is not None) and self.offsets_col in mappings: return dataclasses.replace(self, offsets_col=mappings[self.offsets_col]) return self def remap_refs( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ExplodeNode: new_ids = tuple(id.remap_column_refs(mappings) for id in self.column_ids) return dataclasses.replace(self, column_ids=new_ids) # type: ignore diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index f93186bf36..bf93fa51b6 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -17,6 +17,7 @@ from bigframes.core.rewrite.legacy_align import legacy_join_as_projection from bigframes.core.rewrite.operators import rewrite_timedelta_ops from bigframes.core.rewrite.order import pull_up_order +from bigframes.core.rewrite.pruning import column_pruning from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice __all__ = [ @@ -27,4 +28,5 @@ "pullup_limit_from_slice", "remap_variables", "pull_up_order", + "column_pruning", ] diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py index 1b864fb919..1989b1a543 100644 --- a/bigframes/core/rewrite/implicit_align.py +++ b/bigframes/core/rewrite/implicit_align.py @@ -113,7 +113,7 @@ def try_row_join( r_node, r_selection = pull_up_selection( r_node, stop=divergent_node, rename_vars=True ) # Rename only right vars to avoid collisions with left vars - combined_selection = (*l_selection, *r_selection) + combined_selection = l_selection + r_selection def _linearize_trees( base_tree: bigframes.core.nodes.BigFrameNode, @@ -139,10 +139,7 @@ def pull_up_selection( rename_vars: bool = False, ) -> Tuple[ bigframes.core.nodes.BigFrameNode, - Tuple[ - Tuple[bigframes.core.expression.DerefOp, bigframes.core.identifiers.ColumnId], - ..., - ], + Tuple[bigframes.core.nodes.AliasedRef, ...], ]: """Remove all selection nodes above the base node. Returns stripped tree. @@ -157,8 +154,7 @@ def pull_up_selection( """ if node == stop: # base case return node, tuple( - (bigframes.core.expression.DerefOp(field.id), field.id) - for field in node.fields + bigframes.core.nodes.AliasedRef.identity(field.id) for field in node.fields ) # InNode needs special handling, as its a binary node, but row identity is from left side only. # TODO: Merge code with unary op paths @@ -179,11 +175,15 @@ def pull_up_selection( {node.indicator_col: bigframes.core.identifiers.ColumnId.unique()} ), ) - added_selection = ( - bigframes.core.expression.DerefOp(new_in_node.indicator_col), - node.indicator_col, + added_selection = tuple( + ( + bigframes.core.nodes.AliasedRef( + bigframes.core.expression.DerefOp(new_in_node.indicator_col), + node.indicator_col, + ), + ) ) - new_selection = (*child_selections, added_selection) + new_selection = child_selections + added_selection return new_in_node, new_selection if isinstance(node, bigframes.core.nodes.AdditiveNode): @@ -204,28 +204,20 @@ def pull_up_selection( else: var_renames = {} assert isinstance(new_node, bigframes.core.nodes.AdditiveNode) - added_selections = ( - ( - bigframes.core.expression.DerefOp(var_renames.get(field.id, field.id)), - field.id, - ) + added_selections = tuple( + bigframes.core.nodes.AliasedRef.identity(field.id).remap_refs(var_renames) for field in node.added_fields ) - new_selection = (*child_selections, *added_selections) + new_selection = child_selections + added_selections return new_node, new_selection elif isinstance(node, bigframes.core.nodes.SelectionNode): child_node, child_selections = pull_up_selection( node.child, stop, rename_vars=rename_vars ) mapping = {out: ref.id for ref, out in child_selections} - new_selection = tuple( - ( - bigframes.core.expression.DerefOp(mapping[ref.id]), - out, - ) - for ref, out in node.input_output_pairs + return child_node, tuple( + ref.remap_refs(mapping) for ref in node.input_output_pairs ) - return child_node, new_selection raise ValueError(f"Couldn't pull up select from node: {node}") diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py index 05641130fb..573a7026e4 100644 --- a/bigframes/core/rewrite/legacy_align.py +++ b/bigframes/core/rewrite/legacy_align.py @@ -57,7 +57,7 @@ def from_node_span( if isinstance(node, nodes.SelectionNode): return cls.from_node_span(node.child, target).select( - node.input_output_pairs + tuple(node.input_output_pairs) ) elif isinstance(node, nodes.ProjectionNode): return cls.from_node_span(node.child, target).project(node.assignments) @@ -228,7 +228,9 @@ def expand(self) -> nodes.BigFrameNode: root = nodes.FilterNode(child=root, predicate=self.predicate) if self.ordering: root = nodes.OrderByNode(child=root, by=self.ordering) - selection = tuple((scalar_exprs.DerefOp(id), id) for _, id in self.columns) + selection = tuple( + bigframes.core.nodes.AliasedRef.identity(id) for _, id in self.columns + ) return nodes.SelectionNode( child=nodes.ProjectionNode(child=root, assignments=self.columns), input_output_pairs=selection, diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 3f8c409b76..18e5004e1d 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -180,14 +180,10 @@ def pull_up_order_inner( col: bigframes.core.ids.ColumnId.unique() for col in unselected_order_cols } - all_selections = ( - *node.input_output_pairs, - *( - (bigframes.core.expression.DerefOp(k), v) - for k, v in new_selections.items() - ), + all_selections = node.input_output_pairs + tuple( + bigframes.core.nodes.AliasedRef(bigframes.core.expression.DerefOp(k), v) + for k, v in new_selections.items() ) - new_select_node = dataclasses.replace( node, child=child_result, input_output_pairs=all_selections ) @@ -288,7 +284,7 @@ def pull_order_concat( ) selection = tuple( ( - (bigframes.core.expression.DerefOp(id), id) + bigframes.core.nodes.AliasedRef.identity(id) for id in (*source.ids, table_id, offsets_id) ) ) @@ -396,7 +392,7 @@ def remove_order_strict( if result.ids != node.ids: return bigframes.core.nodes.SelectionNode( result, - tuple((bigframes.core.expression.DerefOp(id), id) for id in node.ids), + tuple(bigframes.core.nodes.AliasedRef.identity(id) for id in node.ids), ) return result @@ -428,7 +424,7 @@ def rename_cols( result_node = bigframes.core.nodes.SelectionNode( node, tuple( - (bigframes.core.expression.DerefOp(id), mappings.get(id, id)) + bigframes.core.nodes.AliasedRef.identity(id).remap_vars(mappings) for id in node.ids ), ) diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py new file mode 100644 index 0000000000..0b8534116d --- /dev/null +++ b/bigframes/core/rewrite/pruning.py @@ -0,0 +1,195 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import dataclasses +import functools +from typing import AbstractSet + +import bigframes.core.identifiers +import bigframes.core.nodes + + +def column_pruning( + root: bigframes.core.nodes.BigFrameNode, +) -> bigframes.core.nodes.BigFrameNode: + return bigframes.core.nodes.top_down(root, prune_columns) + + +def to_fixed(max_iterations: int = 100): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + previous_result = None + current_result = func(*args, **kwargs) + attempts = 1 + + while attempts < max_iterations: + if current_result == previous_result: + return current_result + previous_result = current_result + current_result = func(current_result) + attempts += 1 + + return current_result + + return wrapper + + return decorator + + +@to_fixed(max_iterations=100) +def prune_columns(node: bigframes.core.nodes.BigFrameNode): + if isinstance(node, bigframes.core.nodes.SelectionNode): + result = prune_selection_child(node) + elif isinstance(node, bigframes.core.nodes.AggregateNode): + result = node.replace_child(prune_node(node.child, node.consumed_ids)) + elif isinstance(node, bigframes.core.nodes.InNode): + result = dataclasses.replace( + node, + right_child=prune_node(node.right_child, frozenset([node.right_col.id])), + ) + else: + result = node + return result + + +def prune_selection_child( + selection: bigframes.core.nodes.SelectionNode, +) -> bigframes.core.nodes.BigFrameNode: + child = selection.child + + # Important to check this first + if list(selection.ids) == list(child.ids): + return child + + if isinstance(child, bigframes.core.nodes.SelectionNode): + return selection.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + elif isinstance(child, bigframes.core.nodes.AdditiveNode): + if not set(field.id for field in child.added_fields) & selection.consumed_ids: + return selection.replace_child(child.additive_base) + return selection.replace_child( + child.replace_additive_base( + prune_node( + child.additive_base, selection.consumed_ids | child.referenced_ids + ) + ) + ) + elif isinstance(child, bigframes.core.nodes.ConcatNode): + indices = [ + list(child.ids).index(ref.id) for ref, _ in selection.input_output_pairs + ] + new_children = [] + for concat_node in child.child_nodes: + cc_ids = tuple(concat_node.ids) + sub_selection = tuple( + bigframes.core.nodes.AliasedRef.identity(cc_ids[i]) for i in indices + ) + new_children.append( + bigframes.core.nodes.SelectionNode(concat_node, sub_selection) + ) + return bigframes.core.nodes.ConcatNode( + children=tuple(new_children), output_ids=tuple(selection.ids) + ) + # Nodes that pass through input columns + elif isinstance( + child, + ( + bigframes.core.nodes.RandomSampleNode, + bigframes.core.nodes.ReversedNode, + bigframes.core.nodes.OrderByNode, + bigframes.core.nodes.FilterNode, + bigframes.core.nodes.SliceNode, + bigframes.core.nodes.JoinNode, + bigframes.core.nodes.ExplodeNode, + ), + ): + ids = selection.consumed_ids | child.referenced_ids + return selection.replace_child( + child.transform_children(lambda x: prune_node(x, ids)) + ) + elif isinstance(child, bigframes.core.nodes.AggregateNode): + return selection.replace_child(prune_aggregate(child, selection.consumed_ids)) + elif isinstance(child, bigframes.core.nodes.LeafNode): + return selection.replace_child(prune_leaf(child, selection.consumed_ids)) + return selection + + +def prune_node( + node: bigframes.core.nodes.BigFrameNode, + ids: AbstractSet[bigframes.core.ids.ColumnId], +): + # This clause is important, ensures idempotency, so can reach fixed point + if not (set(node.ids) - ids): + return node + else: + return bigframes.core.nodes.SelectionNode( + node, + tuple( + bigframes.core.nodes.AliasedRef.identity(id) + for id in node.ids + if id in ids + ), + ) + + +def prune_aggregate( + node: bigframes.core.nodes.AggregateNode, + used_cols: AbstractSet[bigframes.core.ids.ColumnId], +) -> bigframes.core.nodes.AggregateNode: + pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols) + return dataclasses.replace(node, aggregations=pruned_aggs) + + +@functools.singledispatch +def prune_leaf( + node: bigframes.core.nodes.BigFrameNode, + used_cols: AbstractSet[bigframes.core.ids.ColumnId], +): + ... + + +@prune_leaf.register +def prune_readlocal( + node: bigframes.core.nodes.ReadLocalNode, + selection: AbstractSet[bigframes.core.ids.ColumnId], +) -> bigframes.core.nodes.ReadLocalNode: + new_scan_list = filter_scanlist(node.scan_list, selection) + return dataclasses.replace( + node, + scan_list=new_scan_list, + offsets_col=node.offsets_col if (node.offsets_col in selection) else None, + ) + + +@prune_leaf.register +def prune_readtable( + node: bigframes.core.nodes.ReadTableNode, + selection: AbstractSet[bigframes.core.ids.ColumnId], +) -> bigframes.core.nodes.ReadTableNode: + new_scan_list = filter_scanlist(node.scan_list, selection) + return dataclasses.replace(node, scan_list=new_scan_list) + + +def filter_scanlist( + scanlist: bigframes.core.nodes.ScanList, + ids: AbstractSet[bigframes.core.ids.ColumnId], +): + result = bigframes.core.nodes.ScanList( + tuple(item for item in scanlist.items if item.id in ids) + ) + if len(result.items) == 0: + # We need to select something, or stuff breaks + result = bigframes.core.nodes.ScanList(scanlist.items[:1]) + return result diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 102ffcf773..87a7720e2f 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -120,7 +120,9 @@ def drop_cols( ) -> nodes.SelectionNode: # adding a whole node that redefines the schema is a lot of overhead, should do something more efficient selections = tuple( - (scalar_exprs.DerefOp(id), id) for id in node.ids if id not in drop_cols + nodes.AliasedRef(scalar_exprs.DerefOp(id), id) + for id in node.ids + if id not in drop_cols ) return nodes.SelectionNode(node, selections) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 20f636b681..4ffa56c2e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,10 @@ def __init__( ) block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols]) if columns: - block = block.select_columns(list(columns)) # type:ignore + column_ids = [ + block.resolve_label_exact_or_error(label) for label in list(columns) + ] + block = block.select_columns(column_ids) # type:ignore if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -238,15 +241,7 @@ def _find_indices( return [self._block.value_columns.index(col_id) for col_id in col_ids] def _resolve_label_exact(self, label) -> Optional[str]: - """Returns the column id matching the label if there is exactly - one such column. If there are multiple columns with the same name, - raises an error. If there is no such column, returns None.""" - matches = self._block.label_to_col_id.get(label, []) - if len(matches) > 1: - raise ValueError( - f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" - ) - return matches[0] if len(matches) != 0 else None + return self._block.resolve_label_exact(label) def _sql_names( self, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 7b66191a11..72c49e124b 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -57,6 +57,8 @@ _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT, ) +_MULTIMODAL_EMBEDDING_001_ENDPOINT = "multimodalembedding@001" + _GEMINI_PRO_ENDPOINT = "gemini-pro" _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" @@ -762,6 +764,152 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TextEmbeddingGenerat return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger +class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): + """Multimodal embedding generator LLM model. + + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Args: + model_name (str, Default to "multimodalembedding@001"): + The model for multimodal embedding. Can set to "multimodalembedding@001". Multimodal-embedding models returns model embeddings for text, image and video inputs. + Default to "multimodalembedding@001". + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + If None, use default connection in session context. + """ + + def __init__( + self, + *, + model_name: Literal["multimodalembedding@001"] = "multimodalembedding@001", + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + if not bigframes.options.experiments.blob: + raise NotImplementedError() + self.model_name = model_name + self.session = session or global_session.get_global_session() + self.connection_name = connection_name + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) + + if self.model_name != _MULTIMODAL_EMBEDDING_001_ENDPOINT: + msg = _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=_MULTIMODAL_EMBEDDING_001_ENDPOINT, + ) + warnings.warn(msg) + + options = { + "endpoint": self.model_name, + } + return self._bqml_model_factory.create_remote_model( + session=self.session, connection_name=self.connection_name, options=options + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> MultimodalEmbeddingGenerator: + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + model = cls( + session=session, + model_name=model_endpoint, # type: ignore + connection_name=model_connection, + ) + + model._bqml_model = core.BqmlModel(session, bq_model) + return model + + @property + def _predict_func( + self, + ) -> Callable[ + [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame + ]: + return self._bqml_model.generate_embedding + + @property + def _status_col(self) -> str: + return _ML_GENERATE_EMBEDDING_STATUS + + def predict( + self, X: utils.ArrayType, *, max_retries: int = 0 + ) -> bigframes.dataframe.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. + The content column must be of string type or BigFrames Blob of image or video. + + max_retries (int, default 0): + Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. + Each retry will append newly succeeded rows. When the max retries are reached, the remaining rows (the ones without successful predictions) will be appended to the end of the result. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + if max_retries < 0: + raise ValueError( + f"max_retries must be larger than or equal to 0, but is {max_retries}." + ) + + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "content"}) + + # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input + if X["content"].dtype == dtypes.OBJ_REF_DTYPE: + X["content"] = X["content"].blob._get_runtime("R", with_metadata=True) + + options = { + "flatten_json_output": True, + } + + return self._predict_and_retry(X, options=options, max_retries=max_retries) + + def to_gbq( + self, model_name: str, replace: bool = False + ) -> MultimodalEmbeddingGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Determine whether to replace if the model already exists. Default to False. + + Returns: + MultimodalEmbeddingGenerator: Saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 1f62eec0ff..3bba3699f3 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -76,6 +76,7 @@ llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator, + llm._MULTIMODAL_EMBEDDING_001_ENDPOINT: llm.MultimodalEmbeddingGenerator, } ) @@ -100,6 +101,7 @@ def from_bq( llm.PaLM2TextEmbeddingGenerator, llm.Claude3TextGenerator, llm.TextEmbeddingGenerator, + llm.MultimodalEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, preprocessing.PreprocessingType, diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index e1620485d5..e034fd00f7 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -100,6 +100,9 @@ def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: model_name = model_endpoint version = None + if model_endpoint.startswith("multimodalembedding"): + return model_name, version + at_idx = model_endpoint.find("@") if at_idx != -1: version = model_endpoint[at_idx + 1 :] diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index e212381557..f5b82c2331 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -26,6 +26,6 @@ class ToTimedeltaOp(base_ops.UnaryOp): unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] def output_type(self, *input_types): - if input_types[0] is not dtypes.INT_DTYPE: - raise TypeError("expected integer input") - return dtypes.TIMEDELTA_DTYPE + if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): + return dtypes.TIMEDELTA_DTYPE + raise TypeError("expected integer or float input") diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py index 0cedf425fe..070a41d62d 100644 --- a/bigframes/pandas/core/tools/timedeltas.py +++ b/bigframes/pandas/core/tools/timedeltas.py @@ -18,20 +18,26 @@ timedeltas as vendored_pandas_timedeltas, ) import pandas as pd +import pandas.api.types as pdtypes from bigframes import operations as ops -from bigframes import series +from bigframes import series, session def to_timedelta( - arg: typing.Union[series.Series, str, int, float], + arg, unit: typing.Optional[vendored_pandas_timedeltas.UnitChoices] = None, -) -> typing.Union[series.Series, pd.Timedelta]: - if not isinstance(arg, series.Series): - return pd.to_timedelta(arg, unit) + *, + session: typing.Optional[session.Session] = None, +): + if isinstance(arg, series.Series): + canonical_unit = "us" if unit is None else _canonicalize_unit(unit) + return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) - canonical_unit = "us" if unit is None else _canonicalize_unit(unit) - return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) + if pdtypes.is_list_like(arg): + return to_timedelta(series.Series(arg), unit, session=session) + + return pd.to_timedelta(arg, unit) to_timedelta.__doc__ = vendored_pandas_timedeltas.to_timedelta.__doc__ diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py index 90c638b82e..2180a66207 100644 --- a/bigframes/streaming/dataframe.py +++ b/bigframes/streaming/dataframe.py @@ -24,7 +24,7 @@ from google.cloud import bigquery from bigframes import dataframe -from bigframes.core import log_adapter +from bigframes.core import log_adapter, nodes import bigframes.exceptions as bfe import bigframes.session @@ -54,7 +54,7 @@ def _curate_df_doc(doc: Optional[str]): class StreamingBase: - sql: str + _appends_sql: str _session: bigframes.session.Session def to_bigtable( @@ -124,7 +124,7 @@ def to_bigtable( can be examined. """ return _to_bigtable( - self.sql, + self._appends_sql, instance=instance, table=table, service_account_email=service_account_email, @@ -181,7 +181,7 @@ def to_pubsub( can be examined. """ return _to_pubsub( - self.sql, + self._appends_sql, topic=topic, service_account_email=service_account_email, session=self._session, @@ -218,6 +218,19 @@ def __init__(self, df: dataframe.DataFrame, *, create_key=0): def _from_table_df(cls, df: dataframe.DataFrame) -> StreamingDataFrame: return cls(df, create_key=cls._create_key) + @property + def _original_table(self): + def traverse(node: nodes.BigFrameNode): + if isinstance(node, nodes.ReadTableNode): + return f"{node.source.table.project_id}.{node.source.table.dataset_id}.{node.source.table.table_id}" + for child in node.child_nodes: + original_table = traverse(child) + if original_table: + return original_table + return None + + return traverse(self._df._block._expr.node) + def __getitem__(self, *args, **kwargs): return _return_type_wrapper(self._df.__getitem__, StreamingDataFrame)( *args, **kwargs @@ -266,6 +279,17 @@ def sql(self): sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql)) + # Patch for the required APPENDS clause + @property + def _appends_sql(self): + sql_str = self.sql + original_table = self._original_table + assert original_table is not None + + appends_clause = f"APPENDS(TABLE `{original_table}`, NULL, NULL)" + sql_str = sql_str.replace(f"`{original_table}`", appends_clause) + return sql_str + @property def _session(self): return self._df._session diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index c17a1788df..d57ab1c8ac 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -209,7 +209,7 @@ name: bigframes.bigquery - items: - name: GeoSeries - uid: bigframes.geopandas + uid: bigframes.geopandas.GeoSeries name: bigframes.geopandas - items: - name: Overview diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 90186b297d..6b0682bb1a 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -118,7 +118,7 @@ "metadata": {}, "outputs": [], "source": [ - "#%pip install --upgrade" + "#%pip install --upgrade bigframes" ] }, { diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e7556043af..1db89a074a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -44,8 +44,15 @@ def test_df_construct_copy(scalars_dfs): columns = ["int64_col", "string_col", "float64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_df, columns=columns).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) pandas.testing.assert_frame_equal(bf_result, pd_result) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index e46d073056..4b4264e33c 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -763,7 +763,7 @@ def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): "micros", ], ) -def test_to_timedelta_with_bf_series(session, unit): +def test_to_timedelta_with_bf_integer_series(session, unit): bf_series = bpd.Series([1, 2, 3], session=session) pd_series = pd.Series([1, 2, 3]) @@ -779,6 +779,42 @@ def test_to_timedelta_with_bf_series(session, unit): ) +def test_to_timedelta_with_bf_float_series_value_rounded_down(session): + bf_series = bpd.Series([1.2, 2.9], session=session) + + actual_result = ( + typing.cast(bpd.Series, bpd.to_timedelta(bf_series, "us")) + .to_pandas() + .astype("timedelta64[ns]") + ) + + expected_result = pd.Series([pd.Timedelta(1, "us"), pd.Timedelta(2, "us")]) + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +@pytest.mark.parametrize( + "input", + [ + pytest.param([1, 2, 3], id="list"), + pytest.param((1, 2, 3), id="tuple"), + pytest.param(pd.Series([1, 2, 3]), id="pandas-series"), + ], +) +def test_to_timedelta_with_list_like_input(session, input): + actual_result = ( + typing.cast(bpd.Series, bpd.to_timedelta(input, "s", session=session)) + .to_pandas() + .astype("timedelta64[ns]") + ) + + expected_result = pd.Series(pd.to_timedelta(input, "s")) + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + @pytest.mark.parametrize( "unit", ["Y", "M", "whatever"], diff --git a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py index 2fd0e9186e..fce0643783 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py @@ -63,7 +63,6 @@ typecode.VARBINARY: dt.Binary, typecode.VARCHAR: dt.String, typecode.VARIANT: dt.JSON, - typecode.UNIQUEIDENTIFIER: dt.UUID, typecode.SET: partial(dt.Array, dt.string), ############################# # Unsupported sqlglot types # diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py index 652f04757b..a252f116dd 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py @@ -359,7 +359,7 @@ def wrap(node, _, **kwargs): return CTE(new) if node in ctes else new result = simplified.replace(wrap) - ctes = reversed([cte.parent for cte in result.find(CTE)]) + ctes = [cte.parent for cte in result.find(CTE, ordered=True)] return result, ctes diff --git a/third_party/bigframes_vendored/ibis/common/graph.py b/third_party/bigframes_vendored/ibis/common/graph.py index 1a3fc6c543..6e7995ec03 100644 --- a/third_party/bigframes_vendored/ibis/common/graph.py +++ b/third_party/bigframes_vendored/ibis/common/graph.py @@ -343,6 +343,7 @@ def find( finder: FinderLike, filter: Optional[FinderLike] = None, context: Optional[dict] = None, + ordered: bool = False, ) -> list[Node]: """Find all nodes matching a given pattern or type in the graph. @@ -360,6 +361,8 @@ def find( the given filter and stop otherwise. context Optional context to use if `finder` or `filter` is a pattern. + ordered + Emit nodes in topological order if `True`. Returns ------- @@ -369,6 +372,8 @@ def find( """ graph = Graph.from_bfs(self, filter=filter, context=context) finder = _coerce_finder(finder, context) + if ordered: + graph, _ = graph.toposort() return [node for node in graph.nodes() if finder(node)] @experimental From ba5beb397d578ee5438c8ab64cfa05ab10954281 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Feb 2025 18:59:08 +0000 Subject: [PATCH 16/75] preparing test files --- tests/data/ratings.jsonl | 0 tests/data/ratings_schema.json | 17 ++++++++++++++ tests/system/conftest.py | 14 ++++++++++++ tests/system/large/ml/test_decomposition.py | 25 +++++++++++++++++++++ 4 files changed, 56 insertions(+) create mode 100644 tests/data/ratings.jsonl create mode 100644 tests/data/ratings_schema.json diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json new file mode 100644 index 0000000000..ca34a530ee --- /dev/null +++ b/tests/data/ratings_schema.json @@ -0,0 +1,17 @@ +[ + { + "mode": "NULLABLE", + "name": "user_id", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "item_id", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "ratings", + "type": "FLOAT" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 29234bc4ef..e4bff8cdcc 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -305,6 +305,7 @@ def load_test_data_tables( ("repeated", "repeated_schema.json", "repeated.jsonl"), ("json", "json_schema.json", "json.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), + ("ratings", "ratings_schema.json", "ratings.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), ("hockey_players", "hockey_players.json", "hockey_players.jsonl"), ("matrix_2by3", "matrix_2by3.json", "matrix_2by3.jsonl"), @@ -401,6 +402,11 @@ def penguins_table_id(test_data_tables) -> str: return test_data_tables["penguins"] +@pytest.fixture(scope="session") +def ratings_table_id(test_data_tables) -> str: + return test_data_tables["ratings"] + + @pytest.fixture(scope="session") def urban_areas_table_id(test_data_tables) -> str: return test_data_tables["urban_areas"] @@ -743,6 +749,14 @@ def penguins_df_null_index( return unordered_session.read_gbq(penguins_table_id) +@pytest.fixture(scope="session") +def ratings_df_default_index( + ratings_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq(ratings_table_id) + + @pytest.fixture(scope="session") def time_series_df_default_index( time_series_table_id: str, session: bigframes.Session diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 49aa985189..2544e8dba0 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -163,3 +163,28 @@ def test_decomposition_configure_fit_load_none_component( in reloaded_model._bqml_model.model_name ) assert reloaded_model.n_components == 7 + + +def test_decomposition_mf_configure_fit_load_none_component( + ratings_df_default_index, dataset_id +): + model = decomposition.MatrixFactorization( + num_factors=6, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model.fit(ratings_df_default_index) + + # save, load, check n_components. Here n_components is the column size of the training input. + # reloaded_model = model.to_gbq( + # f"{dataset_id}.temp_configured_pca_model", replace=True + # ) + # assert reloaded_model._bqml_model is not None + # assert ( + # f"{dataset_id}.temp_configured_pca_model" + # in reloaded_model._bqml_model.model_name + # ) + assert model.num_factors == 6 From 857783301e558c8af6b362d3e32a40ce9f7b5c15 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 13 Feb 2025 20:44:56 +0000 Subject: [PATCH 17/75] add test data --- tests/data/ratings.jsonl | 20 ++++++++++++++++++++ tests/system/large/ml/test_decomposition.py | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl index e69de29bb2..919b61c350 100644 --- a/tests/data/ratings.jsonl +++ b/tests/data/ratings.jsonl @@ -0,0 +1,20 @@ +{"user_id": 1, "item_id": 2, "ratings": 4.0} +{"user_id": 1, "item_id": 5, "ratings": 3.0} +{"user_id": 2, "item_id": 1, "ratings": 5.0} +{"user_id": 2, "item_id": 3, "ratings": 2.0} +{"user_id": 3, "item_id": 4, "ratings": 4.5} +{"user_id": 3, "item_id": 7, "ratings": 3.5} +{"user_id": 4, "item_id": 2, "ratings": 1.0} +{"user_id": 4, "item_id": 8, "ratings": 5.0} +{"user_id": 5, "item_id": 3, "ratings": 4.0} +{"user_id": 5, "item_id": 9, "ratings": 2.5} +{"user_id": 6, "item_id": 1, "ratings": 3.0} +{"user_id": 6, "item_id": 6, "ratings": 4.5} +{"user_id": 7, "item_id": 5, "ratings": 5.0} +{"user_id": 7, "item_id": 10, "ratings": 1.5} +{"user_id": 8, "item_id": 4, "ratings": 2.0} +{"user_id": 8, "item_id": 7, "ratings": 4.0} +{"user_id": 9, "item_id": 2, "ratings": 3.5} +{"user_id": 9, "item_id": 9, "ratings": 5.0} +{"user_id": 10, "item_id": 3, "ratings": 4.5} +{"user_id": 10, "item_id": 8, "ratings": 2.5} diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 2544e8dba0..d2320c570a 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -172,8 +172,8 @@ def test_decomposition_mf_configure_fit_load_none_component( num_factors=6, feedback_type="explicit", user_col="user_id", - item_col="item_col", - rating_col="rating_col", + item_col="item_id", + rating_col="ratings", l2_reg=9.83, ) model.fit(ratings_df_default_index) From 4b7b4dba0d043d93dc41186fde8cc9d597d224e4 Mon Sep 17 00:00:00 2001 From: Daniela Date: Fri, 21 Feb 2025 01:40:07 +0000 Subject: [PATCH 18/75] new error: to_gbq column names need to be changed? --- demo.ipynb | 758 ++++++++++++++++++++ tests/data/ratings_schema.json | 2 +- tests/system/large/ml/test_decomposition.py | 35 +- 3 files changed, 782 insertions(+), 13 deletions(-) create mode 100644 demo.ipynb diff --git a/demo.ipynb b/demo.ipynb new file mode 100644 index 0000000000..93e6f121f9 --- /dev/null +++ b/demo.ipynb @@ -0,0 +1,758 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 39ca6c3f-1c37-4f8e-8252-33cf6abfa340 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7dda7bc2-75b2-42b5-918b-41dd0540eb53 is DONE. 24.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4b99d068-1e68-4a86-bd0b-52d40ef6a270 is DONE. 40.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_iditem_idrating
043549684.0
1362235215.0
255439202.0
344531755.0
455352354.0
5442210974.0
6311913564.0
7603712314.0
88511963.0
931114353.0
1054036485.0
11360127343.0
12465529494.0
13127430935.0
1415213504.0
1530724543.0
16331413304.0
17376227191.0
18168721693.0
1997030814.0
20126522485.0
2115021044.0
221945004.0
23352110883.0
24188935673.0
\n", + "

25 rows × 3 columns

\n", + "
[1000209 rows x 3 columns in total]" + ], + "text/plain": [ + " user_id item_id rating\n", + "0 4354 968 4.0\n", + "1 3622 3521 5.0\n", + "2 5543 920 2.0\n", + "3 445 3175 5.0\n", + "4 5535 235 4.0\n", + "5 4422 1097 4.0\n", + "6 3119 1356 4.0\n", + "7 6037 1231 4.0\n", + "8 851 196 3.0\n", + "9 3111 435 3.0\n", + "10 5403 648 5.0\n", + "11 3601 2734 3.0\n", + "12 4655 2949 4.0\n", + "13 1274 3093 5.0\n", + "14 1521 350 4.0\n", + "15 3072 454 3.0\n", + "16 3314 1330 4.0\n", + "17 3762 2719 1.0\n", + "18 1687 2169 3.0\n", + "19 970 3081 4.0\n", + "20 1265 2248 5.0\n", + "21 1502 104 4.0\n", + "22 194 500 4.0\n", + "23 3521 1088 3.0\n", + "24 1889 3567 3.0\n", + "...\n", + "\n", + "[1000209 rows x 3 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import bigframes.pandas as bpd\n", + "from bigframes.ml import decomposition\n", + "\n", + "bq_df = bpd.read_gbq('bqml_tutorial.ratings', columns=('user_id', 'item_id', 'rating'))\n", + "bq_df" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n", + " rating_col='rating_col', user_col='user_id')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = decomposition.MatrixFactorization(\n", + " num_factors=34,\n", + " feedback_type='explicit',\n", + " user_col='user_id',\n", + " item_col='item_col',\n", + " rating_col='rating_col',\n", + " l2_reg=9.83,\n", + ")\n", + "\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 50f616db-afae-40da-bc95-f724bb8a5c84 is DONE. 24.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d13d556d-e011-40a0-9da8-5c0918cf1ef1 is DONE. 537.2 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n", + " rating_col='rating_col', user_col='user_id')" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fitted = model.fit(bq_df.rename(columns={'rating': 'rating_col', 'item_id': 'item_col'}))\n", + "fitted\n", + "# scored = model.score(fitted)\n", + "\n", + "# scored" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 66684505-f14b-423b-8105-93521064036a is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4ec28d78-f0c1-4456-8c08-60b6982ee52f is DONE. 48 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
00.4852820.3953410.0255350.3899060.6831990.683199
\n", + "

1 rows × 6 columns

\n", + "
[1 rows x 6 columns in total]" + ], + "text/plain": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 0.485282 0.395341 0.025535 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 0.389906 0.683199 0.683199 \n", + "\n", + "[1 rows x 6 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scored = model.score(fitted)\n", + "\n", + "scored" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job e7dcfb81-70af-4d65-9c2a-b42591812d0e is DONE. 29.5 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job fcb53dd1-f9cb-4872-b7bf-3d2f0da89b00 is DONE. 40.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 10436512-dada-4dfc-a3ff-94b480a5e890 is DONE. 48.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predicted_rating_coluser_iditem_colrating
03.34813143549684.0
15.22349362235215.0
21.82017355439202.0
34.70222844531755.0
43.20694955352354.0
54.690283442210974.0
63.944585311913564.0
74.275766603712314.0
83.4345798511963.0
91.82747331114353.0
104.13092854036485.0
113.231195360127343.0
123.750037465529494.0
133.858951127430935.0
143.3485215213504.0
152.95628430724543.0
163.831856331413304.0
170.805804376227191.0
183.65957168721693.0
193.0319797030814.0
203.384926126522485.0
214.17324315021044.0
223.9184351945004.0
232.451965352110883.0
242.982963188935673.0
\n", + "

25 rows × 4 columns

\n", + "
[1000209 rows x 4 columns in total]" + ], + "text/plain": [ + " predicted_rating_col user_id item_col rating\n", + "0 3.348131 4354 968 4.0\n", + "1 5.22349 3622 3521 5.0\n", + "2 1.820173 5543 920 2.0\n", + "3 4.702228 445 3175 5.0\n", + "4 3.206949 5535 235 4.0\n", + "5 4.690283 4422 1097 4.0\n", + "6 3.944585 3119 1356 4.0\n", + "7 4.275766 6037 1231 4.0\n", + "8 3.434579 851 196 3.0\n", + "9 1.827473 3111 435 3.0\n", + "10 4.130928 5403 648 5.0\n", + "11 3.231195 3601 2734 3.0\n", + "12 3.750037 4655 2949 4.0\n", + "13 3.858951 1274 3093 5.0\n", + "14 3.34852 1521 350 4.0\n", + "15 2.956284 3072 454 3.0\n", + "16 3.831856 3314 1330 4.0\n", + "17 0.805804 3762 2719 1.0\n", + "18 3.65957 1687 2169 3.0\n", + "19 3.03197 970 3081 4.0\n", + "20 3.384926 1265 2248 5.0\n", + "21 4.173243 1502 104 4.0\n", + "22 3.918435 194 500 4.0\n", + "23 2.451965 3521 1088 3.0\n", + "24 2.982963 1889 3567 3.0\n", + "...\n", + "\n", + "[1000209 rows x 4 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# predict_df = scored[['user_id']['item_col']]\n", + "# model.predict(predict_df)\n", + "model.predict(bq_df.rename(columns={'item_id': 'item_col'}))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json index ca34a530ee..1867a8c801 100644 --- a/tests/data/ratings_schema.json +++ b/tests/data/ratings_schema.json @@ -7,7 +7,7 @@ { "mode": "NULLABLE", "name": "item_id", - "type": "STRING" + "type": "INT64" }, { "mode": "NULLABLE", diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index d2320c570a..36f5d83c75 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -165,8 +165,8 @@ def test_decomposition_configure_fit_load_none_component( assert reloaded_model.n_components == 7 -def test_decomposition_mf_configure_fit_load_none_component( - ratings_df_default_index, dataset_id +def test_decomposition_mf_configure_fit_load( + session, ratings_df_default_index, dataset_id ): model = decomposition.MatrixFactorization( num_factors=6, @@ -178,13 +178,24 @@ def test_decomposition_mf_configure_fit_load_none_component( ) model.fit(ratings_df_default_index) - # save, load, check n_components. Here n_components is the column size of the training input. - # reloaded_model = model.to_gbq( - # f"{dataset_id}.temp_configured_pca_model", replace=True - # ) - # assert reloaded_model._bqml_model is not None - # assert ( - # f"{dataset_id}.temp_configured_pca_model" - # in reloaded_model._bqml_model.model_name - # ) - assert model.num_factors == 6 + reloaded_model = model.to_gbq( + f"{dataset_id}.temp_configured_mf_model", replace=True + ) + + new_ratings = session.read_pandas( + pd.DataFrame( + { + "user_id": ["11", "12", "13"], + "item_id": [1, 2, 3], + "ratings": [1.0, 2.0, 3.0], + } + ) + ) + + reloaded_model.score(new_ratings) + + result = reloaded_model.predict( + new_ratings.rename(columns={"item_id": "item_col"}) + ).to_pandas() + + assert result is not None From 8d55eac81aec7888e6eefa08fd86ed921c3b115a Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 24 Feb 2025 21:51:40 +0000 Subject: [PATCH 19/75] Merge branch 'main' into b338873783-matrix-factorization --- CHANGELOG.md | 32 ++++ bigframes/core/compile/aggregate_compiler.py | 14 +- bigframes/core/compile/ibis_types.py | 17 ++- bigframes/core/compile/scalar_op_compiler.py | 5 + bigframes/core/rewrite/timedeltas.py | 55 ++++++- bigframes/core/schema.py | 6 +- bigframes/dataframe.py | 13 +- bigframes/dtypes.py | 1 + bigframes/functions/_function_session.py | 6 + bigframes/functions/function.py | 12 +- bigframes/ml/metrics/_metrics.py | 7 +- bigframes/operations/__init__.py | 2 + bigframes/operations/aggregations.py | 25 ++- bigframes/operations/json_ops.py | 14 +- bigframes/operations/remote_function_ops.py | 40 ++--- bigframes/operations/timedelta_ops.py | 27 +++- bigframes/series.py | 22 ++- bigframes/version.py | 2 +- noxfile.py | 20 ++- samples/snippets/bigquery_modules_test.py | 69 +++++++++ ...ingle_timeseries_forecasting_model_test.py | 64 ++++++++ scripts/test_publish_api_coverage.py | 2 + setup.py | 11 +- testing/constraints-3.9.txt | 1 - tests/system/conftest.py | 5 + .../large/functions/test_remote_function.py | 4 + .../small/functions/test_remote_function.py | 143 ++++++++++++++++-- tests/system/small/ml/test_metrics.py | 14 +- .../small/operations/test_timedeltas.py | 46 ++++++ tests/system/small/test_pandas.py | 15 ++ tests/system/small/test_series.py | 2 + tests/unit/functions/test_remote_function.py | 6 + tests/unit/ml/test_api_primitives.py | 5 +- tests/unit/ml/test_compose.py | 4 +- tests/unit/ml/test_pipeline.py | 9 +- .../bigframes_vendored/pandas/core/frame.py | 4 +- .../bigframes_vendored/pandas/core/series.py | 4 +- .../sklearn/metrics/_ranking.py | 20 ++- .../bigframes_vendored/tpch/queries/q9.py | 14 +- third_party/bigframes_vendored/version.py | 2 +- 40 files changed, 635 insertions(+), 129 deletions(-) create mode 100644 samples/snippets/bigquery_modules_test.py create mode 100644 samples/snippets/limit_single_timeseries_forecasting_model_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b301f85a6a..24a1d8cb62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,38 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.37.0...v1.38.0) (2025-02-24) + + +### Features + +* (Preview) Support diff aggregation for timestamp series. ([#1405](https://github.com/googleapis/python-bigquery-dataframes/issues/1405)) ([abe48d6](https://github.com/googleapis/python-bigquery-dataframes/commit/abe48d6f13a954534460fa14c9337e1085d9fbb3)) +* Add `GeoSeries.from_wkt() `and `GeoSeries.to_wkt()` ([#1401](https://github.com/googleapis/python-bigquery-dataframes/issues/1401)) ([2993b28](https://github.com/googleapis/python-bigquery-dataframes/commit/2993b283966960430ad8482f40f177e276db2d64)) +* Support DF.__array__(copy=True) ([#1403](https://github.com/googleapis/python-bigquery-dataframes/issues/1403)) ([693ed8c](https://github.com/googleapis/python-bigquery-dataframes/commit/693ed8cfb1ecc3af161801225d3e9cda489c29dd)) +* Support routines with ARRAY return type in `read_gbq_function` ([#1412](https://github.com/googleapis/python-bigquery-dataframes/issues/1412)) ([4b60049](https://github.com/googleapis/python-bigquery-dataframes/commit/4b60049e8362bfb07c136d8b2eb02b984d71f084)) + + +### Bug Fixes + +* Calling to_timdelta() over timedeltas no longer changes their values ([#1411](https://github.com/googleapis/python-bigquery-dataframes/issues/1411)) ([650a190](https://github.com/googleapis/python-bigquery-dataframes/commit/650a1907fdf84897eb7aa288863ee27d938e0879)) +* Replace empty dict with None to avoid mutable default arguments ([#1416](https://github.com/googleapis/python-bigquery-dataframes/issues/1416)) ([fa4e3ad](https://github.com/googleapis/python-bigquery-dataframes/commit/fa4e3ad8bcd5db56fa26b26609cc7e58b1edf498)) + + +### Performance Improvements + +* Avoid redundant SQL casts ([#1399](https://github.com/googleapis/python-bigquery-dataframes/issues/1399)) ([6ee48d5](https://github.com/googleapis/python-bigquery-dataframes/commit/6ee48d5c16870f1caa99c3f658c2c1a0e14be749)) + + +### Dependencies + +* Remove scikit-learn and sqlalchemy as required dependencies ([#1296](https://github.com/googleapis/python-bigquery-dataframes/issues/1296)) ([fd8bc89](https://github.com/googleapis/python-bigquery-dataframes/commit/fd8bc894bdbdf551ebbec1fb93832588371ae6af)) + + +### Documentation + +* Add samples using SQL methods via the `bigframes.bigquery` module ([#1358](https://github.com/googleapis/python-bigquery-dataframes/issues/1358)) ([f54e768](https://github.com/googleapis/python-bigquery-dataframes/commit/f54e7688fda6372c6decc9b61796b0272d803c79)) +* Add snippets for visualizing a time series and creating a time series model for the Limit forecasted values in time series model tutorial ([#1310](https://github.com/googleapis/python-bigquery-dataframes/issues/1310)) ([c6c9120](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c9120e839647e5b3cb97f04a8d90cc8690b8a3)) + ## [1.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.36.0...v1.37.0) (2025-02-19) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 4ec0b270ed..a17b69815c 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -231,7 +231,11 @@ def _( column: ibis_types.NumericColumn, window=None, ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.quantile(op.q), window) + result = column.quantile(op.q) + if op.should_floor_result: + result = result.floor() # type:ignore + + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -242,7 +246,8 @@ def _( window=None, # order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.mean(), window) + result = column.mean().floor() if op.should_floor_result else column.mean() + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -306,10 +311,11 @@ def _( @numeric_op def _( op: agg_ops.StdOp, - x: ibis_types.Column, + x: ibis_types.NumericColumn, window=None, ) -> ibis_types.Value: - return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window) + result = x.std().floor() if op.should_floor_result else x.std() + return _apply_window_if_present(result, window) @compile_unary_agg.register diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 2dcc1b3c8a..c47c6cf07b 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -463,10 +463,19 @@ def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType: return python_type_to_ibis_type(t) -def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: +def ibis_type_from_bigquery_type( + type_: bigquery.StandardSqlDataType, +) -> ibis_dtypes.DataType: """Convert bq type to ibis. Only to be used for remote functions, does not handle all types.""" - if tk not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: + if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: raise UnsupportedTypeError( - tk, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + ) + elif type_.type_kind == "ARRAY": + return ibis_dtypes.Array( + value_type=ibis_type_from_bigquery_type( + typing.cast(bigquery.StandardSqlDataType, type_.array_element_type) + ) ) - return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) + else: + return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 923ec8c81d..7111406646 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1186,6 +1186,11 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): ).floor() +@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) +def timedelta_floor_op_impl(x: ibis_types.NumericValue): + return x.floor() + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): ibis_node = getattr(op.func, "ibis_node", None) diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index dad474e5a1..e21e0b6bf2 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -70,6 +70,19 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod root.skip_reproject_unsafe, ) + if isinstance(root, nodes.AggregateNode): + updated_aggregations = tuple( + (_rewrite_aggregation(agg, root.child.schema), col_id) + for agg, col_id in root.aggregations + ) + return nodes.AggregateNode( + root.child, + updated_aggregations, + root.by_column_ids, + root.order_by, + root.dropna, + ) + return root @@ -125,6 +138,9 @@ def _rewrite_op_expr( # but for timedeltas: int(timedelta) // float => int(timedelta) return _rewrite_floordiv_op(inputs[0], inputs[1]) + if isinstance(expr.op, ops.ToTimedeltaOp): + return _rewrite_to_timedelta_op(expr.op, inputs[0]) + return _TypedExpr.create_op_expr(expr.op, *inputs) @@ -154,9 +170,9 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.mul_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -165,7 +181,7 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.div_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -174,28 +190,53 @@ def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result +def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): + if arg.dtype is dtypes.TIMEDELTA_DTYPE: + # Do nothing for values that are already timedeltas + return arg + + return _TypedExpr.create_op_expr(op, arg) + + @functools.cache def _rewrite_aggregation( aggregation: ex.Aggregation, schema: schema.ArraySchema ) -> ex.Aggregation: if not isinstance(aggregation, ex.UnaryAggregation): return aggregation - if not isinstance(aggregation.op, aggs.DiffOp): - return aggregation if isinstance(aggregation.arg, ex.DerefOp): input_type = schema.get_type(aggregation.arg.id.sql) else: input_type = aggregation.arg.dtype - if dtypes.is_datetime_like(input_type): + if isinstance(aggregation.op, aggs.DiffOp) and dtypes.is_datetime_like(input_type): return ex.UnaryAggregation( aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg ) + if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE: + return ex.UnaryAggregation( + aggs.StdOp(should_floor_result=True), aggregation.arg + ) + + if isinstance(aggregation.op, aggs.MeanOp) and input_type is dtypes.TIMEDELTA_DTYPE: + return ex.UnaryAggregation( + aggs.MeanOp(should_floor_result=True), aggregation.arg + ) + + if ( + isinstance(aggregation.op, aggs.QuantileOp) + and input_type is dtypes.TIMEDELTA_DTYPE + ): + return ex.UnaryAggregation( + aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True), + aggregation.arg, + ) + return aggregation diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index e3808dfffd..c379db72be 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -41,8 +41,12 @@ class ArraySchema: def from_bq_table( cls, table: google.cloud.bigquery.Table, - column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {}, + column_type_overrides: typing.Optional[ + typing.Dict[str, bigframes.dtypes.Dtype] + ] = None, ): + if column_type_overrides is None: + column_type_overrides = {} items = tuple( SchemaItem(name, column_type_overrides.get(name, dtype)) for name, dtype in bigframes.dtypes.bf_type_from_type_kind( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c02b182ee3..caf1b62e07 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3705,7 +3705,9 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: + if copy is False: + raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__) @@ -4086,9 +4088,12 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) result_series.name = None - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended + # to be an array, reconstruct the array from the string assuming it + # is a json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index e4db904210..54b621a0f8 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -874,4 +874,5 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: "INT64", "INTEGER", "STRING", + "ARRAY", } diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a0518978a3..93b5c4c596 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -501,6 +501,7 @@ def try_delattr(attr): try_delattr("bigframes_remote_function") try_delattr("input_dtypes") try_delattr("output_dtype") + try_delattr("bigframes_bigquery_function_output_dtype") try_delattr("is_row_processor") try_delattr("ibis_node") @@ -589,6 +590,11 @@ def try_delattr(attr): ibis_signature.output_type ) ) + func.bigframes_bigquery_function_output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_output_type_for_bqrf + ) + ) func.is_row_processor = is_row_processor func.ibis_node = node diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index ef2c81a953..c2809b96eb 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -56,8 +56,10 @@ class ReturnTypeMissingError(ValueError): # TODO: Move this to compile folder def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature: if routine.return_type: - ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_type_kind( - routine.return_type.type_kind + ibis_output_type = ( + bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( + routine.return_type + ) ) else: raise ReturnTypeMissingError @@ -82,8 +84,8 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignatu return _utils.IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ - bigframes.core.compile.ibis_types.ibis_type_from_type_kind( - arg.data_type.type_kind + bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( + arg.data_type ) if arg.data_type else None @@ -233,6 +235,8 @@ def func(*bigframes_args, **bigframes_kwargs): else ibis_signature.output_type ) + func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type) # type: ignore + func.is_row_processor = is_row_processor # type: ignore func.ibis_node = node # type: ignore return func diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 90df6f9539..658818b261 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -25,7 +25,6 @@ import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd -import sklearn.metrics as sklearn_metrics # type: ignore from bigframes.ml import utils import bigframes.pandas as bpd @@ -176,9 +175,9 @@ def auc( ) -> float: x_series, y_series = utils.batch_convert_to_series(x, y) - # TODO(b/286410053) Support ML exceptions and error handling. - auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas()) - return auc + x_pandas = x_series.to_pandas() + y_pandas = y_series.to_pandas() + return vendored_metrics_ranking.auc(x_pandas, y_pandas) auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e4e4bf7ef3..7e6f1f793c 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -184,6 +184,7 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + timedelta_floor_op, timestamp_add_op, timestamp_sub_op, ToTimedeltaOp, @@ -259,6 +260,7 @@ "second_op", "normalize_op", # Timedelta ops + "timedelta_floor_op", "timestamp_add_op", "timestamp_sub_op", "ToTimedeltaOp", diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index e9d102b42d..bf6016bb2e 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -142,13 +142,16 @@ class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if not dtypes.is_numeric(input_types[0]): - raise TypeError(f"Type {input_types[0]} is not numeric") - if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.INT_DTYPE - else: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + + if dtypes.is_numeric(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): + return dtypes.INT_DTYPE return input_types[0] + raise TypeError(f"Type {input_types[0]} is not numeric or timedelta") + @dataclasses.dataclass(frozen=True) class MedianOp(UnaryAggregateOp): @@ -171,6 +174,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) class QuantileOp(UnaryAggregateOp): q: float + should_floor_result: bool = False @property def name(self): @@ -181,6 +185,8 @@ def order_independent(self) -> bool: return True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -224,7 +230,11 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class MeanOp(UnaryAggregateOp): name: ClassVar[str] = "mean" + should_floor_result: bool = False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -262,7 +272,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class StdOp(UnaryAggregateOp): name: ClassVar[str] = "std" + should_floor_result: bool = False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + return signatures.FixedOutputType( dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" ).output_type(input_types[0]) diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 1daacf4e6b..c9ce633cae 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -31,7 +31,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return input_type @@ -46,7 +46,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return pd.ArrowDtype( @@ -63,7 +63,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return pd.ArrowDtype( @@ -79,7 +79,7 @@ def output_type(self, *input_types): input_type = input_types[0] if input_type != dtypes.STRING_DTYPE: raise TypeError( - "Input type must be an valid JSON-formatted string type." + "Input type must be a valid JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.JSON_DTYPE @@ -93,7 +93,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE @@ -109,7 +109,7 @@ def output_type(self, *input_types): right_type = input_types[1] if not dtypes.is_json_like(left_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {left_type}" ) if not dtypes.is_json_encoding_type(right_type): @@ -130,7 +130,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index 5b738c0bb5..8505fd1607 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -15,7 +15,6 @@ import dataclasses import typing -from bigframes import dtypes from bigframes.operations import base_ops @@ -31,17 +30,10 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -55,17 +47,10 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -79,14 +64,7 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 689966e21b..364154f728 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -36,7 +36,26 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) -class TimestampAdd(base_ops.BinaryOp): +class TimedeltaFloorOp(base_ops.UnaryOp): + """Floors the numeric value to the nearest integer and use it to represent a timedelta. + + This operator is only meant to be used during expression tree rewrites. Do not use it anywhere else! + """ + + name: typing.ClassVar[str] = "timedelta_floor" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + input_type = input_types[0] + if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"unsupported type: {input_type}") + + +timedelta_floor_op = TimedeltaFloorOp() + + +@dataclasses.dataclass(frozen=True) +class TimestampAddOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_add" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -57,10 +76,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_add_op = TimestampAdd() +timestamp_add_op = TimestampAddOp() -class TimestampSub(base_ops.BinaryOp): +class TimestampSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -76,4 +95,4 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_sub_op = TimestampSub() +timestamp_sub_op = TimestampSubOp() diff --git a/bigframes/series.py b/bigframes/series.py index fe2d1aae0e..5a84dee32f 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1545,9 +1545,12 @@ def apply( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended to + # be an array, reconstruct the array from the string assuming it is a + # json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1585,9 +1588,12 @@ def combine( other, ops.BinaryRemoteFunctionOp(func=func) ) - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended to + # be an array, reconstruct the array from the string assuming it is a + # json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1812,7 +1818,9 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: + if copy is False: + raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) diff --git a/bigframes/version.py b/bigframes/version.py index 27dfb23603..762deda9ff 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.37.0" +__version__ = "1.38.0" diff --git a/noxfile.py b/noxfile.py index b851bf160d..bffb6ebaa0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -72,7 +72,9 @@ UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] -UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]} +UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.12": ["polars", "scikit-learn"], +} # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search @@ -96,8 +98,13 @@ ] SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] SYSTEM_TEST_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_EXTRAS: List[str] = ["tests"] -SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +SYSTEM_TEST_EXTRAS: List[str] = [] +SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.9": ["tests"], + "3.10": ["tests"], + "3.12": ["tests", "scikit-learn"], + "3.13": ["tests"], +} LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -468,8 +475,7 @@ def cover(session): @nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs for this library.""" - - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -510,7 +516,7 @@ def docs(session): def docfx(session): """Build the docfx yaml files for this library.""" - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( if match.group(1) not in already_installed ] + print(already_installed) + # We use --no-deps to ensure that pre-release versions aren't overwritten # by the version ranges in setup.py. session.install(*deps) diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py new file mode 100644 index 0000000000..1a15790815 --- /dev/null +++ b/samples/snippets/bigquery_modules_test.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_examples() -> None: + # [START bigquery_dataframes_bigquery_methods_struct] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Create a new STRUCT Series with subfields for each column in a DataFrames. + lengths = bbq.struct( + bq_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + lengths.peek() + # 146 {'culmen_length_mm': 51.1, 'culmen_depth_mm': ... + # 278 {'culmen_length_mm': 48.2, 'culmen_depth_mm': ... + # 337 {'culmen_length_mm': 36.4, 'culmen_depth_mm': ... + # 154 {'culmen_length_mm': 46.5, 'culmen_depth_mm': ... + # 185 {'culmen_length_mm': 50.1, 'culmen_depth_mm': ... + # dtype: struct[pyarrow] + # [END bigquery_dataframes_bigquery_methods_struct] + + # [START bigquery_dataframes_bigquery_methods_scalar] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + + # The sql_scalar function can be used to inject SQL syntax that is not supported + # or difficult to express with the bigframes.pandas APIs. + bq_df = bpd.read_gbq(query_or_table) + shortest = bbq.sql_scalar( + "LEAST({0}, {1}, {2})", + columns=[ + bq_df["culmen_depth_mm"], + bq_df["culmen_length_mm"], + bq_df["flipper_length_mm"], + ], + ) + + shortest.peek() + # 0 + # 149 18.9 + # 33 16.3 + # 296 17.2 + # 287 17.0 + # 307 15.0 + # dtype: Float64 + # [END bigquery_dataframes_bigquery_methods_scalar] + assert bq_df is not None + assert lengths is not None + assert shortest is not None diff --git a/samples/snippets/limit_single_timeseries_forecasting_model_test.py b/samples/snippets/limit_single_timeseries_forecasting_model_test.py new file mode 100644 index 0000000000..6a9f14e383 --- /dev/null +++ b/samples/snippets/limit_single_timeseries_forecasting_model_test.py @@ -0,0 +1,64 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_limit_single_timeseries(random_model_id: str) -> None: + your_model_id = random_model_id + + # [START bigquery_dataframes_bqml_limit_forecast_visualize] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date"]).count() + + num_trips.plot.line() + # [END bigquery_dataframes_bqml_limit_forecast_visualize] + + # [START bigquery_dataframes_bqml_limit_forecast_create] + from bigframes.ml import forecasting + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "start_station_id": df["start_station_id"], + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date", "start_station_id"], as_index=False).count() + model = forecasting.ARIMAPlus() + + X = num_trips[["date"]] + y = num_trips[["num_trips"]] + id_col = num_trips[["start_station_id"]] + + model.fit(X, y, id_col=id_col) + + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.nyc_citibike_arima_model", + replace=True, + ) + # [END bigquery_dataframes_bqml_limit_forecast_create] + assert df is not None + assert features is not None + assert num_trips is not None diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 034a266177..6dea10b608 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -19,6 +19,8 @@ from . import publish_api_coverage +pytest.importorskip("sklearn") + @pytest.fixture def api_coverage_df(): diff --git a/setup.py b/setup.py index 4386177a5e..1f6114b634 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ "pyarrow >=10.0.1", "pydata-google-auth >=1.8.2", "requests >=2.27.1", - "scikit-learn >=1.2.2", - "sqlalchemy >=1.4,<3.0dev", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", @@ -77,8 +75,15 @@ "tests": [], # used for local engine, which is only needed for unit tests at present. "polars": ["polars >= 1.7.0"], + "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. - "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"], + "dev": [ + "pytest", + "pytest-mock", + "pre-commit", + "nox", + "google-cloud-testutils", + ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8b7ad892c0..30d5c1c3a7 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,7 +20,6 @@ pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 -sqlalchemy==1.4 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index e4bff8cdcc..f69f08b1ae 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -251,6 +251,11 @@ def table_id_unique(dataset_id: str): return f"{dataset_id}.{prefixer.create_prefix()}" +@pytest.fixture(scope="function") +def routine_id_unique(dataset_id: str): + return f"{dataset_id}.{prefixer.create_prefix()}" + + @pytest.fixture(scope="session") def scalars_schema(bigquery_client: bigquery.Client): # TODO(swast): Add missing scalar data types such as BIGNUMERIC. diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 54ba0549a0..7363e370bb 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2193,6 +2193,10 @@ def foo(x, y, z): ) ) ) + assert ( + getattr(foo, "bigframes_bigquery_function_output_dtype") + == bigframes.dtypes.STRING_DTYPE + ) # Fails to apply on dataframe with incompatible number of columns with pytest.raises( diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 0dc8960f62..99a017c917 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -14,6 +14,7 @@ import inspect import re +import textwrap import google.api_core.exceptions from google.cloud import bigquery @@ -27,6 +28,7 @@ import bigframes.exceptions from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff +import bigframes.session._io.bigquery from tests.system.utils import assert_pandas_df_equal _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -632,7 +634,6 @@ def add_one(x): )(add_one) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_detects_invalid_function(session, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) with pytest.raises(ValueError) as e: @@ -705,21 +706,133 @@ def square1(x): assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_runs_existing_udf(session): func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") got = func("AURÉLIE") assert got == "aurÉlie" -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_runs_existing_udf_4_params(session): func = session.read_gbq_function("bqutil.fn.cw_instr4") got = func("TestStr123456Str", "Str", 1, 2) assert got == 14 -@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_unique): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING) + RETURNS ARRAY + AS ( + [x, x] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello") + assert got == ["hello", "hello"] + + # Test on a series, assert pandas parity + pd_s = pd.Series(["alpha", "beta", "gamma"]) + bf_s = session.read_pandas(pd_s) + pd_result = pd_s.apply(func) + bf_result = bf_s.apply(func) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_read_gbq_function_runs_existing_udf_2_params_array_output( + session, routine_id_unique +): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y STRING) + RETURNS ARRAY + AS ( + [x, y] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello", "world") + assert got == ["hello", "world"] + + # Test on series, assert pandas parity + pd_df = pd.DataFrame( + {"col0": ["alpha", "beta", "gamma"], "col1": ["delta", "theta", "phi"]} + ) + bf_df = session.read_pandas(pd_df) + pd_result = pd_df["col0"].combine(pd_df["col1"], func) + bf_result = bf_df["col0"].combine(bf_df["col1"], func) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_read_gbq_function_runs_existing_udf_4_params_array_output( + session, routine_id_unique +): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y BOOL, z INT64, w FLOAT64) + RETURNS ARRAY + AS ( + [x, CAST(y AS STRING), CAST(z AS STRING), CAST(w AS STRING)] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello", True, 1, 2.3) + assert got == ["hello", "true", "1", "2.3"] + + # Test on a dataframe, assert pandas parity + pd_df = pd.DataFrame( + { + "col0": ["alpha", "beta", "gamma"], + "col1": [True, False, True], + "col2": [1, 2, 3], + "col3": [4.5, 6, 7.75], + } + ) + bf_df = session.read_pandas(pd_df) + # Simulate the result directly, since the function cannot be applied + # directly on a pandas dataframe with axis=1, as this is a special type of + # function with multiple params supported only on bigframes dataframe. + pd_result = pd.Series( + [ + ["alpha", "true", "1", "4.5"], + ["beta", "false", "2", "6"], + ["gamma", "true", "3", "7.75"], + ] + ) + bf_result = bf_df.apply(func, axis=1) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( @@ -754,6 +867,10 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): assert square.bigframes_remote_function == str(routine.reference) assert square.input_dtypes == (bigframes.dtypes.INT_DTYPE,) assert square.output_dtype == bigframes.dtypes.INT_DTYPE + assert ( + square.bigframes_bigquery_function_output_dtype + == bigframes.dtypes.INT_DTYPE + ) src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} @@ -772,7 +889,6 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_requires_explicit_types( session, bigquery_client, dataset_id ): @@ -863,7 +979,6 @@ def test_read_gbq_function_requires_explicit_types( ), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_respects_python_output_type( request, session_fixture, bigquery_client, dataset_id, array_type, expected_data ): @@ -906,7 +1021,6 @@ def test_read_gbq_function_respects_python_output_type( pytest.param(list[str], id="list-str"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( session, bigquery_client, dataset_id, array_type ): @@ -945,7 +1059,6 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( pytest.param(list[str], id="list-str"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_supported_python_output_type( session, bigquery_client, dataset_id, array_type ): @@ -992,7 +1105,6 @@ def test_df_apply_scalar_func(session, scalars_dfs): ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_multiple_inputs_not_a_row_processor(session): with pytest.raises(ValueError) as context: # The remote function has two args, which cannot be row processed. Throw @@ -1214,20 +1326,19 @@ def should_mask(name: str) -> bool: repr(s.mask(should_mask, "REDACTED")) -@pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index): - gbq_function = f"{dataset_id}.should_mask" - +def test_read_gbq_function_application_repr( + session, routine_id_unique, scalars_df_index +): # This function deliberately has a param with name "name", this is to test # a specific ibis' internal handling of object names session.bqclient.query_and_wait( - f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" + f"CREATE OR REPLACE FUNCTION `{routine_id_unique}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" ) - routine = session.bqclient.get_routine(gbq_function) + routine = session.bqclient.get_routine(routine_id_unique) assert "name" in [arg.name for arg in routine.arguments] # read the function and apply to dataframe - should_mask = session.read_gbq_function(gbq_function) + should_mask = session.read_gbq_function(routine_id_unique) s = scalars_df_index["string_col"] diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 81e1b2f77f..b80202bdbe 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd import pytest -import sklearn.metrics as sklearn_metrics # type: ignore import bigframes from bigframes.ml import metrics @@ -66,6 +65,7 @@ def test_r2_score_force_finite(session): def test_r2_score_ok_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session): def test_accuracy_score_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): def test_roc_curve_binary_classification_prediction_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): def test_roc_curve_binary_classification_decision_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") # Instead of operating on probabilities, assume a 70% decision threshold # has been applied, and operate on the final output y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45] @@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session): def test_roc_auc_score_returns_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session): def test_confusion_matrix_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 3, 3, 3, 4, 1], @@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session): def test_confusion_matrix_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -603,6 +609,7 @@ def test_recall_score(session): def test_recall_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session): def test_recall_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -673,6 +681,7 @@ def test_precision_score(session): def test_precision_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session): def test_precision_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -752,6 +762,7 @@ def test_f1_score(session): def test_f1_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session): def test_f1_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 356000b3f6..723481b1d1 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -465,3 +465,49 @@ def test_timedelta_ordering(session): pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) + + +def test_timedelta_cumsum(temporal_dfs): + bf_df, pd_df = temporal_dfs + + actual_result = bf_df["timedelta_col_1"].cumsum().to_pandas() + + expected_result = pd_df["timedelta_col_1"].cumsum() + _assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + "agg_func", + [ + pytest.param(lambda x: x.min(), id="min"), + pytest.param(lambda x: x.max(), id="max"), + pytest.param(lambda x: x.sum(), id="sum"), + pytest.param(lambda x: x.mean(), id="mean"), + pytest.param(lambda x: x.median(), id="median"), + pytest.param(lambda x: x.quantile(0.5), id="quantile"), + pytest.param(lambda x: x.std(), id="std"), + ], +) +def test_timedelta_agg__timedelta_result(temporal_dfs, agg_func): + bf_df, pd_df = temporal_dfs + + actual_result = agg_func(bf_df["timedelta_col_1"]) + + expected_result = agg_func(pd_df["timedelta_col_1"]).floor("us") + assert actual_result == expected_result + + +@pytest.mark.parametrize( + "agg_func", + [ + pytest.param(lambda x: x.count(), id="count"), + pytest.param(lambda x: x.nunique(), id="nunique"), + ], +) +def test_timedelta_agg__int_result(temporal_dfs, agg_func): + bf_df, pd_df = temporal_dfs + + actual_result = agg_func(bf_df["timedelta_col_1"]) + + expected_result = agg_func(pd_df["timedelta_col_1"]) + assert actual_result == expected_result diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 4b4264e33c..da78432cdb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -829,3 +829,18 @@ def test_to_timedelta_with_bf_series_invalid_unit(session, unit): @pytest.mark.parametrize("input", [1, 1.2, "1s"]) def test_to_timedelta_non_bf_series(input): assert bpd.to_timedelta(input) == pd.to_timedelta(input) + + +def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): + bf_df, pd_df = scalars_dfs + bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us") + pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us") + + actual_result = ( + bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]") + ) + + expected_result = pd.to_timedelta(pd_series, unit="s") + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 00f47c754e..2daa7dd825 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): ), ) def test_series_interpolate(method): + pytest.importorskip("scipy") + values = [None, 1, 2, None, None, 16, None] index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] pd_series = pd.Series(values, index) diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index 413a694680..d377fb4d49 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -66,6 +66,12 @@ def test_supported_types_correspond(): ibis_types_from_bigquery = { third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + # TODO(b/284515241): ARRAY is the only exception because it is supported + # as an output type of the BQ routine in the read_gbq_function path but + # not in the remote function path. Remove this handline once BQ remote + # functions supports ARRAY output and the bigframes remote functions + # utilizes that to support array output. + if tk != "ARRAY" } assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py index 00a51ccfe9..dd2ceff143 100644 --- a/tests/unit/ml/test_api_primitives.py +++ b/tests/unit/ml/test_api_primitives.py @@ -13,8 +13,6 @@ # limitations under the License. import pytest -import sklearn.decomposition as sklearn_decomposition # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore import bigframes.ml.decomposition import bigframes.ml.linear_model @@ -35,8 +33,9 @@ def test_base_estimator_repr(): assert pca_estimator.__repr__() == "PCA(n_components=7)" -@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn") def test_base_estimator_repr_matches_sklearn(): + sklearn_decomposition = pytest.importorskip("sklearn.decomposition") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") estimator = bigframes.ml.linear_model.LinearRegression() sklearn_estimator = sklearn_linear_model.LinearRegression() assert estimator.__repr__() == sklearn_estimator.__repr__() diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 395296f3e4..450ce8d6ee 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -15,8 +15,6 @@ from google.cloud import bigquery import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer @@ -119,6 +117,8 @@ def test_columntransformer_repr(): def test_columntransformer_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_column_transformer = compose.ColumnTransformer( [ ( diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py index ed5c621b1d..beebb9f282 100644 --- a/tests/unit/ml/test_pipeline.py +++ b/tests/unit/ml/test_pipeline.py @@ -13,10 +13,6 @@ # limitations under the License. import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore -import sklearn.pipeline as sklearn_pipeline # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing @@ -57,8 +53,11 @@ def test_pipeline_repr(): ) -@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn") def test_pipeline_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") + sklearn_pipeline = pytest.importorskip("sklearn.pipeline") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_pl = pipeline.Pipeline( [ ( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f5aa23d00b..e296dcb9f6 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -7179,7 +7179,7 @@ def __len__(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def __array__(self): + def __array__(self, dtype=None, copy: Optional[bool] = None): """ Returns the rows as NumPy array. @@ -7210,6 +7210,8 @@ def __array__(self): dtype (str or numpy.dtype, optional): The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy (bool or None, optional): + Whether to copy the data, False is not supported. Returns: numpy.ndarray: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 57f7dfbb79..5e6f546d09 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -5941,7 +5941,7 @@ def size(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: """ Returns the values as NumPy array. @@ -5965,6 +5965,8 @@ def __array__(self, dtype=None) -> numpy.ndarray: dtype (str or numpy.dtype, optional): The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy (bool or None, optional): + Whether to copy the data, False is not supported. Returns: numpy.ndarray: diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 7b97526de2..9262ffbd3d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -16,6 +16,8 @@ # Michal Karbownik # License: BSD 3 clause +import numpy as np + from bigframes import constants @@ -60,7 +62,23 @@ def auc(x, y) -> float: Returns: float: Area Under the Curve. """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + if len(x) < 2: + raise ValueError( + f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}" + ) + + if x.is_monotonic_decreasing: + d = -1 + elif x.is_monotonic_increasing: + d = 1 + else: + raise ValueError(f"x is neither increasing nor decreasing : {x}.") + + if hasattr(np, "trapezoid"): + # new in numpy 2.0 + return d * np.trapezoid(y, x) + # np.trapz has been deprecated in 2.0 + return d * np.trapz(y, x) # type: ignore def roc_auc_score(y_true, y_score) -> float: diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py index 6af33f7569..5c9ca1e9c3 100644 --- a/third_party/bigframes_vendored/tpch/queries/q9.py +++ b/third_party/bigframes_vendored/tpch/queries/q9.py @@ -33,13 +33,17 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) q_final = ( - part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") - .merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") - .merge( + part.merge( lineitem, - left_on=["P_PARTKEY", "PS_SUPPKEY"], - right_on=["L_PARTKEY", "L_SUPPKEY"], + left_on="P_PARTKEY", + right_on="L_PARTKEY", + ) + .merge( + partsupp, + left_on=["L_SUPPKEY", "L_PARTKEY"], + right_on=["PS_SUPPKEY", "PS_PARTKEY"], ) + .merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY") .merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") .merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY") ) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 27dfb23603..762deda9ff 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.37.0" +__version__ = "1.38.0" From faa4d6b29c94477da783268b0243112c7eae2d22 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 24 Feb 2025 22:06:49 +0000 Subject: [PATCH 20/75] Merge branch 'main' into b338873783-matrix-factorization --- CHANGELOG.md | 32 ++++ bigframes/core/compile/aggregate_compiler.py | 14 +- bigframes/core/compile/ibis_types.py | 17 ++- bigframes/core/compile/scalar_op_compiler.py | 5 + bigframes/core/rewrite/timedeltas.py | 55 ++++++- bigframes/core/schema.py | 6 +- bigframes/dataframe.py | 13 +- bigframes/dtypes.py | 1 + bigframes/functions/_function_session.py | 6 + bigframes/functions/function.py | 12 +- bigframes/ml/metrics/_metrics.py | 7 +- bigframes/operations/__init__.py | 2 + bigframes/operations/aggregations.py | 25 ++- bigframes/operations/json_ops.py | 14 +- bigframes/operations/remote_function_ops.py | 40 ++--- bigframes/operations/timedelta_ops.py | 27 +++- bigframes/series.py | 22 ++- bigframes/version.py | 2 +- noxfile.py | 20 ++- samples/snippets/bigquery_modules_test.py | 69 +++++++++ ...ingle_timeseries_forecasting_model_test.py | 64 ++++++++ scripts/test_publish_api_coverage.py | 2 + setup.py | 11 +- testing/constraints-3.9.txt | 1 - tests/system/conftest.py | 5 + .../large/functions/test_remote_function.py | 4 + .../small/functions/test_remote_function.py | 143 ++++++++++++++++-- tests/system/small/ml/test_metrics.py | 14 +- .../small/operations/test_timedeltas.py | 46 ++++++ tests/system/small/test_pandas.py | 15 ++ tests/system/small/test_series.py | 2 + tests/unit/functions/test_remote_function.py | 6 + tests/unit/ml/test_api_primitives.py | 5 +- tests/unit/ml/test_compose.py | 4 +- tests/unit/ml/test_pipeline.py | 9 +- .../bigframes_vendored/pandas/core/frame.py | 4 +- .../bigframes_vendored/pandas/core/series.py | 4 +- .../sklearn/metrics/_ranking.py | 20 ++- .../bigframes_vendored/tpch/queries/q9.py | 14 +- third_party/bigframes_vendored/version.py | 2 +- 40 files changed, 635 insertions(+), 129 deletions(-) create mode 100644 samples/snippets/bigquery_modules_test.py create mode 100644 samples/snippets/limit_single_timeseries_forecasting_model_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b301f85a6a..24a1d8cb62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,38 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.37.0...v1.38.0) (2025-02-24) + + +### Features + +* (Preview) Support diff aggregation for timestamp series. ([#1405](https://github.com/googleapis/python-bigquery-dataframes/issues/1405)) ([abe48d6](https://github.com/googleapis/python-bigquery-dataframes/commit/abe48d6f13a954534460fa14c9337e1085d9fbb3)) +* Add `GeoSeries.from_wkt() `and `GeoSeries.to_wkt()` ([#1401](https://github.com/googleapis/python-bigquery-dataframes/issues/1401)) ([2993b28](https://github.com/googleapis/python-bigquery-dataframes/commit/2993b283966960430ad8482f40f177e276db2d64)) +* Support DF.__array__(copy=True) ([#1403](https://github.com/googleapis/python-bigquery-dataframes/issues/1403)) ([693ed8c](https://github.com/googleapis/python-bigquery-dataframes/commit/693ed8cfb1ecc3af161801225d3e9cda489c29dd)) +* Support routines with ARRAY return type in `read_gbq_function` ([#1412](https://github.com/googleapis/python-bigquery-dataframes/issues/1412)) ([4b60049](https://github.com/googleapis/python-bigquery-dataframes/commit/4b60049e8362bfb07c136d8b2eb02b984d71f084)) + + +### Bug Fixes + +* Calling to_timdelta() over timedeltas no longer changes their values ([#1411](https://github.com/googleapis/python-bigquery-dataframes/issues/1411)) ([650a190](https://github.com/googleapis/python-bigquery-dataframes/commit/650a1907fdf84897eb7aa288863ee27d938e0879)) +* Replace empty dict with None to avoid mutable default arguments ([#1416](https://github.com/googleapis/python-bigquery-dataframes/issues/1416)) ([fa4e3ad](https://github.com/googleapis/python-bigquery-dataframes/commit/fa4e3ad8bcd5db56fa26b26609cc7e58b1edf498)) + + +### Performance Improvements + +* Avoid redundant SQL casts ([#1399](https://github.com/googleapis/python-bigquery-dataframes/issues/1399)) ([6ee48d5](https://github.com/googleapis/python-bigquery-dataframes/commit/6ee48d5c16870f1caa99c3f658c2c1a0e14be749)) + + +### Dependencies + +* Remove scikit-learn and sqlalchemy as required dependencies ([#1296](https://github.com/googleapis/python-bigquery-dataframes/issues/1296)) ([fd8bc89](https://github.com/googleapis/python-bigquery-dataframes/commit/fd8bc894bdbdf551ebbec1fb93832588371ae6af)) + + +### Documentation + +* Add samples using SQL methods via the `bigframes.bigquery` module ([#1358](https://github.com/googleapis/python-bigquery-dataframes/issues/1358)) ([f54e768](https://github.com/googleapis/python-bigquery-dataframes/commit/f54e7688fda6372c6decc9b61796b0272d803c79)) +* Add snippets for visualizing a time series and creating a time series model for the Limit forecasted values in time series model tutorial ([#1310](https://github.com/googleapis/python-bigquery-dataframes/issues/1310)) ([c6c9120](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c9120e839647e5b3cb97f04a8d90cc8690b8a3)) + ## [1.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.36.0...v1.37.0) (2025-02-19) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 4ec0b270ed..a17b69815c 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -231,7 +231,11 @@ def _( column: ibis_types.NumericColumn, window=None, ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.quantile(op.q), window) + result = column.quantile(op.q) + if op.should_floor_result: + result = result.floor() # type:ignore + + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -242,7 +246,8 @@ def _( window=None, # order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.mean(), window) + result = column.mean().floor() if op.should_floor_result else column.mean() + return _apply_window_if_present(result, window) @compile_unary_agg.register @@ -306,10 +311,11 @@ def _( @numeric_op def _( op: agg_ops.StdOp, - x: ibis_types.Column, + x: ibis_types.NumericColumn, window=None, ) -> ibis_types.Value: - return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window) + result = x.std().floor() if op.should_floor_result else x.std() + return _apply_window_if_present(result, window) @compile_unary_agg.register diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 2dcc1b3c8a..c47c6cf07b 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -463,10 +463,19 @@ def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType: return python_type_to_ibis_type(t) -def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: +def ibis_type_from_bigquery_type( + type_: bigquery.StandardSqlDataType, +) -> ibis_dtypes.DataType: """Convert bq type to ibis. Only to be used for remote functions, does not handle all types.""" - if tk not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: + if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: raise UnsupportedTypeError( - tk, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + ) + elif type_.type_kind == "ARRAY": + return ibis_dtypes.Array( + value_type=ibis_type_from_bigquery_type( + typing.cast(bigquery.StandardSqlDataType, type_.array_element_type) + ) ) - return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) + else: + return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 923ec8c81d..7111406646 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1186,6 +1186,11 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): ).floor() +@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) +def timedelta_floor_op_impl(x: ibis_types.NumericValue): + return x.floor() + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): ibis_node = getattr(op.func, "ibis_node", None) diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index dad474e5a1..e21e0b6bf2 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -70,6 +70,19 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod root.skip_reproject_unsafe, ) + if isinstance(root, nodes.AggregateNode): + updated_aggregations = tuple( + (_rewrite_aggregation(agg, root.child.schema), col_id) + for agg, col_id in root.aggregations + ) + return nodes.AggregateNode( + root.child, + updated_aggregations, + root.by_column_ids, + root.order_by, + root.dropna, + ) + return root @@ -125,6 +138,9 @@ def _rewrite_op_expr( # but for timedeltas: int(timedelta) // float => int(timedelta) return _rewrite_floordiv_op(inputs[0], inputs[1]) + if isinstance(expr.op, ops.ToTimedeltaOp): + return _rewrite_to_timedelta_op(expr.op, inputs[0]) + return _TypedExpr.create_op_expr(expr.op, *inputs) @@ -154,9 +170,9 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.mul_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -165,7 +181,7 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.div_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result @@ -174,28 +190,53 @@ def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): - return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result) return result +def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): + if arg.dtype is dtypes.TIMEDELTA_DTYPE: + # Do nothing for values that are already timedeltas + return arg + + return _TypedExpr.create_op_expr(op, arg) + + @functools.cache def _rewrite_aggregation( aggregation: ex.Aggregation, schema: schema.ArraySchema ) -> ex.Aggregation: if not isinstance(aggregation, ex.UnaryAggregation): return aggregation - if not isinstance(aggregation.op, aggs.DiffOp): - return aggregation if isinstance(aggregation.arg, ex.DerefOp): input_type = schema.get_type(aggregation.arg.id.sql) else: input_type = aggregation.arg.dtype - if dtypes.is_datetime_like(input_type): + if isinstance(aggregation.op, aggs.DiffOp) and dtypes.is_datetime_like(input_type): return ex.UnaryAggregation( aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg ) + if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE: + return ex.UnaryAggregation( + aggs.StdOp(should_floor_result=True), aggregation.arg + ) + + if isinstance(aggregation.op, aggs.MeanOp) and input_type is dtypes.TIMEDELTA_DTYPE: + return ex.UnaryAggregation( + aggs.MeanOp(should_floor_result=True), aggregation.arg + ) + + if ( + isinstance(aggregation.op, aggs.QuantileOp) + and input_type is dtypes.TIMEDELTA_DTYPE + ): + return ex.UnaryAggregation( + aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True), + aggregation.arg, + ) + return aggregation diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index e3808dfffd..c379db72be 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -41,8 +41,12 @@ class ArraySchema: def from_bq_table( cls, table: google.cloud.bigquery.Table, - column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {}, + column_type_overrides: typing.Optional[ + typing.Dict[str, bigframes.dtypes.Dtype] + ] = None, ): + if column_type_overrides is None: + column_type_overrides = {} items = tuple( SchemaItem(name, column_type_overrides.get(name, dtype)) for name, dtype in bigframes.dtypes.bf_type_from_type_kind( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c02b182ee3..caf1b62e07 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3705,7 +3705,9 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: + if copy is False: + raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__) @@ -4086,9 +4088,12 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) result_series.name = None - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended + # to be an array, reconstruct the array from the string assuming it + # is a json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index e4db904210..54b621a0f8 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -874,4 +874,5 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: "INT64", "INTEGER", "STRING", + "ARRAY", } diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a0518978a3..93b5c4c596 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -501,6 +501,7 @@ def try_delattr(attr): try_delattr("bigframes_remote_function") try_delattr("input_dtypes") try_delattr("output_dtype") + try_delattr("bigframes_bigquery_function_output_dtype") try_delattr("is_row_processor") try_delattr("ibis_node") @@ -589,6 +590,11 @@ def try_delattr(attr): ibis_signature.output_type ) ) + func.bigframes_bigquery_function_output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_output_type_for_bqrf + ) + ) func.is_row_processor = is_row_processor func.ibis_node = node diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index ef2c81a953..c2809b96eb 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -56,8 +56,10 @@ class ReturnTypeMissingError(ValueError): # TODO: Move this to compile folder def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature: if routine.return_type: - ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_type_kind( - routine.return_type.type_kind + ibis_output_type = ( + bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( + routine.return_type + ) ) else: raise ReturnTypeMissingError @@ -82,8 +84,8 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignatu return _utils.IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ - bigframes.core.compile.ibis_types.ibis_type_from_type_kind( - arg.data_type.type_kind + bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type( + arg.data_type ) if arg.data_type else None @@ -233,6 +235,8 @@ def func(*bigframes_args, **bigframes_kwargs): else ibis_signature.output_type ) + func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type) # type: ignore + func.is_row_processor = is_row_processor # type: ignore func.ibis_node = node # type: ignore return func diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 90df6f9539..658818b261 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -25,7 +25,6 @@ import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd -import sklearn.metrics as sklearn_metrics # type: ignore from bigframes.ml import utils import bigframes.pandas as bpd @@ -176,9 +175,9 @@ def auc( ) -> float: x_series, y_series = utils.batch_convert_to_series(x, y) - # TODO(b/286410053) Support ML exceptions and error handling. - auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas()) - return auc + x_pandas = x_series.to_pandas() + y_pandas = y_series.to_pandas() + return vendored_metrics_ranking.auc(x_pandas, y_pandas) auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e4e4bf7ef3..7e6f1f793c 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -184,6 +184,7 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + timedelta_floor_op, timestamp_add_op, timestamp_sub_op, ToTimedeltaOp, @@ -259,6 +260,7 @@ "second_op", "normalize_op", # Timedelta ops + "timedelta_floor_op", "timestamp_add_op", "timestamp_sub_op", "ToTimedeltaOp", diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index e9d102b42d..bf6016bb2e 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -142,13 +142,16 @@ class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if not dtypes.is_numeric(input_types[0]): - raise TypeError(f"Type {input_types[0]} is not numeric") - if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.INT_DTYPE - else: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + + if dtypes.is_numeric(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): + return dtypes.INT_DTYPE return input_types[0] + raise TypeError(f"Type {input_types[0]} is not numeric or timedelta") + @dataclasses.dataclass(frozen=True) class MedianOp(UnaryAggregateOp): @@ -171,6 +174,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) class QuantileOp(UnaryAggregateOp): q: float + should_floor_result: bool = False @property def name(self): @@ -181,6 +185,8 @@ def order_independent(self) -> bool: return True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -224,7 +230,11 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class MeanOp(UnaryAggregateOp): name: ClassVar[str] = "mean" + should_floor_result: bool = False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -262,7 +272,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class StdOp(UnaryAggregateOp): name: ClassVar[str] = "std" + should_floor_result: bool = False + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + return signatures.FixedOutputType( dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric" ).output_type(input_types[0]) diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 1daacf4e6b..c9ce633cae 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -31,7 +31,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return input_type @@ -46,7 +46,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return pd.ArrowDtype( @@ -63,7 +63,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return pd.ArrowDtype( @@ -79,7 +79,7 @@ def output_type(self, *input_types): input_type = input_types[0] if input_type != dtypes.STRING_DTYPE: raise TypeError( - "Input type must be an valid JSON-formatted string type." + "Input type must be a valid JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.JSON_DTYPE @@ -93,7 +93,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE @@ -109,7 +109,7 @@ def output_type(self, *input_types): right_type = input_types[1] if not dtypes.is_json_like(left_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {left_type}" ) if not dtypes.is_json_encoding_type(right_type): @@ -130,7 +130,7 @@ def output_type(self, *input_types): input_type = input_types[0] if not dtypes.is_json_like(input_type): raise TypeError( - "Input type must be an valid JSON object or JSON-formatted string type." + "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) return dtypes.STRING_DTYPE diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index 5b738c0bb5..8505fd1607 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -15,7 +15,6 @@ import dataclasses import typing -from bigframes import dtypes from bigframes.operations import base_ops @@ -31,17 +30,10 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -55,17 +47,10 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -79,14 +64,7 @@ def expensive(self) -> bool: def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - if hasattr(self.func, "output_dtype"): - if dtypes.is_array_like(self.func.output_dtype): - # TODO(b/284515241): remove this special handling to support - # array output types once BQ remote functions support ARRAY. - # Until then, use json serialized strings at the remote function - # level, and parse that to the intended output type at the - # bigframes level. - return dtypes.STRING_DTYPE - return self.func.output_dtype + if hasattr(self.func, "bigframes_bigquery_function_output_dtype"): + return self.func.bigframes_bigquery_function_output_dtype else: - raise AttributeError("output_dtype not defined") + raise AttributeError("bigframes_bigquery_function_output_dtype not defined") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 689966e21b..364154f728 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -36,7 +36,26 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT @dataclasses.dataclass(frozen=True) -class TimestampAdd(base_ops.BinaryOp): +class TimedeltaFloorOp(base_ops.UnaryOp): + """Floors the numeric value to the nearest integer and use it to represent a timedelta. + + This operator is only meant to be used during expression tree rewrites. Do not use it anywhere else! + """ + + name: typing.ClassVar[str] = "timedelta_floor" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + input_type = input_types[0] + if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"unsupported type: {input_type}") + + +timedelta_floor_op = TimedeltaFloorOp() + + +@dataclasses.dataclass(frozen=True) +class TimestampAddOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_add" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -57,10 +76,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_add_op = TimestampAdd() +timestamp_add_op = TimestampAddOp() -class TimestampSub(base_ops.BinaryOp): +class TimestampSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: @@ -76,4 +95,4 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) -timestamp_sub_op = TimestampSub() +timestamp_sub_op = TimestampSubOp() diff --git a/bigframes/series.py b/bigframes/series.py index fe2d1aae0e..5a84dee32f 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1545,9 +1545,12 @@ def apply( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended to + # be an array, reconstruct the array from the string assuming it is a + # json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1585,9 +1588,12 @@ def combine( other, ops.BinaryRemoteFunctionOp(func=func) ) - # if the output is an array, reconstruct it from the json serialized - # string form - if bigframes.dtypes.is_array_like(func.output_dtype): + # If the result type is string but the function output is intended to + # be an array, reconstruct the array from the string assuming it is a + # json serialized form of the array. + if bigframes.dtypes.is_string_like( + result_series.dtype + ) and bigframes.dtypes.is_array_like(func.output_dtype): import bigframes.bigquery as bbq result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1812,7 +1818,9 @@ def to_numpy( ) -> numpy.ndarray: return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: + if copy is False: + raise ValueError("Cannot convert to array without copy.") return self.to_numpy(dtype=dtype) __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) diff --git a/bigframes/version.py b/bigframes/version.py index 27dfb23603..762deda9ff 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.37.0" +__version__ = "1.38.0" diff --git a/noxfile.py b/noxfile.py index b851bf160d..bffb6ebaa0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -72,7 +72,9 @@ UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] -UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]} +UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.12": ["polars", "scikit-learn"], +} # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search @@ -96,8 +98,13 @@ ] SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] SYSTEM_TEST_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_EXTRAS: List[str] = ["tests"] -SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +SYSTEM_TEST_EXTRAS: List[str] = [] +SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.9": ["tests"], + "3.10": ["tests"], + "3.12": ["tests", "scikit-learn"], + "3.13": ["tests"], +} LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -468,8 +475,7 @@ def cover(session): @nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs for this library.""" - - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -510,7 +516,7 @@ def docs(session): def docfx(session): """Build the docfx yaml files for this library.""" - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( if match.group(1) not in already_installed ] + print(already_installed) + # We use --no-deps to ensure that pre-release versions aren't overwritten # by the version ranges in setup.py. session.install(*deps) diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py new file mode 100644 index 0000000000..1a15790815 --- /dev/null +++ b/samples/snippets/bigquery_modules_test.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_examples() -> None: + # [START bigquery_dataframes_bigquery_methods_struct] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Create a new STRUCT Series with subfields for each column in a DataFrames. + lengths = bbq.struct( + bq_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + lengths.peek() + # 146 {'culmen_length_mm': 51.1, 'culmen_depth_mm': ... + # 278 {'culmen_length_mm': 48.2, 'culmen_depth_mm': ... + # 337 {'culmen_length_mm': 36.4, 'culmen_depth_mm': ... + # 154 {'culmen_length_mm': 46.5, 'culmen_depth_mm': ... + # 185 {'culmen_length_mm': 50.1, 'culmen_depth_mm': ... + # dtype: struct[pyarrow] + # [END bigquery_dataframes_bigquery_methods_struct] + + # [START bigquery_dataframes_bigquery_methods_scalar] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + + # The sql_scalar function can be used to inject SQL syntax that is not supported + # or difficult to express with the bigframes.pandas APIs. + bq_df = bpd.read_gbq(query_or_table) + shortest = bbq.sql_scalar( + "LEAST({0}, {1}, {2})", + columns=[ + bq_df["culmen_depth_mm"], + bq_df["culmen_length_mm"], + bq_df["flipper_length_mm"], + ], + ) + + shortest.peek() + # 0 + # 149 18.9 + # 33 16.3 + # 296 17.2 + # 287 17.0 + # 307 15.0 + # dtype: Float64 + # [END bigquery_dataframes_bigquery_methods_scalar] + assert bq_df is not None + assert lengths is not None + assert shortest is not None diff --git a/samples/snippets/limit_single_timeseries_forecasting_model_test.py b/samples/snippets/limit_single_timeseries_forecasting_model_test.py new file mode 100644 index 0000000000..6a9f14e383 --- /dev/null +++ b/samples/snippets/limit_single_timeseries_forecasting_model_test.py @@ -0,0 +1,64 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_limit_single_timeseries(random_model_id: str) -> None: + your_model_id = random_model_id + + # [START bigquery_dataframes_bqml_limit_forecast_visualize] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date"]).count() + + num_trips.plot.line() + # [END bigquery_dataframes_bqml_limit_forecast_visualize] + + # [START bigquery_dataframes_bqml_limit_forecast_create] + from bigframes.ml import forecasting + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") + + features = bpd.DataFrame( + { + "start_station_id": df["start_station_id"], + "num_trips": df.starttime, + "date": df["starttime"].dt.date, + } + ) + num_trips = features.groupby(["date", "start_station_id"], as_index=False).count() + model = forecasting.ARIMAPlus() + + X = num_trips[["date"]] + y = num_trips[["num_trips"]] + id_col = num_trips[["start_station_id"]] + + model.fit(X, y, id_col=id_col) + + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.nyc_citibike_arima_model", + replace=True, + ) + # [END bigquery_dataframes_bqml_limit_forecast_create] + assert df is not None + assert features is not None + assert num_trips is not None diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 034a266177..6dea10b608 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -19,6 +19,8 @@ from . import publish_api_coverage +pytest.importorskip("sklearn") + @pytest.fixture def api_coverage_df(): diff --git a/setup.py b/setup.py index 4386177a5e..1f6114b634 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ "pyarrow >=10.0.1", "pydata-google-auth >=1.8.2", "requests >=2.27.1", - "scikit-learn >=1.2.2", - "sqlalchemy >=1.4,<3.0dev", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", @@ -77,8 +75,15 @@ "tests": [], # used for local engine, which is only needed for unit tests at present. "polars": ["polars >= 1.7.0"], + "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. - "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"], + "dev": [ + "pytest", + "pytest-mock", + "pre-commit", + "nox", + "google-cloud-testutils", + ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8b7ad892c0..30d5c1c3a7 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,7 +20,6 @@ pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 -sqlalchemy==1.4 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index e4bff8cdcc..f69f08b1ae 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -251,6 +251,11 @@ def table_id_unique(dataset_id: str): return f"{dataset_id}.{prefixer.create_prefix()}" +@pytest.fixture(scope="function") +def routine_id_unique(dataset_id: str): + return f"{dataset_id}.{prefixer.create_prefix()}" + + @pytest.fixture(scope="session") def scalars_schema(bigquery_client: bigquery.Client): # TODO(swast): Add missing scalar data types such as BIGNUMERIC. diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 54ba0549a0..7363e370bb 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2193,6 +2193,10 @@ def foo(x, y, z): ) ) ) + assert ( + getattr(foo, "bigframes_bigquery_function_output_dtype") + == bigframes.dtypes.STRING_DTYPE + ) # Fails to apply on dataframe with incompatible number of columns with pytest.raises( diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 0dc8960f62..99a017c917 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -14,6 +14,7 @@ import inspect import re +import textwrap import google.api_core.exceptions from google.cloud import bigquery @@ -27,6 +28,7 @@ import bigframes.exceptions from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff +import bigframes.session._io.bigquery from tests.system.utils import assert_pandas_df_equal _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -632,7 +634,6 @@ def add_one(x): )(add_one) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_detects_invalid_function(session, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) with pytest.raises(ValueError) as e: @@ -705,21 +706,133 @@ def square1(x): assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_runs_existing_udf(session): func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") got = func("AURÉLIE") assert got == "aurÉlie" -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_runs_existing_udf_4_params(session): func = session.read_gbq_function("bqutil.fn.cw_instr4") got = func("TestStr123456Str", "Str", 1, 2) assert got == 14 -@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_unique): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING) + RETURNS ARRAY + AS ( + [x, x] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello") + assert got == ["hello", "hello"] + + # Test on a series, assert pandas parity + pd_s = pd.Series(["alpha", "beta", "gamma"]) + bf_s = session.read_pandas(pd_s) + pd_result = pd_s.apply(func) + bf_result = bf_s.apply(func) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_read_gbq_function_runs_existing_udf_2_params_array_output( + session, routine_id_unique +): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y STRING) + RETURNS ARRAY + AS ( + [x, y] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello", "world") + assert got == ["hello", "world"] + + # Test on series, assert pandas parity + pd_df = pd.DataFrame( + {"col0": ["alpha", "beta", "gamma"], "col1": ["delta", "theta", "phi"]} + ) + bf_df = session.read_pandas(pd_df) + pd_result = pd_df["col0"].combine(pd_df["col1"], func) + bf_result = bf_df["col0"].combine(bf_df["col1"], func) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_read_gbq_function_runs_existing_udf_4_params_array_output( + session, routine_id_unique +): + bigframes.session._io.bigquery.start_query_with_client( + session.bqclient, + textwrap.dedent( + f""" + CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y BOOL, z INT64, w FLOAT64) + RETURNS ARRAY + AS ( + [x, CAST(y AS STRING), CAST(z AS STRING), CAST(w AS STRING)] + ) + """ + ), + job_config=bigquery.QueryJobConfig(), + ) + func = session.read_gbq_function(routine_id_unique) + + # Test on scalar value + got = func("hello", True, 1, 2.3) + assert got == ["hello", "true", "1", "2.3"] + + # Test on a dataframe, assert pandas parity + pd_df = pd.DataFrame( + { + "col0": ["alpha", "beta", "gamma"], + "col1": [True, False, True], + "col2": [1, 2, 3], + "col3": [4.5, 6, 7.75], + } + ) + bf_df = session.read_pandas(pd_df) + # Simulate the result directly, since the function cannot be applied + # directly on a pandas dataframe with axis=1, as this is a special type of + # function with multiple params supported only on bigframes dataframe. + pd_result = pd.Series( + [ + ["alpha", "true", "1", "4.5"], + ["beta", "false", "2", "6"], + ["gamma", "true", "3", "7.75"], + ] + ) + bf_result = bf_df.apply(func, axis=1) + assert bigframes.dtypes.is_array_string_like(bf_result.dtype) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( @@ -754,6 +867,10 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): assert square.bigframes_remote_function == str(routine.reference) assert square.input_dtypes == (bigframes.dtypes.INT_DTYPE,) assert square.output_dtype == bigframes.dtypes.INT_DTYPE + assert ( + square.bigframes_bigquery_function_output_dtype + == bigframes.dtypes.INT_DTYPE + ) src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} @@ -772,7 +889,6 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_requires_explicit_types( session, bigquery_client, dataset_id ): @@ -863,7 +979,6 @@ def test_read_gbq_function_requires_explicit_types( ), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_respects_python_output_type( request, session_fixture, bigquery_client, dataset_id, array_type, expected_data ): @@ -906,7 +1021,6 @@ def test_read_gbq_function_respects_python_output_type( pytest.param(list[str], id="list-str"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( session, bigquery_client, dataset_id, array_type ): @@ -945,7 +1059,6 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( pytest.param(list[str], id="list-str"), ], ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_supported_python_output_type( session, bigquery_client, dataset_id, array_type ): @@ -992,7 +1105,6 @@ def test_df_apply_scalar_func(session, scalars_dfs): ) -@pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_multiple_inputs_not_a_row_processor(session): with pytest.raises(ValueError) as context: # The remote function has two args, which cannot be row processed. Throw @@ -1214,20 +1326,19 @@ def should_mask(name: str) -> bool: repr(s.mask(should_mask, "REDACTED")) -@pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index): - gbq_function = f"{dataset_id}.should_mask" - +def test_read_gbq_function_application_repr( + session, routine_id_unique, scalars_df_index +): # This function deliberately has a param with name "name", this is to test # a specific ibis' internal handling of object names session.bqclient.query_and_wait( - f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" + f"CREATE OR REPLACE FUNCTION `{routine_id_unique}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" ) - routine = session.bqclient.get_routine(gbq_function) + routine = session.bqclient.get_routine(routine_id_unique) assert "name" in [arg.name for arg in routine.arguments] # read the function and apply to dataframe - should_mask = session.read_gbq_function(gbq_function) + should_mask = session.read_gbq_function(routine_id_unique) s = scalars_df_index["string_col"] diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 81e1b2f77f..b80202bdbe 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd import pytest -import sklearn.metrics as sklearn_metrics # type: ignore import bigframes from bigframes.ml import metrics @@ -66,6 +65,7 @@ def test_r2_score_force_finite(session): def test_r2_score_ok_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session): def test_accuracy_score_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): def test_roc_curve_binary_classification_prediction_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): def test_roc_curve_binary_classification_decision_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") # Instead of operating on probabilities, assume a 70% decision threshold # has been applied, and operate on the final output y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45] @@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session): def test_roc_auc_score_returns_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session): def test_confusion_matrix_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 3, 3, 3, 4, 1], @@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session): def test_confusion_matrix_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -603,6 +609,7 @@ def test_recall_score(session): def test_recall_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session): def test_recall_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -673,6 +681,7 @@ def test_precision_score(session): def test_precision_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session): def test_precision_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -752,6 +762,7 @@ def test_f1_score(session): def test_f1_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session): def test_f1_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 356000b3f6..723481b1d1 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -465,3 +465,49 @@ def test_timedelta_ordering(session): pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) + + +def test_timedelta_cumsum(temporal_dfs): + bf_df, pd_df = temporal_dfs + + actual_result = bf_df["timedelta_col_1"].cumsum().to_pandas() + + expected_result = pd_df["timedelta_col_1"].cumsum() + _assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + "agg_func", + [ + pytest.param(lambda x: x.min(), id="min"), + pytest.param(lambda x: x.max(), id="max"), + pytest.param(lambda x: x.sum(), id="sum"), + pytest.param(lambda x: x.mean(), id="mean"), + pytest.param(lambda x: x.median(), id="median"), + pytest.param(lambda x: x.quantile(0.5), id="quantile"), + pytest.param(lambda x: x.std(), id="std"), + ], +) +def test_timedelta_agg__timedelta_result(temporal_dfs, agg_func): + bf_df, pd_df = temporal_dfs + + actual_result = agg_func(bf_df["timedelta_col_1"]) + + expected_result = agg_func(pd_df["timedelta_col_1"]).floor("us") + assert actual_result == expected_result + + +@pytest.mark.parametrize( + "agg_func", + [ + pytest.param(lambda x: x.count(), id="count"), + pytest.param(lambda x: x.nunique(), id="nunique"), + ], +) +def test_timedelta_agg__int_result(temporal_dfs, agg_func): + bf_df, pd_df = temporal_dfs + + actual_result = agg_func(bf_df["timedelta_col_1"]) + + expected_result = agg_func(pd_df["timedelta_col_1"]) + assert actual_result == expected_result diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 4b4264e33c..da78432cdb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -829,3 +829,18 @@ def test_to_timedelta_with_bf_series_invalid_unit(session, unit): @pytest.mark.parametrize("input", [1, 1.2, "1s"]) def test_to_timedelta_non_bf_series(input): assert bpd.to_timedelta(input) == pd.to_timedelta(input) + + +def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs): + bf_df, pd_df = scalars_dfs + bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us") + pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us") + + actual_result = ( + bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]") + ) + + expected_result = pd.to_timedelta(pd_series, unit="s") + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 00f47c754e..2daa7dd825 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): ), ) def test_series_interpolate(method): + pytest.importorskip("scipy") + values = [None, 1, 2, None, None, 16, None] index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] pd_series = pd.Series(values, index) diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index 413a694680..d377fb4d49 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -66,6 +66,12 @@ def test_supported_types_correspond(): ibis_types_from_bigquery = { third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + # TODO(b/284515241): ARRAY is the only exception because it is supported + # as an output type of the BQ routine in the read_gbq_function path but + # not in the remote function path. Remove this handline once BQ remote + # functions supports ARRAY output and the bigframes remote functions + # utilizes that to support array output. + if tk != "ARRAY" } assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py index 00a51ccfe9..dd2ceff143 100644 --- a/tests/unit/ml/test_api_primitives.py +++ b/tests/unit/ml/test_api_primitives.py @@ -13,8 +13,6 @@ # limitations under the License. import pytest -import sklearn.decomposition as sklearn_decomposition # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore import bigframes.ml.decomposition import bigframes.ml.linear_model @@ -35,8 +33,9 @@ def test_base_estimator_repr(): assert pca_estimator.__repr__() == "PCA(n_components=7)" -@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn") def test_base_estimator_repr_matches_sklearn(): + sklearn_decomposition = pytest.importorskip("sklearn.decomposition") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") estimator = bigframes.ml.linear_model.LinearRegression() sklearn_estimator = sklearn_linear_model.LinearRegression() assert estimator.__repr__() == sklearn_estimator.__repr__() diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 395296f3e4..450ce8d6ee 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -15,8 +15,6 @@ from google.cloud import bigquery import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer @@ -119,6 +117,8 @@ def test_columntransformer_repr(): def test_columntransformer_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_column_transformer = compose.ColumnTransformer( [ ( diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py index ed5c621b1d..beebb9f282 100644 --- a/tests/unit/ml/test_pipeline.py +++ b/tests/unit/ml/test_pipeline.py @@ -13,10 +13,6 @@ # limitations under the License. import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore -import sklearn.pipeline as sklearn_pipeline # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing @@ -57,8 +53,11 @@ def test_pipeline_repr(): ) -@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn") def test_pipeline_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") + sklearn_pipeline = pytest.importorskip("sklearn.pipeline") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_pl = pipeline.Pipeline( [ ( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f5aa23d00b..e296dcb9f6 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -7179,7 +7179,7 @@ def __len__(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def __array__(self): + def __array__(self, dtype=None, copy: Optional[bool] = None): """ Returns the rows as NumPy array. @@ -7210,6 +7210,8 @@ def __array__(self): dtype (str or numpy.dtype, optional): The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy (bool or None, optional): + Whether to copy the data, False is not supported. Returns: numpy.ndarray: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 57f7dfbb79..5e6f546d09 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -5941,7 +5941,7 @@ def size(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def __array__(self, dtype=None) -> numpy.ndarray: + def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: """ Returns the values as NumPy array. @@ -5965,6 +5965,8 @@ def __array__(self, dtype=None) -> numpy.ndarray: dtype (str or numpy.dtype, optional): The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy (bool or None, optional): + Whether to copy the data, False is not supported. Returns: numpy.ndarray: diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 7b97526de2..9262ffbd3d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -16,6 +16,8 @@ # Michal Karbownik # License: BSD 3 clause +import numpy as np + from bigframes import constants @@ -60,7 +62,23 @@ def auc(x, y) -> float: Returns: float: Area Under the Curve. """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + if len(x) < 2: + raise ValueError( + f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}" + ) + + if x.is_monotonic_decreasing: + d = -1 + elif x.is_monotonic_increasing: + d = 1 + else: + raise ValueError(f"x is neither increasing nor decreasing : {x}.") + + if hasattr(np, "trapezoid"): + # new in numpy 2.0 + return d * np.trapezoid(y, x) + # np.trapz has been deprecated in 2.0 + return d * np.trapz(y, x) # type: ignore def roc_auc_score(y_true, y_score) -> float: diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py index 6af33f7569..5c9ca1e9c3 100644 --- a/third_party/bigframes_vendored/tpch/queries/q9.py +++ b/third_party/bigframes_vendored/tpch/queries/q9.py @@ -33,13 +33,17 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) q_final = ( - part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") - .merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") - .merge( + part.merge( lineitem, - left_on=["P_PARTKEY", "PS_SUPPKEY"], - right_on=["L_PARTKEY", "L_SUPPKEY"], + left_on="P_PARTKEY", + right_on="L_PARTKEY", + ) + .merge( + partsupp, + left_on=["L_SUPPKEY", "L_PARTKEY"], + right_on=["PS_SUPPKEY", "PS_PARTKEY"], ) + .merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY") .merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") .merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY") ) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 27dfb23603..762deda9ff 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.37.0" +__version__ = "1.38.0" From bef780818f4ac3fbabe956fadeff9c5a658d2308 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 24 Feb 2025 16:11:11 -0600 Subject: [PATCH 21/75] Delete demo.ipynb --- demo.ipynb | 758 ----------------------------------------------------- 1 file changed, 758 deletions(-) delete mode 100644 demo.ipynb diff --git a/demo.ipynb b/demo.ipynb deleted file mode 100644 index 93e6f121f9..0000000000 --- a/demo.ipynb +++ /dev/null @@ -1,758 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 39ca6c3f-1c37-4f8e-8252-33cf6abfa340 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7dda7bc2-75b2-42b5-918b-41dd0540eb53 is DONE. 24.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4b99d068-1e68-4a86-bd0b-52d40ef6a270 is DONE. 40.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_iditem_idrating
043549684.0
1362235215.0
255439202.0
344531755.0
455352354.0
5442210974.0
6311913564.0
7603712314.0
88511963.0
931114353.0
1054036485.0
11360127343.0
12465529494.0
13127430935.0
1415213504.0
1530724543.0
16331413304.0
17376227191.0
18168721693.0
1997030814.0
20126522485.0
2115021044.0
221945004.0
23352110883.0
24188935673.0
\n", - "

25 rows × 3 columns

\n", - "
[1000209 rows x 3 columns in total]" - ], - "text/plain": [ - " user_id item_id rating\n", - "0 4354 968 4.0\n", - "1 3622 3521 5.0\n", - "2 5543 920 2.0\n", - "3 445 3175 5.0\n", - "4 5535 235 4.0\n", - "5 4422 1097 4.0\n", - "6 3119 1356 4.0\n", - "7 6037 1231 4.0\n", - "8 851 196 3.0\n", - "9 3111 435 3.0\n", - "10 5403 648 5.0\n", - "11 3601 2734 3.0\n", - "12 4655 2949 4.0\n", - "13 1274 3093 5.0\n", - "14 1521 350 4.0\n", - "15 3072 454 3.0\n", - "16 3314 1330 4.0\n", - "17 3762 2719 1.0\n", - "18 1687 2169 3.0\n", - "19 970 3081 4.0\n", - "20 1265 2248 5.0\n", - "21 1502 104 4.0\n", - "22 194 500 4.0\n", - "23 3521 1088 3.0\n", - "24 1889 3567 3.0\n", - "...\n", - "\n", - "[1000209 rows x 3 columns]" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import bigframes.pandas as bpd\n", - "from bigframes.ml import decomposition\n", - "\n", - "bq_df = bpd.read_gbq('bqml_tutorial.ratings', columns=('user_id', 'item_id', 'rating'))\n", - "bq_df" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n", - " rating_col='rating_col', user_col='user_id')" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model = decomposition.MatrixFactorization(\n", - " num_factors=34,\n", - " feedback_type='explicit',\n", - " user_col='user_id',\n", - " item_col='item_col',\n", - " rating_col='rating_col',\n", - " l2_reg=9.83,\n", - ")\n", - "\n", - "model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 50f616db-afae-40da-bc95-f724bb8a5c84 is DONE. 24.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d13d556d-e011-40a0-9da8-5c0918cf1ef1 is DONE. 537.2 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n", - " rating_col='rating_col', user_col='user_id')" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fitted = model.fit(bq_df.rename(columns={'rating': 'rating_col', 'item_id': 'item_col'}))\n", - "fitted\n", - "# scored = model.score(fitted)\n", - "\n", - "# scored" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 66684505-f14b-423b-8105-93521064036a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4ec28d78-f0c1-4456-8c08-60b6982ee52f is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
00.4852820.3953410.0255350.3899060.6831990.683199
\n", - "

1 rows × 6 columns

\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 0.485282 0.395341 0.025535 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 0.389906 0.683199 0.683199 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scored = model.score(fitted)\n", - "\n", - "scored" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job e7dcfb81-70af-4d65-9c2a-b42591812d0e is DONE. 29.5 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job fcb53dd1-f9cb-4872-b7bf-3d2f0da89b00 is DONE. 40.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 10436512-dada-4dfc-a3ff-94b480a5e890 is DONE. 48.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_rating_coluser_iditem_colrating
03.34813143549684.0
15.22349362235215.0
21.82017355439202.0
34.70222844531755.0
43.20694955352354.0
54.690283442210974.0
63.944585311913564.0
74.275766603712314.0
83.4345798511963.0
91.82747331114353.0
104.13092854036485.0
113.231195360127343.0
123.750037465529494.0
133.858951127430935.0
143.3485215213504.0
152.95628430724543.0
163.831856331413304.0
170.805804376227191.0
183.65957168721693.0
193.0319797030814.0
203.384926126522485.0
214.17324315021044.0
223.9184351945004.0
232.451965352110883.0
242.982963188935673.0
\n", - "

25 rows × 4 columns

\n", - "
[1000209 rows x 4 columns in total]" - ], - "text/plain": [ - " predicted_rating_col user_id item_col rating\n", - "0 3.348131 4354 968 4.0\n", - "1 5.22349 3622 3521 5.0\n", - "2 1.820173 5543 920 2.0\n", - "3 4.702228 445 3175 5.0\n", - "4 3.206949 5535 235 4.0\n", - "5 4.690283 4422 1097 4.0\n", - "6 3.944585 3119 1356 4.0\n", - "7 4.275766 6037 1231 4.0\n", - "8 3.434579 851 196 3.0\n", - "9 1.827473 3111 435 3.0\n", - "10 4.130928 5403 648 5.0\n", - "11 3.231195 3601 2734 3.0\n", - "12 3.750037 4655 2949 4.0\n", - "13 3.858951 1274 3093 5.0\n", - "14 3.34852 1521 350 4.0\n", - "15 2.956284 3072 454 3.0\n", - "16 3.831856 3314 1330 4.0\n", - "17 0.805804 3762 2719 1.0\n", - "18 3.65957 1687 2169 3.0\n", - "19 3.03197 970 3081 4.0\n", - "20 3.384926 1265 2248 5.0\n", - "21 4.173243 1502 104 4.0\n", - "22 3.918435 194 500 4.0\n", - "23 2.451965 3521 1088 3.0\n", - "24 2.982963 1889 3567 3.0\n", - "...\n", - "\n", - "[1000209 rows x 4 columns]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# predict_df = scored[['user_id']['item_col']]\n", - "# model.predict(predict_df)\n", - "model.predict(bq_df.rename(columns={'item_id': 'item_col'}))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.19" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 0dd033da3ea4e70e024f0cb3f0ad20db4e934a37 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 25 Feb 2025 22:10:24 +0000 Subject: [PATCH 22/75] passing system test --- bigframes/ml/decomposition.py | 10 +++++++++- tests/system/large/ml/test_decomposition.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 1ea7d98177..065c1fdb5f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -28,7 +28,15 @@ import bigframes.pandas as bpd import bigframes.session -_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"} +_BQML_PARAMS_MAPPING = { + "svd_solver": "pcaSolver", + "feedback_type": "feedbackType", + "num_factors": "numFactors", + "user_col": "userColumn", + "item_col": "itemColumn", + # TODO: Add rating_col + "l2_reg": "l2Regularization", +} @log_adapter.class_logger diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 36f5d83c75..0a25187935 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -198,4 +198,15 @@ def test_decomposition_mf_configure_fit_load( new_ratings.rename(columns={"item_id": "item_col"}) ).to_pandas() + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_mf_model" + in reloaded_model._bqml_model.model_name + ) assert result is not None + assert reloaded_model.feedback_type == "EXPLICIT" + assert reloaded_model.num_factors == 6 + assert reloaded_model.user_col == "user_id" + assert reloaded_model.item_col == "item_id" + assert reloaded_model.rating_col == "rating" + assert reloaded_model.l2_reg == 9.83 From 1f85b75449363f707190b9426264f8d5df3d6a94 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 26 Feb 2025 19:55:03 +0000 Subject: [PATCH 23/75] preparing to add unit tests --- tests/unit/ml/test_golden_sql.py | 54 +++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 97d1d2d7d1..31ee73d0ea 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -20,7 +20,7 @@ import pytest_mock import bigframes -from bigframes.ml import core, linear_model +from bigframes.ml import core, decomposition, linear_model import bigframes.pandas as bpd TEMP_MODEL_ID = bigquery.ModelReference.from_string( @@ -207,3 +207,55 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_y_sql))" ) + + +def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, mock_y): + model = decomposition.MatrixFactorization( # revise + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model_factory = bqml_model_factory + model.fit(mock_X, mock_y) + + mock_session._start_query_ml_ddl.assert_called_once_with( # revice + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='MATRIX_FACTORIZATION',\n " + ) + + +def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): + model = decomposition.MatrixFactorization( # revise + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model = bqml_model + model.predict(mock_X) # mock x requires item_col + + mock_session.read_gbq.assert_called_once_with( # revise + "SELECT * FROM ML.PREDICT(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", + index_col=["index_column_id"], + ) + + +def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y): + model = decomposition.MatrixFactorization( # revise + num_factors=34, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + model._bqml_model = bqml_model + model.score(mock_X, mock_y) + + mock_session.read_gbq.assert_called_once_with( # revise + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_y_sql))" + ) From a45763946a8dbaca8488141b1cdb463b891f6f50 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 27 Feb 2025 20:30:03 +0000 Subject: [PATCH 24/75] 2 out of 3 (so far) passing unit tests --- tests/unit/ml/test_golden_sql.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 31ee73d0ea..a3ae3f50de 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -221,13 +221,13 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_ml_ddl.assert_called_once_with( # revice - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='MATRIX_FACTORIZATION',\n " + mock_session._start_query_ml_ddl.assert_called_once_with( + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='matrix_factorization',\n feedback_type='explicit',\n user_col='user_id',\n item_col='item_col',\n rating_col='rating_col',\n l2_reg=9.83,\n num_factors=34)\nAS input_X_y_no_index_sql" ) def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): - model = decomposition.MatrixFactorization( # revise + model = decomposition.MatrixFactorization( num_factors=34, feedback_type="explicit", user_col="user_id", @@ -236,10 +236,10 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): l2_reg=9.83, ) model._bqml_model = bqml_model - model.predict(mock_X) # mock x requires item_col + model.predict(mock_X) - mock_session.read_gbq.assert_called_once_with( # revise - "SELECT * FROM ML.PREDICT(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", index_col=["index_column_id"], ) @@ -257,5 +257,5 @@ def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y): model.score(mock_X, mock_y) mock_session.read_gbq.assert_called_once_with( # revise - "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_y_sql))" + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)" ) From 512332e4969158009ead4656fc911e96d4c073e6 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 3 Mar 2025 16:27:55 +0000 Subject: [PATCH 25/75] attempted mocking --- tests/unit/ml/test_golden_sql.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index a3ae3f50de..09745a3bf4 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -219,6 +219,11 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, l2_reg=9.83, ) model._bqml_model_factory = bqml_model_factory + mock_start_query_ml_ddl = mock.Mock( + return_value="CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='matrix_factorization',\n feedback_type='explicit',\n user_col='user_id',\n item_col='item_col',\n rating_col='rating_col',\n l2_reg=9.83,\n num_factors=34)\nAS input_X_y_no_index_sql" + ) + mock_create_model = mock.PropertyMock(return_value=mock_start_query_ml_ddl) + type(model)._start_query_ml_ddl = mock_create_model model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( From 408e8073ac9cbacf85cc58c3ec89f36ad595ca33 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 4 Mar 2025 20:08:13 +0000 Subject: [PATCH 26/75] fix tests --- bigframes/ml/decomposition.py | 5 +++++ tests/unit/ml/test_golden_sql.py | 19 +++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 065c1fdb5f..ea68fd5e6b 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -272,6 +272,11 @@ def _fit( y=None, transforms: Optional[List[str]] = None, ) -> MatrixFactorization: + if y is not None: + raise ValueError( + "Label column not supported for Matrix Factorization model but y was not `None`" + ) + (X,) = utils.batch_convert_to_dataframe(X) self._bqml_model = self._bqml_model_factory.create_model( diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 09745a3bf4..d1aae60744 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -79,6 +79,7 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + mock_X.reset_index(drop=True).cache().sql = "input_X_no_index_sql" mock_X.join(mock_y).sql = "input_X_y_sql" mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) mock_X.join(mock_y)._to_sql_query.return_value = ( @@ -209,7 +210,7 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): ) -def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, mock_y): +def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X): model = decomposition.MatrixFactorization( # revise num_factors=34, feedback_type="explicit", @@ -219,15 +220,13 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, l2_reg=9.83, ) model._bqml_model_factory = bqml_model_factory - mock_start_query_ml_ddl = mock.Mock( - return_value="CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='matrix_factorization',\n feedback_type='explicit',\n user_col='user_id',\n item_col='item_col',\n rating_col='rating_col',\n l2_reg=9.83,\n num_factors=34)\nAS input_X_y_no_index_sql" - ) + mock_start_query_ml_ddl = mock.Mock() mock_create_model = mock.PropertyMock(return_value=mock_start_query_ml_ddl) type(model)._start_query_ml_ddl = mock_create_model - model.fit(mock_X, mock_y) + model.fit(mock_X) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='matrix_factorization',\n feedback_type='explicit',\n user_col='user_id',\n item_col='item_col',\n rating_col='rating_col',\n l2_reg=9.83,\n num_factors=34)\nAS input_X_y_no_index_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='matrix_factorization',\n feedback_type='explicit',\n user_col='user_id',\n item_col='item_col',\n rating_col='rating_col',\n l2_reg=9.83,\n num_factors=34)\nAS input_X_no_index_sql" ) @@ -249,8 +248,8 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): ) -def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y): - model = decomposition.MatrixFactorization( # revise +def test_decomposition_mf_score(mock_session, bqml_model, mock_X): + model = decomposition.MatrixFactorization( num_factors=34, feedback_type="explicit", user_col="user_id", @@ -259,8 +258,8 @@ def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y): l2_reg=9.83, ) model._bqml_model = bqml_model - model.score(mock_X, mock_y) + model.score(mock_X) - mock_session.read_gbq.assert_called_once_with( # revise + mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)" ) From 19e423bbefea2376b7b1feddd03d0530d90edbf7 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 4 Mar 2025 23:54:05 +0000 Subject: [PATCH 27/75] new test file for model creation unit tests --- tests/unit/ml/test_matrix_factorization.py | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/unit/ml/test_matrix_factorization.py diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py new file mode 100644 index 0000000000..3e5a9b0034 --- /dev/null +++ b/tests/unit/ml/test_matrix_factorization.py @@ -0,0 +1,70 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# import re + +# import pytest + +from bigframes.ml import decomposition + + +def test_decomposition_mf_num_factors(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + assert model.num_factors == 16 + + +# def test_decomposition_mf_num_factors_invalid_raises(): +# # with pytest.raises(TypeError): +# model = decomposition.MatrixFactorization( +# num_factors=0.5, +# feedback_type="explicit", +# user_col="user_id", +# item_col="item_col", +# rating_col="rating_col", +# l2_reg=9.83, +# ) +# # passing test -> should raise error? +# assert model.num_factors == 0.5 + + +def test_decomposition_mf_feedback_type(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="implicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + assert model.feedback_type == "implicit" + + +# def test_decomposition_mf_feedback_type_raises(): +# model = decomposition.MatrixFactorization( +# num_factors=16, +# feedback_type="implexpl", +# user_col="user_id", +# item_col="item_col", +# rating_col="rating_col", +# l2_reg=9.83, +# ) +# # passing test -> should raise error? +# assert model.feedback_type == "implexpl" From 5f1a19aa4b420a55b3b1da09f1663b195488c09e Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 6 Mar 2025 01:43:17 +0000 Subject: [PATCH 28/75] add unit tests for num_factors, user_col, and item_col --- bigframes/ml/decomposition.py | 12 +++ tests/unit/ml/test_matrix_factorization.py | 90 ++++++++++++++-------- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index ea68fd5e6b..be78b1848b 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -227,8 +227,20 @@ def __init__( l2_reg: float = 1.0, ): self.feedback_type = feedback_type + + if type(num_factors) is not int: + raise (TypeError) + self.num_factors = num_factors + + if type(user_col) is not str: + raise (TypeError) + self.user_col = user_col + + if type(item_col) is not str: + raise (TypeError) + self.item_col = item_col self.rating_col = rating_col self.l2_reg = l2_reg diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 3e5a9b0034..a8d3c650fc 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -12,59 +12,89 @@ # See the License for the specific language governing permissions and # limitations under the License. -# import re # import pytest from bigframes.ml import decomposition -def test_decomposition_mf_num_factors(): +def test_decomposition_mf_model(): model = decomposition.MatrixFactorization( num_factors=16, - feedback_type="explicit", + feedback_type="implicit", user_col="user_id", item_col="item_col", rating_col="rating_col", l2_reg=9.83, ) assert model.num_factors == 16 + assert model.feedback_type == "implicit" + assert model.user_col == "user_id" + assert model.item_col == "item_col" + assert model.rating_col == "rating_col" + + +def test_decomposition_mf_feedback_type_explicit(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + assert model.feedback_type == "explicit" -# def test_decomposition_mf_num_factors_invalid_raises(): -# # with pytest.raises(TypeError): -# model = decomposition.MatrixFactorization( -# num_factors=0.5, -# feedback_type="explicit", -# user_col="user_id", -# item_col="item_col", -# rating_col="rating_col", -# l2_reg=9.83, -# ) -# # passing test -> should raise error? -# assert model.num_factors == 0.5 +# test_decomposition_mf_invalid_feedback_type_raises -def test_decomposition_mf_feedback_type(): +def test_decomposition_mf_num_factors_low(): model = decomposition.MatrixFactorization( - num_factors=16, - feedback_type="implicit", + num_factors=0, + feedback_type="explicit", user_col="user_id", item_col="item_col", rating_col="rating_col", l2_reg=9.83, ) - assert model.feedback_type == "implicit" + assert model.num_factors == 0 + + +# test_decomposition_mf_negative_num_factors_raises + +# def test_decomposition_mf_invalid_num_factors_raises(): +# num_factors = 0.5 +# with pytest.raises(TypeError): +# decomposition.MatrixFactorization( +# num_factors=num_factors, +# feedback_type="explicit", +# user_col="user_id", +# item_col="item_col", +# rating_col="rating_col", +# l2_reg=9.83, +# ) + + +# def test_decomposition_mf_invalid_user_col_raises(): +# with pytest.raises(TypeError): +# decomposition.MatrixFactorization( +# num_factors=16, +# feedback_type="explicit", +# user_col=123, +# item_col="item_col", +# rating_col="rating_col", +# l2_reg=9.83, +# ) -# def test_decomposition_mf_feedback_type_raises(): -# model = decomposition.MatrixFactorization( -# num_factors=16, -# feedback_type="implexpl", -# user_col="user_id", -# item_col="item_col", -# rating_col="rating_col", -# l2_reg=9.83, -# ) -# # passing test -> should raise error? -# assert model.feedback_type == "implexpl" +# def test_decomposition_mf_invalid_item_col_raises(): +# with pytest.raises(TypeError): +# decomposition.MatrixFactorization( +# num_factors=16, +# feedback_type="explicit", +# user_col="user_col", +# item_col=123, +# rating_col="rating_col", +# l2_reg=9.83, +# ) From 33f3069d90b19aec79b7429a34d4361d58047f62 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 6 Mar 2025 18:40:39 -0600 Subject: [PATCH 29/75] Update tests/unit/ml/test_matrix_factorization.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- tests/unit/ml/test_matrix_factorization.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index a8d3c650fc..1496f9a074 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -67,7 +67,10 @@ def test_decomposition_mf_num_factors_low(): # num_factors = 0.5 # with pytest.raises(TypeError): # decomposition.MatrixFactorization( -# num_factors=num_factors, +# # Intentionally pass in the wrong type. This will fail if the user is using +# # a type checker, but we can't assume that everyone is doing so, especially +# # not in notebook environments. +# num_factors=num_factors, # type: ignore # feedback_type="explicit", # user_col="user_id", # item_col="item_col", From 1ff6aaa8e40c3eea154f27e26af09099f833ff16 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 6 Mar 2025 18:40:50 -0600 Subject: [PATCH 30/75] Update tests/unit/ml/test_matrix_factorization.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- tests/unit/ml/test_matrix_factorization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 1496f9a074..1af86f7729 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -65,7 +65,7 @@ def test_decomposition_mf_num_factors_low(): # def test_decomposition_mf_invalid_num_factors_raises(): # num_factors = 0.5 -# with pytest.raises(TypeError): +# with pytest.raises(TypeError, match="num_factors"): # decomposition.MatrixFactorization( # # Intentionally pass in the wrong type. This will fail if the user is using # # a type checker, but we can't assume that everyone is doing so, especially From c84dd7ecbeea9f425145ab4157cfa4fb44a9bfe1 Mon Sep 17 00:00:00 2001 From: Daniela Date: Fri, 7 Mar 2025 00:43:38 +0000 Subject: [PATCH 31/75] uncomment one test --- tests/unit/ml/test_matrix_factorization.py | 31 +++++++++++----------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 1af86f7729..9413fb036a 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -13,7 +13,7 @@ # limitations under the License. -# import pytest +import pytest from bigframes.ml import decomposition @@ -63,20 +63,21 @@ def test_decomposition_mf_num_factors_low(): # test_decomposition_mf_negative_num_factors_raises -# def test_decomposition_mf_invalid_num_factors_raises(): -# num_factors = 0.5 -# with pytest.raises(TypeError, match="num_factors"): -# decomposition.MatrixFactorization( -# # Intentionally pass in the wrong type. This will fail if the user is using -# # a type checker, but we can't assume that everyone is doing so, especially -# # not in notebook environments. -# num_factors=num_factors, # type: ignore -# feedback_type="explicit", -# user_col="user_id", -# item_col="item_col", -# rating_col="rating_col", -# l2_reg=9.83, -# ) + +def test_decomposition_mf_invalid_num_factors_raises(): + num_factors = 0.5 + with pytest.raises(TypeError, match="num_factors"): + decomposition.MatrixFactorization( + # Intentionally pass in the wrong type. This will fail if the user is using + # a type checker, but we can't assume that everyone is doing so, especially + # not in notebook environments. + num_factors=num_factors, # type: ignore + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) # def test_decomposition_mf_invalid_user_col_raises(): From 3473037dfbec13ec251758babce00ddcc14c2966 Mon Sep 17 00:00:00 2001 From: Daniela Date: Fri, 7 Mar 2025 01:05:46 +0000 Subject: [PATCH 32/75] uncomment test --- bigframes/ml/decomposition.py | 9 ++++-- tests/unit/ml/test_matrix_factorization.py | 37 ++++++++++++++-------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index be78b1848b..42fb4187d7 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -229,17 +229,22 @@ def __init__( self.feedback_type = feedback_type if type(num_factors) is not int: - raise (TypeError) + raise TypeError( + f"Expected num_factors to be INT64 but got {type(num_factors)}" + ) self.num_factors = num_factors if type(user_col) is not str: raise (TypeError) + # if user_col is not "user_id": + # raise ValueError(match="") + self.user_col = user_col if type(item_col) is not str: - raise (TypeError) + raise TypeError(f"Expected item_col to be STR but got {type(item_col)}") self.item_col = item_col self.rating_col = rating_col diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 9413fb036a..a410db524d 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -66,7 +66,9 @@ def test_decomposition_mf_num_factors_low(): def test_decomposition_mf_invalid_num_factors_raises(): num_factors = 0.5 - with pytest.raises(TypeError, match="num_factors"): + with pytest.raises( + TypeError, match=f"Expected num_factors to be INT64 but got {type(num_factors)}" + ): decomposition.MatrixFactorization( # Intentionally pass in the wrong type. This will fail if the user is using # a type checker, but we can't assume that everyone is doing so, especially @@ -81,24 +83,31 @@ def test_decomposition_mf_invalid_num_factors_raises(): # def test_decomposition_mf_invalid_user_col_raises(): -# with pytest.raises(TypeError): +# user_col = 123 +# with pytest.raises(TypeError, match="user_col"): # decomposition.MatrixFactorization( # num_factors=16, -# feedback_type="explicit", -# user_col=123, +# # Intentionally pass in the wrong type. This will fail if the user is using +# # a type checker, but we can't assume that everyone is doing so, especially +# # not in notebook environments. +# feedback_type="explicit", # type: ignore +# user_col=user_col, # item_col="item_col", # rating_col="rating_col", # l2_reg=9.83, # ) -# def test_decomposition_mf_invalid_item_col_raises(): -# with pytest.raises(TypeError): -# decomposition.MatrixFactorization( -# num_factors=16, -# feedback_type="explicit", -# user_col="user_col", -# item_col=123, -# rating_col="rating_col", -# l2_reg=9.83, -# ) +def test_decomposition_mf_invalid_item_col_raises(): + item_col = 123 + with pytest.raises( + TypeError, match=f"Expected item_col to be STR but got {type(item_col)}" + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_col", + item_col=item_col, # type: ignore + rating_col="rating_col", + l2_reg=9.83, + ) From b3809e561677be35e63e4d9c2149a7b151939e6e Mon Sep 17 00:00:00 2001 From: Daniela Date: Fri, 7 Mar 2025 01:07:02 +0000 Subject: [PATCH 33/75] uncomment test --- bigframes/ml/decomposition.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 42fb4187d7..09bc402ecb 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -246,6 +246,9 @@ def __init__( if type(item_col) is not str: raise TypeError(f"Expected item_col to be STR but got {type(item_col)}") + # if item_col is not "item_col": + # raise ValueError(match=f"item_col") + self.item_col = item_col self.rating_col = rating_col self.l2_reg = l2_reg From 7e8a5b6aa4046295c2c7792776b173b9494d47ef Mon Sep 17 00:00:00 2001 From: Daniela Date: Fri, 7 Mar 2025 01:09:35 +0000 Subject: [PATCH 34/75] uncomment test --- bigframes/ml/decomposition.py | 4 ++-- tests/unit/ml/test_matrix_factorization.py | 27 +++++++++++----------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 09bc402ecb..8a4f60962c 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -236,7 +236,7 @@ def __init__( self.num_factors = num_factors if type(user_col) is not str: - raise (TypeError) + raise TypeError(f"Expected item_col to be STR but got {type(user_col)}") # if user_col is not "user_id": # raise ValueError(match="") @@ -247,7 +247,7 @@ def __init__( raise TypeError(f"Expected item_col to be STR but got {type(item_col)}") # if item_col is not "item_col": - # raise ValueError(match=f"item_col") + # raise ValueError(match=f"") self.item_col = item_col self.rating_col = rating_col diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index a410db524d..67b45d8551 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -82,20 +82,19 @@ def test_decomposition_mf_invalid_num_factors_raises(): ) -# def test_decomposition_mf_invalid_user_col_raises(): -# user_col = 123 -# with pytest.raises(TypeError, match="user_col"): -# decomposition.MatrixFactorization( -# num_factors=16, -# # Intentionally pass in the wrong type. This will fail if the user is using -# # a type checker, but we can't assume that everyone is doing so, especially -# # not in notebook environments. -# feedback_type="explicit", # type: ignore -# user_col=user_col, -# item_col="item_col", -# rating_col="rating_col", -# l2_reg=9.83, -# ) +def test_decomposition_mf_invalid_user_col_raises(): + user_col = 123 + with pytest.raises( + TypeError, match=f"Expected item_col to be STR but got {type(user_col)}" + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col=user_col, # type: ignore + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) def test_decomposition_mf_invalid_item_col_raises(): From 8599d8848bb2dca03af1e77209cfe581267eb3e7 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 10 Mar 2025 16:24:34 +0000 Subject: [PATCH 35/75] nearly all tests --- bigframes/ml/decomposition.py | 33 ++++++-- tests/unit/ml/test_matrix_factorization.py | 92 ++++++++++++++++++++-- 2 files changed, 110 insertions(+), 15 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 8a4f60962c..81f1d619c8 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -230,27 +230,46 @@ def __init__( if type(num_factors) is not int: raise TypeError( - f"Expected num_factors to be INT64 but got {type(num_factors)}" + f"Expected num_factors to be INT64, but got {type(num_factors)}." + ) + + if num_factors < 0: + raise ValueError( + f"Expected num_factors to be a positive integer, but got {num_factors}." ) self.num_factors = num_factors if type(user_col) is not str: - raise TypeError(f"Expected item_col to be STR but got {type(user_col)}") + raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.") - # if user_col is not "user_id": - # raise ValueError(match="") + if user_col != "user_id": + raise ValueError( + f"Expected user_col column to be `user_id`, but got {user_col}." + ) self.user_col = user_col if type(item_col) is not str: - raise TypeError(f"Expected item_col to be STR but got {type(item_col)}") + raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") - # if item_col is not "item_col": - # raise ValueError(match=f"") + if item_col != "item_col": + raise ValueError( + f"Expected item_col column to be `item_col`, but got {item_col}." + ) self.item_col = item_col + + if type(rating_col) is not str: + raise TypeError( + f"Expected rating_col to be STR, but got {type(rating_col)}." + ) + self.rating_col = rating_col + + if type(l2_reg) is not float: + raise TypeError(f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}.") + self.l2_reg = l2_reg self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 67b45d8551..c7adf4f6ed 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -61,18 +61,32 @@ def test_decomposition_mf_num_factors_low(): assert model.num_factors == 0 -# test_decomposition_mf_negative_num_factors_raises +def test_decomposition_mf_negative_num_factors_raises(): + num_factors = -2 + with pytest.raises( + ValueError, + match=f"Expected num_factors to be a positive integer, but got {num_factors}.", + ): + decomposition.MatrixFactorization( + # Intentionally pass in the wrong type. This will fail if the user is using + # a type checker, but we can't assume that everyone is doing so, especially + # not in notebook environments. + num_factors=num_factors, # type: ignore + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) def test_decomposition_mf_invalid_num_factors_raises(): num_factors = 0.5 with pytest.raises( - TypeError, match=f"Expected num_factors to be INT64 but got {type(num_factors)}" + TypeError, + match=f"Expected num_factors to be INT64, but got {type(num_factors)}.", ): decomposition.MatrixFactorization( - # Intentionally pass in the wrong type. This will fail if the user is using - # a type checker, but we can't assume that everyone is doing so, especially - # not in notebook environments. num_factors=num_factors, # type: ignore feedback_type="explicit", user_col="user_id", @@ -85,7 +99,23 @@ def test_decomposition_mf_invalid_num_factors_raises(): def test_decomposition_mf_invalid_user_col_raises(): user_col = 123 with pytest.raises( - TypeError, match=f"Expected item_col to be STR but got {type(user_col)}" + TypeError, match=f"Expected user_col to be STR, but got {type(user_col)}." + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col=user_col, # type: ignore + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_label_user_col_raises(): + user_col = "user_col" + with pytest.raises( + ValueError, + match=f"Expected user_col column to be `user_id`, but got {user_col}.", ): decomposition.MatrixFactorization( num_factors=16, @@ -100,13 +130,59 @@ def test_decomposition_mf_invalid_user_col_raises(): def test_decomposition_mf_invalid_item_col_raises(): item_col = 123 with pytest.raises( - TypeError, match=f"Expected item_col to be STR but got {type(item_col)}" + TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}." ): decomposition.MatrixFactorization( num_factors=16, feedback_type="explicit", - user_col="user_col", + user_col="user_id", item_col=item_col, # type: ignore rating_col="rating_col", l2_reg=9.83, ) + + +def test_decomposition_mf_label_item_col_raises(): + item_col = "item_id" + with pytest.raises( + ValueError, + match=f"Expected item_col column to be `item_col`, but got {item_col}.", + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col=item_col, # type: ignore + rating_col="rating_col", + l2_reg=9.83, + ) + + +def test_decomposition_mf_invalid_rating_col_raises(): + rating_col = 4 + with pytest.raises( + TypeError, match=f"Expected rating_col to be STR, but got {type(rating_col)}." + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col=rating_col, # type: ignore + l2_reg=9.83, + ) + + +def test_decomposition_mf_invalid_l2_reg_raises(): + l2_reg = "6.02" + with pytest.raises( + TypeError, match=f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}." + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=l2_reg, # type: ignore + ) From 8ab88187f7cb53e5aeeb20fda5a327d54d9d04fa Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 10 Mar 2025 18:46:43 +0000 Subject: [PATCH 36/75] tests complete and passing --- bigframes/ml/decomposition.py | 6 ++++++ tests/unit/ml/test_matrix_factorization.py | 21 +++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 81f1d619c8..2128470b97 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -226,6 +226,12 @@ def __init__( # TODO: Add support for hyperparameter tuning. l2_reg: float = 1.0, ): + + if feedback_type not in ("explicit", "implicit"): + raise ValueError( + f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}" + ) + self.feedback_type = feedback_type if type(num_factors) is not int: diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index c7adf4f6ed..ca256a419d 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -46,7 +46,23 @@ def test_decomposition_mf_feedback_type_explicit(): assert model.feedback_type == "explicit" -# test_decomposition_mf_invalid_feedback_type_raises +def test_decomposition_mf_invalid_feedback_type_raises(): + feedback_type = "explimp" + with pytest.raises( + ValueError, + match=f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}", + ): + decomposition.MatrixFactorization( + # Intentionally pass in the wrong type. This will fail if the user is using + # a type checker, but we can't assume that everyone is doing so, especially + # not in notebook environments. + num_factors=16, + feedback_type=feedback_type, # type: ignore + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=9.83, + ) def test_decomposition_mf_num_factors_low(): @@ -68,9 +84,6 @@ def test_decomposition_mf_negative_num_factors_raises(): match=f"Expected num_factors to be a positive integer, but got {num_factors}.", ): decomposition.MatrixFactorization( - # Intentionally pass in the wrong type. This will fail if the user is using - # a type checker, but we can't assume that everyone is doing so, especially - # not in notebook environments. num_factors=num_factors, # type: ignore feedback_type="explicit", user_col="user_id", From b4d357865548022cc23402411d9f34f4cccd674b Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 10 Mar 2025 20:58:48 +0000 Subject: [PATCH 37/75] seeing if test causes kokoro failure --- tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index ca256a419d..63f5a58fbe 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises(): ) -def test_decomposition_mf_label_item_col_raises(): - item_col = "item_id" - with pytest.raises( - ValueError, - match=f"Expected item_col column to be `item_col`, but got {item_col}.", - ): - decomposition.MatrixFactorization( - num_factors=16, - feedback_type="explicit", - user_col="user_id", - item_col=item_col, # type: ignore - rating_col="rating_col", - l2_reg=9.83, - ) +# def test_decomposition_mf_label_item_col_raises(): +# item_col = "item_id" +# with pytest.raises( +# ValueError, +# match=f"Expected item_col column to be `item_col`, but got {item_col}.", +# ): +# decomposition.MatrixFactorization( +# num_factors=16, +# feedback_type="explicit", +# user_col="user_id", +# item_col=item_col, # type: ignore +# rating_col="rating_col", +# l2_reg=9.83, +# ) def test_decomposition_mf_invalid_rating_col_raises(): From a63cb90d0099b54404db5d41e7c4b3c7b75b6d14 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 10 Mar 2025 21:11:01 +0000 Subject: [PATCH 38/75] uncomment test-kokoro still failing --- tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 63f5a58fbe..ca256a419d 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises(): ) -# def test_decomposition_mf_label_item_col_raises(): -# item_col = "item_id" -# with pytest.raises( -# ValueError, -# match=f"Expected item_col column to be `item_col`, but got {item_col}.", -# ): -# decomposition.MatrixFactorization( -# num_factors=16, -# feedback_type="explicit", -# user_col="user_id", -# item_col=item_col, # type: ignore -# rating_col="rating_col", -# l2_reg=9.83, -# ) +def test_decomposition_mf_label_item_col_raises(): + item_col = "item_id" + with pytest.raises( + ValueError, + match=f"Expected item_col column to be `item_col`, but got {item_col}.", + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col=item_col, # type: ignore + rating_col="rating_col", + l2_reg=9.83, + ) def test_decomposition_mf_invalid_rating_col_raises(): From e69438d39e6e725dc5763cf2fecd6903753ca159 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 11 Mar 2025 16:42:58 +0000 Subject: [PATCH 39/75] remove comment --- tests/unit/ml/test_golden_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 18043426a9..32c2250ec2 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -213,7 +213,7 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X): - model = decomposition.MatrixFactorization( # revise + model = decomposition.MatrixFactorization( num_factors=34, feedback_type="explicit", user_col="user_id", From 087953f0eac9ed29fa30c79e16a983196d2897db Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 11 Mar 2025 18:40:55 +0000 Subject: [PATCH 40/75] fix test --- tests/unit/ml/test_golden_sql.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 32c2250ec2..03695a20e4 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -222,9 +222,6 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X): l2_reg=9.83, ) model._bqml_model_factory = bqml_model_factory - mock_start_query_ml_ddl = mock.Mock() - mock_create_model = mock.PropertyMock(return_value=mock_start_query_ml_ddl) - type(model)._start_query_ml_ddl = mock_create_model model.fit(mock_X) mock_session._start_query_ml_ddl.assert_called_once_with( From 8912663ba7b72abf9943789741be710174b83e55 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 13:42:33 +0000 Subject: [PATCH 41/75] test kokoro --- bigframes/ml/decomposition.py | 8 +++---- tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++----------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 2128470b97..6b396709b6 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -259,10 +259,10 @@ def __init__( if type(item_col) is not str: raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") - if item_col != "item_col": - raise ValueError( - f"Expected item_col column to be `item_col`, but got {item_col}." - ) + # if item_col != "item_col": + # raise ValueError( + # f"Expected item_col column to be `item_col`, but got {item_col}." + # ) self.item_col = item_col diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index ca256a419d..63f5a58fbe 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises(): ) -def test_decomposition_mf_label_item_col_raises(): - item_col = "item_id" - with pytest.raises( - ValueError, - match=f"Expected item_col column to be `item_col`, but got {item_col}.", - ): - decomposition.MatrixFactorization( - num_factors=16, - feedback_type="explicit", - user_col="user_id", - item_col=item_col, # type: ignore - rating_col="rating_col", - l2_reg=9.83, - ) +# def test_decomposition_mf_label_item_col_raises(): +# item_col = "item_id" +# with pytest.raises( +# ValueError, +# match=f"Expected item_col column to be `item_col`, but got {item_col}.", +# ): +# decomposition.MatrixFactorization( +# num_factors=16, +# feedback_type="explicit", +# user_col="user_id", +# item_col=item_col, # type: ignore +# rating_col="rating_col", +# l2_reg=9.83, +# ) def test_decomposition_mf_invalid_rating_col_raises(): From 35a8c1822e880ef45e373d116ad92b2ea1de4f3e Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 15:04:53 +0000 Subject: [PATCH 42/75] test_decomposition.py failing and now feedback_type attr does not exist --- bigframes/ml/decomposition.py | 8 +++---- tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++----------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 6b396709b6..2128470b97 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -259,10 +259,10 @@ def __init__( if type(item_col) is not str: raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") - # if item_col != "item_col": - # raise ValueError( - # f"Expected item_col column to be `item_col`, but got {item_col}." - # ) + if item_col != "item_col": + raise ValueError( + f"Expected item_col column to be `item_col`, but got {item_col}." + ) self.item_col = item_col diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 63f5a58fbe..ca256a419d 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises(): ) -# def test_decomposition_mf_label_item_col_raises(): -# item_col = "item_id" -# with pytest.raises( -# ValueError, -# match=f"Expected item_col column to be `item_col`, but got {item_col}.", -# ): -# decomposition.MatrixFactorization( -# num_factors=16, -# feedback_type="explicit", -# user_col="user_id", -# item_col=item_col, # type: ignore -# rating_col="rating_col", -# l2_reg=9.83, -# ) +def test_decomposition_mf_label_item_col_raises(): + item_col = "item_id" + with pytest.raises( + ValueError, + match=f"Expected item_col column to be `item_col`, but got {item_col}.", + ): + decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col=item_col, # type: ignore + rating_col="rating_col", + l2_reg=9.83, + ) def test_decomposition_mf_invalid_rating_col_raises(): From ff58ff501888f54af73ffdd4b266bc0a4b0ab037 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 20:10:39 +0000 Subject: [PATCH 43/75] passing tests --- bigframes/ml/decomposition.py | 2 +- tests/system/large/ml/test_decomposition.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 2128470b97..04e1efc83e 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -227,7 +227,7 @@ def __init__( l2_reg: float = 1.0, ): - if feedback_type not in ("explicit", "implicit"): + if feedback_type not in ("explicit", "implicit", "EXPLICIT", "IMPLICIT"): raise ValueError( f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}" ) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 0a25187935..c9c3d7433e 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -172,11 +172,15 @@ def test_decomposition_mf_configure_fit_load( num_factors=6, feedback_type="explicit", user_col="user_id", - item_col="item_id", + item_col="item_col", rating_col="ratings", l2_reg=9.83, ) - model.fit(ratings_df_default_index) + model.fit( + ratings_df_default_index.rename( + columns={"rating": "rating_col", "item_id": "item_col"} + ) + ) reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_mf_model", replace=True @@ -207,6 +211,6 @@ def test_decomposition_mf_configure_fit_load( assert reloaded_model.feedback_type == "EXPLICIT" assert reloaded_model.num_factors == 6 assert reloaded_model.user_col == "user_id" - assert reloaded_model.item_col == "item_id" + assert reloaded_model.item_col == "item_col" assert reloaded_model.rating_col == "rating" assert reloaded_model.l2_reg == 9.83 From f0a6ba21d4b2a6a61e49b00de967a802c2b7101e Mon Sep 17 00:00:00 2001 From: rey-esp Date: Wed, 12 Mar 2025 15:24:32 -0500 Subject: [PATCH 44/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 04e1efc83e..216557b859 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -227,7 +227,8 @@ def __init__( l2_reg: float = 1.0, ): - if feedback_type not in ("explicit", "implicit", "EXPLICIT", "IMPLICIT"): + feedback_type = feedback_type.lower() + if feedback_type not in ("explicit", "implicit"): raise ValueError( f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}" ) From b586c5ce875364206749203af482795e9a6237b7 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Wed, 12 Mar 2025 15:44:15 -0500 Subject: [PATCH 45/75] Update tests/system/large/ml/test_decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- tests/system/large/ml/test_decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index c9c3d7433e..a2aeaabd9e 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -208,7 +208,7 @@ def test_decomposition_mf_configure_fit_load( in reloaded_model._bqml_model.model_name ) assert result is not None - assert reloaded_model.feedback_type == "EXPLICIT" + assert reloaded_model.feedback_type == "explicit" assert reloaded_model.num_factors == 6 assert reloaded_model.user_col == "user_id" assert reloaded_model.item_col == "item_col" From 565138aed2c24fba554eb508154f54018ed20fad Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 23:15:18 +0000 Subject: [PATCH 46/75] doc attempt - _mf.py example --- .../bigframes_vendored/sklearn/decomposition/_mf.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 2d9ec4e1a1..2d33a177b2 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -25,11 +25,13 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) - >>> model = MatrixFactorization(n_components=2, init='random', random_state=0) - >>> W = model.fit_transform(X) - >>> H = model.components_ + >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) + >>> W = model.fit(X) + >>> result = model.global_explain() Args: + feedback_type ('explicit' | 'implicit'): + Specifies the feedback type for the model. The feedback type determines the algorithm that is used during training. num_factors (int or auto, default auto): Specifies the number of latent factors to use. user_col (str): From c0ef08f09c46dad3c1f82589dad55857e0d0801d Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 13 Mar 2025 14:31:48 +0000 Subject: [PATCH 47/75] feedback_type case ignore --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 216557b859..183e266e30 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -227,7 +227,7 @@ def __init__( l2_reg: float = 1.0, ): - feedback_type = feedback_type.lower() + feedback_type = feedback_type.lower() # type: ignore if feedback_type not in ("explicit", "implicit"): raise ValueError( f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}" From 664de04d9fd76a19b1d2cb9c40e13a8748b4a1b1 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Tue, 18 Mar 2025 15:33:28 -0500 Subject: [PATCH 48/75] Update _mf.py - remove global_explain() --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 2d33a177b2..6bcae7a206 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -27,7 +27,6 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) >>> W = model.fit(X) - >>> result = model.global_explain() Args: feedback_type ('explicit' | 'implicit'): From 63e8e9c3b0fbf4a4c62130355eb021aff34868a4 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 18 Mar 2025 22:56:41 +0000 Subject: [PATCH 49/75] fit --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 2d33a177b2..64b77f7730 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,8 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) - >>> W = model.fit(X) - >>> result = model.global_explain() + >>> W = model.fit(model.fit(X.rename(columns={'rating': 'rating_col', 'item_id': 'item_col'}))) Args: feedback_type ('explicit' | 'implicit'): From c2e9a5fd3325d9ee0cb3a0704ec515934fe2d800 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 18 Mar 2025 23:31:38 +0000 Subject: [PATCH 50/75] W --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 6bcae7a206..5abb6bb549 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,7 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) - >>> W = model.fit(X) + >>> W = model.fit(model.fit(X.rename(columns={0:'user_id', 1: 'item_col', 2: 'rating_col'}))) Args: feedback_type ('explicit' | 'implicit'): From 193b9c8c8aef13b991e2b8a031ac0d013344baa3 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 24 Mar 2025 17:54:42 +0000 Subject: [PATCH 51/75] fix docs (maybe) --- bigframes/ml/decomposition.py | 12 +++--------- .../bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 183e266e30..17f91683af 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -229,9 +229,7 @@ def __init__( feedback_type = feedback_type.lower() # type: ignore if feedback_type not in ("explicit", "implicit"): - raise ValueError( - f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}" - ) + raise ValueError("Expected feedback_type to be `explicit` or `implicit`.") self.feedback_type = feedback_type @@ -251,9 +249,7 @@ def __init__( raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.") if user_col != "user_id": - raise ValueError( - f"Expected user_col column to be `user_id`, but got {user_col}." - ) + raise ValueError("Expected user_col column to be `user_id`.") self.user_col = user_col @@ -261,9 +257,7 @@ def __init__( raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") if item_col != "item_col": - raise ValueError( - f"Expected item_col column to be `item_col`, but got {item_col}." - ) + raise ValueError("Expected item_col column to be `item_col`.") self.item_col = item_col diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 5abb6bb549..d8a1f0eb04 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,7 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) - >>> W = model.fit(model.fit(X.rename(columns={0:'user_id', 1: 'item_col', 2: 'rating_col'}))) + >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'})) Args: feedback_type ('explicit' | 'implicit'): From 5a547f82df503f2ab8f7dc32990d6fdc4a182e39 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 24 Mar 2025 13:09:13 -0500 Subject: [PATCH 52/75] Update test_matrix_factorization.py with updated error messages --- tests/unit/ml/test_matrix_factorization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index ca256a419d..bd1b61778d 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -50,7 +50,7 @@ def test_decomposition_mf_invalid_feedback_type_raises(): feedback_type = "explimp" with pytest.raises( ValueError, - match=f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}", + match=f"Expected feedback_type to be `explicit` or `implicit`.", ): decomposition.MatrixFactorization( # Intentionally pass in the wrong type. This will fail if the user is using @@ -128,7 +128,7 @@ def test_decomposition_mf_label_user_col_raises(): user_col = "user_col" with pytest.raises( ValueError, - match=f"Expected user_col column to be `user_id`, but got {user_col}.", + match=f"Expected user_col column to be `user_id`.", ): decomposition.MatrixFactorization( num_factors=16, @@ -159,7 +159,7 @@ def test_decomposition_mf_label_item_col_raises(): item_col = "item_id" with pytest.raises( ValueError, - match=f"Expected item_col column to be `item_col`, but got {item_col}.", + match=f"Expected item_col column to be `item_col`.", ): decomposition.MatrixFactorization( num_factors=16, From 23d8fc896876829c52d90ffd44fbf04187a2ba4b Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 24 Mar 2025 18:16:09 +0000 Subject: [PATCH 53/75] ilnt --- tests/unit/ml/test_matrix_factorization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index bd1b61778d..e4e44d0f99 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -50,7 +50,7 @@ def test_decomposition_mf_invalid_feedback_type_raises(): feedback_type = "explimp" with pytest.raises( ValueError, - match=f"Expected feedback_type to be `explicit` or `implicit`.", + match="Expected feedback_type to be `explicit` or `implicit`.", ): decomposition.MatrixFactorization( # Intentionally pass in the wrong type. This will fail if the user is using @@ -128,7 +128,7 @@ def test_decomposition_mf_label_user_col_raises(): user_col = "user_col" with pytest.raises( ValueError, - match=f"Expected user_col column to be `user_id`.", + match="Expected user_col column to be `user_id`.", ): decomposition.MatrixFactorization( num_factors=16, @@ -143,7 +143,7 @@ def test_decomposition_mf_label_user_col_raises(): def test_decomposition_mf_invalid_item_col_raises(): item_col = 123 with pytest.raises( - TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}." + TypeError, match="Expected item_col to be STR, but got {type(item_col)}." ): decomposition.MatrixFactorization( num_factors=16, @@ -159,7 +159,7 @@ def test_decomposition_mf_label_item_col_raises(): item_col = "item_id" with pytest.raises( ValueError, - match=f"Expected item_col column to be `item_col`.", + match="Expected item_col column to be `item_col`.", ): decomposition.MatrixFactorization( num_factors=16, From ed99ad7c67e5b42a7ea6c2ddf1b95c465fbdb170 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 24 Mar 2025 13:27:56 -0500 Subject: [PATCH 54/75] Update test_matrix_factorization.py - add 'f' --- tests/unit/ml/test_matrix_factorization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index e4e44d0f99..047bd367b8 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -143,7 +143,7 @@ def test_decomposition_mf_label_user_col_raises(): def test_decomposition_mf_invalid_item_col_raises(): item_col = 123 with pytest.raises( - TypeError, match="Expected item_col to be STR, but got {type(item_col)}." + TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}." ): decomposition.MatrixFactorization( num_factors=16, From e305950d917d92100bd5db5fb07b015e0a2605c9 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 24 Mar 2025 20:35:25 +0000 Subject: [PATCH 55/75] improve errors and update tests --- bigframes/ml/decomposition.py | 16 +++++++++------- tests/unit/ml/test_matrix_factorization.py | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 17f91683af..6768c2db25 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -233,9 +233,9 @@ def __init__( self.feedback_type = feedback_type - if type(num_factors) is not int: + if not isinstance(num_factors, int): raise TypeError( - f"Expected num_factors to be INT64, but got {type(num_factors)}." + f"Expected num_factors to be INT, but got {type(num_factors)}." ) if num_factors < 0: @@ -245,7 +245,7 @@ def __init__( self.num_factors = num_factors - if type(user_col) is not str: + if not isinstance(user_col, str): raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.") if user_col != "user_id": @@ -253,7 +253,7 @@ def __init__( self.user_col = user_col - if type(item_col) is not str: + if not isinstance(item_col, str): raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") if item_col != "item_col": @@ -261,15 +261,17 @@ def __init__( self.item_col = item_col - if type(rating_col) is not str: + if not isinstance(rating_col, str): raise TypeError( f"Expected rating_col to be STR, but got {type(rating_col)}." ) self.rating_col = rating_col - if type(l2_reg) is not float: - raise TypeError(f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}.") + if not isinstance(l2_reg, (float, int)): + raise TypeError( + f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}." + ) self.l2_reg = l2_reg self._bqml_model: Optional[core.BqmlModel] = None diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 047bd367b8..826681eace 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -25,7 +25,7 @@ def test_decomposition_mf_model(): user_col="user_id", item_col="item_col", rating_col="rating_col", - l2_reg=9.83, + l2_reg=9, ) assert model.num_factors == 16 assert model.feedback_type == "implicit" @@ -97,7 +97,7 @@ def test_decomposition_mf_invalid_num_factors_raises(): num_factors = 0.5 with pytest.raises( TypeError, - match=f"Expected num_factors to be INT64, but got {type(num_factors)}.", + match=f"Expected num_factors to be INT, but got {type(num_factors)}.", ): decomposition.MatrixFactorization( num_factors=num_factors, # type: ignore @@ -186,10 +186,22 @@ def test_decomposition_mf_invalid_rating_col_raises(): ) +def test_decomposition_mf_l2_reg(): + model = decomposition.MatrixFactorization( + num_factors=16, + feedback_type="explicit", + user_col="user_id", + item_col="item_col", + rating_col="rating_col", + l2_reg=6.02, # type: ignore + ) + assert model.l2_reg == 6.02 + + def test_decomposition_mf_invalid_l2_reg_raises(): l2_reg = "6.02" with pytest.raises( - TypeError, match=f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}." + TypeError, match=f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}." ): decomposition.MatrixFactorization( num_factors=16, From 32917e5c5cc4bf4c54bbb8c9a5b3edb12611f2cf Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 08:37:22 -0500 Subject: [PATCH 56/75] Update tests/system/large/ml/test_decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- tests/system/large/ml/test_decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index a2aeaabd9e..add05c9abe 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -212,5 +212,5 @@ def test_decomposition_mf_configure_fit_load( assert reloaded_model.num_factors == 6 assert reloaded_model.user_col == "user_id" assert reloaded_model.item_col == "item_col" - assert reloaded_model.rating_col == "rating" + assert reloaded_model.rating_col == "ratings" assert reloaded_model.l2_reg == 9.83 From e485d3b9a2dccce5b2f717a2f8ae2ec70d668242 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 08:39:09 -0500 Subject: [PATCH 57/75] Update bigframes/ml/decomposition.py - num_factors error messsage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 6768c2db25..5911e9285e 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -235,7 +235,7 @@ def __init__( if not isinstance(num_factors, int): raise TypeError( - f"Expected num_factors to be INT, but got {type(num_factors)}." + f"Expected num_factors to be an int, but got {type(num_factors)}." ) if num_factors < 0: From 6a27083edfbf5edd51673e94360ac0de9ea4ff92 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 08:39:36 -0500 Subject: [PATCH 58/75] Update bigframes/ml/decomposition.py - user_col error message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 5911e9285e..864df46924 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -246,7 +246,7 @@ def __init__( self.num_factors = num_factors if not isinstance(user_col, str): - raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.") + raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.") if user_col != "user_id": raise ValueError("Expected user_col column to be `user_id`.") From 6e2d902d94bdc3ac2e1c20f9e25357b9eac86107 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 09:59:15 -0500 Subject: [PATCH 59/75] Update bigframes/ml/decomposition.py - rating_col error message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 864df46924..7ea0ccfd74 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -263,7 +263,7 @@ def __init__( if not isinstance(rating_col, str): raise TypeError( - f"Expected rating_col to be STR, but got {type(rating_col)}." + f"Expected rating_col to be a str, but got {type(rating_col)}." ) self.rating_col = rating_col From b65c63789b2bfc2f16ca8467e4742ef140ad86cc Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 09:59:44 -0500 Subject: [PATCH 60/75] Update bigframes/ml/decomposition.py - l2_reg error msg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 7ea0ccfd74..59d2b0277f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -270,7 +270,7 @@ def __init__( if not isinstance(l2_reg, (float, int)): raise TypeError( - f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}." + f"Expected l2_reg to be a float or int, but got {type(l2_reg)}." ) self.l2_reg = l2_reg From 74ebe27a9dca3f140059de2d8917c7150e8cc7f8 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 27 Mar 2025 15:09:18 +0000 Subject: [PATCH 61/75] fix tests to match updated error messages --- tests/unit/ml/test_matrix_factorization.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 826681eace..6e4cf04b79 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -97,7 +97,7 @@ def test_decomposition_mf_invalid_num_factors_raises(): num_factors = 0.5 with pytest.raises( TypeError, - match=f"Expected num_factors to be INT, but got {type(num_factors)}.", + match=f"Expected num_factors to be an int, but got {type(num_factors)}.", ): decomposition.MatrixFactorization( num_factors=num_factors, # type: ignore @@ -112,7 +112,7 @@ def test_decomposition_mf_invalid_num_factors_raises(): def test_decomposition_mf_invalid_user_col_raises(): user_col = 123 with pytest.raises( - TypeError, match=f"Expected user_col to be STR, but got {type(user_col)}." + TypeError, match=f"Expected user_col to be a str, but got {type(user_col)}." ): decomposition.MatrixFactorization( num_factors=16, @@ -174,7 +174,7 @@ def test_decomposition_mf_label_item_col_raises(): def test_decomposition_mf_invalid_rating_col_raises(): rating_col = 4 with pytest.raises( - TypeError, match=f"Expected rating_col to be STR, but got {type(rating_col)}." + TypeError, match=f"Expected rating_col to be a str, but got {type(rating_col)}." ): decomposition.MatrixFactorization( num_factors=16, @@ -201,7 +201,8 @@ def test_decomposition_mf_l2_reg(): def test_decomposition_mf_invalid_l2_reg_raises(): l2_reg = "6.02" with pytest.raises( - TypeError, match=f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}." + TypeError, + match=f"Expected l2_reg to be a float or int, but got {type(l2_reg)}.", ): decomposition.MatrixFactorization( num_factors=16, From 3f40763951cd7b67bda8b5c146545765cad653e8 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 10:52:17 -0500 Subject: [PATCH 62/75] Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs df MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index d8a1f0eb04..865db974e2 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,7 +24,11 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization - >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> X = bpd.DataFrame({ + ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], + ... "column": [0, 1] * 6, + ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], + ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'})) From 2cbc2e3f5839557969ef2ee0bf5e7317a92fe383 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 10:53:06 -0500 Subject: [PATCH 63/75] Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 865db974e2..1d84ebc374 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -29,7 +29,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): ... "column": [0, 1] * 6, ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], ... }) - >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06) + >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'})) Args: From 0a5aefb5c9b33fb98bd414fe1d14cee9b031ede8 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Thu, 27 Mar 2025 10:53:33 -0500 Subject: [PATCH 64/75] Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs fit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 1d84ebc374..c506a10671 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -30,7 +30,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) - >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'})) + >>> W = model.fit(X) Args: feedback_type ('explicit' | 'implicit'): From 366e0ab755a9596aa934a03f586d46ee030e06b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 28 Mar 2025 11:25:57 -0500 Subject: [PATCH 65/75] Update third_party/bigframes_vendored/sklearn/decomposition/_mf.py --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index c506a10671..41a2693a14 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,7 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], - ... "column": [0, 1] * 6, + ... "column": [0, 1] * 7, ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) From 56ee62399822a629484254c393041f01238bcd88 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 31 Mar 2025 15:01:00 +0000 Subject: [PATCH 66/75] remove errors and tests --- bigframes/ml/decomposition.py | 6 ---- tests/unit/ml/test_matrix_factorization.py | 32 ---------------------- 2 files changed, 38 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 59d2b0277f..e99853dfa3 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -248,17 +248,11 @@ def __init__( if not isinstance(user_col, str): raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.") - if user_col != "user_id": - raise ValueError("Expected user_col column to be `user_id`.") - self.user_col = user_col if not isinstance(item_col, str): raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.") - if item_col != "item_col": - raise ValueError("Expected item_col column to be `item_col`.") - self.item_col = item_col if not isinstance(rating_col, str): diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py index 6e4cf04b79..92691ba9d4 100644 --- a/tests/unit/ml/test_matrix_factorization.py +++ b/tests/unit/ml/test_matrix_factorization.py @@ -124,22 +124,6 @@ def test_decomposition_mf_invalid_user_col_raises(): ) -def test_decomposition_mf_label_user_col_raises(): - user_col = "user_col" - with pytest.raises( - ValueError, - match="Expected user_col column to be `user_id`.", - ): - decomposition.MatrixFactorization( - num_factors=16, - feedback_type="explicit", - user_col=user_col, # type: ignore - item_col="item_col", - rating_col="rating_col", - l2_reg=9.83, - ) - - def test_decomposition_mf_invalid_item_col_raises(): item_col = 123 with pytest.raises( @@ -155,22 +139,6 @@ def test_decomposition_mf_invalid_item_col_raises(): ) -def test_decomposition_mf_label_item_col_raises(): - item_col = "item_id" - with pytest.raises( - ValueError, - match="Expected item_col column to be `item_col`.", - ): - decomposition.MatrixFactorization( - num_factors=16, - feedback_type="explicit", - user_col="user_id", - item_col=item_col, # type: ignore - rating_col="rating_col", - l2_reg=9.83, - ) - - def test_decomposition_mf_invalid_rating_col_raises(): rating_col = 4 with pytest.raises( From c9424183486d39c52ff462bb794cc9b40a23313b Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 31 Mar 2025 10:14:35 -0500 Subject: [PATCH 67/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index e99853dfa3..1488a8270e 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -260,7 +260,7 @@ def __init__( f"Expected rating_col to be a str, but got {type(rating_col)}." ) - self.rating_col = rating_col + self._input_label_columns = [rating_col] if not isinstance(l2_reg, (float, int)): raise TypeError( From e0ef53e18627bafb70427d0d7af68f7ff1db6245 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 31 Mar 2025 10:14:56 -0500 Subject: [PATCH 68/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 1488a8270e..4f0ff94617 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -34,7 +34,7 @@ "num_factors": "numFactors", "user_col": "userColumn", "item_col": "itemColumn", - # TODO: Add rating_col + "_input_label_columns": "inputLabelColumns", "l2_reg": "l2Regularization", } From 5018182095b482b9e55c5af3d7bfb77af04f40dd Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 31 Mar 2025 10:15:09 -0500 Subject: [PATCH 69/75] Update bigframes/ml/decomposition.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/ml/decomposition.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 4f0ff94617..54c2fce9ff 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -271,6 +271,11 @@ def __init__( self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() + @property + def rating_col(self) -> str: + """str: The rating column name. Defaults to 'rating'."""" + return self._input_label_columns[0] + @classmethod def _from_bq( cls, session: bigframes.session.Session, bq_model: bigquery.Model From f9397f19b91fbf153dd6f93440686bbeb80c6dd9 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 31 Mar 2025 20:21:23 +0000 Subject: [PATCH 70/75] passing system test --- bigframes/ml/decomposition.py | 2 +- tests/data/ratings.jsonl | 40 ++++++++++----------- tests/data/ratings_schema.json | 2 +- tests/system/large/ml/test_decomposition.py | 21 +++++------ tests/unit/ml/test_golden_sql.py | 2 +- 5 files changed, 31 insertions(+), 36 deletions(-) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 54c2fce9ff..ece950a5a2 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -273,7 +273,7 @@ def __init__( @property def rating_col(self) -> str: - """str: The rating column name. Defaults to 'rating'."""" + """str: The rating column name. Defaults to 'rating'.""" return self._input_label_columns[0] @classmethod diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl index 919b61c350..b7cd350d08 100644 --- a/tests/data/ratings.jsonl +++ b/tests/data/ratings.jsonl @@ -1,20 +1,20 @@ -{"user_id": 1, "item_id": 2, "ratings": 4.0} -{"user_id": 1, "item_id": 5, "ratings": 3.0} -{"user_id": 2, "item_id": 1, "ratings": 5.0} -{"user_id": 2, "item_id": 3, "ratings": 2.0} -{"user_id": 3, "item_id": 4, "ratings": 4.5} -{"user_id": 3, "item_id": 7, "ratings": 3.5} -{"user_id": 4, "item_id": 2, "ratings": 1.0} -{"user_id": 4, "item_id": 8, "ratings": 5.0} -{"user_id": 5, "item_id": 3, "ratings": 4.0} -{"user_id": 5, "item_id": 9, "ratings": 2.5} -{"user_id": 6, "item_id": 1, "ratings": 3.0} -{"user_id": 6, "item_id": 6, "ratings": 4.5} -{"user_id": 7, "item_id": 5, "ratings": 5.0} -{"user_id": 7, "item_id": 10, "ratings": 1.5} -{"user_id": 8, "item_id": 4, "ratings": 2.0} -{"user_id": 8, "item_id": 7, "ratings": 4.0} -{"user_id": 9, "item_id": 2, "ratings": 3.5} -{"user_id": 9, "item_id": 9, "ratings": 5.0} -{"user_id": 10, "item_id": 3, "ratings": 4.5} -{"user_id": 10, "item_id": 8, "ratings": 2.5} +{"user_id": 1, "item_id": 2, "rating": 4.0} +{"user_id": 1, "item_id": 5, "rating": 3.0} +{"user_id": 2, "item_id": 1, "rating": 5.0} +{"user_id": 2, "item_id": 3, "rating": 2.0} +{"user_id": 3, "item_id": 4, "rating": 4.5} +{"user_id": 3, "item_id": 7, "rating": 3.5} +{"user_id": 4, "item_id": 2, "rating": 1.0} +{"user_id": 4, "item_id": 8, "rating": 5.0} +{"user_id": 5, "item_id": 3, "rating": 4.0} +{"user_id": 5, "item_id": 9, "rating": 2.5} +{"user_id": 6, "item_id": 1, "rating": 3.0} +{"user_id": 6, "item_id": 6, "rating": 4.5} +{"user_id": 7, "item_id": 5, "rating": 5.0} +{"user_id": 7, "item_id": 10, "rating": 1.5} +{"user_id": 8, "item_id": 4, "rating": 2.0} +{"user_id": 8, "item_id": 7, "rating": 4.0} +{"user_id": 9, "item_id": 2, "rating": 3.5} +{"user_id": 9, "item_id": 9, "rating": 5.0} +{"user_id": 10, "item_id": 3, "rating": 4.5} +{"user_id": 10, "item_id": 8, "rating": 2.5} diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json index 1867a8c801..9fd0101ec8 100644 --- a/tests/data/ratings_schema.json +++ b/tests/data/ratings_schema.json @@ -11,7 +11,7 @@ }, { "mode": "NULLABLE", - "name": "ratings", + "name": "rating", "type": "FLOAT" } ] diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index add05c9abe..d1a5f9f2aa 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -172,15 +172,12 @@ def test_decomposition_mf_configure_fit_load( num_factors=6, feedback_type="explicit", user_col="user_id", - item_col="item_col", - rating_col="ratings", + item_col="item_id", + rating_col="rating", l2_reg=9.83, ) - model.fit( - ratings_df_default_index.rename( - columns={"rating": "rating_col", "item_id": "item_col"} - ) - ) + + model.fit(ratings_df_default_index) reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_mf_model", replace=True @@ -191,16 +188,14 @@ def test_decomposition_mf_configure_fit_load( { "user_id": ["11", "12", "13"], "item_id": [1, 2, 3], - "ratings": [1.0, 2.0, 3.0], + "rating": [1.0, 2.0, 3.0], } ) ) reloaded_model.score(new_ratings) - result = reloaded_model.predict( - new_ratings.rename(columns={"item_id": "item_col"}) - ).to_pandas() + result = reloaded_model.predict(new_ratings).to_pandas() assert reloaded_model._bqml_model is not None assert ( @@ -211,6 +206,6 @@ def test_decomposition_mf_configure_fit_load( assert reloaded_model.feedback_type == "explicit" assert reloaded_model.num_factors == 6 assert reloaded_model.user_col == "user_id" - assert reloaded_model.item_col == "item_col" - assert reloaded_model.rating_col == "ratings" + assert reloaded_model.item_col == "item_id" + assert reloaded_model.rating_col == "rating" assert reloaded_model.l2_reg == 9.83 diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 03695a20e4..083dc25661 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -243,7 +243,7 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", - index_col=["index_column_id"], + trial_id=["index_column_id"], ) From b43912073353fbc4f6328584b758fa8f7d56325d Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 31 Mar 2025 20:44:56 +0000 Subject: [PATCH 71/75] E AssertionError: expected call not found. E Expected: read_gbq('SELECT * FROM ML.RECOMMEND(MODEL ..,\n (input_X_sql))', trial_id=['index_column_id']) E Actual: read_gbq('SELECT * FROM ML.RECOMMEND(MODEL ..,\n (input_X_sql))', index_col=['index_column_id']) --- tests/unit/ml/test_golden_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 083dc25661..03695a20e4 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -243,7 +243,7 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X): mock_session.read_gbq.assert_called_once_with( "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", - trial_id=["index_column_id"], + index_col=["index_column_id"], ) From 8a614c5f42696e2e423378441a33fe8f217369d4 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 1 Apr 2025 15:58:45 +0000 Subject: [PATCH 72/75] same # of elements in each --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 41a2693a14..79b90bd13e 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,8 +26,8 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], - ... "column": [0, 1] * 7, - ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], + ... "column": [0, 1, 1, 1, 0, 3, 1, 0, 2, 1, 1, 0, 2, 1], + ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 1], ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) >>> W = model.fit(X) From c2b47950850a4bfc8da04754652a8bce24a62343 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 1 Apr 2025 18:26:54 +0000 Subject: [PATCH 73/75] attempt --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 79b90bd13e..9d6283e756 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,8 +26,8 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], - ... "column": [0, 1, 1, 1, 0, 3, 1, 0, 2, 1, 1, 0, 2, 1], - ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 1], + ... "column": [[0, 1], [1, 1], [0, 3], [1, 0], [2, 1], [1, 0], [2, 1]], + ... "value": [[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1], [2, 1]], ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) >>> W = model.fit(X) From cf6e5be73b64392f1b628b4799ac644946d0445c Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 1 Apr 2025 18:40:17 +0000 Subject: [PATCH 74/75] doc fix --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 9d6283e756..302cf018ee 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -26,8 +26,8 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> from bigframes.ml.decomposition import MatrixFactorization >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], - ... "column": [[0, 1], [1, 1], [0, 3], [1, 0], [2, 1], [1, 0], [2, 1]], - ... "value": [[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1], [2, 1]], + ... "column": [0,1] * 7, + ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) >>> W = model.fit(X) From da230b497268c43539109cae7d5e03d95307c870 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 1 Apr 2025 18:45:28 +0000 Subject: [PATCH 75/75] doc fix --- third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py index 302cf018ee..fb29cc8984 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py @@ -24,10 +24,11 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta): >>> import bigframes.pandas as bpd >>> from bigframes.ml.decomposition import MatrixFactorization + >>> bpd.options.display.progress_bar = None >>> X = bpd.DataFrame({ ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], ... "column": [0,1] * 7, - ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1], + ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 3], ... }) >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06) >>> W = model.fit(X)