From 6783a0a6010211fd61968223ed41ece9e5ec3835 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Wed, 4 Sep 2024 09:26:32 -0500
Subject: [PATCH 01/75] docs: update title of pypi notebook example to reflect
 use of the PyPI public dataset

In response to feedback on internal change 662899733.
---
 notebooks/dataframes/pypi.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/dataframes/pypi.ipynb b/notebooks/dataframes/pypi.ipynb
index 3777e98d42..7b16412ff5 100644
--- a/notebooks/dataframes/pypi.ipynb
+++ b/notebooks/dataframes/pypi.ipynb
@@ -25,7 +25,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Analyzing Python dependencies with BigQuery DataFrames\n",
+    "# Analyzing package downloads from PyPI with BigQuery DataFrames\n",
     "\n",
     "In this notebook, you'll use the [PyPI public dataset](https://console.cloud.google.com/marketplace/product/gcp-public-data-pypi/pypi) and the [deps.dev public dataset](https://deps.dev/) to visualize Python package downloads for a package and its dependencies.\n",
     "\n",

From 1d3956025146ae442f3e8f8b22d1e1660de068d3 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 27 Jan 2025 20:16:17 +0000
Subject: [PATCH 02/75] feat: add support for creating a  Matrix Factorization
 model

---
 bigframes/ml/decomposition.py                 | 24 +++++++++++
 bigframes/ml/loader.py                        |  1 +
 .../sklearn/decomposition/_mf.py              | 40 +++++++++++++++++++
 3 files changed, 65 insertions(+)
 create mode 100644 third_party/bigframes_vendored/sklearn/decomposition/_mf.py

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index c98e18322a..d34a0bfc13 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -19,6 +19,7 @@
 
 from typing import List, Literal, Optional, Union
 
+import bigframes_vendored.sklearn.decomposition._ml
 import bigframes_vendored.sklearn.decomposition._pca
 from google.cloud import bigquery
 
@@ -197,3 +198,26 @@ def score(
 
         # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
         return self._bqml_model.evaluate()
+
+
+@log_adapter.class_logger
+class MF(
+    base.UnsupervisedTrainablePredictor,
+    bigframes_vendored.sklearn.decomposition._mf.MF,
+):
+    __doc__ = bigframes_vendored.sklearn.decomposition._mf.MF.__doc__
+
+    def __init__(
+        self,
+        n_components: Optional[Union[int, float]] = None,
+        *,
+        user_col: str,
+        item_col: str,
+        l2_reg: float,
+    ):
+        self.n_components = n_components
+        self.user_col = user_col
+        self.item_col = item_col
+        self.l2_reg = l2_reg
+        self._bqml_model: Optional[core.BqmlModel] = None
+        self._bqml_model_factory = globals.bqml_model_factory()
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
index 5d52927ded..53961879e6 100644
--- a/bigframes/ml/loader.py
+++ b/bigframes/ml/loader.py
@@ -42,6 +42,7 @@
         "LINEAR_REGRESSION": linear_model.LinearRegression,
         "LOGISTIC_REGRESSION": linear_model.LogisticRegression,
         "KMEANS": cluster.KMeans,
+        "MF": decomposition.MF,
         "PCA": decomposition.PCA,
         "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
         "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
new file mode 100644
index 0000000000..d453645b1a
--- /dev/null
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -0,0 +1,40 @@
+""" Matrix Factorization.
+"""
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+#         Mathieu Blondel <mathieu@mblondel.org>
+#         Denis A. Engemann <denis-alexander.engemann@inria.fr>
+#         Michael Eickenberg <michael.eickenberg@inria.fr>
+#         Giorgio Patrini <giorgio.patrini@anu.edu.au>
+#
+# License: BSD 3 clause
+
+from abc import ABCMeta
+
+from bigframes_vendored.sklearn.base import BaseEstimator
+
+# from bigframes import constants
+
+
+class PCA(BaseEstimator, metaclass=ABCMeta):
+    """Matrix Factorization (MF).
+
+    **Examples:**
+
+        >>> import numpy as np
+        >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+        >>> from sklearn.decomposition import NMF
+        >>> model = NMF(n_components=2, init='random', random_state=0)
+        >>> W = model.fit_transform(X)
+        >>> H = model.components_
+
+    Args:
+        n_components (int, float or None, default None):
+            Number of components to keep. If n_components is not set, all
+            components are kept, n_components = min(n_samples, n_features).
+            If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
+        svd_solver ("full", "randomized" or "auto", default "auto"):
+            The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver.
+
+    """

From 1bef4a2f0f45507c954eaacdcbb14866a5ba4477 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 27 Jan 2025 22:30:24 +0000
Subject: [PATCH 03/75] feat: add support for creating a  Matrix Factorization
 model

---
 bigframes/ml/decomposition.py                 |  2 +
 bigframes/ml/loader.py                        |  1 +
 .../sklearn/decomposition/_mf.py              | 74 +++++--------------
 3 files changed, 21 insertions(+), 56 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 9e18276dd1..486979b832 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -211,11 +211,13 @@ def __init__(
         self,
         n_components: Optional[Union[int, float]] = None,
         *,
+        num_factors: int,
         user_col: str,
         item_col: str,
         l2_reg: float,
     ):
         self.n_components = n_components
+        self.num_factors = num_factors
         self.user_col = user_col
         self.item_col = item_col
         self.l2_reg = l2_reg
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
index 53961879e6..c8ed914468 100644
--- a/bigframes/ml/loader.py
+++ b/bigframes/ml/loader.py
@@ -83,6 +83,7 @@
 def from_bq(
     session: bigframes.session.Session, bq_model: bigquery.Model
 ) -> Union[
+    decomposition.MF,
     decomposition.PCA,
     cluster.KMeans,
     linear_model.LinearRegression,
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index a4c175ab9a..bae62b9e85 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -17,15 +17,15 @@
 from bigframes import constants
 
 
-class PCA(BaseEstimator, metaclass=ABCMeta):
+class MF(BaseEstimator, metaclass=ABCMeta):
     """Matrix Factorization (MF).
 
     **Examples:**
 
-        >>> import numpy as np
-        >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
-        >>> from sklearn.decomposition import NMF
-        >>> model = NMF(n_components=2, init='random', random_state=0)
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.decomposition import MF
+        >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+        >>> model = MF(n_components=2, init='random', random_state=0)
         >>> W = model.fit_transform(X)
         >>> H = model.components_
 
@@ -34,9 +34,18 @@ class PCA(BaseEstimator, metaclass=ABCMeta):
             Number of components to keep. If n_components is not set, all
             components are kept, n_components = min(n_samples, n_features).
             If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
-        svd_solver ("full", "randomized" or "auto", default "auto"):
-            The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver.
-
+        num_factors (int or auto, default auto):
+            Specifies the number of latent factors to use.
+            If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples.
+        user_col (str):
+            The user column name.
+        item_col (str):
+            The item column name.
+        l2_reg (float, default 1.0):
+            If you aren't running hyperparameter tuning, then you can specify a FLOAT64 value. The default value is 1.0.
+            If you are running hyperparameter tuning, then you can use one of the following options:
+                The HPARAM_RANGE keyword and two FLOAT64 values that define the range to use for the hyperparameter. For example, L2_REG = HPARAM_RANGE(1.5, 5.0).
+                The HPARAM_CANDIDATES keyword and an array of FLOAT64 values that provide discrete values to use for the hyperparameter. For example, L2_REG = HPARAM_CANDIDATES([0, 1.0, 3.0, 5.0]).
     """
 
     def fit(self, X, y=None):
@@ -62,7 +71,7 @@ def score(self, X=None, y=None):
         .. note::
 
             Output matches that of the BigQuery ML.EVALUATE function.
-            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#matrix_factorization_models
             for the outputs relevant to this model type.
 
         Args:
@@ -86,50 +95,3 @@ def predict(self, X):
         Returns:
             bigframes.dataframe.DataFrame: Predicted DataFrames."""
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
-
-    @property
-    def components_(self):
-        """Principal axes in feature space, representing the directions of maximum variance in the data.
-
-        Returns:
-            bigframes.dataframe.DataFrame: DataFrame of principal components, containing following columns:
-                principal_component_id: An integer that identifies the principal component.
-
-                feature: The column name that contains the feature.
-
-                numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL.
-
-                categorical_value: A list of mappings containing information about categorical features. Each mapping contains the following fields:
-                    categorical_value.category: The name of each category.
-
-                    categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies.
-
-            The output contains one row per feature per component.
-        """
-        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
-
-    @property
-    def explained_variance_(self):
-        """The amount of variance explained by each of the selected components.
-
-        Returns:
-            bigframes.dataframe.DataFrame: DataFrame containing following columns:
-                principal_component_id: An integer that identifies the principal component.
-
-                explained_variance: The factor by which the eigenvector is scaled. Eigenvalue and explained variance are the same concepts in PCA.
-        """
-        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
-
-    @property
-    def explained_variance_ratio_(self):
-        """Percentage of variance explained by each of the selected components.
-
-        Returns:
-            bigframes.dataframe.DataFrame: DataFrame containing following columns:
-                principal_component_id: An integer that identifies the principal component.
-
-                explained_variance_ratio: the total variance is the sum of variances, also known as eigenvalues, of all
-                of the individual principal components. The explained variance ratio by a principal component is
-                the ratio between the variance, also known as eigenvalue, of that principal component and the total variance.
-        """
-        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

From e336bde7894bdddc8f8e2ac6fadec730d7ef213d Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Tue, 28 Jan 2025 11:10:06 -0600
Subject: [PATCH 04/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 486979b832..9acabc26bc 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -201,7 +201,7 @@ def score(
 
 
 @log_adapter.class_logger
-class MF(
+class MatrixFactorization(
     base.UnsupervisedTrainablePredictor,
     bigframes_vendored.sklearn.decomposition._mf.MF,
 ):

From d5f713a4bd3616f8b8feefd9fff0afe22253497c Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Tue, 28 Jan 2025 11:11:17 -0600
Subject: [PATCH 05/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 9acabc26bc..5169fadfe5 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -209,7 +209,6 @@ class MatrixFactorization(
 
     def __init__(
         self,
-        n_components: Optional[Union[int, float]] = None,
         *,
         num_factors: int,
         user_col: str,

From 5e3e4434176906c010936221b7e59551f2cb2d4f Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Tue, 28 Jan 2025 11:12:01 -0600
Subject: [PATCH 06/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 5169fadfe5..e995aee62f 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -213,7 +213,8 @@ def __init__(
         num_factors: int,
         user_col: str,
         item_col: str,
-        l2_reg: float,
+        # TODO: Add support for hyperparameter tuning.
+        l2_reg: float = 1.0,
     ):
         self.n_components = n_components
         self.num_factors = num_factors

From c116e8ad21e8b5e64623a03fbe9964244b89857a Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 28 Jan 2025 19:13:11 +0000
Subject: [PATCH 07/75] rating_col

---
 bigframes/ml/decomposition.py                 | 29 ++++++-------------
 bigframes/ml/loader.py                        |  4 +--
 .../sklearn/decomposition/_mf.py              | 10 ++-----
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index e995aee62f..fe2094630e 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -203,9 +203,9 @@ def score(
 @log_adapter.class_logger
 class MatrixFactorization(
     base.UnsupervisedTrainablePredictor,
-    bigframes_vendored.sklearn.decomposition._mf.MF,
+    bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization,
 ):
-    __doc__ = bigframes_vendored.sklearn.decomposition._mf.MF.__doc__
+    __doc__ = bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization.__doc__
 
     def __init__(
         self,
@@ -213,13 +213,14 @@ def __init__(
         num_factors: int,
         user_col: str,
         item_col: str,
+        rating_col: Optional[str] = "rating",
         # TODO: Add support for hyperparameter tuning.
         l2_reg: float = 1.0,
     ):
-        self.n_components = n_components
         self.num_factors = num_factors
         self.user_col = user_col
         self.item_col = item_col
+        self.rating_col = rating_col
         self.l2_reg = l2_reg
         self._bqml_model: Optional[core.BqmlModel] = None
         self._bqml_model_factory = globals.bqml_model_factory()
@@ -227,8 +228,8 @@ def __init__(
     @classmethod
     def _from_bq(
         cls, session: bigframes.session.Session, bq_model: bigquery.Model
-    ) -> MF:
-        assert bq_model.model_type == "MF"
+    ) -> MatrixFactorization:
+        assert bq_model.model_type == "MatrixFactorization"
 
         kwargs = utils.retrieve_params_from_bq_model(
             cls, bq_model, _BQML_PARAMS_MAPPING
@@ -248,15 +249,9 @@ def _from_bq(
     def _bqml_options(self) -> dict:
         """The model options as they will be set for BQML"""
         options: dict = {
-            "model_type": "ML",
+            "model_type": "MatrixFactorization",
         }
 
-        assert self.n_components is not None
-        if 0 < self.n_components < 1:
-            options["pca_explained_variance_ratio"] = float(self.n_components)
-        elif self.n_components >= 1:
-            options["num_principal_components"] = int(self.n_components)
-
         return options
 
     def _fit(
@@ -264,17 +259,11 @@ def _fit(
         X: utils.ArrayType,
         y=None,
         transforms: Optional[List[str]] = None,
-    ) -> PCA:
+    ) -> MatrixFactorization:
         (X,) = utils.batch_convert_to_dataframe(X)
 
         # To mimic sklearn's behavior
-        if self.n_components is None:
-            self.n_components = min(X.shape)
-        self._bqml_model = self._bqml_model_factory.create_model(
-            X_train=X,
-            transforms=transforms,
-            options=self._bqml_options,
-        )
+
         return self
 
     @property
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
index c8ed914468..7aa6d6708f 100644
--- a/bigframes/ml/loader.py
+++ b/bigframes/ml/loader.py
@@ -42,7 +42,7 @@
         "LINEAR_REGRESSION": linear_model.LinearRegression,
         "LOGISTIC_REGRESSION": linear_model.LogisticRegression,
         "KMEANS": cluster.KMeans,
-        "MF": decomposition.MF,
+        "MatrixFactorization": decomposition.MatrixFactorization,
         "PCA": decomposition.PCA,
         "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
         "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
@@ -83,7 +83,7 @@
 def from_bq(
     session: bigframes.session.Session, bq_model: bigquery.Model
 ) -> Union[
-    decomposition.MF,
+    decomposition.MatrixFactorization,
     decomposition.PCA,
     cluster.KMeans,
     linear_model.LinearRegression,
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index bae62b9e85..da5eacb18b 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -17,23 +17,19 @@
 from bigframes import constants
 
 
-class MF(BaseEstimator, metaclass=ABCMeta):
+class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
     """Matrix Factorization (MF).
 
     **Examples:**
 
         >>> import bigframes.pandas as bpd
-        >>> from bigframes.ml.decomposition import MF
+        >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
-        >>> model = MF(n_components=2, init='random', random_state=0)
+        >>> model = MatrixFactorization(n_components=2, init='random', random_state=0)
         >>> W = model.fit_transform(X)
         >>> H = model.components_
 
     Args:
-        n_components (int, float or None, default None):
-            Number of components to keep. If n_components is not set, all
-            components are kept, n_components = min(n_samples, n_features).
-            If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
         num_factors (int or auto, default auto):
             Specifies the number of latent factors to use.
             If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples.

From dedef3980dc516bd28a5a9d55f46fc15c71e2743 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 28 Jan 2025 21:14:14 +0000
Subject: [PATCH 08/75] (nearly) complete class

---
 bigframes/ml/decomposition.py                 | 97 +++++--------------
 bigframes/ml/loader.py                        |  2 +-
 .../sklearn/decomposition/_mf.py              |  2 +-
 3 files changed, 25 insertions(+), 76 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index fe2094630e..1d5e8a9e07 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -19,7 +19,7 @@
 
 from typing import List, Literal, Optional, Union
 
-import bigframes_vendored.sklearn.decomposition._ml
+import bigframes_vendored.sklearn.decomposition._mf
 import bigframes_vendored.sklearn.decomposition._pca
 from google.cloud import bigquery
 
@@ -210,13 +210,15 @@ class MatrixFactorization(
     def __init__(
         self,
         *,
+        feedback_type: Literal["explicit", "implicit"] = "explicit",
         num_factors: int,
         user_col: str,
         item_col: str,
-        rating_col: Optional[str] = "rating",
+        rating_col: str = "rating",
         # TODO: Add support for hyperparameter tuning.
         l2_reg: float = 1.0,
     ):
+        self.feedback_type = feedback_type
         self.num_factors = num_factors
         self.user_col = user_col
         self.item_col = item_col
@@ -229,18 +231,12 @@ def __init__(
     def _from_bq(
         cls, session: bigframes.session.Session, bq_model: bigquery.Model
     ) -> MatrixFactorization:
-        assert bq_model.model_type == "MatrixFactorization"
+        assert bq_model.model_type == "MATRIX_FACTORIZATION"
 
         kwargs = utils.retrieve_params_from_bq_model(
             cls, bq_model, _BQML_PARAMS_MAPPING
         )
 
-        last_fitting = bq_model.training_runs[-1]["trainingOptions"]
-        if "numPrincipalComponents" in last_fitting:
-            kwargs["n_components"] = int(last_fitting["numPrincipalComponents"])
-        # elif "pcaExplainedVarianceRatio" in last_fitting:
-        #     kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"])
-
         model = cls(**kwargs)
         model._bqml_model = core.BqmlModel(session, bq_model)
         return model
@@ -249,9 +245,18 @@ def _from_bq(
     def _bqml_options(self) -> dict:
         """The model options as they will be set for BQML"""
         options: dict = {
-            "model_type": "MatrixFactorization",
+            "model_type": "matrix_factorization",
+            "feedback_type": self.feedback_type,
+            "user_col": self.user_col,
+            "item_col": self.item_col,
+            "rating_col": self.rating_col,
+            "l2_reg": self.l2_reg,
         }
 
+        if self.num_factors is not None:
+            options["num_factors"] = self.num_factors
+
+        print(repr(options))
         return options
 
     def _fit(
@@ -262,79 +267,23 @@ def _fit(
     ) -> MatrixFactorization:
         (X,) = utils.batch_convert_to_dataframe(X)
 
-        # To mimic sklearn's behavior
-
+        self._bqml_model = self._bqml_model_factory.create_model(
+            X_train=X,
+            transforms=transforms,
+            options=self._bqml_options,
+        )
         return self
 
-    @property
-    def components_(self) -> bpd.DataFrame:
-        if not self._bqml_model:
-            raise RuntimeError("A model must be fitted before calling components_.")
-
-        return self._bqml_model.principal_components()
-
-    @property
-    def explained_variance_(self) -> bpd.DataFrame:
-        if not self._bqml_model:
-            raise RuntimeError(
-                "A model must be fitted before calling explained_variance_."
-            )
-
-        return self._bqml_model.principal_component_info()[
-            ["principal_component_id", "eigenvalue"]
-        ].rename(columns={"eigenvalue": "explained_variance"})
-
-    @property
-    def explained_variance_ratio_(self) -> bpd.DataFrame:
-        if not self._bqml_model:
-            raise RuntimeError(
-                "A model must be fitted before calling explained_variance_ratio_."
-            )
-
-        return self._bqml_model.principal_component_info()[
-            ["principal_component_id", "explained_variance_ratio"]
-        ]
-
     def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before predict")
 
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
+        # TODO: Create recommend()
         return self._bqml_model.predict(X)
 
-    def detect_anomalies(
-        self,
-        X: utils.ArrayType,
-        *,
-        contamination: float = 0.1,
-    ) -> bpd.DataFrame:
-        """Detect the anomaly data points of the input.
-
-        Args:
-            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
-                Series or a DataFrame to detect anomalies.
-            contamination (float, default 0.1):
-                Identifies the proportion of anomalies in the training dataset that are used to create the model.
-                The value must be in the range [0, 0.5].
-
-        Returns:
-            bigframes.dataframe.DataFrame: detected DataFrame."""
-        if contamination < 0.0 or contamination > 0.5:
-            raise ValueError(
-                f"contamination must be [0.0, 0.5], but is {contamination}."
-            )
-
-        if not self._bqml_model:
-            raise RuntimeError("A model must be fitted before detect_anomalies")
-
-        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
-
-        return self._bqml_model.detect_anomalies(
-            X, options={"contamination": contamination}
-        )
-
-    def to_gbq(self, model_name: str, replace: bool = False) -> PCA:
+    def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization:
         """Save the model to BigQuery.
 
         Args:
@@ -344,7 +293,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PCA:
                 Determine whether to replace if the model already exists. Default to False.
 
         Returns:
-            PCA: Saved model."""
+            MatrixFactorization: Saved model."""
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before it can be saved")
 
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
index 7aa6d6708f..1f62eec0ff 100644
--- a/bigframes/ml/loader.py
+++ b/bigframes/ml/loader.py
@@ -42,7 +42,7 @@
         "LINEAR_REGRESSION": linear_model.LinearRegression,
         "LOGISTIC_REGRESSION": linear_model.LogisticRegression,
         "KMEANS": cluster.KMeans,
-        "MatrixFactorization": decomposition.MatrixFactorization,
+        "MATRIX_FACTORIZATION": decomposition.MatrixFactorization,
         "PCA": decomposition.PCA,
         "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
         "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index da5eacb18b..1b371c1af5 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -57,7 +57,7 @@ def fit(self, X, y=None):
                 Ignored.
 
         Returns:
-            PCA: Fitted estimator.
+            bigframes.ml.decomposition.MatrixFactorization: Fitted estimator.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 

From 27871786d39c7d81f03cb347a08d93c1dab45a49 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 28 Jan 2025 21:32:56 +0000
Subject: [PATCH 09/75] removem print()

---
 bigframes/ml/decomposition.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 1d5e8a9e07..0f802ddebb 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -256,7 +256,6 @@ def _bqml_options(self) -> dict:
         if self.num_factors is not None:
             options["num_factors"] = self.num_factors
 
-        print(repr(options))
         return options
 
     def _fit(

From 086b4dd86ed4a09b859ce7b2a381fe4513549c0a Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 29 Jan 2025 00:07:37 +0000
Subject: [PATCH 10/75] adding recommend

---
 bigframes/ml/core.py          | 6 ++++++
 bigframes/ml/decomposition.py | 7 +++----
 bigframes/ml/sql.py           | 5 +++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index d038b8f4c0..4fafc470a5 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -117,6 +117,12 @@ def model(self) -> bigquery.Model:
         """Get the BQML model associated with this wrapper"""
         return self._model
 
+    def recommend(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
+        return self._apply_ml_tvf(
+            input_data,
+            self._model_manipulation_sql_generator.ml_recommend,
+        )
+
     def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
         return self._apply_ml_tvf(
             input_data,
diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 0f802ddebb..574dadbe4d 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -273,14 +273,13 @@ def _fit(
         )
         return self
 
-    def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
+    def recommend(self, X: utils.ArrayType) -> bpd.DataFrame:
         if not self._bqml_model:
-            raise RuntimeError("A model must be fitted before predict")
+            raise RuntimeError("A model must be fitted before recommend")
 
         (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
 
-        # TODO: Create recommend()
-        return self._bqml_model.predict(X)
+        return self._bqml_model.recommend(X)
 
     def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization:
         """Save the model to BigQuery.
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index b662d4c22c..d59f2013da 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -299,6 +299,11 @@ def alter_model(
         return "\n".join(parts)
 
     # ML prediction TVFs
+    def ml_recommend(self, source_sql: str) -> str:
+        """Encode ML.RECOMMEND for BQML"""
+        return f"""SELECT * FROM ML.RECOMMEND(MODEL {self._model_ref_sql()},
+  ({source_sql}))"""
+
     def ml_predict(self, source_sql: str) -> str:
         """Encode ML.PREDICT for BQML"""
         return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()},

From 7c371ac847108491bcd6f01504fe41e6673afb89 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Thu, 30 Jan 2025 14:09:40 +0000
Subject: [PATCH 11/75] remove hyper parameter runing references

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 1b371c1af5..c088fa59cd 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -32,16 +32,12 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
     Args:
         num_factors (int or auto, default auto):
             Specifies the number of latent factors to use.
-            If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples.
         user_col (str):
             The user column name.
         item_col (str):
             The item column name.
         l2_reg (float, default 1.0):
-            If you aren't running hyperparameter tuning, then you can specify a FLOAT64 value. The default value is 1.0.
-            If you are running hyperparameter tuning, then you can use one of the following options:
-                The HPARAM_RANGE keyword and two FLOAT64 values that define the range to use for the hyperparameter. For example, L2_REG = HPARAM_RANGE(1.5, 5.0).
-                The HPARAM_CANDIDATES keyword and an array of FLOAT64 values that provide discrete values to use for the hyperparameter. For example, L2_REG = HPARAM_CANDIDATES([0, 1.0, 3.0, 5.0]).
+            A floating point value for L2 regularization. The default value is 1.0.
     """
 
     def fit(self, X, y=None):

From 8de384a768edefd4103cd93b2a8dcbf7fa75d1d9 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 4 Feb 2025 15:29:18 +0000
Subject: [PATCH 12/75] swap predict in _mf for recommend

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index c088fa59cd..83a2f9bea8 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -77,7 +77,7 @@ def score(self, X=None, y=None):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def predict(self, X):
+    def recommend(self, X):
         """Predict the closest cluster for each sample in X.
 
         Args:

From 647532b1ebfb0b638cffcc8565d0271d3217bd2d Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 4 Feb 2025 19:39:46 +0000
Subject: [PATCH 13/75] recommend -> predict

---
 bigframes/ml/decomposition.py                               | 2 +-
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 574dadbe4d..1ea7d98177 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -273,7 +273,7 @@ def _fit(
         )
         return self
 
-    def recommend(self, X: utils.ArrayType) -> bpd.DataFrame:
+    def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before recommend")
 
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 83a2f9bea8..c088fa59cd 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -77,7 +77,7 @@ def score(self, X=None, y=None):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def recommend(self, X):
+    def predict(self, X):
         """Predict the closest cluster for each sample in X.
 
         Args:

From b340c4fb48bbbda8a040608b6255dd88d8b27f9c Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 4 Feb 2025 20:27:56 +0000
Subject: [PATCH 14/75] update predict doc string

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index c088fa59cd..2d9ec4e1a1 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -78,7 +78,7 @@ def score(self, X=None, y=None):
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
     def predict(self, X):
-        """Predict the closest cluster for each sample in X.
+        """Generate a predicted rating for every user-item row combination for a matrix factorization model.
 
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):

From 4c90c1d84116eac05ea29ea75127b7f186f39016 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 10 Feb 2025 22:55:28 +0000
Subject: [PATCH 15/75] Merge branch 'main' into
 b338873783-matrix-factorization

---
 bigframes/core/__init__.py                    |  24 +-
 bigframes/core/blocks.py                      |  20 +
 bigframes/core/compile/compiled.py            |   2 +-
 bigframes/core/compile/compiler.py            |  22 +-
 bigframes/core/nodes.py                       | 423 +++++++++---------
 bigframes/core/rewrite/__init__.py            |   2 +
 bigframes/core/rewrite/implicit_align.py      |  40 +-
 bigframes/core/rewrite/legacy_align.py        |   6 +-
 bigframes/core/rewrite/order.py               |  16 +-
 bigframes/core/rewrite/pruning.py             | 195 ++++++++
 bigframes/core/rewrite/slices.py              |   4 +-
 bigframes/dataframe.py                        |  15 +-
 bigframes/ml/llm.py                           | 148 ++++++
 bigframes/ml/loader.py                        |   2 +
 bigframes/ml/utils.py                         |   3 +
 bigframes/operations/timedelta_ops.py         |   6 +-
 bigframes/pandas/core/tools/timedeltas.py     |  20 +-
 bigframes/streaming/dataframe.py              |  32 +-
 docs/templates/toc.yml                        |   2 +-
 .../bq_dataframes_template.ipynb              |   2 +-
 tests/system/small/test_dataframe.py          |  11 +-
 tests/system/small/test_pandas.py             |  38 +-
 .../ibis/backends/sql/datatypes.py            |   1 -
 .../ibis/backends/sql/rewrites.py             |   2 +-
 .../bigframes_vendored/ibis/common/graph.py   |   5 +
 25 files changed, 759 insertions(+), 282 deletions(-)
 create mode 100644 bigframes/core/rewrite/pruning.py

diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index 5f64bf68dd..dc9b8e3b9b 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -304,18 +304,25 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue:
         if destination_id in self.column_ids:  # Mutate case
             exprs = [
                 (
-                    ex.deref(source_id if (col_id == destination_id) else col_id),
-                    ids.ColumnId(col_id),
+                    bigframes.core.nodes.AliasedRef(
+                        ex.deref(source_id if (col_id == destination_id) else col_id),
+                        ids.ColumnId(col_id),
+                    )
                 )
                 for col_id in self.column_ids
             ]
         else:  # append case
             self_projection = (
-                (ex.deref(col_id), ids.ColumnId(col_id)) for col_id in self.column_ids
+                bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id))
+                for col_id in self.column_ids
             )
             exprs = [
                 *self_projection,
-                (ex.deref(source_id), ids.ColumnId(destination_id)),
+                (
+                    bigframes.core.nodes.AliasedRef(
+                        ex.deref(source_id), ids.ColumnId(destination_id)
+                    )
+                ),
             ]
         return ArrayValue(
             nodes.SelectionNode(
@@ -337,7 +344,10 @@ def create_constant(
 
     def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
         # This basically just drops and reorders columns - logically a no-op except as a final step
-        selections = ((ex.deref(col_id), ids.ColumnId(col_id)) for col_id in column_ids)
+        selections = (
+            bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id))
+            for col_id in column_ids
+        )
         return ArrayValue(
             nodes.SelectionNode(
                 child=self.node,
@@ -488,7 +498,9 @@ def prepare_join_names(
                 nodes.SelectionNode(
                     other.node,
                     tuple(
-                        (ex.deref(old_id), ids.ColumnId(new_id))
+                        bigframes.core.nodes.AliasedRef(
+                            ex.deref(old_id), ids.ColumnId(new_id)
+                        )
                         for old_id, new_id in r_mapping.items()
                     ),
                 ),
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index b1f4ed35cc..c6e3096e51 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -276,6 +276,26 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]:
             mapping[label] = (*mapping.get(label, ()), id)
         return mapping
 
+    def resolve_label_exact(self, label: Label) -> Optional[str]:
+        """Returns the column id matching the label if there is exactly
+        one such column. If there are multiple columns with the same name,
+        raises an error. If there is no such a column, returns None."""
+        matches = self.label_to_col_id.get(label, [])
+        if len(matches) > 1:
+            raise ValueError(
+                f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}"
+            )
+        return matches[0] if len(matches) != 0 else None
+
+    def resolve_label_exact_or_error(self, label: Label) -> str:
+        """Returns the column id matching the label if there is exactly
+        one such column. If there are multiple columns with the same name,
+        raises an error. If there is no such a column, raises an error too."""
+        col_id = self.resolve_label_exact(label)
+        if col_id is None:
+            raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}")
+        return col_id
+
     @functools.cached_property
     def col_id_to_index_name(self) -> typing.Mapping[str, Label]:
         """Get column label for value columns, or index name for index columns"""
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
index 906bdb1f0d..93be998b5b 100644
--- a/bigframes/core/compile/compiled.py
+++ b/bigframes/core/compile/compiled.py
@@ -184,7 +184,7 @@ def _to_ibis_expr(
         # Special case for empty tables, since we can't create an empty
         # projection.
         if not self._columns:
-            return bigframes_vendored.ibis.memtable([])
+            return self._table.select([bigframes_vendored.ibis.literal(1)])
 
         table = self._table.select(self._columns)
         if fraction is not None:
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
index dca204401e..ff5f1d61c8 100644
--- a/bigframes/core/compile/compiler.py
+++ b/bigframes/core/compile/compiler.py
@@ -62,9 +62,11 @@ def compile_sql(
         if ordered:
             node, limit = rewrites.pullup_limit_from_slice(node)
             node = nodes.bottom_up(node, rewrites.rewrite_slice)
+            # TODO: Extract out CTEs
             node, ordering = rewrites.pull_up_order(
                 node, order_root=True, ordered_joins=self.strict
             )
+            node = rewrites.column_pruning(node)
             ir = self.compile_node(node)
             return ir.to_sql(
                 order_by=ordering.all_ordering_columns,
@@ -76,6 +78,7 @@ def compile_sql(
             node, _ = rewrites.pull_up_order(
                 node, order_root=False, ordered_joins=self.strict
             )
+            node = rewrites.column_pruning(node)
             ir = self.compile_node(node)
             return ir.to_sql(selections=output_ids)
 
@@ -86,6 +89,7 @@ def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str:
         node, _ = rewrites.pull_up_order(
             node, order_root=False, ordered_joins=self.strict
         )
+        node = rewrites.column_pruning(node)
         return self.compile_node(node).to_sql(limit=n_rows, selections=ids)
 
     def compile_raw(
@@ -97,6 +101,7 @@ def compile_raw(
         node = nodes.bottom_up(node, rewrites.rewrite_slice)
         node = nodes.top_down(node, rewrites.rewrite_timedelta_ops)
         node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict)
+        node = rewrites.column_pruning(node)
         ir = self.compile_node(node)
         sql = ir.to_sql()
         return sql, node.schema.to_bigquery(), ordering
@@ -192,31 +197,34 @@ def compile_readtable(self, node: nodes.ReadTableNode):
         return self.compile_read_table_unordered(node.source, node.scan_list)
 
     def read_table_as_unordered_ibis(
-        self, source: nodes.BigqueryDataSource
+        self,
+        source: nodes.BigqueryDataSource,
+        scan_cols: typing.Sequence[str],
     ) -> ibis_types.Table:
         full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}"
-        used_columns = tuple(col.name for col in source.table.physical_schema)
         # Physical schema might include unused columns, unsupported datatypes like JSON
         physical_schema = ibis_bigquery.BigQuerySchema.to_ibis(
-            list(i for i in source.table.physical_schema if i.name in used_columns)
+            list(source.table.physical_schema)
         )
         if source.at_time is not None or source.sql_predicate is not None:
             import bigframes.session._io.bigquery
 
             sql = bigframes.session._io.bigquery.to_query(
                 full_table_name,
-                columns=used_columns,
+                columns=scan_cols,
                 sql_predicate=source.sql_predicate,
                 time_travel_timestamp=source.at_time,
             )
             return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql)
         else:
-            return ibis_api.table(physical_schema, full_table_name)
+            return ibis_api.table(physical_schema, full_table_name).select(scan_cols)
 
     def compile_read_table_unordered(
         self, source: nodes.BigqueryDataSource, scan: nodes.ScanList
     ):
-        ibis_table = self.read_table_as_unordered_ibis(source)
+        ibis_table = self.read_table_as_unordered_ibis(
+            source, scan_cols=[col.source_id for col in scan.items]
+        )
         return compiled.UnorderedIR(
             ibis_table,
             tuple(
@@ -291,7 +299,7 @@ def set_output_names(
     return nodes.SelectionNode(
         node,
         tuple(
-            (ex.DerefOp(old_id), ids.ColumnId(out_id))
+            bigframes.core.nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id))
             for old_id, out_id in zip(node.ids, output_ids)
         ),
     )
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
index 085d52daa6..88e084d79c 100644
--- a/bigframes/core/nodes.py
+++ b/bigframes/core/nodes.py
@@ -20,7 +20,7 @@
 import functools
 import itertools
 import typing
-from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple
+from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple, TypeVar
 
 import google.cloud.bigquery as bq
 
@@ -44,6 +44,8 @@
 
 COLUMN_SET = frozenset[bfet_ids.ColumnId]
 
+Self = TypeVar("Self")
+
 
 @dataclasses.dataclass(frozen=True)
 class Field:
@@ -87,10 +89,17 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
     def row_count(self) -> typing.Optional[int]:
         return None
 
+    @abc.abstractmethod
+    def remap_vars(
+        self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> Self:
+        """Remap defined (in this node only) variables."""
+        ...
+
     @abc.abstractmethod
     def remap_refs(
-        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+        self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> Self:
         """Remap variable references"""
         ...
 
@@ -100,6 +109,10 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         """The variables defined in this node (as opposed to by child nodes)."""
         ...
 
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset()
+
     @functools.cached_property
     def session(self):
         sessions = []
@@ -248,18 +261,11 @@ def planning_complexity(self) -> int:
 
     @abc.abstractmethod
     def transform_children(
-        self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+        self: Self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> Self:
         """Apply a function to each child node."""
         ...
 
-    @abc.abstractmethod
-    def remap_vars(
-        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
-        """Remap defined (in this node only) variables."""
-        ...
-
     @property
     def defines_namespace(self) -> bool:
         """
@@ -269,16 +275,6 @@ def defines_namespace(self) -> bool:
         """
         return False
 
-    @functools.cached_property
-    def defined_variables(self) -> set[str]:
-        """Full set of variables defined in the namespace, even if not selected."""
-        self_defined_variables = set(self.schema.names)
-        if self.defines_namespace:
-            return self_defined_variables
-        return self_defined_variables.union(
-            *(child.defined_variables for child in self.child_nodes)
-        )
-
     def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype:
         return self._dtype_lookup[id]
 
@@ -286,9 +282,6 @@ def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype:
     def _dtype_lookup(self):
         return {field.id: field.dtype for field in self.fields}
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        return self.transform_children(lambda x: x.prune(used_cols))
-
 
 class AdditiveNode:
     """Definition of additive - if you drop added_fields, you end up with the descendent.
@@ -336,7 +329,7 @@ def explicitly_ordered(self) -> bool:
 
     def transform_children(
         self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+    ) -> UnaryNode:
         transformed = dataclasses.replace(self, child=t(self.child))
         if self == transformed:
             # reusing existing object speeds up eq, and saves a small amount of memory
@@ -406,12 +399,18 @@ def row_count(self) -> typing.Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return ()
 
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset()
+
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> SliceNode:
         return self
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> SliceNode:
         return self
 
 
@@ -483,6 +482,10 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return (self.indicator_col,)
 
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset({self.left_col.id, self.right_col.id})
+
     @property
     def additive_base(self) -> BigFrameNode:
         return self.left_child
@@ -490,9 +493,7 @@ def additive_base(self) -> BigFrameNode:
     def replace_additive_base(self, node: BigFrameNode):
         return dataclasses.replace(self, left_child=node)
 
-    def transform_children(
-        self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+    def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> InNode:
         transformed = dataclasses.replace(
             self, left_child=t(self.left_child), right_child=t(self.right_child)
         )
@@ -501,17 +502,16 @@ def transform_children(
             return self
         return transformed
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        return self
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> InNode:
         return dataclasses.replace(
             self, indicator_col=mappings.get(self.indicator_col, self.indicator_col)
         )
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> InNode:
         return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True))  # type: ignore
 
 
@@ -574,9 +574,20 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return ()
 
-    def transform_children(
-        self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset(
+            itertools.chain.from_iterable(
+                (*l_cond.column_references, *r_cond.column_references)
+                for l_cond, r_cond in self.conditions
+            )
+        )
+
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset(*self.ids, *self.referenced_ids)
+
+    def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> JoinNode:
         transformed = dataclasses.replace(
             self, left_child=t(self.left_child), right_child=t(self.right_child)
         )
@@ -585,21 +596,14 @@ def transform_children(
             return self
         return transformed
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        # If this is a cross join, make sure to select at least one column from each side
-        condition_cols = used_cols.union(
-            map(lambda x: x.id, itertools.chain.from_iterable(self.conditions))
-        )
-        return self.transform_children(
-            lambda x: x.prune(frozenset([*condition_cols, *used_cols]))
-        )
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> JoinNode:
         return self
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> JoinNode:
         new_conds = tuple(
             (
                 l_cond.remap_column_refs(mappings, allow_partial_bindings=True),
@@ -665,7 +669,7 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
 
     def transform_children(
         self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+    ) -> ConcatNode:
         transformed = dataclasses.replace(
             self, children=tuple(t(child) for child in self.children)
         )
@@ -674,17 +678,15 @@ def transform_children(
             return self
         return transformed
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        # TODO: Make concat prunable, probably by redefining
-        return self
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ConcatNode:
         new_ids = tuple(mappings.get(id, id) for id in self.output_ids)
         return dataclasses.replace(self, output_ids=new_ids)
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> ConcatNode:
         return self
 
 
@@ -735,25 +737,23 @@ def defines_namespace(self) -> bool:
 
     def transform_children(
         self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+    ) -> FromRangeNode:
         transformed = dataclasses.replace(self, start=t(self.start), end=t(self.end))
         if self == transformed:
             # reusing existing object speeds up eq, and saves a small amount of memory
             return self
         return transformed
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        # TODO: Make FromRangeNode prunable (or convert to other node types)
-        return self
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> FromRangeNode:
         return dataclasses.replace(
             self, output_id=mappings.get(self.output_id, self.output_id)
         )
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> FromRangeNode:
         return self
 
 
@@ -774,9 +774,7 @@ def fast_offsets(self) -> bool:
     def fast_ordered_limit(self) -> bool:
         return False
 
-    def transform_children(
-        self, t: Callable[[BigFrameNode], BigFrameNode]
-    ) -> BigFrameNode:
+    def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafNode:
         return self
 
 
@@ -785,6 +783,9 @@ class ScanItem(typing.NamedTuple):
     dtype: bigframes.dtypes.Dtype  # Might be multiple logical types for a given physical source type
     source_id: str  # Flexible enough for both local data and bq data
 
+    def with_id(self, id: bfet_ids.ColumnId) -> ScanItem:
+        return ScanItem(id, self.dtype, self.source_id)
+
 
 @dataclasses.dataclass(frozen=True)
 class ScanList:
@@ -841,25 +842,9 @@ def row_count(self) -> typing.Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return tuple(item.id for item in self.fields)
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        # Don't preoduce empty scan list no matter what, will result in broken sql syntax
-        # TODO: Handle more elegantly
-        new_scan_list = ScanList(
-            tuple(item for item in self.scan_list.items if item.id in used_cols)
-            or (self.scan_list.items[0],)
-        )
-        return ReadLocalNode(
-            self.feather_bytes,
-            self.data_schema,
-            self.n_rows,
-            new_scan_list,
-            self.offsets_col,
-            self.session,
-        )
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ReadLocalNode:
         new_scan_list = ScanList(
             tuple(
                 ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id)
@@ -875,7 +860,9 @@ def remap_vars(
             self, scan_list=new_scan_list, offsets_col=new_offsets_col
         )
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> ReadLocalNode:
         return self
 
 
@@ -1003,16 +990,9 @@ def row_count(self) -> typing.Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return tuple(item.id for item in self.scan_list.items)
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        new_scan_list = ScanList(
-            tuple(item for item in self.scan_list.items if item.id in used_cols)
-            or (self.scan_list.items[0],)
-        )
-        return dataclasses.replace(self, scan_list=new_scan_list)
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ReadTableNode:
         new_scan_list = ScanList(
             tuple(
                 ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id)
@@ -1021,7 +1001,9 @@ def remap_vars(
         )
         return dataclasses.replace(self, scan_list=new_scan_list)
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> ReadTableNode:
         return self
 
     def with_order_cols(self):
@@ -1089,6 +1071,10 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return (self.col_id,)
 
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset()
+
     @property
     def added_fields(self) -> Tuple[Field, ...]:
         return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),)
@@ -1097,22 +1083,17 @@ def added_fields(self) -> Tuple[Field, ...]:
     def additive_base(self) -> BigFrameNode:
         return self.child
 
-    def replace_additive_base(self, node: BigFrameNode):
+    def replace_additive_base(self, node: BigFrameNode) -> PromoteOffsetsNode:
         return dataclasses.replace(self, child=node)
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        if self.col_id not in used_cols:
-            return self.child.prune(used_cols)
-        else:
-            new_used = used_cols.difference([self.col_id])
-            return self.transform_children(lambda x: x.prune(new_used))
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> PromoteOffsetsNode:
         return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id))
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> PromoteOffsetsNode:
         return self
 
 
@@ -1136,17 +1117,22 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return ()
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        consumed_ids = used_cols.union(self.predicate.column_references)
-        pruned_child = self.child.prune(consumed_ids)
-        return FilterNode(pruned_child, self.predicate)
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset(self.ids) | self.referenced_ids
+
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset(self.predicate.column_references)
 
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> FilterNode:
         return self
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> FilterNode:
         return dataclasses.replace(
             self,
             predicate=self.predicate.remap_column_refs(
@@ -1183,20 +1169,24 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return ()
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        ordering_cols = itertools.chain.from_iterable(
-            map(lambda x: x.referenced_columns, self.by)
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset(self.ids) | self.referenced_ids
+
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset(
+            itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by))
         )
-        consumed_ids = used_cols.union(ordering_cols)
-        pruned_child = self.child.prune(consumed_ids)
-        return OrderByNode(pruned_child, self.by)
 
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> OrderByNode:
         return self
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> OrderByNode:
         all_refs = set(
             itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by))
         )
@@ -1233,20 +1223,43 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return ()
 
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset()
+
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ReversedNode:
         return self
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> ReversedNode:
         return self
 
 
+class AliasedRef(typing.NamedTuple):
+    ref: ex.DerefOp
+    id: bfet_ids.ColumnId
+
+    @classmethod
+    def identity(cls, id: bfet_ids.ColumnId) -> AliasedRef:
+        return cls(ex.DerefOp(id), id)
+
+    def remap_vars(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> AliasedRef:
+        return AliasedRef(self.ref, mappings.get(self.id, self.id))
+
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> AliasedRef:
+        return AliasedRef(ex.DerefOp(mappings.get(self.ref.id, self.ref.id)), self.id)
+
+
 @dataclasses.dataclass(frozen=True, eq=False)
 class SelectionNode(UnaryNode):
-    input_output_pairs: typing.Tuple[
-        typing.Tuple[ex.DerefOp, bigframes.core.identifiers.ColumnId], ...
-    ]
+    input_output_pairs: Tuple[AliasedRef, ...]
 
     def _validate(self):
         for ref, _ in self.input_output_pairs:
@@ -1280,33 +1293,26 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return tuple(id for _, id in self.input_output_pairs)
 
-    def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]:
-        return {ref.id: out_id for ref, out_id in self.input_output_pairs}
-
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        pruned_selections = (
-            tuple(
-                select for select in self.input_output_pairs if select[1] in used_cols
-            )
-            or self.input_output_pairs[:1]
-        )
-        consumed_ids = frozenset(i[0].id for i in pruned_selections)
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset(ref.id for ref, id in self.input_output_pairs)
 
-        pruned_child = self.child.prune(consumed_ids)
-        return SelectionNode(pruned_child, pruned_selections)
+    def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]:
+        return {ref.id: id for ref, id in self.input_output_pairs}
 
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
-        new_pairs = tuple(
-            (ref, mappings.get(id, id)) for ref, id in self.input_output_pairs
+    ) -> SelectionNode:
+        new_fields = tuple(
+            item.remap_vars(mappings) for item in self.input_output_pairs
         )
-        return dataclasses.replace(self, input_output_pairs=new_pairs)
+        return dataclasses.replace(self, input_output_pairs=new_fields)
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> SelectionNode:
         new_fields = tuple(
-            (ex.remap_column_refs(mappings, allow_partial_bindings=True), id)
-            for ex, id in self.input_output_pairs
+            item.remap_refs(mappings) for item in self.input_output_pairs
         )
         return dataclasses.replace(self, input_output_pairs=new_fields)  # type: ignore
 
@@ -1353,30 +1359,38 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return tuple(id for _, id in self.assignments)
 
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset(
+            itertools.chain.from_iterable(
+                i[0].column_references for i in self.assignments
+            )
+        )
+
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset(
+            itertools.chain.from_iterable(
+                ex.column_references for ex, id in self.assignments
+            )
+        )
+
     @property
     def additive_base(self) -> BigFrameNode:
         return self.child
 
-    def replace_additive_base(self, node: BigFrameNode):
+    def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode:
         return dataclasses.replace(self, child=node)
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols)
-        if len(pruned_assignments) == 0:
-            return self.child.prune(used_cols)
-        consumed_ids = itertools.chain.from_iterable(
-            i[0].column_references for i in pruned_assignments
-        )
-        pruned_child = self.child.prune(used_cols.union(consumed_ids))
-        return ProjectionNode(pruned_child, pruned_assignments)
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ProjectionNode:
         new_fields = tuple((ex, mappings.get(id, id)) for ex, id in self.assignments)
         return dataclasses.replace(self, assignments=new_fields)
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> ProjectionNode:
         new_fields = tuple(
             (ex.remap_column_refs(mappings, allow_partial_bindings=True), id)
             for ex, id in self.assignments
@@ -1418,16 +1432,18 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return (self.col_id,)
 
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset()
+
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> RowCountNode:
         return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id))
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
-        return self
-
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        # TODO: Handle row count pruning
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> RowCountNode:
         return self
 
 
@@ -1487,33 +1503,31 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return tuple(id for _, id in self.aggregations)
 
     @property
-    def has_ordered_ops(self) -> bool:
-        return not all(
-            aggregate.op.order_independent for aggregate, _ in self.aggregations
-        )
-
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
+    def consumed_ids(self) -> COLUMN_SET:
         by_ids = (ref.id for ref in self.by_column_ids)
-        pruned_aggs = (
-            tuple(agg for agg in self.aggregations if agg[1] in used_cols)
-            or self.aggregations[:1]
-        )
         agg_inputs = itertools.chain.from_iterable(
-            agg.column_references for agg, _ in pruned_aggs
+            agg.column_references for agg, _ in self.aggregations
         )
-        consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs))
-        pruned_child = self.child.prune(consumed_ids)
-        return AggregateNode(
-            pruned_child, pruned_aggs, self.by_column_ids, dropna=self.dropna
+        order_ids = itertools.chain.from_iterable(
+            part.scalar_expression.column_references for part in self.order_by
+        )
+        return frozenset(itertools.chain(by_ids, agg_inputs, order_ids))
+
+    @property
+    def has_ordered_ops(self) -> bool:
+        return not all(
+            aggregate.op.order_independent for aggregate, _ in self.aggregations
         )
 
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> AggregateNode:
         new_aggs = tuple((agg, mappings.get(id, id)) for agg, id in self.aggregations)
         return dataclasses.replace(self, aggregations=new_aggs)
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> AggregateNode:
         new_aggs = tuple(
             (agg.remap_column_refs(mappings, allow_partial_bindings=True), id)
             for agg, id in self.aggregations
@@ -1578,6 +1592,20 @@ def added_field(self) -> Field:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return (self.output_name,)
 
+    @property
+    def consumed_ids(self) -> COLUMN_SET:
+        return frozenset(
+            set(self.ids).difference([self.output_name]).union(self.referenced_ids)
+        )
+
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return (
+            frozenset()
+            .union(self.expression.column_references)
+            .union(self.window_spec.all_referenced_columns)
+        )
+
     @property
     def inherits_order(self) -> bool:
         # does the op both use ordering at all? and if so, can it inherit order?
@@ -1590,27 +1618,19 @@ def inherits_order(self) -> bool:
     def additive_base(self) -> BigFrameNode:
         return self.child
 
-    def replace_additive_base(self, node: BigFrameNode):
+    def replace_additive_base(self, node: BigFrameNode) -> WindowOpNode:
         return dataclasses.replace(self, child=node)
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        if self.output_name not in used_cols:
-            return self.child.prune(used_cols)
-        consumed_ids = (
-            used_cols.difference([self.output_name])
-            .union(self.expression.column_references)
-            .union(self.window_spec.all_referenced_columns)
-        )
-        return self.transform_children(lambda x: x.prune(consumed_ids))
-
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> WindowOpNode:
         return dataclasses.replace(
             self, output_name=mappings.get(self.output_name, self.output_name)
         )
 
-    def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+    def remap_refs(
+        self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+    ) -> WindowOpNode:
         return dataclasses.replace(
             self,
             expression=self.expression.remap_column_refs(
@@ -1646,14 +1666,18 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return ()
 
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset()
+
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> RandomSampleNode:
         return self
 
     def remap_refs(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> RandomSampleNode:
         return self
 
 
@@ -1703,21 +1727,20 @@ def row_count(self) -> Optional[int]:
     def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
         return (self.offsets_col,) if (self.offsets_col is not None) else ()
 
-    def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
-        # Cannot prune explode op
-        consumed_ids = used_cols.union(ref.id for ref in self.column_ids)
-        return self.transform_children(lambda x: x.prune(consumed_ids))
+    @property
+    def referenced_ids(self) -> COLUMN_SET:
+        return frozenset(ref.id for ref in self.column_ids)
 
     def remap_vars(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ExplodeNode:
         if (self.offsets_col is not None) and self.offsets_col in mappings:
             return dataclasses.replace(self, offsets_col=mappings[self.offsets_col])
         return self
 
     def remap_refs(
         self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
-    ) -> BigFrameNode:
+    ) -> ExplodeNode:
         new_ids = tuple(id.remap_column_refs(mappings) for id in self.column_ids)
         return dataclasses.replace(self, column_ids=new_ids)  # type: ignore
 
diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py
index f93186bf36..bf93fa51b6 100644
--- a/bigframes/core/rewrite/__init__.py
+++ b/bigframes/core/rewrite/__init__.py
@@ -17,6 +17,7 @@
 from bigframes.core.rewrite.legacy_align import legacy_join_as_projection
 from bigframes.core.rewrite.operators import rewrite_timedelta_ops
 from bigframes.core.rewrite.order import pull_up_order
+from bigframes.core.rewrite.pruning import column_pruning
 from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice
 
 __all__ = [
@@ -27,4 +28,5 @@
     "pullup_limit_from_slice",
     "remap_variables",
     "pull_up_order",
+    "column_pruning",
 ]
diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py
index 1b864fb919..1989b1a543 100644
--- a/bigframes/core/rewrite/implicit_align.py
+++ b/bigframes/core/rewrite/implicit_align.py
@@ -113,7 +113,7 @@ def try_row_join(
     r_node, r_selection = pull_up_selection(
         r_node, stop=divergent_node, rename_vars=True
     )  # Rename only right vars to avoid collisions with left vars
-    combined_selection = (*l_selection, *r_selection)
+    combined_selection = l_selection + r_selection
 
     def _linearize_trees(
         base_tree: bigframes.core.nodes.BigFrameNode,
@@ -139,10 +139,7 @@ def pull_up_selection(
     rename_vars: bool = False,
 ) -> Tuple[
     bigframes.core.nodes.BigFrameNode,
-    Tuple[
-        Tuple[bigframes.core.expression.DerefOp, bigframes.core.identifiers.ColumnId],
-        ...,
-    ],
+    Tuple[bigframes.core.nodes.AliasedRef, ...],
 ]:
     """Remove all selection nodes above the base node. Returns stripped tree.
 
@@ -157,8 +154,7 @@ def pull_up_selection(
     """
     if node == stop:  # base case
         return node, tuple(
-            (bigframes.core.expression.DerefOp(field.id), field.id)
-            for field in node.fields
+            bigframes.core.nodes.AliasedRef.identity(field.id) for field in node.fields
         )
     # InNode needs special handling, as its a binary node, but row identity is from left side only.
     # TODO: Merge code with unary op paths
@@ -179,11 +175,15 @@ def pull_up_selection(
                     {node.indicator_col: bigframes.core.identifiers.ColumnId.unique()}
                 ),
             )
-        added_selection = (
-            bigframes.core.expression.DerefOp(new_in_node.indicator_col),
-            node.indicator_col,
+        added_selection = tuple(
+            (
+                bigframes.core.nodes.AliasedRef(
+                    bigframes.core.expression.DerefOp(new_in_node.indicator_col),
+                    node.indicator_col,
+                ),
+            )
         )
-        new_selection = (*child_selections, added_selection)
+        new_selection = child_selections + added_selection
         return new_in_node, new_selection
 
     if isinstance(node, bigframes.core.nodes.AdditiveNode):
@@ -204,28 +204,20 @@ def pull_up_selection(
         else:
             var_renames = {}
         assert isinstance(new_node, bigframes.core.nodes.AdditiveNode)
-        added_selections = (
-            (
-                bigframes.core.expression.DerefOp(var_renames.get(field.id, field.id)),
-                field.id,
-            )
+        added_selections = tuple(
+            bigframes.core.nodes.AliasedRef.identity(field.id).remap_refs(var_renames)
             for field in node.added_fields
         )
-        new_selection = (*child_selections, *added_selections)
+        new_selection = child_selections + added_selections
         return new_node, new_selection
     elif isinstance(node, bigframes.core.nodes.SelectionNode):
         child_node, child_selections = pull_up_selection(
             node.child, stop, rename_vars=rename_vars
         )
         mapping = {out: ref.id for ref, out in child_selections}
-        new_selection = tuple(
-            (
-                bigframes.core.expression.DerefOp(mapping[ref.id]),
-                out,
-            )
-            for ref, out in node.input_output_pairs
+        return child_node, tuple(
+            ref.remap_refs(mapping) for ref in node.input_output_pairs
         )
-        return child_node, new_selection
     raise ValueError(f"Couldn't pull up select from node: {node}")
 
 
diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py
index 05641130fb..573a7026e4 100644
--- a/bigframes/core/rewrite/legacy_align.py
+++ b/bigframes/core/rewrite/legacy_align.py
@@ -57,7 +57,7 @@ def from_node_span(
 
         if isinstance(node, nodes.SelectionNode):
             return cls.from_node_span(node.child, target).select(
-                node.input_output_pairs
+                tuple(node.input_output_pairs)
             )
         elif isinstance(node, nodes.ProjectionNode):
             return cls.from_node_span(node.child, target).project(node.assignments)
@@ -228,7 +228,9 @@ def expand(self) -> nodes.BigFrameNode:
             root = nodes.FilterNode(child=root, predicate=self.predicate)
         if self.ordering:
             root = nodes.OrderByNode(child=root, by=self.ordering)
-        selection = tuple((scalar_exprs.DerefOp(id), id) for _, id in self.columns)
+        selection = tuple(
+            bigframes.core.nodes.AliasedRef.identity(id) for _, id in self.columns
+        )
         return nodes.SelectionNode(
             child=nodes.ProjectionNode(child=root, assignments=self.columns),
             input_output_pairs=selection,
diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py
index 3f8c409b76..18e5004e1d 100644
--- a/bigframes/core/rewrite/order.py
+++ b/bigframes/core/rewrite/order.py
@@ -180,14 +180,10 @@ def pull_up_order_inner(
                 col: bigframes.core.ids.ColumnId.unique()
                 for col in unselected_order_cols
             }
-            all_selections = (
-                *node.input_output_pairs,
-                *(
-                    (bigframes.core.expression.DerefOp(k), v)
-                    for k, v in new_selections.items()
-                ),
+            all_selections = node.input_output_pairs + tuple(
+                bigframes.core.nodes.AliasedRef(bigframes.core.expression.DerefOp(k), v)
+                for k, v in new_selections.items()
             )
-
             new_select_node = dataclasses.replace(
                 node, child=child_result, input_output_pairs=all_selections
             )
@@ -288,7 +284,7 @@ def pull_order_concat(
             )
             selection = tuple(
                 (
-                    (bigframes.core.expression.DerefOp(id), id)
+                    bigframes.core.nodes.AliasedRef.identity(id)
                     for id in (*source.ids, table_id, offsets_id)
                 )
             )
@@ -396,7 +392,7 @@ def remove_order_strict(
         if result.ids != node.ids:
             return bigframes.core.nodes.SelectionNode(
                 result,
-                tuple((bigframes.core.expression.DerefOp(id), id) for id in node.ids),
+                tuple(bigframes.core.nodes.AliasedRef.identity(id) for id in node.ids),
             )
         return result
 
@@ -428,7 +424,7 @@ def rename_cols(
     result_node = bigframes.core.nodes.SelectionNode(
         node,
         tuple(
-            (bigframes.core.expression.DerefOp(id), mappings.get(id, id))
+            bigframes.core.nodes.AliasedRef.identity(id).remap_vars(mappings)
             for id in node.ids
         ),
     )
diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py
new file mode 100644
index 0000000000..0b8534116d
--- /dev/null
+++ b/bigframes/core/rewrite/pruning.py
@@ -0,0 +1,195 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import functools
+from typing import AbstractSet
+
+import bigframes.core.identifiers
+import bigframes.core.nodes
+
+
+def column_pruning(
+    root: bigframes.core.nodes.BigFrameNode,
+) -> bigframes.core.nodes.BigFrameNode:
+    return bigframes.core.nodes.top_down(root, prune_columns)
+
+
+def to_fixed(max_iterations: int = 100):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            previous_result = None
+            current_result = func(*args, **kwargs)
+            attempts = 1
+
+            while attempts < max_iterations:
+                if current_result == previous_result:
+                    return current_result
+                previous_result = current_result
+                current_result = func(current_result)
+                attempts += 1
+
+            return current_result
+
+        return wrapper
+
+    return decorator
+
+
+@to_fixed(max_iterations=100)
+def prune_columns(node: bigframes.core.nodes.BigFrameNode):
+    if isinstance(node, bigframes.core.nodes.SelectionNode):
+        result = prune_selection_child(node)
+    elif isinstance(node, bigframes.core.nodes.AggregateNode):
+        result = node.replace_child(prune_node(node.child, node.consumed_ids))
+    elif isinstance(node, bigframes.core.nodes.InNode):
+        result = dataclasses.replace(
+            node,
+            right_child=prune_node(node.right_child, frozenset([node.right_col.id])),
+        )
+    else:
+        result = node
+    return result
+
+
+def prune_selection_child(
+    selection: bigframes.core.nodes.SelectionNode,
+) -> bigframes.core.nodes.BigFrameNode:
+    child = selection.child
+
+    # Important to check this first
+    if list(selection.ids) == list(child.ids):
+        return child
+
+    if isinstance(child, bigframes.core.nodes.SelectionNode):
+        return selection.remap_refs(
+            {id: ref.id for ref, id in child.input_output_pairs}
+        ).replace_child(child.child)
+    elif isinstance(child, bigframes.core.nodes.AdditiveNode):
+        if not set(field.id for field in child.added_fields) & selection.consumed_ids:
+            return selection.replace_child(child.additive_base)
+        return selection.replace_child(
+            child.replace_additive_base(
+                prune_node(
+                    child.additive_base, selection.consumed_ids | child.referenced_ids
+                )
+            )
+        )
+    elif isinstance(child, bigframes.core.nodes.ConcatNode):
+        indices = [
+            list(child.ids).index(ref.id) for ref, _ in selection.input_output_pairs
+        ]
+        new_children = []
+        for concat_node in child.child_nodes:
+            cc_ids = tuple(concat_node.ids)
+            sub_selection = tuple(
+                bigframes.core.nodes.AliasedRef.identity(cc_ids[i]) for i in indices
+            )
+            new_children.append(
+                bigframes.core.nodes.SelectionNode(concat_node, sub_selection)
+            )
+        return bigframes.core.nodes.ConcatNode(
+            children=tuple(new_children), output_ids=tuple(selection.ids)
+        )
+    # Nodes that pass through input columns
+    elif isinstance(
+        child,
+        (
+            bigframes.core.nodes.RandomSampleNode,
+            bigframes.core.nodes.ReversedNode,
+            bigframes.core.nodes.OrderByNode,
+            bigframes.core.nodes.FilterNode,
+            bigframes.core.nodes.SliceNode,
+            bigframes.core.nodes.JoinNode,
+            bigframes.core.nodes.ExplodeNode,
+        ),
+    ):
+        ids = selection.consumed_ids | child.referenced_ids
+        return selection.replace_child(
+            child.transform_children(lambda x: prune_node(x, ids))
+        )
+    elif isinstance(child, bigframes.core.nodes.AggregateNode):
+        return selection.replace_child(prune_aggregate(child, selection.consumed_ids))
+    elif isinstance(child, bigframes.core.nodes.LeafNode):
+        return selection.replace_child(prune_leaf(child, selection.consumed_ids))
+    return selection
+
+
+def prune_node(
+    node: bigframes.core.nodes.BigFrameNode,
+    ids: AbstractSet[bigframes.core.ids.ColumnId],
+):
+    # This clause is important, ensures idempotency, so can reach fixed point
+    if not (set(node.ids) - ids):
+        return node
+    else:
+        return bigframes.core.nodes.SelectionNode(
+            node,
+            tuple(
+                bigframes.core.nodes.AliasedRef.identity(id)
+                for id in node.ids
+                if id in ids
+            ),
+        )
+
+
+def prune_aggregate(
+    node: bigframes.core.nodes.AggregateNode,
+    used_cols: AbstractSet[bigframes.core.ids.ColumnId],
+) -> bigframes.core.nodes.AggregateNode:
+    pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols)
+    return dataclasses.replace(node, aggregations=pruned_aggs)
+
+
+@functools.singledispatch
+def prune_leaf(
+    node: bigframes.core.nodes.BigFrameNode,
+    used_cols: AbstractSet[bigframes.core.ids.ColumnId],
+):
+    ...
+
+
+@prune_leaf.register
+def prune_readlocal(
+    node: bigframes.core.nodes.ReadLocalNode,
+    selection: AbstractSet[bigframes.core.ids.ColumnId],
+) -> bigframes.core.nodes.ReadLocalNode:
+    new_scan_list = filter_scanlist(node.scan_list, selection)
+    return dataclasses.replace(
+        node,
+        scan_list=new_scan_list,
+        offsets_col=node.offsets_col if (node.offsets_col in selection) else None,
+    )
+
+
+@prune_leaf.register
+def prune_readtable(
+    node: bigframes.core.nodes.ReadTableNode,
+    selection: AbstractSet[bigframes.core.ids.ColumnId],
+) -> bigframes.core.nodes.ReadTableNode:
+    new_scan_list = filter_scanlist(node.scan_list, selection)
+    return dataclasses.replace(node, scan_list=new_scan_list)
+
+
+def filter_scanlist(
+    scanlist: bigframes.core.nodes.ScanList,
+    ids: AbstractSet[bigframes.core.ids.ColumnId],
+):
+    result = bigframes.core.nodes.ScanList(
+        tuple(item for item in scanlist.items if item.id in ids)
+    )
+    if len(result.items) == 0:
+        # We need to select something, or stuff breaks
+        result = bigframes.core.nodes.ScanList(scanlist.items[:1])
+    return result
diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py
index 102ffcf773..87a7720e2f 100644
--- a/bigframes/core/rewrite/slices.py
+++ b/bigframes/core/rewrite/slices.py
@@ -120,7 +120,9 @@ def drop_cols(
 ) -> nodes.SelectionNode:
     # adding a whole node that redefines the schema is a lot of overhead, should do something more efficient
     selections = tuple(
-        (scalar_exprs.DerefOp(id), id) for id in node.ids if id not in drop_cols
+        nodes.AliasedRef(scalar_exprs.DerefOp(id), id)
+        for id in node.ids
+        if id not in drop_cols
     )
     return nodes.SelectionNode(node, selections)
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 20f636b681..4ffa56c2e5 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -180,7 +180,10 @@ def __init__(
                 )
                 block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols])
             if columns:
-                block = block.select_columns(list(columns))  # type:ignore
+                column_ids = [
+                    block.resolve_label_exact_or_error(label) for label in list(columns)
+                ]
+                block = block.select_columns(column_ids)  # type:ignore
             if dtype:
                 bf_dtype = bigframes.dtypes.bigframes_type(dtype)
                 block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
@@ -238,15 +241,7 @@ def _find_indices(
         return [self._block.value_columns.index(col_id) for col_id in col_ids]
 
     def _resolve_label_exact(self, label) -> Optional[str]:
-        """Returns the column id matching the label if there is exactly
-        one such column. If there are multiple columns with the same name,
-        raises an error. If there is no such column, returns None."""
-        matches = self._block.label_to_col_id.get(label, [])
-        if len(matches) > 1:
-            raise ValueError(
-                f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}"
-            )
-        return matches[0] if len(matches) != 0 else None
+        return self._block.resolve_label_exact(label)
 
     def _sql_names(
         self,
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 7b66191a11..72c49e124b 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -57,6 +57,8 @@
     _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT,
 )
 
+_MULTIMODAL_EMBEDDING_001_ENDPOINT = "multimodalembedding@001"
+
 _GEMINI_PRO_ENDPOINT = "gemini-pro"
 _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514"
 _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514"
@@ -762,6 +764,152 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TextEmbeddingGenerat
         return new_model.session.read_gbq_model(model_name)
 
 
+@log_adapter.class_logger
+class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor):
+    """Multimodal embedding generator LLM model.
+
+    .. note::
+        BigFrames Blob is still under experiments. It may not work and subject to change in the future.
+
+    Args:
+        model_name (str, Default to "multimodalembedding@001"):
+            The model for multimodal embedding. Can set to "multimodalembedding@001". Multimodal-embedding models returns model embeddings for text, image and video inputs.
+            Default to "multimodalembedding@001".
+        session (bigframes.Session or None):
+            BQ session to create the model. If None, use the global default session.
+        connection_name (str or None):
+            Connection to connect with remote service. str of the format <PROJECT_NUMBER/PROJECT_ID>.<LOCATION>.<CONNECTION_ID>.
+            If None, use default connection in session context.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_name: Literal["multimodalembedding@001"] = "multimodalembedding@001",
+        session: Optional[bigframes.Session] = None,
+        connection_name: Optional[str] = None,
+    ):
+        if not bigframes.options.experiments.blob:
+            raise NotImplementedError()
+        self.model_name = model_name
+        self.session = session or global_session.get_global_session()
+        self.connection_name = connection_name
+
+        self._bqml_model_factory = globals.bqml_model_factory()
+        self._bqml_model: core.BqmlModel = self._create_bqml_model()
+
+    def _create_bqml_model(self):
+        # Parse and create connection if needed.
+        self.connection_name = self.session._create_bq_connection(
+            connection=self.connection_name, iam_role="aiplatform.user"
+        )
+
+        if self.model_name != _MULTIMODAL_EMBEDDING_001_ENDPOINT:
+            msg = _MODEL_NOT_SUPPORTED_WARNING.format(
+                model_name=self.model_name,
+                known_models=_MULTIMODAL_EMBEDDING_001_ENDPOINT,
+            )
+            warnings.warn(msg)
+
+        options = {
+            "endpoint": self.model_name,
+        }
+        return self._bqml_model_factory.create_remote_model(
+            session=self.session, connection_name=self.connection_name, options=options
+        )
+
+    @classmethod
+    def _from_bq(
+        cls, session: bigframes.Session, bq_model: bigquery.Model
+    ) -> MultimodalEmbeddingGenerator:
+        assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED"
+        assert "remoteModelInfo" in bq_model._properties
+        assert "endpoint" in bq_model._properties["remoteModelInfo"]
+        assert "connection" in bq_model._properties["remoteModelInfo"]
+
+        # Parse the remote model endpoint
+        bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"]
+        model_connection = bq_model._properties["remoteModelInfo"]["connection"]
+        model_endpoint = bqml_endpoint.split("/")[-1]
+
+        model = cls(
+            session=session,
+            model_name=model_endpoint,  # type: ignore
+            connection_name=model_connection,
+        )
+
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
+
+    @property
+    def _predict_func(
+        self,
+    ) -> Callable[
+        [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame
+    ]:
+        return self._bqml_model.generate_embedding
+
+    @property
+    def _status_col(self) -> str:
+        return _ML_GENERATE_EMBEDDING_STATUS
+
+    def predict(
+        self, X: utils.ArrayType, *, max_retries: int = 0
+    ) -> bigframes.dataframe.DataFrame:
+        """Predict the result from input DataFrame.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
+                Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction.
+                The content column must be of string type or BigFrames Blob of image or video.
+
+            max_retries (int, default 0):
+                Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry.
+                Each retry will append newly succeeded rows. When the max retries are reached, the remaining rows (the ones without successful predictions) will be appended to the end of the result.
+
+        Returns:
+            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values.
+        """
+        if max_retries < 0:
+            raise ValueError(
+                f"max_retries must be larger than or equal to 0, but is {max_retries}."
+            )
+
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+
+        if len(X.columns) == 1:
+            # BQML identified the column by name
+            col_label = cast(blocks.Label, X.columns[0])
+            X = X.rename(columns={col_label: "content"})
+
+        # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input
+        if X["content"].dtype == dtypes.OBJ_REF_DTYPE:
+            X["content"] = X["content"].blob._get_runtime("R", with_metadata=True)
+
+        options = {
+            "flatten_json_output": True,
+        }
+
+        return self._predict_and_retry(X, options=options, max_retries=max_retries)
+
+    def to_gbq(
+        self, model_name: str, replace: bool = False
+    ) -> MultimodalEmbeddingGenerator:
+        """Save the model to BigQuery.
+
+        Args:
+            model_name (str):
+                The name of the model.
+            replace (bool, default False):
+                Determine whether to replace if the model already exists. Default to False.
+
+        Returns:
+            MultimodalEmbeddingGenerator: Saved model."""
+
+        new_model = self._bqml_model.copy(model_name, replace)
+        return new_model.session.read_gbq_model(model_name)
+
+
 @log_adapter.class_logger
 class GeminiTextGenerator(base.RetriableRemotePredictor):
     """Gemini text generator LLM model.
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
index 1f62eec0ff..3bba3699f3 100644
--- a/bigframes/ml/loader.py
+++ b/bigframes/ml/loader.py
@@ -76,6 +76,7 @@
         llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator,
         llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator,
         llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator,
+        llm._MULTIMODAL_EMBEDDING_001_ENDPOINT: llm.MultimodalEmbeddingGenerator,
     }
 )
 
@@ -100,6 +101,7 @@ def from_bq(
     llm.PaLM2TextEmbeddingGenerator,
     llm.Claude3TextGenerator,
     llm.TextEmbeddingGenerator,
+    llm.MultimodalEmbeddingGenerator,
     pipeline.Pipeline,
     compose.ColumnTransformer,
     preprocessing.PreprocessingType,
diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py
index e1620485d5..e034fd00f7 100644
--- a/bigframes/ml/utils.py
+++ b/bigframes/ml/utils.py
@@ -100,6 +100,9 @@ def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]:
     model_name = model_endpoint
     version = None
 
+    if model_endpoint.startswith("multimodalembedding"):
+        return model_name, version
+
     at_idx = model_endpoint.find("@")
     if at_idx != -1:
         version = model_endpoint[at_idx + 1 :]
diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py
index e212381557..f5b82c2331 100644
--- a/bigframes/operations/timedelta_ops.py
+++ b/bigframes/operations/timedelta_ops.py
@@ -26,6 +26,6 @@ class ToTimedeltaOp(base_ops.UnaryOp):
     unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"]
 
     def output_type(self, *input_types):
-        if input_types[0] is not dtypes.INT_DTYPE:
-            raise TypeError("expected integer input")
-        return dtypes.TIMEDELTA_DTYPE
+        if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE):
+            return dtypes.TIMEDELTA_DTYPE
+        raise TypeError("expected integer or float input")
diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py
index 0cedf425fe..070a41d62d 100644
--- a/bigframes/pandas/core/tools/timedeltas.py
+++ b/bigframes/pandas/core/tools/timedeltas.py
@@ -18,20 +18,26 @@
     timedeltas as vendored_pandas_timedeltas,
 )
 import pandas as pd
+import pandas.api.types as pdtypes
 
 from bigframes import operations as ops
-from bigframes import series
+from bigframes import series, session
 
 
 def to_timedelta(
-    arg: typing.Union[series.Series, str, int, float],
+    arg,
     unit: typing.Optional[vendored_pandas_timedeltas.UnitChoices] = None,
-) -> typing.Union[series.Series, pd.Timedelta]:
-    if not isinstance(arg, series.Series):
-        return pd.to_timedelta(arg, unit)
+    *,
+    session: typing.Optional[session.Session] = None,
+):
+    if isinstance(arg, series.Series):
+        canonical_unit = "us" if unit is None else _canonicalize_unit(unit)
+        return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit))
 
-    canonical_unit = "us" if unit is None else _canonicalize_unit(unit)
-    return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit))
+    if pdtypes.is_list_like(arg):
+        return to_timedelta(series.Series(arg), unit, session=session)
+
+    return pd.to_timedelta(arg, unit)
 
 
 to_timedelta.__doc__ = vendored_pandas_timedeltas.to_timedelta.__doc__
diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py
index 90c638b82e..2180a66207 100644
--- a/bigframes/streaming/dataframe.py
+++ b/bigframes/streaming/dataframe.py
@@ -24,7 +24,7 @@
 from google.cloud import bigquery
 
 from bigframes import dataframe
-from bigframes.core import log_adapter
+from bigframes.core import log_adapter, nodes
 import bigframes.exceptions as bfe
 import bigframes.session
 
@@ -54,7 +54,7 @@ def _curate_df_doc(doc: Optional[str]):
 
 
 class StreamingBase:
-    sql: str
+    _appends_sql: str
     _session: bigframes.session.Session
 
     def to_bigtable(
@@ -124,7 +124,7 @@ def to_bigtable(
                 can be examined.
         """
         return _to_bigtable(
-            self.sql,
+            self._appends_sql,
             instance=instance,
             table=table,
             service_account_email=service_account_email,
@@ -181,7 +181,7 @@ def to_pubsub(
                 can be examined.
         """
         return _to_pubsub(
-            self.sql,
+            self._appends_sql,
             topic=topic,
             service_account_email=service_account_email,
             session=self._session,
@@ -218,6 +218,19 @@ def __init__(self, df: dataframe.DataFrame, *, create_key=0):
     def _from_table_df(cls, df: dataframe.DataFrame) -> StreamingDataFrame:
         return cls(df, create_key=cls._create_key)
 
+    @property
+    def _original_table(self):
+        def traverse(node: nodes.BigFrameNode):
+            if isinstance(node, nodes.ReadTableNode):
+                return f"{node.source.table.project_id}.{node.source.table.dataset_id}.{node.source.table.table_id}"
+            for child in node.child_nodes:
+                original_table = traverse(child)
+                if original_table:
+                    return original_table
+            return None
+
+        return traverse(self._df._block._expr.node)
+
     def __getitem__(self, *args, **kwargs):
         return _return_type_wrapper(self._df.__getitem__, StreamingDataFrame)(
             *args, **kwargs
@@ -266,6 +279,17 @@ def sql(self):
 
     sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql))
 
+    # Patch for the required APPENDS clause
+    @property
+    def _appends_sql(self):
+        sql_str = self.sql
+        original_table = self._original_table
+        assert original_table is not None
+
+        appends_clause = f"APPENDS(TABLE `{original_table}`, NULL, NULL)"
+        sql_str = sql_str.replace(f"`{original_table}`", appends_clause)
+        return sql_str
+
     @property
     def _session(self):
         return self._df._session
diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml
index c17a1788df..d57ab1c8ac 100644
--- a/docs/templates/toc.yml
+++ b/docs/templates/toc.yml
@@ -209,7 +209,7 @@
     name: bigframes.bigquery
   - items:
     - name: GeoSeries
-      uid: bigframes.geopandas
+      uid: bigframes.geopandas.GeoSeries
     name: bigframes.geopandas
   - items:
     - name: Overview
diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb
index 90186b297d..6b0682bb1a 100644
--- a/notebooks/getting_started/bq_dataframes_template.ipynb
+++ b/notebooks/getting_started/bq_dataframes_template.ipynb
@@ -118,7 +118,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "#%pip install  --upgrade"
+        "#%pip install  --upgrade bigframes"
       ]
     },
     {
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index e7556043af..1db89a074a 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -44,8 +44,15 @@
 def test_df_construct_copy(scalars_dfs):
     columns = ["int64_col", "string_col", "float64_col"]
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_result = dataframe.DataFrame(scalars_df, columns=columns).to_pandas()
-    pd_result = pd.DataFrame(scalars_pandas_df, columns=columns)
+    # Make the mapping from label to col_id non-trivial
+    bf_df = scalars_df.copy()
+    bf_df["int64_col"] = bf_df["int64_col"] / 2
+    pd_df = scalars_pandas_df.copy()
+    pd_df["int64_col"] = pd_df["int64_col"] / 2
+
+    bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas()
+
+    pd_result = pd.DataFrame(pd_df, columns=columns)
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index e46d073056..4b4264e33c 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -763,7 +763,7 @@ def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc):
         "micros",
     ],
 )
-def test_to_timedelta_with_bf_series(session, unit):
+def test_to_timedelta_with_bf_integer_series(session, unit):
     bf_series = bpd.Series([1, 2, 3], session=session)
     pd_series = pd.Series([1, 2, 3])
 
@@ -779,6 +779,42 @@ def test_to_timedelta_with_bf_series(session, unit):
     )
 
 
+def test_to_timedelta_with_bf_float_series_value_rounded_down(session):
+    bf_series = bpd.Series([1.2, 2.9], session=session)
+
+    actual_result = (
+        typing.cast(bpd.Series, bpd.to_timedelta(bf_series, "us"))
+        .to_pandas()
+        .astype("timedelta64[ns]")
+    )
+
+    expected_result = pd.Series([pd.Timedelta(1, "us"), pd.Timedelta(2, "us")])
+    pd.testing.assert_series_equal(
+        actual_result, expected_result, check_index_type=False
+    )
+
+
+@pytest.mark.parametrize(
+    "input",
+    [
+        pytest.param([1, 2, 3], id="list"),
+        pytest.param((1, 2, 3), id="tuple"),
+        pytest.param(pd.Series([1, 2, 3]), id="pandas-series"),
+    ],
+)
+def test_to_timedelta_with_list_like_input(session, input):
+    actual_result = (
+        typing.cast(bpd.Series, bpd.to_timedelta(input, "s", session=session))
+        .to_pandas()
+        .astype("timedelta64[ns]")
+    )
+
+    expected_result = pd.Series(pd.to_timedelta(input, "s"))
+    pd.testing.assert_series_equal(
+        actual_result, expected_result, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     "unit",
     ["Y", "M", "whatever"],
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py
index 2fd0e9186e..fce0643783 100644
--- a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py
+++ b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py
@@ -63,7 +63,6 @@
     typecode.VARBINARY: dt.Binary,
     typecode.VARCHAR: dt.String,
     typecode.VARIANT: dt.JSON,
-    typecode.UNIQUEIDENTIFIER: dt.UUID,
     typecode.SET: partial(dt.Array, dt.string),
     #############################
     # Unsupported sqlglot types #
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py
index 652f04757b..a252f116dd 100644
--- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py
+++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py
@@ -359,7 +359,7 @@ def wrap(node, _, **kwargs):
         return CTE(new) if node in ctes else new
 
     result = simplified.replace(wrap)
-    ctes = reversed([cte.parent for cte in result.find(CTE)])
+    ctes = [cte.parent for cte in result.find(CTE, ordered=True)]
 
     return result, ctes
 
diff --git a/third_party/bigframes_vendored/ibis/common/graph.py b/third_party/bigframes_vendored/ibis/common/graph.py
index 1a3fc6c543..6e7995ec03 100644
--- a/third_party/bigframes_vendored/ibis/common/graph.py
+++ b/third_party/bigframes_vendored/ibis/common/graph.py
@@ -343,6 +343,7 @@ def find(
         finder: FinderLike,
         filter: Optional[FinderLike] = None,
         context: Optional[dict] = None,
+        ordered: bool = False,
     ) -> list[Node]:
         """Find all nodes matching a given pattern or type in the graph.
 
@@ -360,6 +361,8 @@ def find(
             the given filter and stop otherwise.
         context
             Optional context to use if `finder` or `filter` is a pattern.
+        ordered
+            Emit nodes in topological order if `True`.
 
         Returns
         -------
@@ -369,6 +372,8 @@ def find(
         """
         graph = Graph.from_bfs(self, filter=filter, context=context)
         finder = _coerce_finder(finder, context)
+        if ordered:
+            graph, _ = graph.toposort()
         return [node for node in graph.nodes() if finder(node)]
 
     @experimental

From ba5beb397d578ee5438c8ab64cfa05ab10954281 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 12 Feb 2025 18:59:08 +0000
Subject: [PATCH 16/75] preparing test files

---
 tests/data/ratings.jsonl                    |  0
 tests/data/ratings_schema.json              | 17 ++++++++++++++
 tests/system/conftest.py                    | 14 ++++++++++++
 tests/system/large/ml/test_decomposition.py | 25 +++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 tests/data/ratings.jsonl
 create mode 100644 tests/data/ratings_schema.json

diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json
new file mode 100644
index 0000000000..ca34a530ee
--- /dev/null
+++ b/tests/data/ratings_schema.json
@@ -0,0 +1,17 @@
+[
+    {
+      "mode": "NULLABLE",
+      "name": "user_id",
+      "type": "STRING"
+    },
+    {
+      "mode": "NULLABLE",
+      "name": "item_id",
+      "type": "STRING"
+    },
+    {
+      "mode": "NULLABLE",
+      "name": "ratings",
+      "type": "FLOAT"
+    }
+]
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index 29234bc4ef..e4bff8cdcc 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -305,6 +305,7 @@ def load_test_data_tables(
         ("repeated", "repeated_schema.json", "repeated.jsonl"),
         ("json", "json_schema.json", "json.jsonl"),
         ("penguins", "penguins_schema.json", "penguins.jsonl"),
+        ("ratings", "ratings_schema.json", "ratings.jsonl"),
         ("time_series", "time_series_schema.json", "time_series.jsonl"),
         ("hockey_players", "hockey_players.json", "hockey_players.jsonl"),
         ("matrix_2by3", "matrix_2by3.json", "matrix_2by3.jsonl"),
@@ -401,6 +402,11 @@ def penguins_table_id(test_data_tables) -> str:
     return test_data_tables["penguins"]
 
 
+@pytest.fixture(scope="session")
+def ratings_table_id(test_data_tables) -> str:
+    return test_data_tables["ratings"]
+
+
 @pytest.fixture(scope="session")
 def urban_areas_table_id(test_data_tables) -> str:
     return test_data_tables["urban_areas"]
@@ -743,6 +749,14 @@ def penguins_df_null_index(
     return unordered_session.read_gbq(penguins_table_id)
 
 
+@pytest.fixture(scope="session")
+def ratings_df_default_index(
+    ratings_table_id: str, session: bigframes.Session
+) -> bigframes.dataframe.DataFrame:
+    """DataFrame pointing at test data."""
+    return session.read_gbq(ratings_table_id)
+
+
 @pytest.fixture(scope="session")
 def time_series_df_default_index(
     time_series_table_id: str, session: bigframes.Session
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index 49aa985189..2544e8dba0 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -163,3 +163,28 @@ def test_decomposition_configure_fit_load_none_component(
         in reloaded_model._bqml_model.model_name
     )
     assert reloaded_model.n_components == 7
+
+
+def test_decomposition_mf_configure_fit_load_none_component(
+    ratings_df_default_index, dataset_id
+):
+    model = decomposition.MatrixFactorization(
+        num_factors=6,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    model.fit(ratings_df_default_index)
+
+    # save, load, check n_components. Here n_components is the column size of the training input.
+    # reloaded_model = model.to_gbq(
+    #     f"{dataset_id}.temp_configured_pca_model", replace=True
+    # )
+    # assert reloaded_model._bqml_model is not None
+    # assert (
+    #     f"{dataset_id}.temp_configured_pca_model"
+    #     in reloaded_model._bqml_model.model_name
+    # )
+    assert model.num_factors == 6

From 857783301e558c8af6b362d3e32a40ce9f7b5c15 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Thu, 13 Feb 2025 20:44:56 +0000
Subject: [PATCH 17/75] add test data

---
 tests/data/ratings.jsonl                    | 20 ++++++++++++++++++++
 tests/system/large/ml/test_decomposition.py |  4 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl
index e69de29bb2..919b61c350 100644
--- a/tests/data/ratings.jsonl
+++ b/tests/data/ratings.jsonl
@@ -0,0 +1,20 @@
+{"user_id": 1, "item_id": 2, "ratings": 4.0}
+{"user_id": 1, "item_id": 5, "ratings": 3.0}
+{"user_id": 2, "item_id": 1, "ratings": 5.0}
+{"user_id": 2, "item_id": 3, "ratings": 2.0}
+{"user_id": 3, "item_id": 4, "ratings": 4.5}
+{"user_id": 3, "item_id": 7, "ratings": 3.5}
+{"user_id": 4, "item_id": 2, "ratings": 1.0}
+{"user_id": 4, "item_id": 8, "ratings": 5.0}
+{"user_id": 5, "item_id": 3, "ratings": 4.0}
+{"user_id": 5, "item_id": 9, "ratings": 2.5}
+{"user_id": 6, "item_id": 1, "ratings": 3.0}
+{"user_id": 6, "item_id": 6, "ratings": 4.5}
+{"user_id": 7, "item_id": 5, "ratings": 5.0}
+{"user_id": 7, "item_id": 10, "ratings": 1.5}
+{"user_id": 8, "item_id": 4, "ratings": 2.0}
+{"user_id": 8, "item_id": 7, "ratings": 4.0}
+{"user_id": 9, "item_id": 2, "ratings": 3.5}
+{"user_id": 9, "item_id": 9, "ratings": 5.0}
+{"user_id": 10, "item_id": 3, "ratings": 4.5}
+{"user_id": 10, "item_id": 8, "ratings": 2.5}
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index 2544e8dba0..d2320c570a 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -172,8 +172,8 @@ def test_decomposition_mf_configure_fit_load_none_component(
         num_factors=6,
         feedback_type="explicit",
         user_col="user_id",
-        item_col="item_col",
-        rating_col="rating_col",
+        item_col="item_id",
+        rating_col="ratings",
         l2_reg=9.83,
     )
     model.fit(ratings_df_default_index)

From 4b7b4dba0d043d93dc41186fde8cc9d597d224e4 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Fri, 21 Feb 2025 01:40:07 +0000
Subject: [PATCH 18/75] new error: to_gbq column names need to be changed?

---
 demo.ipynb                                  | 758 ++++++++++++++++++++
 tests/data/ratings_schema.json              |   2 +-
 tests/system/large/ml/test_decomposition.py |  35 +-
 3 files changed, 782 insertions(+), 13 deletions(-)
 create mode 100644 demo.ipynb

diff --git a/demo.ipynb b/demo.ipynb
new file mode 100644
index 0000000000..93e6f121f9
--- /dev/null
+++ b/demo.ipynb
@@ -0,0 +1,758 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 39ca6c3f-1c37-4f8e-8252-33cf6abfa340 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:39ca6c3f-1c37-4f8e-8252-33cf6abfa340&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 7dda7bc2-75b2-42b5-918b-41dd0540eb53 is DONE. 24.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7dda7bc2-75b2-42b5-918b-41dd0540eb53&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 4b99d068-1e68-4a86-bd0b-52d40ef6a270 is DONE. 40.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:4b99d068-1e68-4a86-bd0b-52d40ef6a270&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>item_id</th>\n",
+       "      <th>rating</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4354</td>\n",
+       "      <td>968</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3622</td>\n",
+       "      <td>3521</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>5543</td>\n",
+       "      <td>920</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>445</td>\n",
+       "      <td>3175</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5535</td>\n",
+       "      <td>235</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>4422</td>\n",
+       "      <td>1097</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>3119</td>\n",
+       "      <td>1356</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>6037</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>851</td>\n",
+       "      <td>196</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>3111</td>\n",
+       "      <td>435</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>5403</td>\n",
+       "      <td>648</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>3601</td>\n",
+       "      <td>2734</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>4655</td>\n",
+       "      <td>2949</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>1274</td>\n",
+       "      <td>3093</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1521</td>\n",
+       "      <td>350</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>3072</td>\n",
+       "      <td>454</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>3314</td>\n",
+       "      <td>1330</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>3762</td>\n",
+       "      <td>2719</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>1687</td>\n",
+       "      <td>2169</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>970</td>\n",
+       "      <td>3081</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>1265</td>\n",
+       "      <td>2248</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>1502</td>\n",
+       "      <td>104</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>194</td>\n",
+       "      <td>500</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>3521</td>\n",
+       "      <td>1088</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>1889</td>\n",
+       "      <td>3567</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>25 rows × 3 columns</p>\n",
+       "</div>[1000209 rows x 3 columns in total]"
+      ],
+      "text/plain": [
+       "    user_id  item_id  rating\n",
+       "0      4354      968     4.0\n",
+       "1      3622     3521     5.0\n",
+       "2      5543      920     2.0\n",
+       "3       445     3175     5.0\n",
+       "4      5535      235     4.0\n",
+       "5      4422     1097     4.0\n",
+       "6      3119     1356     4.0\n",
+       "7      6037     1231     4.0\n",
+       "8       851      196     3.0\n",
+       "9      3111      435     3.0\n",
+       "10     5403      648     5.0\n",
+       "11     3601     2734     3.0\n",
+       "12     4655     2949     4.0\n",
+       "13     1274     3093     5.0\n",
+       "14     1521      350     4.0\n",
+       "15     3072      454     3.0\n",
+       "16     3314     1330     4.0\n",
+       "17     3762     2719     1.0\n",
+       "18     1687     2169     3.0\n",
+       "19      970     3081     4.0\n",
+       "20     1265     2248     5.0\n",
+       "21     1502      104     4.0\n",
+       "22      194      500     4.0\n",
+       "23     3521     1088     3.0\n",
+       "24     1889     3567     3.0\n",
+       "...\n",
+       "\n",
+       "[1000209 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import bigframes.pandas as bpd\n",
+    "from bigframes.ml import decomposition\n",
+    "\n",
+    "bq_df = bpd.read_gbq('bqml_tutorial.ratings', columns=('user_id', 'item_id', 'rating'))\n",
+    "bq_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n",
+       "                    rating_col='rating_col', user_col='user_id')"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = decomposition.MatrixFactorization(\n",
+    "    num_factors=34,\n",
+    "    feedback_type='explicit',\n",
+    "    user_col='user_id',\n",
+    "    item_col='item_col',\n",
+    "    rating_col='rating_col',\n",
+    "    l2_reg=9.83,\n",
+    ")\n",
+    "\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 50f616db-afae-40da-bc95-f724bb8a5c84 is DONE. 24.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:50f616db-afae-40da-bc95-f724bb8a5c84&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job d13d556d-e011-40a0-9da8-5c0918cf1ef1 is DONE. 537.2 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d13d556d-e011-40a0-9da8-5c0918cf1ef1&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n",
+       "                    rating_col='rating_col', user_col='user_id')"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fitted = model.fit(bq_df.rename(columns={'rating': 'rating_col', 'item_id': 'item_col'}))\n",
+    "fitted\n",
+    "# scored = model.score(fitted)\n",
+    "\n",
+    "# scored"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 66684505-f14b-423b-8105-93521064036a is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:66684505-f14b-423b-8105-93521064036a&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 4ec28d78-f0c1-4456-8c08-60b6982ee52f is DONE. 48 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:4ec28d78-f0c1-4456-8c08-60b6982ee52f&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mean_absolute_error</th>\n",
+       "      <th>mean_squared_error</th>\n",
+       "      <th>mean_squared_log_error</th>\n",
+       "      <th>median_absolute_error</th>\n",
+       "      <th>r2_score</th>\n",
+       "      <th>explained_variance</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.485282</td>\n",
+       "      <td>0.395341</td>\n",
+       "      <td>0.025535</td>\n",
+       "      <td>0.389906</td>\n",
+       "      <td>0.683199</td>\n",
+       "      <td>0.683199</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 6 columns</p>\n",
+       "</div>[1 rows x 6 columns in total]"
+      ],
+      "text/plain": [
+       "   mean_absolute_error  mean_squared_error  mean_squared_log_error  \\\n",
+       "0             0.485282            0.395341                0.025535   \n",
+       "\n",
+       "   median_absolute_error  r2_score  explained_variance  \n",
+       "0               0.389906  0.683199            0.683199  \n",
+       "\n",
+       "[1 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scored = model.score(fitted)\n",
+    "\n",
+    "scored"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job e7dcfb81-70af-4d65-9c2a-b42591812d0e is DONE. 29.5 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:e7dcfb81-70af-4d65-9c2a-b42591812d0e&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job fcb53dd1-f9cb-4872-b7bf-3d2f0da89b00 is DONE. 40.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:fcb53dd1-f9cb-4872-b7bf-3d2f0da89b00&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 10436512-dada-4dfc-a3ff-94b480a5e890 is DONE. 48.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:10436512-dada-4dfc-a3ff-94b480a5e890&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>predicted_rating_col</th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>item_col</th>\n",
+       "      <th>rating</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3.348131</td>\n",
+       "      <td>4354</td>\n",
+       "      <td>968</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>5.22349</td>\n",
+       "      <td>3622</td>\n",
+       "      <td>3521</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1.820173</td>\n",
+       "      <td>5543</td>\n",
+       "      <td>920</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4.702228</td>\n",
+       "      <td>445</td>\n",
+       "      <td>3175</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3.206949</td>\n",
+       "      <td>5535</td>\n",
+       "      <td>235</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>4.690283</td>\n",
+       "      <td>4422</td>\n",
+       "      <td>1097</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>3.944585</td>\n",
+       "      <td>3119</td>\n",
+       "      <td>1356</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>4.275766</td>\n",
+       "      <td>6037</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>3.434579</td>\n",
+       "      <td>851</td>\n",
+       "      <td>196</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1.827473</td>\n",
+       "      <td>3111</td>\n",
+       "      <td>435</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>4.130928</td>\n",
+       "      <td>5403</td>\n",
+       "      <td>648</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>3.231195</td>\n",
+       "      <td>3601</td>\n",
+       "      <td>2734</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>3.750037</td>\n",
+       "      <td>4655</td>\n",
+       "      <td>2949</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>3.858951</td>\n",
+       "      <td>1274</td>\n",
+       "      <td>3093</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>3.34852</td>\n",
+       "      <td>1521</td>\n",
+       "      <td>350</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>2.956284</td>\n",
+       "      <td>3072</td>\n",
+       "      <td>454</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>3.831856</td>\n",
+       "      <td>3314</td>\n",
+       "      <td>1330</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>0.805804</td>\n",
+       "      <td>3762</td>\n",
+       "      <td>2719</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>3.65957</td>\n",
+       "      <td>1687</td>\n",
+       "      <td>2169</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>3.03197</td>\n",
+       "      <td>970</td>\n",
+       "      <td>3081</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>3.384926</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>2248</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>4.173243</td>\n",
+       "      <td>1502</td>\n",
+       "      <td>104</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>3.918435</td>\n",
+       "      <td>194</td>\n",
+       "      <td>500</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>2.451965</td>\n",
+       "      <td>3521</td>\n",
+       "      <td>1088</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>2.982963</td>\n",
+       "      <td>1889</td>\n",
+       "      <td>3567</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>25 rows × 4 columns</p>\n",
+       "</div>[1000209 rows x 4 columns in total]"
+      ],
+      "text/plain": [
+       "    predicted_rating_col  user_id  item_col  rating\n",
+       "0               3.348131     4354       968     4.0\n",
+       "1                5.22349     3622      3521     5.0\n",
+       "2               1.820173     5543       920     2.0\n",
+       "3               4.702228      445      3175     5.0\n",
+       "4               3.206949     5535       235     4.0\n",
+       "5               4.690283     4422      1097     4.0\n",
+       "6               3.944585     3119      1356     4.0\n",
+       "7               4.275766     6037      1231     4.0\n",
+       "8               3.434579      851       196     3.0\n",
+       "9               1.827473     3111       435     3.0\n",
+       "10              4.130928     5403       648     5.0\n",
+       "11              3.231195     3601      2734     3.0\n",
+       "12              3.750037     4655      2949     4.0\n",
+       "13              3.858951     1274      3093     5.0\n",
+       "14               3.34852     1521       350     4.0\n",
+       "15              2.956284     3072       454     3.0\n",
+       "16              3.831856     3314      1330     4.0\n",
+       "17              0.805804     3762      2719     1.0\n",
+       "18               3.65957     1687      2169     3.0\n",
+       "19               3.03197      970      3081     4.0\n",
+       "20              3.384926     1265      2248     5.0\n",
+       "21              4.173243     1502       104     4.0\n",
+       "22              3.918435      194       500     4.0\n",
+       "23              2.451965     3521      1088     3.0\n",
+       "24              2.982963     1889      3567     3.0\n",
+       "...\n",
+       "\n",
+       "[1000209 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# predict_df = scored[['user_id']['item_col']]\n",
+    "# model.predict(predict_df)\n",
+    "model.predict(bq_df.rename(columns={'item_id': 'item_col'}))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json
index ca34a530ee..1867a8c801 100644
--- a/tests/data/ratings_schema.json
+++ b/tests/data/ratings_schema.json
@@ -7,7 +7,7 @@
     {
       "mode": "NULLABLE",
       "name": "item_id",
-      "type": "STRING"
+      "type": "INT64"
     },
     {
       "mode": "NULLABLE",
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index d2320c570a..36f5d83c75 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -165,8 +165,8 @@ def test_decomposition_configure_fit_load_none_component(
     assert reloaded_model.n_components == 7
 
 
-def test_decomposition_mf_configure_fit_load_none_component(
-    ratings_df_default_index, dataset_id
+def test_decomposition_mf_configure_fit_load(
+    session, ratings_df_default_index, dataset_id
 ):
     model = decomposition.MatrixFactorization(
         num_factors=6,
@@ -178,13 +178,24 @@ def test_decomposition_mf_configure_fit_load_none_component(
     )
     model.fit(ratings_df_default_index)
 
-    # save, load, check n_components. Here n_components is the column size of the training input.
-    # reloaded_model = model.to_gbq(
-    #     f"{dataset_id}.temp_configured_pca_model", replace=True
-    # )
-    # assert reloaded_model._bqml_model is not None
-    # assert (
-    #     f"{dataset_id}.temp_configured_pca_model"
-    #     in reloaded_model._bqml_model.model_name
-    # )
-    assert model.num_factors == 6
+    reloaded_model = model.to_gbq(
+        f"{dataset_id}.temp_configured_mf_model", replace=True
+    )
+
+    new_ratings = session.read_pandas(
+        pd.DataFrame(
+            {
+                "user_id": ["11", "12", "13"],
+                "item_id": [1, 2, 3],
+                "ratings": [1.0, 2.0, 3.0],
+            }
+        )
+    )
+
+    reloaded_model.score(new_ratings)
+
+    result = reloaded_model.predict(
+        new_ratings.rename(columns={"item_id": "item_col"})
+    ).to_pandas()
+
+    assert result is not None

From 8d55eac81aec7888e6eefa08fd86ed921c3b115a Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 24 Feb 2025 21:51:40 +0000
Subject: [PATCH 19/75] Merge branch 'main' into
 b338873783-matrix-factorization

---
 CHANGELOG.md                                  |  32 ++++
 bigframes/core/compile/aggregate_compiler.py  |  14 +-
 bigframes/core/compile/ibis_types.py          |  17 ++-
 bigframes/core/compile/scalar_op_compiler.py  |   5 +
 bigframes/core/rewrite/timedeltas.py          |  55 ++++++-
 bigframes/core/schema.py                      |   6 +-
 bigframes/dataframe.py                        |  13 +-
 bigframes/dtypes.py                           |   1 +
 bigframes/functions/_function_session.py      |   6 +
 bigframes/functions/function.py               |  12 +-
 bigframes/ml/metrics/_metrics.py              |   7 +-
 bigframes/operations/__init__.py              |   2 +
 bigframes/operations/aggregations.py          |  25 ++-
 bigframes/operations/json_ops.py              |  14 +-
 bigframes/operations/remote_function_ops.py   |  40 ++---
 bigframes/operations/timedelta_ops.py         |  27 +++-
 bigframes/series.py                           |  22 ++-
 bigframes/version.py                          |   2 +-
 noxfile.py                                    |  20 ++-
 samples/snippets/bigquery_modules_test.py     |  69 +++++++++
 ...ingle_timeseries_forecasting_model_test.py |  64 ++++++++
 scripts/test_publish_api_coverage.py          |   2 +
 setup.py                                      |  11 +-
 testing/constraints-3.9.txt                   |   1 -
 tests/system/conftest.py                      |   5 +
 .../large/functions/test_remote_function.py   |   4 +
 .../small/functions/test_remote_function.py   | 143 ++++++++++++++++--
 tests/system/small/ml/test_metrics.py         |  14 +-
 .../small/operations/test_timedeltas.py       |  46 ++++++
 tests/system/small/test_pandas.py             |  15 ++
 tests/system/small/test_series.py             |   2 +
 tests/unit/functions/test_remote_function.py  |   6 +
 tests/unit/ml/test_api_primitives.py          |   5 +-
 tests/unit/ml/test_compose.py                 |   4 +-
 tests/unit/ml/test_pipeline.py                |   9 +-
 .../bigframes_vendored/pandas/core/frame.py   |   4 +-
 .../bigframes_vendored/pandas/core/series.py  |   4 +-
 .../sklearn/metrics/_ranking.py               |  20 ++-
 .../bigframes_vendored/tpch/queries/q9.py     |  14 +-
 third_party/bigframes_vendored/version.py     |   2 +-
 40 files changed, 635 insertions(+), 129 deletions(-)
 create mode 100644 samples/snippets/bigquery_modules_test.py
 create mode 100644 samples/snippets/limit_single_timeseries_forecasting_model_test.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b301f85a6a..24a1d8cb62 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,38 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [1.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.37.0...v1.38.0) (2025-02-24)
+
+
+### Features
+
+* (Preview) Support diff aggregation for timestamp series. ([#1405](https://github.com/googleapis/python-bigquery-dataframes/issues/1405)) ([abe48d6](https://github.com/googleapis/python-bigquery-dataframes/commit/abe48d6f13a954534460fa14c9337e1085d9fbb3))
+* Add `GeoSeries.from_wkt() `and `GeoSeries.to_wkt()` ([#1401](https://github.com/googleapis/python-bigquery-dataframes/issues/1401)) ([2993b28](https://github.com/googleapis/python-bigquery-dataframes/commit/2993b283966960430ad8482f40f177e276db2d64))
+* Support DF.__array__(copy=True) ([#1403](https://github.com/googleapis/python-bigquery-dataframes/issues/1403)) ([693ed8c](https://github.com/googleapis/python-bigquery-dataframes/commit/693ed8cfb1ecc3af161801225d3e9cda489c29dd))
+* Support routines with ARRAY return type in `read_gbq_function` ([#1412](https://github.com/googleapis/python-bigquery-dataframes/issues/1412)) ([4b60049](https://github.com/googleapis/python-bigquery-dataframes/commit/4b60049e8362bfb07c136d8b2eb02b984d71f084))
+
+
+### Bug Fixes
+
+* Calling to_timdelta() over timedeltas no longer changes their values ([#1411](https://github.com/googleapis/python-bigquery-dataframes/issues/1411)) ([650a190](https://github.com/googleapis/python-bigquery-dataframes/commit/650a1907fdf84897eb7aa288863ee27d938e0879))
+* Replace empty dict with None to avoid mutable default arguments ([#1416](https://github.com/googleapis/python-bigquery-dataframes/issues/1416)) ([fa4e3ad](https://github.com/googleapis/python-bigquery-dataframes/commit/fa4e3ad8bcd5db56fa26b26609cc7e58b1edf498))
+
+
+### Performance Improvements
+
+* Avoid redundant SQL casts ([#1399](https://github.com/googleapis/python-bigquery-dataframes/issues/1399)) ([6ee48d5](https://github.com/googleapis/python-bigquery-dataframes/commit/6ee48d5c16870f1caa99c3f658c2c1a0e14be749))
+
+
+### Dependencies
+
+* Remove scikit-learn and sqlalchemy as required dependencies ([#1296](https://github.com/googleapis/python-bigquery-dataframes/issues/1296)) ([fd8bc89](https://github.com/googleapis/python-bigquery-dataframes/commit/fd8bc894bdbdf551ebbec1fb93832588371ae6af))
+
+
+### Documentation
+
+* Add samples using SQL methods via the `bigframes.bigquery` module ([#1358](https://github.com/googleapis/python-bigquery-dataframes/issues/1358)) ([f54e768](https://github.com/googleapis/python-bigquery-dataframes/commit/f54e7688fda6372c6decc9b61796b0272d803c79))
+* Add snippets for visualizing a time series and creating a time series model for the Limit forecasted values in time series model tutorial ([#1310](https://github.com/googleapis/python-bigquery-dataframes/issues/1310)) ([c6c9120](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c9120e839647e5b3cb97f04a8d90cc8690b8a3))
+
 ## [1.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.36.0...v1.37.0) (2025-02-19)
 
 
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
index 4ec0b270ed..a17b69815c 100644
--- a/bigframes/core/compile/aggregate_compiler.py
+++ b/bigframes/core/compile/aggregate_compiler.py
@@ -231,7 +231,11 @@ def _(
     column: ibis_types.NumericColumn,
     window=None,
 ) -> ibis_types.NumericValue:
-    return _apply_window_if_present(column.quantile(op.q), window)
+    result = column.quantile(op.q)
+    if op.should_floor_result:
+        result = result.floor()  # type:ignore
+
+    return _apply_window_if_present(result, window)
 
 
 @compile_unary_agg.register
@@ -242,7 +246,8 @@ def _(
     window=None,
     # order_by: typing.Sequence[ibis_types.Value] = [],
 ) -> ibis_types.NumericValue:
-    return _apply_window_if_present(column.mean(), window)
+    result = column.mean().floor() if op.should_floor_result else column.mean()
+    return _apply_window_if_present(result, window)
 
 
 @compile_unary_agg.register
@@ -306,10 +311,11 @@ def _(
 @numeric_op
 def _(
     op: agg_ops.StdOp,
-    x: ibis_types.Column,
+    x: ibis_types.NumericColumn,
     window=None,
 ) -> ibis_types.Value:
-    return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window)
+    result = x.std().floor() if op.should_floor_result else x.std()
+    return _apply_window_if_present(result, window)
 
 
 @compile_unary_agg.register
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index 2dcc1b3c8a..c47c6cf07b 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -463,10 +463,19 @@ def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType:
     return python_type_to_ibis_type(t)
 
 
-def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType:
+def ibis_type_from_bigquery_type(
+    type_: bigquery.StandardSqlDataType,
+) -> ibis_dtypes.DataType:
     """Convert bq type to ibis. Only to be used for remote functions, does not handle all types."""
-    if tk not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS:
+    if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS:
         raise UnsupportedTypeError(
-            tk, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+            type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+        )
+    elif type_.type_kind == "ARRAY":
+        return ibis_dtypes.Array(
+            value_type=ibis_type_from_bigquery_type(
+                typing.cast(bigquery.StandardSqlDataType, type_.array_element_type)
+            )
         )
-    return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk)
+    else:
+        return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind)
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index 923ec8c81d..7111406646 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -1186,6 +1186,11 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp):
     ).floor()
 
 
+@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op)
+def timedelta_floor_op_impl(x: ibis_types.NumericValue):
+    return x.floor()
+
+
 @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
 def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp):
     ibis_node = getattr(op.func, "ibis_node", None)
diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py
index dad474e5a1..e21e0b6bf2 100644
--- a/bigframes/core/rewrite/timedeltas.py
+++ b/bigframes/core/rewrite/timedeltas.py
@@ -70,6 +70,19 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod
             root.skip_reproject_unsafe,
         )
 
+    if isinstance(root, nodes.AggregateNode):
+        updated_aggregations = tuple(
+            (_rewrite_aggregation(agg, root.child.schema), col_id)
+            for agg, col_id in root.aggregations
+        )
+        return nodes.AggregateNode(
+            root.child,
+            updated_aggregations,
+            root.by_column_ids,
+            root.order_by,
+            root.dropna,
+        )
+
     return root
 
 
@@ -125,6 +138,9 @@ def _rewrite_op_expr(
         # but for timedeltas: int(timedelta) // float => int(timedelta)
         return _rewrite_floordiv_op(inputs[0], inputs[1])
 
+    if isinstance(expr.op, ops.ToTimedeltaOp):
+        return _rewrite_to_timedelta_op(expr.op, inputs[0])
+
     return _TypedExpr.create_op_expr(expr.op, *inputs)
 
 
@@ -154,9 +170,9 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
     result = _TypedExpr.create_op_expr(ops.mul_op, left, right)
 
     if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
     if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE:
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
 
     return result
 
@@ -165,7 +181,7 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
     result = _TypedExpr.create_op_expr(ops.div_op, left, right)
 
     if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
 
     return result
 
@@ -174,28 +190,53 @@ def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
     result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right)
 
     if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
 
     return result
 
 
+def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr):
+    if arg.dtype is dtypes.TIMEDELTA_DTYPE:
+        # Do nothing for values that are already timedeltas
+        return arg
+
+    return _TypedExpr.create_op_expr(op, arg)
+
+
 @functools.cache
 def _rewrite_aggregation(
     aggregation: ex.Aggregation, schema: schema.ArraySchema
 ) -> ex.Aggregation:
     if not isinstance(aggregation, ex.UnaryAggregation):
         return aggregation
-    if not isinstance(aggregation.op, aggs.DiffOp):
-        return aggregation
 
     if isinstance(aggregation.arg, ex.DerefOp):
         input_type = schema.get_type(aggregation.arg.id.sql)
     else:
         input_type = aggregation.arg.dtype
 
-    if dtypes.is_datetime_like(input_type):
+    if isinstance(aggregation.op, aggs.DiffOp) and dtypes.is_datetime_like(input_type):
         return ex.UnaryAggregation(
             aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg
         )
 
+    if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE:
+        return ex.UnaryAggregation(
+            aggs.StdOp(should_floor_result=True), aggregation.arg
+        )
+
+    if isinstance(aggregation.op, aggs.MeanOp) and input_type is dtypes.TIMEDELTA_DTYPE:
+        return ex.UnaryAggregation(
+            aggs.MeanOp(should_floor_result=True), aggregation.arg
+        )
+
+    if (
+        isinstance(aggregation.op, aggs.QuantileOp)
+        and input_type is dtypes.TIMEDELTA_DTYPE
+    ):
+        return ex.UnaryAggregation(
+            aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True),
+            aggregation.arg,
+        )
+
     return aggregation
diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py
index e3808dfffd..c379db72be 100644
--- a/bigframes/core/schema.py
+++ b/bigframes/core/schema.py
@@ -41,8 +41,12 @@ class ArraySchema:
     def from_bq_table(
         cls,
         table: google.cloud.bigquery.Table,
-        column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {},
+        column_type_overrides: typing.Optional[
+            typing.Dict[str, bigframes.dtypes.Dtype]
+        ] = None,
     ):
+        if column_type_overrides is None:
+            column_type_overrides = {}
         items = tuple(
             SchemaItem(name, column_type_overrides.get(name, dtype))
             for name, dtype in bigframes.dtypes.bf_type_from_type_kind(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index c02b182ee3..caf1b62e07 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -3705,7 +3705,9 @@ def to_numpy(
     ) -> numpy.ndarray:
         return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs)
 
-    def __array__(self, dtype=None) -> numpy.ndarray:
+    def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray:
+        if copy is False:
+            raise ValueError("Cannot convert to array without copy.")
         return self.to_numpy(dtype=dtype)
 
     __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__)
@@ -4086,9 +4088,12 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
                 )
             result_series.name = None
 
-            # if the output is an array, reconstruct it from the json serialized
-            # string form
-            if bigframes.dtypes.is_array_like(func.output_dtype):
+            # If the result type is string but the function output is intended
+            # to be an array, reconstruct the array from the string assuming it
+            # is a json serialized form of the array.
+            if bigframes.dtypes.is_string_like(
+                result_series.dtype
+            ) and bigframes.dtypes.is_array_like(func.output_dtype):
                 import bigframes.bigquery as bbq
 
                 result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index e4db904210..54b621a0f8 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -874,4 +874,5 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
     "INT64",
     "INTEGER",
     "STRING",
+    "ARRAY",
 }
diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py
index a0518978a3..93b5c4c596 100644
--- a/bigframes/functions/_function_session.py
+++ b/bigframes/functions/_function_session.py
@@ -501,6 +501,7 @@ def try_delattr(attr):
             try_delattr("bigframes_remote_function")
             try_delattr("input_dtypes")
             try_delattr("output_dtype")
+            try_delattr("bigframes_bigquery_function_output_dtype")
             try_delattr("is_row_processor")
             try_delattr("ibis_node")
 
@@ -589,6 +590,11 @@ def try_delattr(attr):
                     ibis_signature.output_type
                 )
             )
+            func.bigframes_bigquery_function_output_dtype = (
+                bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
+                    ibis_output_type_for_bqrf
+                )
+            )
             func.is_row_processor = is_row_processor
             func.ibis_node = node
 
diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py
index ef2c81a953..c2809b96eb 100644
--- a/bigframes/functions/function.py
+++ b/bigframes/functions/function.py
@@ -56,8 +56,10 @@ class ReturnTypeMissingError(ValueError):
 # TODO: Move this to compile folder
 def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature:
     if routine.return_type:
-        ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_type_kind(
-            routine.return_type.type_kind
+        ibis_output_type = (
+            bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type(
+                routine.return_type
+            )
         )
     else:
         raise ReturnTypeMissingError
@@ -82,8 +84,8 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignatu
     return _utils.IbisSignature(
         parameter_names=[arg.name for arg in routine.arguments],
         input_types=[
-            bigframes.core.compile.ibis_types.ibis_type_from_type_kind(
-                arg.data_type.type_kind
+            bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type(
+                arg.data_type
             )
             if arg.data_type
             else None
@@ -233,6 +235,8 @@ def func(*bigframes_args, **bigframes_kwargs):
         else ibis_signature.output_type
     )
 
+    func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type)  # type: ignore
+
     func.is_row_processor = is_row_processor  # type: ignore
     func.ibis_node = node  # type: ignore
     return func
diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py
index 90df6f9539..658818b261 100644
--- a/bigframes/ml/metrics/_metrics.py
+++ b/bigframes/ml/metrics/_metrics.py
@@ -25,7 +25,6 @@
 import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
 import numpy as np
 import pandas as pd
-import sklearn.metrics as sklearn_metrics  # type: ignore
 
 from bigframes.ml import utils
 import bigframes.pandas as bpd
@@ -176,9 +175,9 @@ def auc(
 ) -> float:
     x_series, y_series = utils.batch_convert_to_series(x, y)
 
-    # TODO(b/286410053) Support ML exceptions and error handling.
-    auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas())
-    return auc
+    x_pandas = x_series.to_pandas()
+    y_pandas = y_series.to_pandas()
+    return vendored_metrics_ranking.auc(x_pandas, y_pandas)
 
 
 auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index e4e4bf7ef3..7e6f1f793c 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -184,6 +184,7 @@
 from bigframes.operations.struct_ops import StructFieldOp, StructOp
 from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op
 from bigframes.operations.timedelta_ops import (
+    timedelta_floor_op,
     timestamp_add_op,
     timestamp_sub_op,
     ToTimedeltaOp,
@@ -259,6 +260,7 @@
     "second_op",
     "normalize_op",
     # Timedelta ops
+    "timedelta_floor_op",
     "timestamp_add_op",
     "timestamp_sub_op",
     "ToTimedeltaOp",
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index e9d102b42d..bf6016bb2e 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -142,13 +142,16 @@ class SumOp(UnaryAggregateOp):
     name: ClassVar[str] = "sum"
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
-        if not dtypes.is_numeric(input_types[0]):
-            raise TypeError(f"Type {input_types[0]} is not numeric")
-        if pd.api.types.is_bool_dtype(input_types[0]):
-            return dtypes.INT_DTYPE
-        else:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
+
+        if dtypes.is_numeric(input_types[0]):
+            if pd.api.types.is_bool_dtype(input_types[0]):
+                return dtypes.INT_DTYPE
             return input_types[0]
 
+        raise TypeError(f"Type {input_types[0]} is not numeric or timedelta")
+
 
 @dataclasses.dataclass(frozen=True)
 class MedianOp(UnaryAggregateOp):
@@ -171,6 +174,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 @dataclasses.dataclass(frozen=True)
 class QuantileOp(UnaryAggregateOp):
     q: float
+    should_floor_result: bool = False
 
     @property
     def name(self):
@@ -181,6 +185,8 @@ def order_independent(self) -> bool:
         return True
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
         return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
 
 
@@ -224,7 +230,11 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 class MeanOp(UnaryAggregateOp):
     name: ClassVar[str] = "mean"
 
+    should_floor_result: bool = False
+
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
         return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
 
 
@@ -262,7 +272,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 class StdOp(UnaryAggregateOp):
     name: ClassVar[str] = "std"
 
+    should_floor_result: bool = False
+
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
+
         return signatures.FixedOutputType(
             dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric"
         ).output_type(input_types[0])
diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py
index 1daacf4e6b..c9ce633cae 100644
--- a/bigframes/operations/json_ops.py
+++ b/bigframes/operations/json_ops.py
@@ -31,7 +31,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return input_type
@@ -46,7 +46,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return pd.ArrowDtype(
@@ -63,7 +63,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return pd.ArrowDtype(
@@ -79,7 +79,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if input_type != dtypes.STRING_DTYPE:
             raise TypeError(
-                "Input type must be an valid JSON-formatted string type."
+                "Input type must be a valid JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return dtypes.JSON_DTYPE
@@ -93,7 +93,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return dtypes.STRING_DTYPE
@@ -109,7 +109,7 @@ def output_type(self, *input_types):
         right_type = input_types[1]
         if not dtypes.is_json_like(left_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {left_type}"
             )
         if not dtypes.is_json_encoding_type(right_type):
@@ -130,7 +130,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return dtypes.STRING_DTYPE
diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py
index 5b738c0bb5..8505fd1607 100644
--- a/bigframes/operations/remote_function_ops.py
+++ b/bigframes/operations/remote_function_ops.py
@@ -15,7 +15,6 @@
 import dataclasses
 import typing
 
-from bigframes import dtypes
 from bigframes.operations import base_ops
 
 
@@ -31,17 +30,10 @@ def expensive(self) -> bool:
 
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
-        if hasattr(self.func, "output_dtype"):
-            if dtypes.is_array_like(self.func.output_dtype):
-                # TODO(b/284515241): remove this special handling to support
-                # array output types once BQ remote functions support ARRAY.
-                # Until then, use json serialized strings at the remote function
-                # level, and parse that to the intended output type at the
-                # bigframes level.
-                return dtypes.STRING_DTYPE
-            return self.func.output_dtype
+        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
+            return self.func.bigframes_bigquery_function_output_dtype
         else:
-            raise AttributeError("output_dtype not defined")
+            raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
 
 
 @dataclasses.dataclass(frozen=True)
@@ -55,17 +47,10 @@ def expensive(self) -> bool:
 
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
-        if hasattr(self.func, "output_dtype"):
-            if dtypes.is_array_like(self.func.output_dtype):
-                # TODO(b/284515241): remove this special handling to support
-                # array output types once BQ remote functions support ARRAY.
-                # Until then, use json serialized strings at the remote function
-                # level, and parse that to the intended output type at the
-                # bigframes level.
-                return dtypes.STRING_DTYPE
-            return self.func.output_dtype
+        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
+            return self.func.bigframes_bigquery_function_output_dtype
         else:
-            raise AttributeError("output_dtype not defined")
+            raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
 
 
 @dataclasses.dataclass(frozen=True)
@@ -79,14 +64,7 @@ def expensive(self) -> bool:
 
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
-        if hasattr(self.func, "output_dtype"):
-            if dtypes.is_array_like(self.func.output_dtype):
-                # TODO(b/284515241): remove this special handling to support
-                # array output types once BQ remote functions support ARRAY.
-                # Until then, use json serialized strings at the remote function
-                # level, and parse that to the intended output type at the
-                # bigframes level.
-                return dtypes.STRING_DTYPE
-            return self.func.output_dtype
+        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
+            return self.func.bigframes_bigquery_function_output_dtype
         else:
-            raise AttributeError("output_dtype not defined")
+            raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py
index 689966e21b..364154f728 100644
--- a/bigframes/operations/timedelta_ops.py
+++ b/bigframes/operations/timedelta_ops.py
@@ -36,7 +36,26 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 
 
 @dataclasses.dataclass(frozen=True)
-class TimestampAdd(base_ops.BinaryOp):
+class TimedeltaFloorOp(base_ops.UnaryOp):
+    """Floors the numeric value to the nearest integer and use it to represent a timedelta.
+
+    This operator is only meant to be used during expression tree rewrites. Do not use it anywhere else!
+    """
+
+    name: typing.ClassVar[str] = "timedelta_floor"
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        input_type = input_types[0]
+        if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
+        raise TypeError(f"unsupported type: {input_type}")
+
+
+timedelta_floor_op = TimedeltaFloorOp()
+
+
+@dataclasses.dataclass(frozen=True)
+class TimestampAddOp(base_ops.BinaryOp):
     name: typing.ClassVar[str] = "timestamp_add"
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
@@ -57,10 +76,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
         )
 
 
-timestamp_add_op = TimestampAdd()
+timestamp_add_op = TimestampAddOp()
 
 
-class TimestampSub(base_ops.BinaryOp):
+class TimestampSubOp(base_ops.BinaryOp):
     name: typing.ClassVar[str] = "timestamp_sub"
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
@@ -76,4 +95,4 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
         )
 
 
-timestamp_sub_op = TimestampSub()
+timestamp_sub_op = TimestampSubOp()
diff --git a/bigframes/series.py b/bigframes/series.py
index fe2d1aae0e..5a84dee32f 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -1545,9 +1545,12 @@ def apply(
             ops.RemoteFunctionOp(func=func, apply_on_null=True)
         )
 
-        # if the output is an array, reconstruct it from the json serialized
-        # string form
-        if bigframes.dtypes.is_array_like(func.output_dtype):
+        # If the result type is string but the function output is intended to
+        # be an array, reconstruct the array from the string assuming it is a
+        # json serialized form of the array.
+        if bigframes.dtypes.is_string_like(
+            result_series.dtype
+        ) and bigframes.dtypes.is_array_like(func.output_dtype):
             import bigframes.bigquery as bbq
 
             result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
@@ -1585,9 +1588,12 @@ def combine(
             other, ops.BinaryRemoteFunctionOp(func=func)
         )
 
-        # if the output is an array, reconstruct it from the json serialized
-        # string form
-        if bigframes.dtypes.is_array_like(func.output_dtype):
+        # If the result type is string but the function output is intended to
+        # be an array, reconstruct the array from the string assuming it is a
+        # json serialized form of the array.
+        if bigframes.dtypes.is_string_like(
+            result_series.dtype
+        ) and bigframes.dtypes.is_array_like(func.output_dtype):
             import bigframes.bigquery as bbq
 
             result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
@@ -1812,7 +1818,9 @@ def to_numpy(
     ) -> numpy.ndarray:
         return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs)
 
-    def __array__(self, dtype=None) -> numpy.ndarray:
+    def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray:
+        if copy is False:
+            raise ValueError("Cannot convert to array without copy.")
         return self.to_numpy(dtype=dtype)
 
     __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__)
diff --git a/bigframes/version.py b/bigframes/version.py
index 27dfb23603..762deda9ff 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.37.0"
+__version__ = "1.38.0"
diff --git a/noxfile.py b/noxfile.py
index b851bf160d..bffb6ebaa0 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -72,7 +72,9 @@
 UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
 UNIT_TEST_DEPENDENCIES: List[str] = []
 UNIT_TEST_EXTRAS: List[str] = []
-UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]}
+UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
+    "3.12": ["polars", "scikit-learn"],
+}
 
 # 3.10 is needed for Windows tests as it is the only version installed in the
 # bigframes-windows container image. For more information, search
@@ -96,8 +98,13 @@
 ]
 SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = []
 SYSTEM_TEST_DEPENDENCIES: List[str] = []
-SYSTEM_TEST_EXTRAS: List[str] = ["tests"]
-SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {}
+SYSTEM_TEST_EXTRAS: List[str] = []
+SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
+    "3.9": ["tests"],
+    "3.10": ["tests"],
+    "3.12": ["tests", "scikit-learn"],
+    "3.13": ["tests"],
+}
 
 LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"
 
@@ -468,8 +475,7 @@ def cover(session):
 @nox.session(python=DEFAULT_PYTHON_VERSION)
 def docs(session):
     """Build the docs for this library."""
-
-    session.install("-e", ".")
+    session.install("-e", ".[scikit-learn]")
     session.install(
         # We need to pin to specific versions of the `sphinxcontrib-*` packages
         # which still support sphinx 4.x.
@@ -510,7 +516,7 @@ def docs(session):
 def docfx(session):
     """Build the docfx yaml files for this library."""
 
-    session.install("-e", ".")
+    session.install("-e", ".[scikit-learn]")
     session.install(
         # We need to pin to specific versions of the `sphinxcontrib-*` packages
         # which still support sphinx 4.x.
@@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=(
         if match.group(1) not in already_installed
     ]
 
+    print(already_installed)
+
     # We use --no-deps to ensure that pre-release versions aren't overwritten
     # by the version ranges in setup.py.
     session.install(*deps)
diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py
new file mode 100644
index 0000000000..1a15790815
--- /dev/null
+++ b/samples/snippets/bigquery_modules_test.py
@@ -0,0 +1,69 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_bigquery_dataframes_examples() -> None:
+    # [START bigquery_dataframes_bigquery_methods_struct]
+    import bigframes.bigquery as bbq
+    import bigframes.pandas as bpd
+
+    # Load data from BigQuery
+    query_or_table = "bigquery-public-data.ml_datasets.penguins"
+    bq_df = bpd.read_gbq(query_or_table)
+
+    # Create a new STRUCT Series with subfields for each column in a DataFrames.
+    lengths = bbq.struct(
+        bq_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
+    )
+
+    lengths.peek()
+    # 146	{'culmen_length_mm': 51.1, 'culmen_depth_mm': ...
+    # 278	{'culmen_length_mm': 48.2, 'culmen_depth_mm': ...
+    # 337	{'culmen_length_mm': 36.4, 'culmen_depth_mm': ...
+    # 154	{'culmen_length_mm': 46.5, 'culmen_depth_mm': ...
+    # 185	{'culmen_length_mm': 50.1, 'culmen_depth_mm': ...
+    # dtype: struct[pyarrow]
+    # [END bigquery_dataframes_bigquery_methods_struct]
+
+    # [START bigquery_dataframes_bigquery_methods_scalar]
+    import bigframes.bigquery as bbq
+    import bigframes.pandas as bpd
+
+    # Load data from BigQuery
+    query_or_table = "bigquery-public-data.ml_datasets.penguins"
+
+    # The sql_scalar function can be used to inject SQL syntax that is not supported
+    # or difficult to express with the bigframes.pandas APIs.
+    bq_df = bpd.read_gbq(query_or_table)
+    shortest = bbq.sql_scalar(
+        "LEAST({0}, {1}, {2})",
+        columns=[
+            bq_df["culmen_depth_mm"],
+            bq_df["culmen_length_mm"],
+            bq_df["flipper_length_mm"],
+        ],
+    )
+
+    shortest.peek()
+    #         0
+    # 149	18.9
+    # 33	16.3
+    # 296	17.2
+    # 287	17.0
+    # 307	15.0
+    # dtype: Float64
+    # [END bigquery_dataframes_bigquery_methods_scalar]
+    assert bq_df is not None
+    assert lengths is not None
+    assert shortest is not None
diff --git a/samples/snippets/limit_single_timeseries_forecasting_model_test.py b/samples/snippets/limit_single_timeseries_forecasting_model_test.py
new file mode 100644
index 0000000000..6a9f14e383
--- /dev/null
+++ b/samples/snippets/limit_single_timeseries_forecasting_model_test.py
@@ -0,0 +1,64 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (t
+# you may not use this file except in compliance wi
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in
+# distributed under the License is distributed on a
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit
+# See the License for the specific language governi
+# limitations under the License.
+
+
+def test_limit_single_timeseries(random_model_id: str) -> None:
+    your_model_id = random_model_id
+
+    # [START bigquery_dataframes_bqml_limit_forecast_visualize]
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")
+
+    features = bpd.DataFrame(
+        {
+            "num_trips": df.starttime,
+            "date": df["starttime"].dt.date,
+        }
+    )
+    num_trips = features.groupby(["date"]).count()
+
+    num_trips.plot.line()
+    # [END bigquery_dataframes_bqml_limit_forecast_visualize]
+
+    # [START bigquery_dataframes_bqml_limit_forecast_create]
+    from bigframes.ml import forecasting
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")
+
+    features = bpd.DataFrame(
+        {
+            "start_station_id": df["start_station_id"],
+            "num_trips": df.starttime,
+            "date": df["starttime"].dt.date,
+        }
+    )
+    num_trips = features.groupby(["date", "start_station_id"], as_index=False).count()
+    model = forecasting.ARIMAPlus()
+
+    X = num_trips[["date"]]
+    y = num_trips[["num_trips"]]
+    id_col = num_trips[["start_station_id"]]
+
+    model.fit(X, y, id_col=id_col)
+
+    model.to_gbq(
+        your_model_id,  # For example: "bqml_tutorial.nyc_citibike_arima_model",
+        replace=True,
+    )
+    # [END bigquery_dataframes_bqml_limit_forecast_create]
+    assert df is not None
+    assert features is not None
+    assert num_trips is not None
diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py
index 034a266177..6dea10b608 100644
--- a/scripts/test_publish_api_coverage.py
+++ b/scripts/test_publish_api_coverage.py
@@ -19,6 +19,8 @@
 
 from . import publish_api_coverage
 
+pytest.importorskip("sklearn")
+
 
 @pytest.fixture
 def api_coverage_df():
diff --git a/setup.py b/setup.py
index 4386177a5e..1f6114b634 100644
--- a/setup.py
+++ b/setup.py
@@ -55,8 +55,6 @@
     "pyarrow >=10.0.1",
     "pydata-google-auth >=1.8.2",
     "requests >=2.27.1",
-    "scikit-learn >=1.2.2",
-    "sqlalchemy >=1.4,<3.0dev",
     "sqlglot >=23.6.3",
     "tabulate >=0.9",
     "ipywidgets >=7.7.1",
@@ -77,8 +75,15 @@
     "tests": [],
     # used for local engine, which is only needed for unit tests at present.
     "polars": ["polars >= 1.7.0"],
+    "scikit-learn": ["scikit-learn>=1.2.2"],
     # Packages required for basic development flow.
-    "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"],
+    "dev": [
+        "pytest",
+        "pytest-mock",
+        "pre-commit",
+        "nox",
+        "google-cloud-testutils",
+    ],
 }
 extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values()))))
 
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index 8b7ad892c0..30d5c1c3a7 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -20,7 +20,6 @@ pyarrow==10.0.1
 pydata-google-auth==1.8.2
 requests==2.27.1
 scikit-learn==1.2.2
-sqlalchemy==1.4
 sqlglot==23.6.3
 tabulate==0.9
 ipywidgets==7.7.1
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index e4bff8cdcc..f69f08b1ae 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -251,6 +251,11 @@ def table_id_unique(dataset_id: str):
     return f"{dataset_id}.{prefixer.create_prefix()}"
 
 
+@pytest.fixture(scope="function")
+def routine_id_unique(dataset_id: str):
+    return f"{dataset_id}.{prefixer.create_prefix()}"
+
+
 @pytest.fixture(scope="session")
 def scalars_schema(bigquery_client: bigquery.Client):
     # TODO(swast): Add missing scalar data types such as BIGNUMERIC.
diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py
index 54ba0549a0..7363e370bb 100644
--- a/tests/system/large/functions/test_remote_function.py
+++ b/tests/system/large/functions/test_remote_function.py
@@ -2193,6 +2193,10 @@ def foo(x, y, z):
                 )
             )
         )
+        assert (
+            getattr(foo, "bigframes_bigquery_function_output_dtype")
+            == bigframes.dtypes.STRING_DTYPE
+        )
 
         # Fails to apply on dataframe with incompatible number of columns
         with pytest.raises(
diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py
index 0dc8960f62..99a017c917 100644
--- a/tests/system/small/functions/test_remote_function.py
+++ b/tests/system/small/functions/test_remote_function.py
@@ -14,6 +14,7 @@
 
 import inspect
 import re
+import textwrap
 
 import google.api_core.exceptions
 from google.cloud import bigquery
@@ -27,6 +28,7 @@
 import bigframes.exceptions
 from bigframes.functions import _utils as bff_utils
 from bigframes.functions import function as bff
+import bigframes.session._io.bigquery
 from tests.system.utils import assert_pandas_df_equal
 
 _prefixer = test_utils.prefixer.Prefixer("bigframes", "")
@@ -632,7 +634,6 @@ def add_one(x):
         )(add_one)
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_detects_invalid_function(session, dataset_id):
     dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
     with pytest.raises(ValueError) as e:
@@ -705,21 +706,133 @@ def square1(x):
     assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas())
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_runs_existing_udf(session):
     func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only")
     got = func("AURÉLIE")
     assert got == "aurÉlie"
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_runs_existing_udf_4_params(session):
     func = session.read_gbq_function("bqutil.fn.cw_instr4")
     got = func("TestStr123456Str", "Str", 1, 2)
     assert got == 14
 
 
-@pytest.mark.flaky(retries=2, delay=120)
+def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_unique):
+    bigframes.session._io.bigquery.start_query_with_client(
+        session.bqclient,
+        textwrap.dedent(
+            f"""
+                CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING)
+                RETURNS ARRAY<STRING>
+                AS (
+                    [x, x]
+                )
+            """
+        ),
+        job_config=bigquery.QueryJobConfig(),
+    )
+    func = session.read_gbq_function(routine_id_unique)
+
+    # Test on scalar value
+    got = func("hello")
+    assert got == ["hello", "hello"]
+
+    # Test on a series, assert pandas parity
+    pd_s = pd.Series(["alpha", "beta", "gamma"])
+    bf_s = session.read_pandas(pd_s)
+    pd_result = pd_s.apply(func)
+    bf_result = bf_s.apply(func)
+    assert bigframes.dtypes.is_array_string_like(bf_result.dtype)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
+def test_read_gbq_function_runs_existing_udf_2_params_array_output(
+    session, routine_id_unique
+):
+    bigframes.session._io.bigquery.start_query_with_client(
+        session.bqclient,
+        textwrap.dedent(
+            f"""
+                CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y STRING)
+                RETURNS ARRAY<STRING>
+                AS (
+                    [x, y]
+                )
+            """
+        ),
+        job_config=bigquery.QueryJobConfig(),
+    )
+    func = session.read_gbq_function(routine_id_unique)
+
+    # Test on scalar value
+    got = func("hello", "world")
+    assert got == ["hello", "world"]
+
+    # Test on series, assert pandas parity
+    pd_df = pd.DataFrame(
+        {"col0": ["alpha", "beta", "gamma"], "col1": ["delta", "theta", "phi"]}
+    )
+    bf_df = session.read_pandas(pd_df)
+    pd_result = pd_df["col0"].combine(pd_df["col1"], func)
+    bf_result = bf_df["col0"].combine(bf_df["col1"], func)
+    assert bigframes.dtypes.is_array_string_like(bf_result.dtype)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
+def test_read_gbq_function_runs_existing_udf_4_params_array_output(
+    session, routine_id_unique
+):
+    bigframes.session._io.bigquery.start_query_with_client(
+        session.bqclient,
+        textwrap.dedent(
+            f"""
+                CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y BOOL, z INT64, w FLOAT64)
+                RETURNS ARRAY<STRING>
+                AS (
+                    [x, CAST(y AS STRING), CAST(z AS STRING), CAST(w AS STRING)]
+                )
+            """
+        ),
+        job_config=bigquery.QueryJobConfig(),
+    )
+    func = session.read_gbq_function(routine_id_unique)
+
+    # Test on scalar value
+    got = func("hello", True, 1, 2.3)
+    assert got == ["hello", "true", "1", "2.3"]
+
+    # Test on a dataframe, assert pandas parity
+    pd_df = pd.DataFrame(
+        {
+            "col0": ["alpha", "beta", "gamma"],
+            "col1": [True, False, True],
+            "col2": [1, 2, 3],
+            "col3": [4.5, 6, 7.75],
+        }
+    )
+    bf_df = session.read_pandas(pd_df)
+    # Simulate the result directly, since the function cannot be applied
+    # directly on a pandas dataframe with axis=1, as this is a special type of
+    # function with multiple params supported only on bigframes dataframe.
+    pd_result = pd.Series(
+        [
+            ["alpha", "true", "1", "4.5"],
+            ["beta", "false", "2", "6"],
+            ["gamma", "true", "3", "7.75"],
+        ]
+    )
+    bf_result = bf_df.apply(func, axis=1)
+    assert bigframes.dtypes.is_array_string_like(bf_result.dtype)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
 def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id):
     dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
     arg = bigquery.RoutineArgument(
@@ -754,6 +867,10 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id):
         assert square.bigframes_remote_function == str(routine.reference)
         assert square.input_dtypes == (bigframes.dtypes.INT_DTYPE,)
         assert square.output_dtype == bigframes.dtypes.INT_DTYPE
+        assert (
+            square.bigframes_bigquery_function_output_dtype
+            == bigframes.dtypes.INT_DTYPE
+        )
 
         src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]}
 
@@ -772,7 +889,6 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id):
         )
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_requires_explicit_types(
     session, bigquery_client, dataset_id
 ):
@@ -863,7 +979,6 @@ def test_read_gbq_function_requires_explicit_types(
         ),
     ],
 )
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_respects_python_output_type(
     request, session_fixture, bigquery_client, dataset_id, array_type, expected_data
 ):
@@ -906,7 +1021,6 @@ def test_read_gbq_function_respects_python_output_type(
         pytest.param(list[str], id="list-str"),
     ],
 )
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_supports_python_output_type_only_for_string_outputs(
     session, bigquery_client, dataset_id, array_type
 ):
@@ -945,7 +1059,6 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs(
         pytest.param(list[str], id="list-str"),
     ],
 )
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_supported_python_output_type(
     session, bigquery_client, dataset_id, array_type
 ):
@@ -992,7 +1105,6 @@ def test_df_apply_scalar_func(session, scalars_dfs):
     )
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_multiple_inputs_not_a_row_processor(session):
     with pytest.raises(ValueError) as context:
         # The remote function has two args, which cannot be row processed. Throw
@@ -1214,20 +1326,19 @@ def should_mask(name: str) -> bool:
     repr(s.mask(should_mask, "REDACTED"))
 
 
-@pytest.mark.flaky(retries=2, delay=120)
-def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index):
-    gbq_function = f"{dataset_id}.should_mask"
-
+def test_read_gbq_function_application_repr(
+    session, routine_id_unique, scalars_df_index
+):
     # This function deliberately has a param with name "name", this is to test
     # a specific ibis' internal handling of object names
     session.bqclient.query_and_wait(
-        f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)"
+        f"CREATE OR REPLACE FUNCTION `{routine_id_unique}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)"
     )
-    routine = session.bqclient.get_routine(gbq_function)
+    routine = session.bqclient.get_routine(routine_id_unique)
     assert "name" in [arg.name for arg in routine.arguments]
 
     # read the function and apply to dataframe
-    should_mask = session.read_gbq_function(gbq_function)
+    should_mask = session.read_gbq_function(routine_id_unique)
 
     s = scalars_df_index["string_col"]
 
diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py
index 81e1b2f77f..b80202bdbe 100644
--- a/tests/system/small/ml/test_metrics.py
+++ b/tests/system/small/ml/test_metrics.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-import sklearn.metrics as sklearn_metrics  # type: ignore
 
 import bigframes
 from bigframes.ml import metrics
@@ -66,6 +65,7 @@ def test_r2_score_force_finite(session):
 
 
 def test_r2_score_ok_fit_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})
 
     df = session.read_pandas(pd_df)
@@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session):
 
 
 def test_accuracy_score_fit_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})
 
     df = session.read_pandas(pd_df)
@@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session):
 
 
 def test_roc_curve_binary_classification_prediction_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
@@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session):
 
 
 def test_roc_curve_binary_classification_decision_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     # Instead of operating on probabilities, assume a 70% decision threshold
     # has been applied, and operate on the final output
     y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]
@@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session):
 
 
 def test_roc_auc_score_returns_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
@@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session):
 
 
 def test_confusion_matrix_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 3, 3, 3, 4, 1],
@@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session):
 
 
 def test_confusion_matrix_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -603,6 +609,7 @@ def test_recall_score(session):
 
 
 def test_recall_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 0, 2, 2, 0, 1],
@@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session):
 
 
 def test_recall_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -673,6 +681,7 @@ def test_precision_score(session):
 
 
 def test_precision_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 0, 2, 2, 0, 1],
@@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session):
 
 
 def test_precision_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -752,6 +762,7 @@ def test_f1_score(session):
 
 
 def test_f1_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 0, 2, 2, 0, 1],
@@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session):
 
 
 def test_f1_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py
index 356000b3f6..723481b1d1 100644
--- a/tests/system/small/operations/test_timedeltas.py
+++ b/tests/system/small/operations/test_timedeltas.py
@@ -465,3 +465,49 @@ def test_timedelta_ordering(session):
     pandas.testing.assert_series_equal(
         actual_result, expected_result, check_index_type=False
     )
+
+
+def test_timedelta_cumsum(temporal_dfs):
+    bf_df, pd_df = temporal_dfs
+
+    actual_result = bf_df["timedelta_col_1"].cumsum().to_pandas()
+
+    expected_result = pd_df["timedelta_col_1"].cumsum()
+    _assert_series_equal(actual_result, expected_result)
+
+
+@pytest.mark.parametrize(
+    "agg_func",
+    [
+        pytest.param(lambda x: x.min(), id="min"),
+        pytest.param(lambda x: x.max(), id="max"),
+        pytest.param(lambda x: x.sum(), id="sum"),
+        pytest.param(lambda x: x.mean(), id="mean"),
+        pytest.param(lambda x: x.median(), id="median"),
+        pytest.param(lambda x: x.quantile(0.5), id="quantile"),
+        pytest.param(lambda x: x.std(), id="std"),
+    ],
+)
+def test_timedelta_agg__timedelta_result(temporal_dfs, agg_func):
+    bf_df, pd_df = temporal_dfs
+
+    actual_result = agg_func(bf_df["timedelta_col_1"])
+
+    expected_result = agg_func(pd_df["timedelta_col_1"]).floor("us")
+    assert actual_result == expected_result
+
+
+@pytest.mark.parametrize(
+    "agg_func",
+    [
+        pytest.param(lambda x: x.count(), id="count"),
+        pytest.param(lambda x: x.nunique(), id="nunique"),
+    ],
+)
+def test_timedelta_agg__int_result(temporal_dfs, agg_func):
+    bf_df, pd_df = temporal_dfs
+
+    actual_result = agg_func(bf_df["timedelta_col_1"])
+
+    expected_result = agg_func(pd_df["timedelta_col_1"])
+    assert actual_result == expected_result
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index 4b4264e33c..da78432cdb 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -829,3 +829,18 @@ def test_to_timedelta_with_bf_series_invalid_unit(session, unit):
 @pytest.mark.parametrize("input", [1, 1.2, "1s"])
 def test_to_timedelta_non_bf_series(input):
     assert bpd.to_timedelta(input) == pd.to_timedelta(input)
+
+
+def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs):
+    bf_df, pd_df = scalars_dfs
+    bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us")
+    pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us")
+
+    actual_result = (
+        bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]")
+    )
+
+    expected_result = pd.to_timedelta(pd_series, unit="s")
+    pd.testing.assert_series_equal(
+        actual_result, expected_result, check_index_type=False
+    )
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 00f47c754e..2daa7dd825 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
     ),
 )
 def test_series_interpolate(method):
+    pytest.importorskip("scipy")
+
     values = [None, 1, 2, None, None, 16, None]
     index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
     pd_series = pd.Series(values, index)
diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py
index 413a694680..d377fb4d49 100644
--- a/tests/unit/functions/test_remote_function.py
+++ b/tests/unit/functions/test_remote_function.py
@@ -66,6 +66,12 @@ def test_supported_types_correspond():
     ibis_types_from_bigquery = {
         third_party_ibis_bqtypes.BigQueryType.to_ibis(tk)
         for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+        # TODO(b/284515241): ARRAY is the only exception because it is supported
+        # as an output type of the BQ routine in the read_gbq_function path but
+        # not in the remote function path. Remove this handline once BQ remote
+        # functions supports ARRAY output and the bigframes remote functions
+        # utilizes that to support array output.
+        if tk != "ARRAY"
     }
 
     assert ibis_types_from_python == ibis_types_from_bigquery
diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py
index 00a51ccfe9..dd2ceff143 100644
--- a/tests/unit/ml/test_api_primitives.py
+++ b/tests/unit/ml/test_api_primitives.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import pytest
-import sklearn.decomposition as sklearn_decomposition  # type: ignore
-import sklearn.linear_model as sklearn_linear_model  # type: ignore
 
 import bigframes.ml.decomposition
 import bigframes.ml.linear_model
@@ -35,8 +33,9 @@ def test_base_estimator_repr():
     assert pca_estimator.__repr__() == "PCA(n_components=7)"
 
 
-@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn")
 def test_base_estimator_repr_matches_sklearn():
+    sklearn_decomposition = pytest.importorskip("sklearn.decomposition")
+    sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
     estimator = bigframes.ml.linear_model.LinearRegression()
     sklearn_estimator = sklearn_linear_model.LinearRegression()
     assert estimator.__repr__() == sklearn_estimator.__repr__()
diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py
index 395296f3e4..450ce8d6ee 100644
--- a/tests/unit/ml/test_compose.py
+++ b/tests/unit/ml/test_compose.py
@@ -15,8 +15,6 @@
 
 from google.cloud import bigquery
 import pytest
-import sklearn.compose as sklearn_compose  # type: ignore
-import sklearn.preprocessing as sklearn_preprocessing  # type: ignore
 
 from bigframes.ml import compose, preprocessing
 from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer
@@ -119,6 +117,8 @@ def test_columntransformer_repr():
 
 
 def test_columntransformer_repr_matches_sklearn():
+    sklearn_compose = pytest.importorskip("sklearn.compose")
+    sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
     bf_column_transformer = compose.ColumnTransformer(
         [
             (
diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py
index ed5c621b1d..beebb9f282 100644
--- a/tests/unit/ml/test_pipeline.py
+++ b/tests/unit/ml/test_pipeline.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 
 import pytest
-import sklearn.compose as sklearn_compose  # type: ignore
-import sklearn.linear_model as sklearn_linear_model  # type: ignore
-import sklearn.pipeline as sklearn_pipeline  # type: ignore
-import sklearn.preprocessing as sklearn_preprocessing  # type: ignore
 
 from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing
 
@@ -57,8 +53,11 @@ def test_pipeline_repr():
     )
 
 
-@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn")
 def test_pipeline_repr_matches_sklearn():
+    sklearn_compose = pytest.importorskip("sklearn.compose")
+    sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
+    sklearn_pipeline = pytest.importorskip("sklearn.pipeline")
+    sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
     bf_pl = pipeline.Pipeline(
         [
             (
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index f5aa23d00b..e296dcb9f6 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -7179,7 +7179,7 @@ def __len__(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def __array__(self):
+    def __array__(self, dtype=None, copy: Optional[bool] = None):
         """
         Returns the rows as NumPy array.
 
@@ -7210,6 +7210,8 @@ def __array__(self):
             dtype (str or numpy.dtype, optional):
                 The dtype to use for the resulting NumPy array. By default,
                 the dtype is inferred from the data.
+            copy (bool or None, optional):
+                Whether to copy the data, False is not supported.
 
         Returns:
             numpy.ndarray:
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 57f7dfbb79..5e6f546d09 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -5941,7 +5941,7 @@ def size(self) -> int:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def __array__(self, dtype=None) -> numpy.ndarray:
+    def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray:
         """
         Returns the values as NumPy array.
 
@@ -5965,6 +5965,8 @@ def __array__(self, dtype=None) -> numpy.ndarray:
             dtype (str or numpy.dtype, optional):
                 The dtype to use for the resulting NumPy array. By default,
                 the dtype is inferred from the data.
+            copy (bool or None, optional):
+                Whether to copy the data, False is not supported.
 
         Returns:
             numpy.ndarray:
diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
index 7b97526de2..9262ffbd3d 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
@@ -16,6 +16,8 @@
 #          Michal Karbownik <michakarbownik@gmail.com>
 # License: BSD 3 clause
 
+import numpy as np
+
 from bigframes import constants
 
 
@@ -60,7 +62,23 @@ def auc(x, y) -> float:
     Returns:
         float: Area Under the Curve.
     """
-    raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+    if len(x) < 2:
+        raise ValueError(
+            f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}"
+        )
+
+    if x.is_monotonic_decreasing:
+        d = -1
+    elif x.is_monotonic_increasing:
+        d = 1
+    else:
+        raise ValueError(f"x is neither increasing nor decreasing : {x}.")
+
+    if hasattr(np, "trapezoid"):
+        # new in numpy 2.0
+        return d * np.trapezoid(y, x)
+    # np.trapz has been deprecated in 2.0
+    return d * np.trapz(y, x)  # type: ignore
 
 
 def roc_auc_score(y_true, y_score) -> float:
diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py
index 6af33f7569..5c9ca1e9c3 100644
--- a/third_party/bigframes_vendored/tpch/queries/q9.py
+++ b/third_party/bigframes_vendored/tpch/queries/q9.py
@@ -33,13 +33,17 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
     )
 
     q_final = (
-        part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY")
-        .merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY")
-        .merge(
+        part.merge(
             lineitem,
-            left_on=["P_PARTKEY", "PS_SUPPKEY"],
-            right_on=["L_PARTKEY", "L_SUPPKEY"],
+            left_on="P_PARTKEY",
+            right_on="L_PARTKEY",
+        )
+        .merge(
+            partsupp,
+            left_on=["L_SUPPKEY", "L_PARTKEY"],
+            right_on=["PS_SUPPKEY", "PS_PARTKEY"],
         )
+        .merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
         .merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
         .merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
     )
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py
index 27dfb23603..762deda9ff 100644
--- a/third_party/bigframes_vendored/version.py
+++ b/third_party/bigframes_vendored/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.37.0"
+__version__ = "1.38.0"

From faa4d6b29c94477da783268b0243112c7eae2d22 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 24 Feb 2025 22:06:49 +0000
Subject: [PATCH 20/75] Merge branch 'main' into
 b338873783-matrix-factorization

---
 CHANGELOG.md                                  |  32 ++++
 bigframes/core/compile/aggregate_compiler.py  |  14 +-
 bigframes/core/compile/ibis_types.py          |  17 ++-
 bigframes/core/compile/scalar_op_compiler.py  |   5 +
 bigframes/core/rewrite/timedeltas.py          |  55 ++++++-
 bigframes/core/schema.py                      |   6 +-
 bigframes/dataframe.py                        |  13 +-
 bigframes/dtypes.py                           |   1 +
 bigframes/functions/_function_session.py      |   6 +
 bigframes/functions/function.py               |  12 +-
 bigframes/ml/metrics/_metrics.py              |   7 +-
 bigframes/operations/__init__.py              |   2 +
 bigframes/operations/aggregations.py          |  25 ++-
 bigframes/operations/json_ops.py              |  14 +-
 bigframes/operations/remote_function_ops.py   |  40 ++---
 bigframes/operations/timedelta_ops.py         |  27 +++-
 bigframes/series.py                           |  22 ++-
 bigframes/version.py                          |   2 +-
 noxfile.py                                    |  20 ++-
 samples/snippets/bigquery_modules_test.py     |  69 +++++++++
 ...ingle_timeseries_forecasting_model_test.py |  64 ++++++++
 scripts/test_publish_api_coverage.py          |   2 +
 setup.py                                      |  11 +-
 testing/constraints-3.9.txt                   |   1 -
 tests/system/conftest.py                      |   5 +
 .../large/functions/test_remote_function.py   |   4 +
 .../small/functions/test_remote_function.py   | 143 ++++++++++++++++--
 tests/system/small/ml/test_metrics.py         |  14 +-
 .../small/operations/test_timedeltas.py       |  46 ++++++
 tests/system/small/test_pandas.py             |  15 ++
 tests/system/small/test_series.py             |   2 +
 tests/unit/functions/test_remote_function.py  |   6 +
 tests/unit/ml/test_api_primitives.py          |   5 +-
 tests/unit/ml/test_compose.py                 |   4 +-
 tests/unit/ml/test_pipeline.py                |   9 +-
 .../bigframes_vendored/pandas/core/frame.py   |   4 +-
 .../bigframes_vendored/pandas/core/series.py  |   4 +-
 .../sklearn/metrics/_ranking.py               |  20 ++-
 .../bigframes_vendored/tpch/queries/q9.py     |  14 +-
 third_party/bigframes_vendored/version.py     |   2 +-
 40 files changed, 635 insertions(+), 129 deletions(-)
 create mode 100644 samples/snippets/bigquery_modules_test.py
 create mode 100644 samples/snippets/limit_single_timeseries_forecasting_model_test.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b301f85a6a..24a1d8cb62 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,38 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [1.38.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.37.0...v1.38.0) (2025-02-24)
+
+
+### Features
+
+* (Preview) Support diff aggregation for timestamp series. ([#1405](https://github.com/googleapis/python-bigquery-dataframes/issues/1405)) ([abe48d6](https://github.com/googleapis/python-bigquery-dataframes/commit/abe48d6f13a954534460fa14c9337e1085d9fbb3))
+* Add `GeoSeries.from_wkt() `and `GeoSeries.to_wkt()` ([#1401](https://github.com/googleapis/python-bigquery-dataframes/issues/1401)) ([2993b28](https://github.com/googleapis/python-bigquery-dataframes/commit/2993b283966960430ad8482f40f177e276db2d64))
+* Support DF.__array__(copy=True) ([#1403](https://github.com/googleapis/python-bigquery-dataframes/issues/1403)) ([693ed8c](https://github.com/googleapis/python-bigquery-dataframes/commit/693ed8cfb1ecc3af161801225d3e9cda489c29dd))
+* Support routines with ARRAY return type in `read_gbq_function` ([#1412](https://github.com/googleapis/python-bigquery-dataframes/issues/1412)) ([4b60049](https://github.com/googleapis/python-bigquery-dataframes/commit/4b60049e8362bfb07c136d8b2eb02b984d71f084))
+
+
+### Bug Fixes
+
+* Calling to_timdelta() over timedeltas no longer changes their values ([#1411](https://github.com/googleapis/python-bigquery-dataframes/issues/1411)) ([650a190](https://github.com/googleapis/python-bigquery-dataframes/commit/650a1907fdf84897eb7aa288863ee27d938e0879))
+* Replace empty dict with None to avoid mutable default arguments ([#1416](https://github.com/googleapis/python-bigquery-dataframes/issues/1416)) ([fa4e3ad](https://github.com/googleapis/python-bigquery-dataframes/commit/fa4e3ad8bcd5db56fa26b26609cc7e58b1edf498))
+
+
+### Performance Improvements
+
+* Avoid redundant SQL casts ([#1399](https://github.com/googleapis/python-bigquery-dataframes/issues/1399)) ([6ee48d5](https://github.com/googleapis/python-bigquery-dataframes/commit/6ee48d5c16870f1caa99c3f658c2c1a0e14be749))
+
+
+### Dependencies
+
+* Remove scikit-learn and sqlalchemy as required dependencies ([#1296](https://github.com/googleapis/python-bigquery-dataframes/issues/1296)) ([fd8bc89](https://github.com/googleapis/python-bigquery-dataframes/commit/fd8bc894bdbdf551ebbec1fb93832588371ae6af))
+
+
+### Documentation
+
+* Add samples using SQL methods via the `bigframes.bigquery` module ([#1358](https://github.com/googleapis/python-bigquery-dataframes/issues/1358)) ([f54e768](https://github.com/googleapis/python-bigquery-dataframes/commit/f54e7688fda6372c6decc9b61796b0272d803c79))
+* Add snippets for visualizing a time series and creating a time series model for the Limit forecasted values in time series model tutorial ([#1310](https://github.com/googleapis/python-bigquery-dataframes/issues/1310)) ([c6c9120](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c9120e839647e5b3cb97f04a8d90cc8690b8a3))
+
 ## [1.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.36.0...v1.37.0) (2025-02-19)
 
 
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
index 4ec0b270ed..a17b69815c 100644
--- a/bigframes/core/compile/aggregate_compiler.py
+++ b/bigframes/core/compile/aggregate_compiler.py
@@ -231,7 +231,11 @@ def _(
     column: ibis_types.NumericColumn,
     window=None,
 ) -> ibis_types.NumericValue:
-    return _apply_window_if_present(column.quantile(op.q), window)
+    result = column.quantile(op.q)
+    if op.should_floor_result:
+        result = result.floor()  # type:ignore
+
+    return _apply_window_if_present(result, window)
 
 
 @compile_unary_agg.register
@@ -242,7 +246,8 @@ def _(
     window=None,
     # order_by: typing.Sequence[ibis_types.Value] = [],
 ) -> ibis_types.NumericValue:
-    return _apply_window_if_present(column.mean(), window)
+    result = column.mean().floor() if op.should_floor_result else column.mean()
+    return _apply_window_if_present(result, window)
 
 
 @compile_unary_agg.register
@@ -306,10 +311,11 @@ def _(
 @numeric_op
 def _(
     op: agg_ops.StdOp,
-    x: ibis_types.Column,
+    x: ibis_types.NumericColumn,
     window=None,
 ) -> ibis_types.Value:
-    return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window)
+    result = x.std().floor() if op.should_floor_result else x.std()
+    return _apply_window_if_present(result, window)
 
 
 @compile_unary_agg.register
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index 2dcc1b3c8a..c47c6cf07b 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -463,10 +463,19 @@ def ibis_array_output_type_from_python_type(t: type) -> ibis_dtypes.DataType:
     return python_type_to_ibis_type(t)
 
 
-def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType:
+def ibis_type_from_bigquery_type(
+    type_: bigquery.StandardSqlDataType,
+) -> ibis_dtypes.DataType:
     """Convert bq type to ibis. Only to be used for remote functions, does not handle all types."""
-    if tk not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS:
+    if type_.type_kind not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS:
         raise UnsupportedTypeError(
-            tk, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+            type_.type_kind, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+        )
+    elif type_.type_kind == "ARRAY":
+        return ibis_dtypes.Array(
+            value_type=ibis_type_from_bigquery_type(
+                typing.cast(bigquery.StandardSqlDataType, type_.array_element_type)
+            )
         )
-    return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk)
+    else:
+        return third_party_ibis_bqtypes.BigQueryType.to_ibis(type_.type_kind)
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index 923ec8c81d..7111406646 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -1186,6 +1186,11 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp):
     ).floor()
 
 
+@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op)
+def timedelta_floor_op_impl(x: ibis_types.NumericValue):
+    return x.floor()
+
+
 @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
 def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp):
     ibis_node = getattr(op.func, "ibis_node", None)
diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py
index dad474e5a1..e21e0b6bf2 100644
--- a/bigframes/core/rewrite/timedeltas.py
+++ b/bigframes/core/rewrite/timedeltas.py
@@ -70,6 +70,19 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod
             root.skip_reproject_unsafe,
         )
 
+    if isinstance(root, nodes.AggregateNode):
+        updated_aggregations = tuple(
+            (_rewrite_aggregation(agg, root.child.schema), col_id)
+            for agg, col_id in root.aggregations
+        )
+        return nodes.AggregateNode(
+            root.child,
+            updated_aggregations,
+            root.by_column_ids,
+            root.order_by,
+            root.dropna,
+        )
+
     return root
 
 
@@ -125,6 +138,9 @@ def _rewrite_op_expr(
         # but for timedeltas: int(timedelta) // float => int(timedelta)
         return _rewrite_floordiv_op(inputs[0], inputs[1])
 
+    if isinstance(expr.op, ops.ToTimedeltaOp):
+        return _rewrite_to_timedelta_op(expr.op, inputs[0])
+
     return _TypedExpr.create_op_expr(expr.op, *inputs)
 
 
@@ -154,9 +170,9 @@ def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
     result = _TypedExpr.create_op_expr(ops.mul_op, left, right)
 
     if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
     if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE:
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
 
     return result
 
@@ -165,7 +181,7 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
     result = _TypedExpr.create_op_expr(ops.div_op, left, right)
 
     if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
 
     return result
 
@@ -174,28 +190,53 @@ def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
     result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right)
 
     if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
-        return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result)
+        return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
 
     return result
 
 
+def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr):
+    if arg.dtype is dtypes.TIMEDELTA_DTYPE:
+        # Do nothing for values that are already timedeltas
+        return arg
+
+    return _TypedExpr.create_op_expr(op, arg)
+
+
 @functools.cache
 def _rewrite_aggregation(
     aggregation: ex.Aggregation, schema: schema.ArraySchema
 ) -> ex.Aggregation:
     if not isinstance(aggregation, ex.UnaryAggregation):
         return aggregation
-    if not isinstance(aggregation.op, aggs.DiffOp):
-        return aggregation
 
     if isinstance(aggregation.arg, ex.DerefOp):
         input_type = schema.get_type(aggregation.arg.id.sql)
     else:
         input_type = aggregation.arg.dtype
 
-    if dtypes.is_datetime_like(input_type):
+    if isinstance(aggregation.op, aggs.DiffOp) and dtypes.is_datetime_like(input_type):
         return ex.UnaryAggregation(
             aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg
         )
 
+    if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE:
+        return ex.UnaryAggregation(
+            aggs.StdOp(should_floor_result=True), aggregation.arg
+        )
+
+    if isinstance(aggregation.op, aggs.MeanOp) and input_type is dtypes.TIMEDELTA_DTYPE:
+        return ex.UnaryAggregation(
+            aggs.MeanOp(should_floor_result=True), aggregation.arg
+        )
+
+    if (
+        isinstance(aggregation.op, aggs.QuantileOp)
+        and input_type is dtypes.TIMEDELTA_DTYPE
+    ):
+        return ex.UnaryAggregation(
+            aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True),
+            aggregation.arg,
+        )
+
     return aggregation
diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py
index e3808dfffd..c379db72be 100644
--- a/bigframes/core/schema.py
+++ b/bigframes/core/schema.py
@@ -41,8 +41,12 @@ class ArraySchema:
     def from_bq_table(
         cls,
         table: google.cloud.bigquery.Table,
-        column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {},
+        column_type_overrides: typing.Optional[
+            typing.Dict[str, bigframes.dtypes.Dtype]
+        ] = None,
     ):
+        if column_type_overrides is None:
+            column_type_overrides = {}
         items = tuple(
             SchemaItem(name, column_type_overrides.get(name, dtype))
             for name, dtype in bigframes.dtypes.bf_type_from_type_kind(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index c02b182ee3..caf1b62e07 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -3705,7 +3705,9 @@ def to_numpy(
     ) -> numpy.ndarray:
         return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs)
 
-    def __array__(self, dtype=None) -> numpy.ndarray:
+    def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray:
+        if copy is False:
+            raise ValueError("Cannot convert to array without copy.")
         return self.to_numpy(dtype=dtype)
 
     __array__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__array__)
@@ -4086,9 +4088,12 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
                 )
             result_series.name = None
 
-            # if the output is an array, reconstruct it from the json serialized
-            # string form
-            if bigframes.dtypes.is_array_like(func.output_dtype):
+            # If the result type is string but the function output is intended
+            # to be an array, reconstruct the array from the string assuming it
+            # is a json serialized form of the array.
+            if bigframes.dtypes.is_string_like(
+                result_series.dtype
+            ) and bigframes.dtypes.is_array_like(func.output_dtype):
                 import bigframes.bigquery as bbq
 
                 result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index e4db904210..54b621a0f8 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -874,4 +874,5 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
     "INT64",
     "INTEGER",
     "STRING",
+    "ARRAY",
 }
diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py
index a0518978a3..93b5c4c596 100644
--- a/bigframes/functions/_function_session.py
+++ b/bigframes/functions/_function_session.py
@@ -501,6 +501,7 @@ def try_delattr(attr):
             try_delattr("bigframes_remote_function")
             try_delattr("input_dtypes")
             try_delattr("output_dtype")
+            try_delattr("bigframes_bigquery_function_output_dtype")
             try_delattr("is_row_processor")
             try_delattr("ibis_node")
 
@@ -589,6 +590,11 @@ def try_delattr(attr):
                     ibis_signature.output_type
                 )
             )
+            func.bigframes_bigquery_function_output_dtype = (
+                bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(
+                    ibis_output_type_for_bqrf
+                )
+            )
             func.is_row_processor = is_row_processor
             func.ibis_node = node
 
diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py
index ef2c81a953..c2809b96eb 100644
--- a/bigframes/functions/function.py
+++ b/bigframes/functions/function.py
@@ -56,8 +56,10 @@ class ReturnTypeMissingError(ValueError):
 # TODO: Move this to compile folder
 def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignature:
     if routine.return_type:
-        ibis_output_type = bigframes.core.compile.ibis_types.ibis_type_from_type_kind(
-            routine.return_type.type_kind
+        ibis_output_type = (
+            bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type(
+                routine.return_type
+            )
         )
     else:
         raise ReturnTypeMissingError
@@ -82,8 +84,8 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignatu
     return _utils.IbisSignature(
         parameter_names=[arg.name for arg in routine.arguments],
         input_types=[
-            bigframes.core.compile.ibis_types.ibis_type_from_type_kind(
-                arg.data_type.type_kind
+            bigframes.core.compile.ibis_types.ibis_type_from_bigquery_type(
+                arg.data_type
             )
             if arg.data_type
             else None
@@ -233,6 +235,8 @@ def func(*bigframes_args, **bigframes_kwargs):
         else ibis_signature.output_type
     )
 
+    func.bigframes_bigquery_function_output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_signature.output_type)  # type: ignore
+
     func.is_row_processor = is_row_processor  # type: ignore
     func.ibis_node = node  # type: ignore
     return func
diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py
index 90df6f9539..658818b261 100644
--- a/bigframes/ml/metrics/_metrics.py
+++ b/bigframes/ml/metrics/_metrics.py
@@ -25,7 +25,6 @@
 import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
 import numpy as np
 import pandas as pd
-import sklearn.metrics as sklearn_metrics  # type: ignore
 
 from bigframes.ml import utils
 import bigframes.pandas as bpd
@@ -176,9 +175,9 @@ def auc(
 ) -> float:
     x_series, y_series = utils.batch_convert_to_series(x, y)
 
-    # TODO(b/286410053) Support ML exceptions and error handling.
-    auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas())
-    return auc
+    x_pandas = x_series.to_pandas()
+    y_pandas = y_series.to_pandas()
+    return vendored_metrics_ranking.auc(x_pandas, y_pandas)
 
 
 auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index e4e4bf7ef3..7e6f1f793c 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -184,6 +184,7 @@
 from bigframes.operations.struct_ops import StructFieldOp, StructOp
 from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op
 from bigframes.operations.timedelta_ops import (
+    timedelta_floor_op,
     timestamp_add_op,
     timestamp_sub_op,
     ToTimedeltaOp,
@@ -259,6 +260,7 @@
     "second_op",
     "normalize_op",
     # Timedelta ops
+    "timedelta_floor_op",
     "timestamp_add_op",
     "timestamp_sub_op",
     "ToTimedeltaOp",
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index e9d102b42d..bf6016bb2e 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -142,13 +142,16 @@ class SumOp(UnaryAggregateOp):
     name: ClassVar[str] = "sum"
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
-        if not dtypes.is_numeric(input_types[0]):
-            raise TypeError(f"Type {input_types[0]} is not numeric")
-        if pd.api.types.is_bool_dtype(input_types[0]):
-            return dtypes.INT_DTYPE
-        else:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
+
+        if dtypes.is_numeric(input_types[0]):
+            if pd.api.types.is_bool_dtype(input_types[0]):
+                return dtypes.INT_DTYPE
             return input_types[0]
 
+        raise TypeError(f"Type {input_types[0]} is not numeric or timedelta")
+
 
 @dataclasses.dataclass(frozen=True)
 class MedianOp(UnaryAggregateOp):
@@ -171,6 +174,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 @dataclasses.dataclass(frozen=True)
 class QuantileOp(UnaryAggregateOp):
     q: float
+    should_floor_result: bool = False
 
     @property
     def name(self):
@@ -181,6 +185,8 @@ def order_independent(self) -> bool:
         return True
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
         return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
 
 
@@ -224,7 +230,11 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 class MeanOp(UnaryAggregateOp):
     name: ClassVar[str] = "mean"
 
+    should_floor_result: bool = False
+
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
         return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
 
 
@@ -262,7 +272,12 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 class StdOp(UnaryAggregateOp):
     name: ClassVar[str] = "std"
 
+    should_floor_result: bool = False
+
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        if input_types[0] is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
+
         return signatures.FixedOutputType(
             dtypes.is_numeric, dtypes.FLOAT_DTYPE, "numeric"
         ).output_type(input_types[0])
diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py
index 1daacf4e6b..c9ce633cae 100644
--- a/bigframes/operations/json_ops.py
+++ b/bigframes/operations/json_ops.py
@@ -31,7 +31,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return input_type
@@ -46,7 +46,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return pd.ArrowDtype(
@@ -63,7 +63,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return pd.ArrowDtype(
@@ -79,7 +79,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if input_type != dtypes.STRING_DTYPE:
             raise TypeError(
-                "Input type must be an valid JSON-formatted string type."
+                "Input type must be a valid JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return dtypes.JSON_DTYPE
@@ -93,7 +93,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return dtypes.STRING_DTYPE
@@ -109,7 +109,7 @@ def output_type(self, *input_types):
         right_type = input_types[1]
         if not dtypes.is_json_like(left_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {left_type}"
             )
         if not dtypes.is_json_encoding_type(right_type):
@@ -130,7 +130,7 @@ def output_type(self, *input_types):
         input_type = input_types[0]
         if not dtypes.is_json_like(input_type):
             raise TypeError(
-                "Input type must be an valid JSON object or JSON-formatted string type."
+                "Input type must be a valid JSON object or JSON-formatted string type."
                 + f" Received type: {input_type}"
             )
         return dtypes.STRING_DTYPE
diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py
index 5b738c0bb5..8505fd1607 100644
--- a/bigframes/operations/remote_function_ops.py
+++ b/bigframes/operations/remote_function_ops.py
@@ -15,7 +15,6 @@
 import dataclasses
 import typing
 
-from bigframes import dtypes
 from bigframes.operations import base_ops
 
 
@@ -31,17 +30,10 @@ def expensive(self) -> bool:
 
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
-        if hasattr(self.func, "output_dtype"):
-            if dtypes.is_array_like(self.func.output_dtype):
-                # TODO(b/284515241): remove this special handling to support
-                # array output types once BQ remote functions support ARRAY.
-                # Until then, use json serialized strings at the remote function
-                # level, and parse that to the intended output type at the
-                # bigframes level.
-                return dtypes.STRING_DTYPE
-            return self.func.output_dtype
+        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
+            return self.func.bigframes_bigquery_function_output_dtype
         else:
-            raise AttributeError("output_dtype not defined")
+            raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
 
 
 @dataclasses.dataclass(frozen=True)
@@ -55,17 +47,10 @@ def expensive(self) -> bool:
 
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
-        if hasattr(self.func, "output_dtype"):
-            if dtypes.is_array_like(self.func.output_dtype):
-                # TODO(b/284515241): remove this special handling to support
-                # array output types once BQ remote functions support ARRAY.
-                # Until then, use json serialized strings at the remote function
-                # level, and parse that to the intended output type at the
-                # bigframes level.
-                return dtypes.STRING_DTYPE
-            return self.func.output_dtype
+        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
+            return self.func.bigframes_bigquery_function_output_dtype
         else:
-            raise AttributeError("output_dtype not defined")
+            raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
 
 
 @dataclasses.dataclass(frozen=True)
@@ -79,14 +64,7 @@ def expensive(self) -> bool:
 
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
-        if hasattr(self.func, "output_dtype"):
-            if dtypes.is_array_like(self.func.output_dtype):
-                # TODO(b/284515241): remove this special handling to support
-                # array output types once BQ remote functions support ARRAY.
-                # Until then, use json serialized strings at the remote function
-                # level, and parse that to the intended output type at the
-                # bigframes level.
-                return dtypes.STRING_DTYPE
-            return self.func.output_dtype
+        if hasattr(self.func, "bigframes_bigquery_function_output_dtype"):
+            return self.func.bigframes_bigquery_function_output_dtype
         else:
-            raise AttributeError("output_dtype not defined")
+            raise AttributeError("bigframes_bigquery_function_output_dtype not defined")
diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py
index 689966e21b..364154f728 100644
--- a/bigframes/operations/timedelta_ops.py
+++ b/bigframes/operations/timedelta_ops.py
@@ -36,7 +36,26 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 
 
 @dataclasses.dataclass(frozen=True)
-class TimestampAdd(base_ops.BinaryOp):
+class TimedeltaFloorOp(base_ops.UnaryOp):
+    """Floors the numeric value to the nearest integer and use it to represent a timedelta.
+
+    This operator is only meant to be used during expression tree rewrites. Do not use it anywhere else!
+    """
+
+    name: typing.ClassVar[str] = "timedelta_floor"
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        input_type = input_types[0]
+        if dtypes.is_numeric(input_type) or input_type is dtypes.TIMEDELTA_DTYPE:
+            return dtypes.TIMEDELTA_DTYPE
+        raise TypeError(f"unsupported type: {input_type}")
+
+
+timedelta_floor_op = TimedeltaFloorOp()
+
+
+@dataclasses.dataclass(frozen=True)
+class TimestampAddOp(base_ops.BinaryOp):
     name: typing.ClassVar[str] = "timestamp_add"
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
@@ -57,10 +76,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
         )
 
 
-timestamp_add_op = TimestampAdd()
+timestamp_add_op = TimestampAddOp()
 
 
-class TimestampSub(base_ops.BinaryOp):
+class TimestampSubOp(base_ops.BinaryOp):
     name: typing.ClassVar[str] = "timestamp_sub"
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
@@ -76,4 +95,4 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
         )
 
 
-timestamp_sub_op = TimestampSub()
+timestamp_sub_op = TimestampSubOp()
diff --git a/bigframes/series.py b/bigframes/series.py
index fe2d1aae0e..5a84dee32f 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -1545,9 +1545,12 @@ def apply(
             ops.RemoteFunctionOp(func=func, apply_on_null=True)
         )
 
-        # if the output is an array, reconstruct it from the json serialized
-        # string form
-        if bigframes.dtypes.is_array_like(func.output_dtype):
+        # If the result type is string but the function output is intended to
+        # be an array, reconstruct the array from the string assuming it is a
+        # json serialized form of the array.
+        if bigframes.dtypes.is_string_like(
+            result_series.dtype
+        ) and bigframes.dtypes.is_array_like(func.output_dtype):
             import bigframes.bigquery as bbq
 
             result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
@@ -1585,9 +1588,12 @@ def combine(
             other, ops.BinaryRemoteFunctionOp(func=func)
         )
 
-        # if the output is an array, reconstruct it from the json serialized
-        # string form
-        if bigframes.dtypes.is_array_like(func.output_dtype):
+        # If the result type is string but the function output is intended to
+        # be an array, reconstruct the array from the string assuming it is a
+        # json serialized form of the array.
+        if bigframes.dtypes.is_string_like(
+            result_series.dtype
+        ) and bigframes.dtypes.is_array_like(func.output_dtype):
             import bigframes.bigquery as bbq
 
             result_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
@@ -1812,7 +1818,9 @@ def to_numpy(
     ) -> numpy.ndarray:
         return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs)
 
-    def __array__(self, dtype=None) -> numpy.ndarray:
+    def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray:
+        if copy is False:
+            raise ValueError("Cannot convert to array without copy.")
         return self.to_numpy(dtype=dtype)
 
     __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__)
diff --git a/bigframes/version.py b/bigframes/version.py
index 27dfb23603..762deda9ff 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.37.0"
+__version__ = "1.38.0"
diff --git a/noxfile.py b/noxfile.py
index b851bf160d..bffb6ebaa0 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -72,7 +72,9 @@
 UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
 UNIT_TEST_DEPENDENCIES: List[str] = []
 UNIT_TEST_EXTRAS: List[str] = []
-UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]}
+UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
+    "3.12": ["polars", "scikit-learn"],
+}
 
 # 3.10 is needed for Windows tests as it is the only version installed in the
 # bigframes-windows container image. For more information, search
@@ -96,8 +98,13 @@
 ]
 SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = []
 SYSTEM_TEST_DEPENDENCIES: List[str] = []
-SYSTEM_TEST_EXTRAS: List[str] = ["tests"]
-SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {}
+SYSTEM_TEST_EXTRAS: List[str] = []
+SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
+    "3.9": ["tests"],
+    "3.10": ["tests"],
+    "3.12": ["tests", "scikit-learn"],
+    "3.13": ["tests"],
+}
 
 LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"
 
@@ -468,8 +475,7 @@ def cover(session):
 @nox.session(python=DEFAULT_PYTHON_VERSION)
 def docs(session):
     """Build the docs for this library."""
-
-    session.install("-e", ".")
+    session.install("-e", ".[scikit-learn]")
     session.install(
         # We need to pin to specific versions of the `sphinxcontrib-*` packages
         # which still support sphinx 4.x.
@@ -510,7 +516,7 @@ def docs(session):
 def docfx(session):
     """Build the docfx yaml files for this library."""
 
-    session.install("-e", ".")
+    session.install("-e", ".[scikit-learn]")
     session.install(
         # We need to pin to specific versions of the `sphinxcontrib-*` packages
         # which still support sphinx 4.x.
@@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=(
         if match.group(1) not in already_installed
     ]
 
+    print(already_installed)
+
     # We use --no-deps to ensure that pre-release versions aren't overwritten
     # by the version ranges in setup.py.
     session.install(*deps)
diff --git a/samples/snippets/bigquery_modules_test.py b/samples/snippets/bigquery_modules_test.py
new file mode 100644
index 0000000000..1a15790815
--- /dev/null
+++ b/samples/snippets/bigquery_modules_test.py
@@ -0,0 +1,69 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_bigquery_dataframes_examples() -> None:
+    # [START bigquery_dataframes_bigquery_methods_struct]
+    import bigframes.bigquery as bbq
+    import bigframes.pandas as bpd
+
+    # Load data from BigQuery
+    query_or_table = "bigquery-public-data.ml_datasets.penguins"
+    bq_df = bpd.read_gbq(query_or_table)
+
+    # Create a new STRUCT Series with subfields for each column in a DataFrames.
+    lengths = bbq.struct(
+        bq_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
+    )
+
+    lengths.peek()
+    # 146	{'culmen_length_mm': 51.1, 'culmen_depth_mm': ...
+    # 278	{'culmen_length_mm': 48.2, 'culmen_depth_mm': ...
+    # 337	{'culmen_length_mm': 36.4, 'culmen_depth_mm': ...
+    # 154	{'culmen_length_mm': 46.5, 'culmen_depth_mm': ...
+    # 185	{'culmen_length_mm': 50.1, 'culmen_depth_mm': ...
+    # dtype: struct[pyarrow]
+    # [END bigquery_dataframes_bigquery_methods_struct]
+
+    # [START bigquery_dataframes_bigquery_methods_scalar]
+    import bigframes.bigquery as bbq
+    import bigframes.pandas as bpd
+
+    # Load data from BigQuery
+    query_or_table = "bigquery-public-data.ml_datasets.penguins"
+
+    # The sql_scalar function can be used to inject SQL syntax that is not supported
+    # or difficult to express with the bigframes.pandas APIs.
+    bq_df = bpd.read_gbq(query_or_table)
+    shortest = bbq.sql_scalar(
+        "LEAST({0}, {1}, {2})",
+        columns=[
+            bq_df["culmen_depth_mm"],
+            bq_df["culmen_length_mm"],
+            bq_df["flipper_length_mm"],
+        ],
+    )
+
+    shortest.peek()
+    #         0
+    # 149	18.9
+    # 33	16.3
+    # 296	17.2
+    # 287	17.0
+    # 307	15.0
+    # dtype: Float64
+    # [END bigquery_dataframes_bigquery_methods_scalar]
+    assert bq_df is not None
+    assert lengths is not None
+    assert shortest is not None
diff --git a/samples/snippets/limit_single_timeseries_forecasting_model_test.py b/samples/snippets/limit_single_timeseries_forecasting_model_test.py
new file mode 100644
index 0000000000..6a9f14e383
--- /dev/null
+++ b/samples/snippets/limit_single_timeseries_forecasting_model_test.py
@@ -0,0 +1,64 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (t
+# you may not use this file except in compliance wi
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in
+# distributed under the License is distributed on a
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit
+# See the License for the specific language governi
+# limitations under the License.
+
+
+def test_limit_single_timeseries(random_model_id: str) -> None:
+    your_model_id = random_model_id
+
+    # [START bigquery_dataframes_bqml_limit_forecast_visualize]
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")
+
+    features = bpd.DataFrame(
+        {
+            "num_trips": df.starttime,
+            "date": df["starttime"].dt.date,
+        }
+    )
+    num_trips = features.groupby(["date"]).count()
+
+    num_trips.plot.line()
+    # [END bigquery_dataframes_bqml_limit_forecast_visualize]
+
+    # [START bigquery_dataframes_bqml_limit_forecast_create]
+    from bigframes.ml import forecasting
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")
+
+    features = bpd.DataFrame(
+        {
+            "start_station_id": df["start_station_id"],
+            "num_trips": df.starttime,
+            "date": df["starttime"].dt.date,
+        }
+    )
+    num_trips = features.groupby(["date", "start_station_id"], as_index=False).count()
+    model = forecasting.ARIMAPlus()
+
+    X = num_trips[["date"]]
+    y = num_trips[["num_trips"]]
+    id_col = num_trips[["start_station_id"]]
+
+    model.fit(X, y, id_col=id_col)
+
+    model.to_gbq(
+        your_model_id,  # For example: "bqml_tutorial.nyc_citibike_arima_model",
+        replace=True,
+    )
+    # [END bigquery_dataframes_bqml_limit_forecast_create]
+    assert df is not None
+    assert features is not None
+    assert num_trips is not None
diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py
index 034a266177..6dea10b608 100644
--- a/scripts/test_publish_api_coverage.py
+++ b/scripts/test_publish_api_coverage.py
@@ -19,6 +19,8 @@
 
 from . import publish_api_coverage
 
+pytest.importorskip("sklearn")
+
 
 @pytest.fixture
 def api_coverage_df():
diff --git a/setup.py b/setup.py
index 4386177a5e..1f6114b634 100644
--- a/setup.py
+++ b/setup.py
@@ -55,8 +55,6 @@
     "pyarrow >=10.0.1",
     "pydata-google-auth >=1.8.2",
     "requests >=2.27.1",
-    "scikit-learn >=1.2.2",
-    "sqlalchemy >=1.4,<3.0dev",
     "sqlglot >=23.6.3",
     "tabulate >=0.9",
     "ipywidgets >=7.7.1",
@@ -77,8 +75,15 @@
     "tests": [],
     # used for local engine, which is only needed for unit tests at present.
     "polars": ["polars >= 1.7.0"],
+    "scikit-learn": ["scikit-learn>=1.2.2"],
     # Packages required for basic development flow.
-    "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"],
+    "dev": [
+        "pytest",
+        "pytest-mock",
+        "pre-commit",
+        "nox",
+        "google-cloud-testutils",
+    ],
 }
 extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values()))))
 
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index 8b7ad892c0..30d5c1c3a7 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -20,7 +20,6 @@ pyarrow==10.0.1
 pydata-google-auth==1.8.2
 requests==2.27.1
 scikit-learn==1.2.2
-sqlalchemy==1.4
 sqlglot==23.6.3
 tabulate==0.9
 ipywidgets==7.7.1
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index e4bff8cdcc..f69f08b1ae 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -251,6 +251,11 @@ def table_id_unique(dataset_id: str):
     return f"{dataset_id}.{prefixer.create_prefix()}"
 
 
+@pytest.fixture(scope="function")
+def routine_id_unique(dataset_id: str):
+    return f"{dataset_id}.{prefixer.create_prefix()}"
+
+
 @pytest.fixture(scope="session")
 def scalars_schema(bigquery_client: bigquery.Client):
     # TODO(swast): Add missing scalar data types such as BIGNUMERIC.
diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py
index 54ba0549a0..7363e370bb 100644
--- a/tests/system/large/functions/test_remote_function.py
+++ b/tests/system/large/functions/test_remote_function.py
@@ -2193,6 +2193,10 @@ def foo(x, y, z):
                 )
             )
         )
+        assert (
+            getattr(foo, "bigframes_bigquery_function_output_dtype")
+            == bigframes.dtypes.STRING_DTYPE
+        )
 
         # Fails to apply on dataframe with incompatible number of columns
         with pytest.raises(
diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py
index 0dc8960f62..99a017c917 100644
--- a/tests/system/small/functions/test_remote_function.py
+++ b/tests/system/small/functions/test_remote_function.py
@@ -14,6 +14,7 @@
 
 import inspect
 import re
+import textwrap
 
 import google.api_core.exceptions
 from google.cloud import bigquery
@@ -27,6 +28,7 @@
 import bigframes.exceptions
 from bigframes.functions import _utils as bff_utils
 from bigframes.functions import function as bff
+import bigframes.session._io.bigquery
 from tests.system.utils import assert_pandas_df_equal
 
 _prefixer = test_utils.prefixer.Prefixer("bigframes", "")
@@ -632,7 +634,6 @@ def add_one(x):
         )(add_one)
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_detects_invalid_function(session, dataset_id):
     dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
     with pytest.raises(ValueError) as e:
@@ -705,21 +706,133 @@ def square1(x):
     assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas())
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_runs_existing_udf(session):
     func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only")
     got = func("AURÉLIE")
     assert got == "aurÉlie"
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_runs_existing_udf_4_params(session):
     func = session.read_gbq_function("bqutil.fn.cw_instr4")
     got = func("TestStr123456Str", "Str", 1, 2)
     assert got == 14
 
 
-@pytest.mark.flaky(retries=2, delay=120)
+def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_unique):
+    bigframes.session._io.bigquery.start_query_with_client(
+        session.bqclient,
+        textwrap.dedent(
+            f"""
+                CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING)
+                RETURNS ARRAY<STRING>
+                AS (
+                    [x, x]
+                )
+            """
+        ),
+        job_config=bigquery.QueryJobConfig(),
+    )
+    func = session.read_gbq_function(routine_id_unique)
+
+    # Test on scalar value
+    got = func("hello")
+    assert got == ["hello", "hello"]
+
+    # Test on a series, assert pandas parity
+    pd_s = pd.Series(["alpha", "beta", "gamma"])
+    bf_s = session.read_pandas(pd_s)
+    pd_result = pd_s.apply(func)
+    bf_result = bf_s.apply(func)
+    assert bigframes.dtypes.is_array_string_like(bf_result.dtype)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
+def test_read_gbq_function_runs_existing_udf_2_params_array_output(
+    session, routine_id_unique
+):
+    bigframes.session._io.bigquery.start_query_with_client(
+        session.bqclient,
+        textwrap.dedent(
+            f"""
+                CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y STRING)
+                RETURNS ARRAY<STRING>
+                AS (
+                    [x, y]
+                )
+            """
+        ),
+        job_config=bigquery.QueryJobConfig(),
+    )
+    func = session.read_gbq_function(routine_id_unique)
+
+    # Test on scalar value
+    got = func("hello", "world")
+    assert got == ["hello", "world"]
+
+    # Test on series, assert pandas parity
+    pd_df = pd.DataFrame(
+        {"col0": ["alpha", "beta", "gamma"], "col1": ["delta", "theta", "phi"]}
+    )
+    bf_df = session.read_pandas(pd_df)
+    pd_result = pd_df["col0"].combine(pd_df["col1"], func)
+    bf_result = bf_df["col0"].combine(bf_df["col1"], func)
+    assert bigframes.dtypes.is_array_string_like(bf_result.dtype)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
+def test_read_gbq_function_runs_existing_udf_4_params_array_output(
+    session, routine_id_unique
+):
+    bigframes.session._io.bigquery.start_query_with_client(
+        session.bqclient,
+        textwrap.dedent(
+            f"""
+                CREATE OR REPLACE FUNCTION `{routine_id_unique}`(x STRING, y BOOL, z INT64, w FLOAT64)
+                RETURNS ARRAY<STRING>
+                AS (
+                    [x, CAST(y AS STRING), CAST(z AS STRING), CAST(w AS STRING)]
+                )
+            """
+        ),
+        job_config=bigquery.QueryJobConfig(),
+    )
+    func = session.read_gbq_function(routine_id_unique)
+
+    # Test on scalar value
+    got = func("hello", True, 1, 2.3)
+    assert got == ["hello", "true", "1", "2.3"]
+
+    # Test on a dataframe, assert pandas parity
+    pd_df = pd.DataFrame(
+        {
+            "col0": ["alpha", "beta", "gamma"],
+            "col1": [True, False, True],
+            "col2": [1, 2, 3],
+            "col3": [4.5, 6, 7.75],
+        }
+    )
+    bf_df = session.read_pandas(pd_df)
+    # Simulate the result directly, since the function cannot be applied
+    # directly on a pandas dataframe with axis=1, as this is a special type of
+    # function with multiple params supported only on bigframes dataframe.
+    pd_result = pd.Series(
+        [
+            ["alpha", "true", "1", "4.5"],
+            ["beta", "false", "2", "6"],
+            ["gamma", "true", "3", "7.75"],
+        ]
+    )
+    bf_result = bf_df.apply(func, axis=1)
+    assert bigframes.dtypes.is_array_string_like(bf_result.dtype)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
 def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id):
     dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
     arg = bigquery.RoutineArgument(
@@ -754,6 +867,10 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id):
         assert square.bigframes_remote_function == str(routine.reference)
         assert square.input_dtypes == (bigframes.dtypes.INT_DTYPE,)
         assert square.output_dtype == bigframes.dtypes.INT_DTYPE
+        assert (
+            square.bigframes_bigquery_function_output_dtype
+            == bigframes.dtypes.INT_DTYPE
+        )
 
         src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]}
 
@@ -772,7 +889,6 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id):
         )
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_requires_explicit_types(
     session, bigquery_client, dataset_id
 ):
@@ -863,7 +979,6 @@ def test_read_gbq_function_requires_explicit_types(
         ),
     ],
 )
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_respects_python_output_type(
     request, session_fixture, bigquery_client, dataset_id, array_type, expected_data
 ):
@@ -906,7 +1021,6 @@ def test_read_gbq_function_respects_python_output_type(
         pytest.param(list[str], id="list-str"),
     ],
 )
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_supports_python_output_type_only_for_string_outputs(
     session, bigquery_client, dataset_id, array_type
 ):
@@ -945,7 +1059,6 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs(
         pytest.param(list[str], id="list-str"),
     ],
 )
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_supported_python_output_type(
     session, bigquery_client, dataset_id, array_type
 ):
@@ -992,7 +1105,6 @@ def test_df_apply_scalar_func(session, scalars_dfs):
     )
 
 
-@pytest.mark.flaky(retries=2, delay=120)
 def test_read_gbq_function_multiple_inputs_not_a_row_processor(session):
     with pytest.raises(ValueError) as context:
         # The remote function has two args, which cannot be row processed. Throw
@@ -1214,20 +1326,19 @@ def should_mask(name: str) -> bool:
     repr(s.mask(should_mask, "REDACTED"))
 
 
-@pytest.mark.flaky(retries=2, delay=120)
-def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index):
-    gbq_function = f"{dataset_id}.should_mask"
-
+def test_read_gbq_function_application_repr(
+    session, routine_id_unique, scalars_df_index
+):
     # This function deliberately has a param with name "name", this is to test
     # a specific ibis' internal handling of object names
     session.bqclient.query_and_wait(
-        f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)"
+        f"CREATE OR REPLACE FUNCTION `{routine_id_unique}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)"
     )
-    routine = session.bqclient.get_routine(gbq_function)
+    routine = session.bqclient.get_routine(routine_id_unique)
     assert "name" in [arg.name for arg in routine.arguments]
 
     # read the function and apply to dataframe
-    should_mask = session.read_gbq_function(gbq_function)
+    should_mask = session.read_gbq_function(routine_id_unique)
 
     s = scalars_df_index["string_col"]
 
diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py
index 81e1b2f77f..b80202bdbe 100644
--- a/tests/system/small/ml/test_metrics.py
+++ b/tests/system/small/ml/test_metrics.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-import sklearn.metrics as sklearn_metrics  # type: ignore
 
 import bigframes
 from bigframes.ml import metrics
@@ -66,6 +65,7 @@ def test_r2_score_force_finite(session):
 
 
 def test_r2_score_ok_fit_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})
 
     df = session.read_pandas(pd_df)
@@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session):
 
 
 def test_accuracy_score_fit_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})
 
     df = session.read_pandas(pd_df)
@@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session):
 
 
 def test_roc_curve_binary_classification_prediction_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
@@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session):
 
 
 def test_roc_curve_binary_classification_decision_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     # Instead of operating on probabilities, assume a 70% decision threshold
     # has been applied, and operate on the final output
     y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]
@@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session):
 
 
 def test_roc_auc_score_returns_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
@@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session):
 
 
 def test_confusion_matrix_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 3, 3, 3, 4, 1],
@@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session):
 
 
 def test_confusion_matrix_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -603,6 +609,7 @@ def test_recall_score(session):
 
 
 def test_recall_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 0, 2, 2, 0, 1],
@@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session):
 
 
 def test_recall_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -673,6 +681,7 @@ def test_precision_score(session):
 
 
 def test_precision_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 0, 2, 2, 0, 1],
@@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session):
 
 
 def test_precision_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -752,6 +762,7 @@ def test_f1_score(session):
 
 
 def test_f1_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": [2, 0, 2, 2, 0, 1],
@@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session):
 
 
 def test_f1_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
     pd_df = pd.DataFrame(
         {
             "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py
index 356000b3f6..723481b1d1 100644
--- a/tests/system/small/operations/test_timedeltas.py
+++ b/tests/system/small/operations/test_timedeltas.py
@@ -465,3 +465,49 @@ def test_timedelta_ordering(session):
     pandas.testing.assert_series_equal(
         actual_result, expected_result, check_index_type=False
     )
+
+
+def test_timedelta_cumsum(temporal_dfs):
+    bf_df, pd_df = temporal_dfs
+
+    actual_result = bf_df["timedelta_col_1"].cumsum().to_pandas()
+
+    expected_result = pd_df["timedelta_col_1"].cumsum()
+    _assert_series_equal(actual_result, expected_result)
+
+
+@pytest.mark.parametrize(
+    "agg_func",
+    [
+        pytest.param(lambda x: x.min(), id="min"),
+        pytest.param(lambda x: x.max(), id="max"),
+        pytest.param(lambda x: x.sum(), id="sum"),
+        pytest.param(lambda x: x.mean(), id="mean"),
+        pytest.param(lambda x: x.median(), id="median"),
+        pytest.param(lambda x: x.quantile(0.5), id="quantile"),
+        pytest.param(lambda x: x.std(), id="std"),
+    ],
+)
+def test_timedelta_agg__timedelta_result(temporal_dfs, agg_func):
+    bf_df, pd_df = temporal_dfs
+
+    actual_result = agg_func(bf_df["timedelta_col_1"])
+
+    expected_result = agg_func(pd_df["timedelta_col_1"]).floor("us")
+    assert actual_result == expected_result
+
+
+@pytest.mark.parametrize(
+    "agg_func",
+    [
+        pytest.param(lambda x: x.count(), id="count"),
+        pytest.param(lambda x: x.nunique(), id="nunique"),
+    ],
+)
+def test_timedelta_agg__int_result(temporal_dfs, agg_func):
+    bf_df, pd_df = temporal_dfs
+
+    actual_result = agg_func(bf_df["timedelta_col_1"])
+
+    expected_result = agg_func(pd_df["timedelta_col_1"])
+    assert actual_result == expected_result
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index 4b4264e33c..da78432cdb 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -829,3 +829,18 @@ def test_to_timedelta_with_bf_series_invalid_unit(session, unit):
 @pytest.mark.parametrize("input", [1, 1.2, "1s"])
 def test_to_timedelta_non_bf_series(input):
     assert bpd.to_timedelta(input) == pd.to_timedelta(input)
+
+
+def test_to_timedelta_on_timedelta_series__should_be_no_op(scalars_dfs):
+    bf_df, pd_df = scalars_dfs
+    bf_series = bpd.to_timedelta(bf_df["int64_too"], unit="us")
+    pd_series = pd.to_timedelta(pd_df["int64_too"], unit="us")
+
+    actual_result = (
+        bpd.to_timedelta(bf_series, unit="s").to_pandas().astype("timedelta64[ns]")
+    )
+
+    expected_result = pd.to_timedelta(pd_series, unit="s")
+    pd.testing.assert_series_equal(
+        actual_result, expected_result, check_index_type=False
+    )
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 00f47c754e..2daa7dd825 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
     ),
 )
 def test_series_interpolate(method):
+    pytest.importorskip("scipy")
+
     values = [None, 1, 2, None, None, 16, None]
     index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
     pd_series = pd.Series(values, index)
diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py
index 413a694680..d377fb4d49 100644
--- a/tests/unit/functions/test_remote_function.py
+++ b/tests/unit/functions/test_remote_function.py
@@ -66,6 +66,12 @@ def test_supported_types_correspond():
     ibis_types_from_bigquery = {
         third_party_ibis_bqtypes.BigQueryType.to_ibis(tk)
         for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS
+        # TODO(b/284515241): ARRAY is the only exception because it is supported
+        # as an output type of the BQ routine in the read_gbq_function path but
+        # not in the remote function path. Remove this handline once BQ remote
+        # functions supports ARRAY output and the bigframes remote functions
+        # utilizes that to support array output.
+        if tk != "ARRAY"
     }
 
     assert ibis_types_from_python == ibis_types_from_bigquery
diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py
index 00a51ccfe9..dd2ceff143 100644
--- a/tests/unit/ml/test_api_primitives.py
+++ b/tests/unit/ml/test_api_primitives.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import pytest
-import sklearn.decomposition as sklearn_decomposition  # type: ignore
-import sklearn.linear_model as sklearn_linear_model  # type: ignore
 
 import bigframes.ml.decomposition
 import bigframes.ml.linear_model
@@ -35,8 +33,9 @@ def test_base_estimator_repr():
     assert pca_estimator.__repr__() == "PCA(n_components=7)"
 
 
-@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn")
 def test_base_estimator_repr_matches_sklearn():
+    sklearn_decomposition = pytest.importorskip("sklearn.decomposition")
+    sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
     estimator = bigframes.ml.linear_model.LinearRegression()
     sklearn_estimator = sklearn_linear_model.LinearRegression()
     assert estimator.__repr__() == sklearn_estimator.__repr__()
diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py
index 395296f3e4..450ce8d6ee 100644
--- a/tests/unit/ml/test_compose.py
+++ b/tests/unit/ml/test_compose.py
@@ -15,8 +15,6 @@
 
 from google.cloud import bigquery
 import pytest
-import sklearn.compose as sklearn_compose  # type: ignore
-import sklearn.preprocessing as sklearn_preprocessing  # type: ignore
 
 from bigframes.ml import compose, preprocessing
 from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer
@@ -119,6 +117,8 @@ def test_columntransformer_repr():
 
 
 def test_columntransformer_repr_matches_sklearn():
+    sklearn_compose = pytest.importorskip("sklearn.compose")
+    sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
     bf_column_transformer = compose.ColumnTransformer(
         [
             (
diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py
index ed5c621b1d..beebb9f282 100644
--- a/tests/unit/ml/test_pipeline.py
+++ b/tests/unit/ml/test_pipeline.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 
 import pytest
-import sklearn.compose as sklearn_compose  # type: ignore
-import sklearn.linear_model as sklearn_linear_model  # type: ignore
-import sklearn.pipeline as sklearn_pipeline  # type: ignore
-import sklearn.preprocessing as sklearn_preprocessing  # type: ignore
 
 from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing
 
@@ -57,8 +53,11 @@ def test_pipeline_repr():
     )
 
 
-@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn")
 def test_pipeline_repr_matches_sklearn():
+    sklearn_compose = pytest.importorskip("sklearn.compose")
+    sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
+    sklearn_pipeline = pytest.importorskip("sklearn.pipeline")
+    sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
     bf_pl = pipeline.Pipeline(
         [
             (
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index f5aa23d00b..e296dcb9f6 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -7179,7 +7179,7 @@ def __len__(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def __array__(self):
+    def __array__(self, dtype=None, copy: Optional[bool] = None):
         """
         Returns the rows as NumPy array.
 
@@ -7210,6 +7210,8 @@ def __array__(self):
             dtype (str or numpy.dtype, optional):
                 The dtype to use for the resulting NumPy array. By default,
                 the dtype is inferred from the data.
+            copy (bool or None, optional):
+                Whether to copy the data, False is not supported.
 
         Returns:
             numpy.ndarray:
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 57f7dfbb79..5e6f546d09 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -5941,7 +5941,7 @@ def size(self) -> int:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def __array__(self, dtype=None) -> numpy.ndarray:
+    def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray:
         """
         Returns the values as NumPy array.
 
@@ -5965,6 +5965,8 @@ def __array__(self, dtype=None) -> numpy.ndarray:
             dtype (str or numpy.dtype, optional):
                 The dtype to use for the resulting NumPy array. By default,
                 the dtype is inferred from the data.
+            copy (bool or None, optional):
+                Whether to copy the data, False is not supported.
 
         Returns:
             numpy.ndarray:
diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
index 7b97526de2..9262ffbd3d 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
@@ -16,6 +16,8 @@
 #          Michal Karbownik <michakarbownik@gmail.com>
 # License: BSD 3 clause
 
+import numpy as np
+
 from bigframes import constants
 
 
@@ -60,7 +62,23 @@ def auc(x, y) -> float:
     Returns:
         float: Area Under the Curve.
     """
-    raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+    if len(x) < 2:
+        raise ValueError(
+            f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}"
+        )
+
+    if x.is_monotonic_decreasing:
+        d = -1
+    elif x.is_monotonic_increasing:
+        d = 1
+    else:
+        raise ValueError(f"x is neither increasing nor decreasing : {x}.")
+
+    if hasattr(np, "trapezoid"):
+        # new in numpy 2.0
+        return d * np.trapezoid(y, x)
+    # np.trapz has been deprecated in 2.0
+    return d * np.trapz(y, x)  # type: ignore
 
 
 def roc_auc_score(y_true, y_score) -> float:
diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py
index 6af33f7569..5c9ca1e9c3 100644
--- a/third_party/bigframes_vendored/tpch/queries/q9.py
+++ b/third_party/bigframes_vendored/tpch/queries/q9.py
@@ -33,13 +33,17 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
     )
 
     q_final = (
-        part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY")
-        .merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY")
-        .merge(
+        part.merge(
             lineitem,
-            left_on=["P_PARTKEY", "PS_SUPPKEY"],
-            right_on=["L_PARTKEY", "L_SUPPKEY"],
+            left_on="P_PARTKEY",
+            right_on="L_PARTKEY",
+        )
+        .merge(
+            partsupp,
+            left_on=["L_SUPPKEY", "L_PARTKEY"],
+            right_on=["PS_SUPPKEY", "PS_PARTKEY"],
         )
+        .merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
         .merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
         .merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
     )
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py
index 27dfb23603..762deda9ff 100644
--- a/third_party/bigframes_vendored/version.py
+++ b/third_party/bigframes_vendored/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.37.0"
+__version__ = "1.38.0"

From bef780818f4ac3fbabe956fadeff9c5a658d2308 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Mon, 24 Feb 2025 16:11:11 -0600
Subject: [PATCH 21/75] Delete demo.ipynb

---
 demo.ipynb | 758 -----------------------------------------------------
 1 file changed, 758 deletions(-)
 delete mode 100644 demo.ipynb

diff --git a/demo.ipynb b/demo.ipynb
deleted file mode 100644
index 93e6f121f9..0000000000
--- a/demo.ipynb
+++ /dev/null
@@ -1,758 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "Query job 39ca6c3f-1c37-4f8e-8252-33cf6abfa340 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:39ca6c3f-1c37-4f8e-8252-33cf6abfa340&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job 7dda7bc2-75b2-42b5-918b-41dd0540eb53 is DONE. 24.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7dda7bc2-75b2-42b5-918b-41dd0540eb53&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job 4b99d068-1e68-4a86-bd0b-52d40ef6a270 is DONE. 40.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:4b99d068-1e68-4a86-bd0b-52d40ef6a270&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>user_id</th>\n",
-       "      <th>item_id</th>\n",
-       "      <th>rating</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4354</td>\n",
-       "      <td>968</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>3622</td>\n",
-       "      <td>3521</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>5543</td>\n",
-       "      <td>920</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>445</td>\n",
-       "      <td>3175</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5535</td>\n",
-       "      <td>235</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>4422</td>\n",
-       "      <td>1097</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>3119</td>\n",
-       "      <td>1356</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>6037</td>\n",
-       "      <td>1231</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>851</td>\n",
-       "      <td>196</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>3111</td>\n",
-       "      <td>435</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>5403</td>\n",
-       "      <td>648</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>3601</td>\n",
-       "      <td>2734</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>4655</td>\n",
-       "      <td>2949</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>1274</td>\n",
-       "      <td>3093</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>1521</td>\n",
-       "      <td>350</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>3072</td>\n",
-       "      <td>454</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>3314</td>\n",
-       "      <td>1330</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>3762</td>\n",
-       "      <td>2719</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>1687</td>\n",
-       "      <td>2169</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>970</td>\n",
-       "      <td>3081</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>1265</td>\n",
-       "      <td>2248</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>1502</td>\n",
-       "      <td>104</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>194</td>\n",
-       "      <td>500</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>3521</td>\n",
-       "      <td>1088</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>1889</td>\n",
-       "      <td>3567</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>25 rows × 3 columns</p>\n",
-       "</div>[1000209 rows x 3 columns in total]"
-      ],
-      "text/plain": [
-       "    user_id  item_id  rating\n",
-       "0      4354      968     4.0\n",
-       "1      3622     3521     5.0\n",
-       "2      5543      920     2.0\n",
-       "3       445     3175     5.0\n",
-       "4      5535      235     4.0\n",
-       "5      4422     1097     4.0\n",
-       "6      3119     1356     4.0\n",
-       "7      6037     1231     4.0\n",
-       "8       851      196     3.0\n",
-       "9      3111      435     3.0\n",
-       "10     5403      648     5.0\n",
-       "11     3601     2734     3.0\n",
-       "12     4655     2949     4.0\n",
-       "13     1274     3093     5.0\n",
-       "14     1521      350     4.0\n",
-       "15     3072      454     3.0\n",
-       "16     3314     1330     4.0\n",
-       "17     3762     2719     1.0\n",
-       "18     1687     2169     3.0\n",
-       "19      970     3081     4.0\n",
-       "20     1265     2248     5.0\n",
-       "21     1502      104     4.0\n",
-       "22      194      500     4.0\n",
-       "23     3521     1088     3.0\n",
-       "24     1889     3567     3.0\n",
-       "...\n",
-       "\n",
-       "[1000209 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import bigframes.pandas as bpd\n",
-    "from bigframes.ml import decomposition\n",
-    "\n",
-    "bq_df = bpd.read_gbq('bqml_tutorial.ratings', columns=('user_id', 'item_id', 'rating'))\n",
-    "bq_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n",
-       "                    rating_col='rating_col', user_col='user_id')"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model = decomposition.MatrixFactorization(\n",
-    "    num_factors=34,\n",
-    "    feedback_type='explicit',\n",
-    "    user_col='user_id',\n",
-    "    item_col='item_col',\n",
-    "    rating_col='rating_col',\n",
-    "    l2_reg=9.83,\n",
-    ")\n",
-    "\n",
-    "model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "Query job 50f616db-afae-40da-bc95-f724bb8a5c84 is DONE. 24.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:50f616db-afae-40da-bc95-f724bb8a5c84&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job d13d556d-e011-40a0-9da8-5c0918cf1ef1 is DONE. 537.2 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d13d556d-e011-40a0-9da8-5c0918cf1ef1&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "MatrixFactorization(item_col='item_col', l2_reg=9.83, num_factors=34,\n",
-       "                    rating_col='rating_col', user_col='user_id')"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fitted = model.fit(bq_df.rename(columns={'rating': 'rating_col', 'item_id': 'item_col'}))\n",
-    "fitted\n",
-    "# scored = model.score(fitted)\n",
-    "\n",
-    "# scored"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "Query job 66684505-f14b-423b-8105-93521064036a is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:66684505-f14b-423b-8105-93521064036a&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job 4ec28d78-f0c1-4456-8c08-60b6982ee52f is DONE. 48 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:4ec28d78-f0c1-4456-8c08-60b6982ee52f&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>mean_absolute_error</th>\n",
-       "      <th>mean_squared_error</th>\n",
-       "      <th>mean_squared_log_error</th>\n",
-       "      <th>median_absolute_error</th>\n",
-       "      <th>r2_score</th>\n",
-       "      <th>explained_variance</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.485282</td>\n",
-       "      <td>0.395341</td>\n",
-       "      <td>0.025535</td>\n",
-       "      <td>0.389906</td>\n",
-       "      <td>0.683199</td>\n",
-       "      <td>0.683199</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1 rows × 6 columns</p>\n",
-       "</div>[1 rows x 6 columns in total]"
-      ],
-      "text/plain": [
-       "   mean_absolute_error  mean_squared_error  mean_squared_log_error  \\\n",
-       "0             0.485282            0.395341                0.025535   \n",
-       "\n",
-       "   median_absolute_error  r2_score  explained_variance  \n",
-       "0               0.389906  0.683199            0.683199  \n",
-       "\n",
-       "[1 rows x 6 columns]"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "scored = model.score(fitted)\n",
-    "\n",
-    "scored"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "Query job e7dcfb81-70af-4d65-9c2a-b42591812d0e is DONE. 29.5 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:e7dcfb81-70af-4d65-9c2a-b42591812d0e&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job fcb53dd1-f9cb-4872-b7bf-3d2f0da89b00 is DONE. 40.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:fcb53dd1-f9cb-4872-b7bf-3d2f0da89b00&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job 10436512-dada-4dfc-a3ff-94b480a5e890 is DONE. 48.0 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:10436512-dada-4dfc-a3ff-94b480a5e890&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>predicted_rating_col</th>\n",
-       "      <th>user_id</th>\n",
-       "      <th>item_col</th>\n",
-       "      <th>rating</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>3.348131</td>\n",
-       "      <td>4354</td>\n",
-       "      <td>968</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>5.22349</td>\n",
-       "      <td>3622</td>\n",
-       "      <td>3521</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1.820173</td>\n",
-       "      <td>5543</td>\n",
-       "      <td>920</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4.702228</td>\n",
-       "      <td>445</td>\n",
-       "      <td>3175</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3.206949</td>\n",
-       "      <td>5535</td>\n",
-       "      <td>235</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>4.690283</td>\n",
-       "      <td>4422</td>\n",
-       "      <td>1097</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>3.944585</td>\n",
-       "      <td>3119</td>\n",
-       "      <td>1356</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>4.275766</td>\n",
-       "      <td>6037</td>\n",
-       "      <td>1231</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>3.434579</td>\n",
-       "      <td>851</td>\n",
-       "      <td>196</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>1.827473</td>\n",
-       "      <td>3111</td>\n",
-       "      <td>435</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>4.130928</td>\n",
-       "      <td>5403</td>\n",
-       "      <td>648</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>3.231195</td>\n",
-       "      <td>3601</td>\n",
-       "      <td>2734</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>3.750037</td>\n",
-       "      <td>4655</td>\n",
-       "      <td>2949</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>3.858951</td>\n",
-       "      <td>1274</td>\n",
-       "      <td>3093</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>3.34852</td>\n",
-       "      <td>1521</td>\n",
-       "      <td>350</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>2.956284</td>\n",
-       "      <td>3072</td>\n",
-       "      <td>454</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>3.831856</td>\n",
-       "      <td>3314</td>\n",
-       "      <td>1330</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>0.805804</td>\n",
-       "      <td>3762</td>\n",
-       "      <td>2719</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>3.65957</td>\n",
-       "      <td>1687</td>\n",
-       "      <td>2169</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>3.03197</td>\n",
-       "      <td>970</td>\n",
-       "      <td>3081</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>3.384926</td>\n",
-       "      <td>1265</td>\n",
-       "      <td>2248</td>\n",
-       "      <td>5.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>4.173243</td>\n",
-       "      <td>1502</td>\n",
-       "      <td>104</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>3.918435</td>\n",
-       "      <td>194</td>\n",
-       "      <td>500</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>2.451965</td>\n",
-       "      <td>3521</td>\n",
-       "      <td>1088</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>2.982963</td>\n",
-       "      <td>1889</td>\n",
-       "      <td>3567</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>25 rows × 4 columns</p>\n",
-       "</div>[1000209 rows x 4 columns in total]"
-      ],
-      "text/plain": [
-       "    predicted_rating_col  user_id  item_col  rating\n",
-       "0               3.348131     4354       968     4.0\n",
-       "1                5.22349     3622      3521     5.0\n",
-       "2               1.820173     5543       920     2.0\n",
-       "3               4.702228      445      3175     5.0\n",
-       "4               3.206949     5535       235     4.0\n",
-       "5               4.690283     4422      1097     4.0\n",
-       "6               3.944585     3119      1356     4.0\n",
-       "7               4.275766     6037      1231     4.0\n",
-       "8               3.434579      851       196     3.0\n",
-       "9               1.827473     3111       435     3.0\n",
-       "10              4.130928     5403       648     5.0\n",
-       "11              3.231195     3601      2734     3.0\n",
-       "12              3.750037     4655      2949     4.0\n",
-       "13              3.858951     1274      3093     5.0\n",
-       "14               3.34852     1521       350     4.0\n",
-       "15              2.956284     3072       454     3.0\n",
-       "16              3.831856     3314      1330     4.0\n",
-       "17              0.805804     3762      2719     1.0\n",
-       "18               3.65957     1687      2169     3.0\n",
-       "19               3.03197      970      3081     4.0\n",
-       "20              3.384926     1265      2248     5.0\n",
-       "21              4.173243     1502       104     4.0\n",
-       "22              3.918435      194       500     4.0\n",
-       "23              2.451965     3521      1088     3.0\n",
-       "24              2.982963     1889      3567     3.0\n",
-       "...\n",
-       "\n",
-       "[1000209 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# predict_df = scored[['user_id']['item_col']]\n",
-    "# model.predict(predict_df)\n",
-    "model.predict(bq_df.rename(columns={'item_id': 'item_col'}))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "env",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.19"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 0dd033da3ea4e70e024f0cb3f0ad20db4e934a37 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 25 Feb 2025 22:10:24 +0000
Subject: [PATCH 22/75] passing system test

---
 bigframes/ml/decomposition.py               | 10 +++++++++-
 tests/system/large/ml/test_decomposition.py | 11 +++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 1ea7d98177..065c1fdb5f 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -28,7 +28,15 @@
 import bigframes.pandas as bpd
 import bigframes.session
 
-_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"}
+_BQML_PARAMS_MAPPING = {
+    "svd_solver": "pcaSolver",
+    "feedback_type": "feedbackType",
+    "num_factors": "numFactors",
+    "user_col": "userColumn",
+    "item_col": "itemColumn",
+    # TODO: Add rating_col
+    "l2_reg": "l2Regularization",
+}
 
 
 @log_adapter.class_logger
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index 36f5d83c75..0a25187935 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -198,4 +198,15 @@ def test_decomposition_mf_configure_fit_load(
         new_ratings.rename(columns={"item_id": "item_col"})
     ).to_pandas()
 
+    assert reloaded_model._bqml_model is not None
+    assert (
+        f"{dataset_id}.temp_configured_mf_model"
+        in reloaded_model._bqml_model.model_name
+    )
     assert result is not None
+    assert reloaded_model.feedback_type == "EXPLICIT"
+    assert reloaded_model.num_factors == 6
+    assert reloaded_model.user_col == "user_id"
+    assert reloaded_model.item_col == "item_id"
+    assert reloaded_model.rating_col == "rating"
+    assert reloaded_model.l2_reg == 9.83

From 1f85b75449363f707190b9426264f8d5df3d6a94 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 26 Feb 2025 19:55:03 +0000
Subject: [PATCH 23/75] preparing to add unit tests

---
 tests/unit/ml/test_golden_sql.py | 54 +++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 97d1d2d7d1..31ee73d0ea 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -20,7 +20,7 @@
 import pytest_mock
 
 import bigframes
-from bigframes.ml import core, linear_model
+from bigframes.ml import core, decomposition, linear_model
 import bigframes.pandas as bpd
 
 TEMP_MODEL_ID = bigquery.ModelReference.from_string(
@@ -207,3 +207,55 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y):
     mock_session.read_gbq.assert_called_once_with(
         "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_y_sql))"
     )
+
+
+def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, mock_y):
+    model = decomposition.MatrixFactorization(  # revise
+        num_factors=34,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    model._bqml_model_factory = bqml_model_factory
+    model.fit(mock_X, mock_y)
+
+    mock_session._start_query_ml_ddl.assert_called_once_with(  # revice
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='MATRIX_FACTORIZATION',\n "
+    )
+
+
+def test_decomposition_mf_predict(mock_session, bqml_model, mock_X):
+    model = decomposition.MatrixFactorization(  # revise
+        num_factors=34,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    model._bqml_model = bqml_model
+    model.predict(mock_X)  # mock x requires item_col
+
+    mock_session.read_gbq.assert_called_once_with(  # revise
+        "SELECT * FROM ML.PREDICT(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_sql))",
+        index_col=["index_column_id"],
+    )
+
+
+def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y):
+    model = decomposition.MatrixFactorization(  # revise
+        num_factors=34,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    model._bqml_model = bqml_model
+    model.score(mock_X, mock_y)
+
+    mock_session.read_gbq.assert_called_once_with(  # revise
+        "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_y_sql))"
+    )

From a45763946a8dbaca8488141b1cdb463b891f6f50 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Thu, 27 Feb 2025 20:30:03 +0000
Subject: [PATCH 24/75] 2 out of 3 (so far) passing unit tests

---
 tests/unit/ml/test_golden_sql.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 31ee73d0ea..a3ae3f50de 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -221,13 +221,13 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X,
     model._bqml_model_factory = bqml_model_factory
     model.fit(mock_X, mock_y)
 
-    mock_session._start_query_ml_ddl.assert_called_once_with(  # revice
-        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='MATRIX_FACTORIZATION',\n "
+    mock_session._start_query_ml_ddl.assert_called_once_with(
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='matrix_factorization',\n  feedback_type='explicit',\n  user_col='user_id',\n  item_col='item_col',\n  rating_col='rating_col',\n  l2_reg=9.83,\n  num_factors=34)\nAS input_X_y_no_index_sql"
     )
 
 
 def test_decomposition_mf_predict(mock_session, bqml_model, mock_X):
-    model = decomposition.MatrixFactorization(  # revise
+    model = decomposition.MatrixFactorization(
         num_factors=34,
         feedback_type="explicit",
         user_col="user_id",
@@ -236,10 +236,10 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X):
         l2_reg=9.83,
     )
     model._bqml_model = bqml_model
-    model.predict(mock_X)  # mock x requires item_col
+    model.predict(mock_X)
 
-    mock_session.read_gbq.assert_called_once_with(  # revise
-        "SELECT * FROM ML.PREDICT(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_sql))",
+    mock_session.read_gbq.assert_called_once_with(
+        "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_sql))",
         index_col=["index_column_id"],
     )
 
@@ -257,5 +257,5 @@ def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y):
     model.score(mock_X, mock_y)
 
     mock_session.read_gbq.assert_called_once_with(  # revise
-        "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_y_sql))"
+        "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)"
     )

From 512332e4969158009ead4656fc911e96d4c073e6 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 3 Mar 2025 16:27:55 +0000
Subject: [PATCH 25/75] attempted mocking

---
 tests/unit/ml/test_golden_sql.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index a3ae3f50de..09745a3bf4 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -219,6 +219,11 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X,
         l2_reg=9.83,
     )
     model._bqml_model_factory = bqml_model_factory
+    mock_start_query_ml_ddl = mock.Mock(
+        return_value="CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='matrix_factorization',\n  feedback_type='explicit',\n  user_col='user_id',\n  item_col='item_col',\n  rating_col='rating_col',\n  l2_reg=9.83,\n  num_factors=34)\nAS input_X_y_no_index_sql"
+    )
+    mock_create_model = mock.PropertyMock(return_value=mock_start_query_ml_ddl)
+    type(model)._start_query_ml_ddl = mock_create_model
     model.fit(mock_X, mock_y)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(

From 408e8073ac9cbacf85cc58c3ec89f36ad595ca33 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 4 Mar 2025 20:08:13 +0000
Subject: [PATCH 26/75] fix tests

---
 bigframes/ml/decomposition.py    |  5 +++++
 tests/unit/ml/test_golden_sql.py | 19 +++++++++----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 065c1fdb5f..ea68fd5e6b 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -272,6 +272,11 @@ def _fit(
         y=None,
         transforms: Optional[List[str]] = None,
     ) -> MatrixFactorization:
+        if y is not None:
+            raise ValueError(
+                "Label column not supported for Matrix Factorization model but y was not `None`"
+            )
+
         (X,) = utils.batch_convert_to_dataframe(X)
 
         self._bqml_model = self._bqml_model_factory.create_model(
diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 09745a3bf4..d1aae60744 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -79,6 +79,7 @@ def mock_X(mock_y, mock_session):
         ["index_column_id"],
         ["index_column_label"],
     )
+    mock_X.reset_index(drop=True).cache().sql = "input_X_no_index_sql"
     mock_X.join(mock_y).sql = "input_X_y_sql"
     mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y)
     mock_X.join(mock_y)._to_sql_query.return_value = (
@@ -209,7 +210,7 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y):
     )
 
 
-def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X, mock_y):
+def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X):
     model = decomposition.MatrixFactorization(  # revise
         num_factors=34,
         feedback_type="explicit",
@@ -219,15 +220,13 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X,
         l2_reg=9.83,
     )
     model._bqml_model_factory = bqml_model_factory
-    mock_start_query_ml_ddl = mock.Mock(
-        return_value="CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='matrix_factorization',\n  feedback_type='explicit',\n  user_col='user_id',\n  item_col='item_col',\n  rating_col='rating_col',\n  l2_reg=9.83,\n  num_factors=34)\nAS input_X_y_no_index_sql"
-    )
+    mock_start_query_ml_ddl = mock.Mock()
     mock_create_model = mock.PropertyMock(return_value=mock_start_query_ml_ddl)
     type(model)._start_query_ml_ddl = mock_create_model
-    model.fit(mock_X, mock_y)
+    model.fit(mock_X)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(
-        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='matrix_factorization',\n  feedback_type='explicit',\n  user_col='user_id',\n  item_col='item_col',\n  rating_col='rating_col',\n  l2_reg=9.83,\n  num_factors=34)\nAS input_X_y_no_index_sql"
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='matrix_factorization',\n  feedback_type='explicit',\n  user_col='user_id',\n  item_col='item_col',\n  rating_col='rating_col',\n  l2_reg=9.83,\n  num_factors=34)\nAS input_X_no_index_sql"
     )
 
 
@@ -249,8 +248,8 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X):
     )
 
 
-def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y):
-    model = decomposition.MatrixFactorization(  # revise
+def test_decomposition_mf_score(mock_session, bqml_model, mock_X):
+    model = decomposition.MatrixFactorization(
         num_factors=34,
         feedback_type="explicit",
         user_col="user_id",
@@ -259,8 +258,8 @@ def test_decomposition_mf_score(mock_session, bqml_model, mock_X, mock_y):
         l2_reg=9.83,
     )
     model._bqml_model = bqml_model
-    model.score(mock_X, mock_y)
+    model.score(mock_X)
 
-    mock_session.read_gbq.assert_called_once_with(  # revise
+    mock_session.read_gbq.assert_called_once_with(
         "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`)"
     )

From 19e423bbefea2376b7b1feddd03d0530d90edbf7 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 4 Mar 2025 23:54:05 +0000
Subject: [PATCH 27/75] new test file for model creation unit tests

---
 tests/unit/ml/test_matrix_factorization.py | 70 ++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 tests/unit/ml/test_matrix_factorization.py

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
new file mode 100644
index 0000000000..3e5a9b0034
--- /dev/null
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -0,0 +1,70 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import re
+
+# import pytest
+
+from bigframes.ml import decomposition
+
+
+def test_decomposition_mf_num_factors():
+    model = decomposition.MatrixFactorization(
+        num_factors=16,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    assert model.num_factors == 16
+
+
+# def test_decomposition_mf_num_factors_invalid_raises():
+#     # with pytest.raises(TypeError):
+#     model = decomposition.MatrixFactorization(
+#         num_factors=0.5,
+#         feedback_type="explicit",
+#         user_col="user_id",
+#         item_col="item_col",
+#         rating_col="rating_col",
+#         l2_reg=9.83,
+#     )
+#     # passing test -> should raise error?
+#     assert model.num_factors == 0.5
+
+
+def test_decomposition_mf_feedback_type():
+    model = decomposition.MatrixFactorization(
+        num_factors=16,
+        feedback_type="implicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    assert model.feedback_type == "implicit"
+
+
+# def test_decomposition_mf_feedback_type_raises():
+#     model = decomposition.MatrixFactorization(
+#         num_factors=16,
+#         feedback_type="implexpl",
+#         user_col="user_id",
+#         item_col="item_col",
+#         rating_col="rating_col",
+#         l2_reg=9.83,
+#     )
+#     # passing test -> should raise error?
+#     assert model.feedback_type == "implexpl"

From 5f1a19aa4b420a55b3b1da09f1663b195488c09e Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Thu, 6 Mar 2025 01:43:17 +0000
Subject: [PATCH 28/75] add unit tests for num_factors, user_col, and item_col

---
 bigframes/ml/decomposition.py              | 12 +++
 tests/unit/ml/test_matrix_factorization.py | 90 ++++++++++++++--------
 2 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index ea68fd5e6b..be78b1848b 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -227,8 +227,20 @@ def __init__(
         l2_reg: float = 1.0,
     ):
         self.feedback_type = feedback_type
+
+        if type(num_factors) is not int:
+            raise (TypeError)
+
         self.num_factors = num_factors
+
+        if type(user_col) is not str:
+            raise (TypeError)
+
         self.user_col = user_col
+
+        if type(item_col) is not str:
+            raise (TypeError)
+
         self.item_col = item_col
         self.rating_col = rating_col
         self.l2_reg = l2_reg
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 3e5a9b0034..a8d3c650fc 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -12,59 +12,89 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# import re
 
 # import pytest
 
 from bigframes.ml import decomposition
 
 
-def test_decomposition_mf_num_factors():
+def test_decomposition_mf_model():
     model = decomposition.MatrixFactorization(
         num_factors=16,
-        feedback_type="explicit",
+        feedback_type="implicit",
         user_col="user_id",
         item_col="item_col",
         rating_col="rating_col",
         l2_reg=9.83,
     )
     assert model.num_factors == 16
+    assert model.feedback_type == "implicit"
+    assert model.user_col == "user_id"
+    assert model.item_col == "item_col"
+    assert model.rating_col == "rating_col"
+
+
+def test_decomposition_mf_feedback_type_explicit():
+    model = decomposition.MatrixFactorization(
+        num_factors=16,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=9.83,
+    )
+    assert model.feedback_type == "explicit"
 
 
-# def test_decomposition_mf_num_factors_invalid_raises():
-#     # with pytest.raises(TypeError):
-#     model = decomposition.MatrixFactorization(
-#         num_factors=0.5,
-#         feedback_type="explicit",
-#         user_col="user_id",
-#         item_col="item_col",
-#         rating_col="rating_col",
-#         l2_reg=9.83,
-#     )
-#     # passing test -> should raise error?
-#     assert model.num_factors == 0.5
+# test_decomposition_mf_invalid_feedback_type_raises
 
 
-def test_decomposition_mf_feedback_type():
+def test_decomposition_mf_num_factors_low():
     model = decomposition.MatrixFactorization(
-        num_factors=16,
-        feedback_type="implicit",
+        num_factors=0,
+        feedback_type="explicit",
         user_col="user_id",
         item_col="item_col",
         rating_col="rating_col",
         l2_reg=9.83,
     )
-    assert model.feedback_type == "implicit"
+    assert model.num_factors == 0
+
+
+#   test_decomposition_mf_negative_num_factors_raises
+
+# def test_decomposition_mf_invalid_num_factors_raises():
+#     num_factors = 0.5
+#     with pytest.raises(TypeError):
+#         decomposition.MatrixFactorization(
+#             num_factors=num_factors,
+#             feedback_type="explicit",
+#             user_col="user_id",
+#             item_col="item_col",
+#             rating_col="rating_col",
+#             l2_reg=9.83,
+#         )
+
+
+# def test_decomposition_mf_invalid_user_col_raises():
+#     with pytest.raises(TypeError):
+#         decomposition.MatrixFactorization(
+#             num_factors=16,
+#             feedback_type="explicit",
+#             user_col=123,
+#             item_col="item_col",
+#             rating_col="rating_col",
+#             l2_reg=9.83,
+#         )
 
 
-# def test_decomposition_mf_feedback_type_raises():
-#     model = decomposition.MatrixFactorization(
-#         num_factors=16,
-#         feedback_type="implexpl",
-#         user_col="user_id",
-#         item_col="item_col",
-#         rating_col="rating_col",
-#         l2_reg=9.83,
-#     )
-#     # passing test -> should raise error?
-#     assert model.feedback_type == "implexpl"
+# def test_decomposition_mf_invalid_item_col_raises():
+#     with pytest.raises(TypeError):
+#         decomposition.MatrixFactorization(
+#             num_factors=16,
+#             feedback_type="explicit",
+#             user_col="user_col",
+#             item_col=123,
+#             rating_col="rating_col",
+#             l2_reg=9.83,
+#         )

From 33f3069d90b19aec79b7429a34d4361d58047f62 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 6 Mar 2025 18:40:39 -0600
Subject: [PATCH 29/75] Update tests/unit/ml/test_matrix_factorization.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 tests/unit/ml/test_matrix_factorization.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index a8d3c650fc..1496f9a074 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -67,7 +67,10 @@ def test_decomposition_mf_num_factors_low():
 #     num_factors = 0.5
 #     with pytest.raises(TypeError):
 #         decomposition.MatrixFactorization(
-#             num_factors=num_factors,
+#             # Intentionally pass in the wrong type. This will fail if the user is using
+#             # a type checker, but we can't assume that everyone is doing so, especially
+#             # not in notebook environments.
+#             num_factors=num_factors,  # type: ignore
 #             feedback_type="explicit",
 #             user_col="user_id",
 #             item_col="item_col",

From 1ff6aaa8e40c3eea154f27e26af09099f833ff16 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 6 Mar 2025 18:40:50 -0600
Subject: [PATCH 30/75] Update tests/unit/ml/test_matrix_factorization.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 tests/unit/ml/test_matrix_factorization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 1496f9a074..1af86f7729 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -65,7 +65,7 @@ def test_decomposition_mf_num_factors_low():
 
 # def test_decomposition_mf_invalid_num_factors_raises():
 #     num_factors = 0.5
-#     with pytest.raises(TypeError):
+#     with pytest.raises(TypeError, match="num_factors"):
 #         decomposition.MatrixFactorization(
 #             # Intentionally pass in the wrong type. This will fail if the user is using
 #             # a type checker, but we can't assume that everyone is doing so, especially

From c84dd7ecbeea9f425145ab4157cfa4fb44a9bfe1 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Fri, 7 Mar 2025 00:43:38 +0000
Subject: [PATCH 31/75] uncomment one test

---
 tests/unit/ml/test_matrix_factorization.py | 31 +++++++++++-----------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 1af86f7729..9413fb036a 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-# import pytest
+import pytest
 
 from bigframes.ml import decomposition
 
@@ -63,20 +63,21 @@ def test_decomposition_mf_num_factors_low():
 
 #   test_decomposition_mf_negative_num_factors_raises
 
-# def test_decomposition_mf_invalid_num_factors_raises():
-#     num_factors = 0.5
-#     with pytest.raises(TypeError, match="num_factors"):
-#         decomposition.MatrixFactorization(
-#             # Intentionally pass in the wrong type. This will fail if the user is using
-#             # a type checker, but we can't assume that everyone is doing so, especially
-#             # not in notebook environments.
-#             num_factors=num_factors,  # type: ignore
-#             feedback_type="explicit",
-#             user_col="user_id",
-#             item_col="item_col",
-#             rating_col="rating_col",
-#             l2_reg=9.83,
-#         )
+
+def test_decomposition_mf_invalid_num_factors_raises():
+    num_factors = 0.5
+    with pytest.raises(TypeError, match="num_factors"):
+        decomposition.MatrixFactorization(
+            # Intentionally pass in the wrong type. This will fail if the user is using
+            # a type checker, but we can't assume that everyone is doing so, especially
+            # not in notebook environments.
+            num_factors=num_factors,  # type: ignore
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col="item_col",
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
 
 
 # def test_decomposition_mf_invalid_user_col_raises():

From 3473037dfbec13ec251758babce00ddcc14c2966 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Fri, 7 Mar 2025 01:05:46 +0000
Subject: [PATCH 32/75] uncomment test

---
 bigframes/ml/decomposition.py              |  9 ++++--
 tests/unit/ml/test_matrix_factorization.py | 37 ++++++++++++++--------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index be78b1848b..42fb4187d7 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -229,17 +229,22 @@ def __init__(
         self.feedback_type = feedback_type
 
         if type(num_factors) is not int:
-            raise (TypeError)
+            raise TypeError(
+                f"Expected num_factors to be INT64 but got {type(num_factors)}"
+            )
 
         self.num_factors = num_factors
 
         if type(user_col) is not str:
             raise (TypeError)
 
+        # if user_col is not "user_id":
+        #     raise ValueError(match="")
+
         self.user_col = user_col
 
         if type(item_col) is not str:
-            raise (TypeError)
+            raise TypeError(f"Expected item_col to be STR but got {type(item_col)}")
 
         self.item_col = item_col
         self.rating_col = rating_col
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 9413fb036a..a410db524d 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -66,7 +66,9 @@ def test_decomposition_mf_num_factors_low():
 
 def test_decomposition_mf_invalid_num_factors_raises():
     num_factors = 0.5
-    with pytest.raises(TypeError, match="num_factors"):
+    with pytest.raises(
+        TypeError, match=f"Expected num_factors to be INT64 but got {type(num_factors)}"
+    ):
         decomposition.MatrixFactorization(
             # Intentionally pass in the wrong type. This will fail if the user is using
             # a type checker, but we can't assume that everyone is doing so, especially
@@ -81,24 +83,31 @@ def test_decomposition_mf_invalid_num_factors_raises():
 
 
 # def test_decomposition_mf_invalid_user_col_raises():
-#     with pytest.raises(TypeError):
+#     user_col = 123
+#     with pytest.raises(TypeError, match="user_col"):
 #         decomposition.MatrixFactorization(
 #             num_factors=16,
-#             feedback_type="explicit",
-#             user_col=123,
+#             # Intentionally pass in the wrong type. This will fail if the user is using
+#             # a type checker, but we can't assume that everyone is doing so, especially
+#             # not in notebook environments.
+#             feedback_type="explicit",  # type: ignore
+#             user_col=user_col,
 #             item_col="item_col",
 #             rating_col="rating_col",
 #             l2_reg=9.83,
 #         )
 
 
-# def test_decomposition_mf_invalid_item_col_raises():
-#     with pytest.raises(TypeError):
-#         decomposition.MatrixFactorization(
-#             num_factors=16,
-#             feedback_type="explicit",
-#             user_col="user_col",
-#             item_col=123,
-#             rating_col="rating_col",
-#             l2_reg=9.83,
-#         )
+def test_decomposition_mf_invalid_item_col_raises():
+    item_col = 123
+    with pytest.raises(
+        TypeError, match=f"Expected item_col to be STR but got {type(item_col)}"
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col="user_col",
+            item_col=item_col,  # type: ignore
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )

From b3809e561677be35e63e4d9c2149a7b151939e6e Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Fri, 7 Mar 2025 01:07:02 +0000
Subject: [PATCH 33/75] uncomment test

---
 bigframes/ml/decomposition.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 42fb4187d7..09bc402ecb 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -246,6 +246,9 @@ def __init__(
         if type(item_col) is not str:
             raise TypeError(f"Expected item_col to be STR but got {type(item_col)}")
 
+        # if item_col is not "item_col":
+        #     raise ValueError(match=f"item_col")
+
         self.item_col = item_col
         self.rating_col = rating_col
         self.l2_reg = l2_reg

From 7e8a5b6aa4046295c2c7792776b173b9494d47ef Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Fri, 7 Mar 2025 01:09:35 +0000
Subject: [PATCH 34/75] uncomment test

---
 bigframes/ml/decomposition.py              |  4 ++--
 tests/unit/ml/test_matrix_factorization.py | 27 +++++++++++-----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 09bc402ecb..8a4f60962c 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -236,7 +236,7 @@ def __init__(
         self.num_factors = num_factors
 
         if type(user_col) is not str:
-            raise (TypeError)
+            raise TypeError(f"Expected item_col to be STR but got {type(user_col)}")
 
         # if user_col is not "user_id":
         #     raise ValueError(match="")
@@ -247,7 +247,7 @@ def __init__(
             raise TypeError(f"Expected item_col to be STR but got {type(item_col)}")
 
         # if item_col is not "item_col":
-        #     raise ValueError(match=f"item_col")
+        #     raise ValueError(match=f"")
 
         self.item_col = item_col
         self.rating_col = rating_col
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index a410db524d..67b45d8551 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -82,20 +82,19 @@ def test_decomposition_mf_invalid_num_factors_raises():
         )
 
 
-# def test_decomposition_mf_invalid_user_col_raises():
-#     user_col = 123
-#     with pytest.raises(TypeError, match="user_col"):
-#         decomposition.MatrixFactorization(
-#             num_factors=16,
-#             # Intentionally pass in the wrong type. This will fail if the user is using
-#             # a type checker, but we can't assume that everyone is doing so, especially
-#             # not in notebook environments.
-#             feedback_type="explicit",  # type: ignore
-#             user_col=user_col,
-#             item_col="item_col",
-#             rating_col="rating_col",
-#             l2_reg=9.83,
-#         )
+def test_decomposition_mf_invalid_user_col_raises():
+    user_col = 123
+    with pytest.raises(
+        TypeError, match=f"Expected item_col to be STR but got {type(user_col)}"
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col=user_col,  # type: ignore
+            item_col="item_col",
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
 
 
 def test_decomposition_mf_invalid_item_col_raises():

From 8599d8848bb2dca03af1e77209cfe581267eb3e7 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 10 Mar 2025 16:24:34 +0000
Subject: [PATCH 35/75] nearly all tests

---
 bigframes/ml/decomposition.py              | 33 ++++++--
 tests/unit/ml/test_matrix_factorization.py | 92 ++++++++++++++++++++--
 2 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 8a4f60962c..81f1d619c8 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -230,27 +230,46 @@ def __init__(
 
         if type(num_factors) is not int:
             raise TypeError(
-                f"Expected num_factors to be INT64 but got {type(num_factors)}"
+                f"Expected num_factors to be INT64, but got {type(num_factors)}."
+            )
+
+        if num_factors < 0:
+            raise ValueError(
+                f"Expected num_factors to be a positive integer, but got {num_factors}."
             )
 
         self.num_factors = num_factors
 
         if type(user_col) is not str:
-            raise TypeError(f"Expected item_col to be STR but got {type(user_col)}")
+            raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.")
 
-        # if user_col is not "user_id":
-        #     raise ValueError(match="")
+        if user_col != "user_id":
+            raise ValueError(
+                f"Expected user_col column to be `user_id`, but got {user_col}."
+            )
 
         self.user_col = user_col
 
         if type(item_col) is not str:
-            raise TypeError(f"Expected item_col to be STR but got {type(item_col)}")
+            raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
 
-        # if item_col is not "item_col":
-        #     raise ValueError(match=f"")
+        if item_col != "item_col":
+            raise ValueError(
+                f"Expected item_col column to be `item_col`, but got {item_col}."
+            )
 
         self.item_col = item_col
+
+        if type(rating_col) is not str:
+            raise TypeError(
+                f"Expected rating_col to be STR, but got {type(rating_col)}."
+            )
+
         self.rating_col = rating_col
+
+        if type(l2_reg) is not float:
+            raise TypeError(f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}.")
+
         self.l2_reg = l2_reg
         self._bqml_model: Optional[core.BqmlModel] = None
         self._bqml_model_factory = globals.bqml_model_factory()
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 67b45d8551..c7adf4f6ed 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -61,18 +61,32 @@ def test_decomposition_mf_num_factors_low():
     assert model.num_factors == 0
 
 
-#   test_decomposition_mf_negative_num_factors_raises
+def test_decomposition_mf_negative_num_factors_raises():
+    num_factors = -2
+    with pytest.raises(
+        ValueError,
+        match=f"Expected num_factors to be a positive integer, but got {num_factors}.",
+    ):
+        decomposition.MatrixFactorization(
+            # Intentionally pass in the wrong type. This will fail if the user is using
+            # a type checker, but we can't assume that everyone is doing so, especially
+            # not in notebook environments.
+            num_factors=num_factors,  # type: ignore
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col="item_col",
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
 
 
 def test_decomposition_mf_invalid_num_factors_raises():
     num_factors = 0.5
     with pytest.raises(
-        TypeError, match=f"Expected num_factors to be INT64 but got {type(num_factors)}"
+        TypeError,
+        match=f"Expected num_factors to be INT64, but got {type(num_factors)}.",
     ):
         decomposition.MatrixFactorization(
-            # Intentionally pass in the wrong type. This will fail if the user is using
-            # a type checker, but we can't assume that everyone is doing so, especially
-            # not in notebook environments.
             num_factors=num_factors,  # type: ignore
             feedback_type="explicit",
             user_col="user_id",
@@ -85,7 +99,23 @@ def test_decomposition_mf_invalid_num_factors_raises():
 def test_decomposition_mf_invalid_user_col_raises():
     user_col = 123
     with pytest.raises(
-        TypeError, match=f"Expected item_col to be STR but got {type(user_col)}"
+        TypeError, match=f"Expected user_col to be STR, but got {type(user_col)}."
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col=user_col,  # type: ignore
+            item_col="item_col",
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
+
+
+def test_decomposition_mf_label_user_col_raises():
+    user_col = "user_col"
+    with pytest.raises(
+        ValueError,
+        match=f"Expected user_col column to be `user_id`, but got {user_col}.",
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
@@ -100,13 +130,59 @@ def test_decomposition_mf_invalid_user_col_raises():
 def test_decomposition_mf_invalid_item_col_raises():
     item_col = 123
     with pytest.raises(
-        TypeError, match=f"Expected item_col to be STR but got {type(item_col)}"
+        TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}."
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
             feedback_type="explicit",
-            user_col="user_col",
+            user_col="user_id",
             item_col=item_col,  # type: ignore
             rating_col="rating_col",
             l2_reg=9.83,
         )
+
+
+def test_decomposition_mf_label_item_col_raises():
+    item_col = "item_id"
+    with pytest.raises(
+        ValueError,
+        match=f"Expected item_col column to be `item_col`, but got {item_col}.",
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col=item_col,  # type: ignore
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
+
+
+def test_decomposition_mf_invalid_rating_col_raises():
+    rating_col = 4
+    with pytest.raises(
+        TypeError, match=f"Expected rating_col to be STR, but got {type(rating_col)}."
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col="item_col",
+            rating_col=rating_col,  # type: ignore
+            l2_reg=9.83,
+        )
+
+
+def test_decomposition_mf_invalid_l2_reg_raises():
+    l2_reg = "6.02"
+    with pytest.raises(
+        TypeError, match=f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}."
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col="item_col",
+            rating_col="rating_col",
+            l2_reg=l2_reg,  # type: ignore
+        )

From 8ab88187f7cb53e5aeeb20fda5a327d54d9d04fa Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 10 Mar 2025 18:46:43 +0000
Subject: [PATCH 36/75] tests complete and passing

---
 bigframes/ml/decomposition.py              |  6 ++++++
 tests/unit/ml/test_matrix_factorization.py | 21 +++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 81f1d619c8..2128470b97 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -226,6 +226,12 @@ def __init__(
         # TODO: Add support for hyperparameter tuning.
         l2_reg: float = 1.0,
     ):
+
+        if feedback_type not in ("explicit", "implicit"):
+            raise ValueError(
+                f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}"
+            )
+
         self.feedback_type = feedback_type
 
         if type(num_factors) is not int:
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index c7adf4f6ed..ca256a419d 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -46,7 +46,23 @@ def test_decomposition_mf_feedback_type_explicit():
     assert model.feedback_type == "explicit"
 
 
-# test_decomposition_mf_invalid_feedback_type_raises
+def test_decomposition_mf_invalid_feedback_type_raises():
+    feedback_type = "explimp"
+    with pytest.raises(
+        ValueError,
+        match=f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}",
+    ):
+        decomposition.MatrixFactorization(
+            # Intentionally pass in the wrong type. This will fail if the user is using
+            # a type checker, but we can't assume that everyone is doing so, especially
+            # not in notebook environments.
+            num_factors=16,
+            feedback_type=feedback_type,  # type: ignore
+            user_col="user_id",
+            item_col="item_col",
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
 
 
 def test_decomposition_mf_num_factors_low():
@@ -68,9 +84,6 @@ def test_decomposition_mf_negative_num_factors_raises():
         match=f"Expected num_factors to be a positive integer, but got {num_factors}.",
     ):
         decomposition.MatrixFactorization(
-            # Intentionally pass in the wrong type. This will fail if the user is using
-            # a type checker, but we can't assume that everyone is doing so, especially
-            # not in notebook environments.
             num_factors=num_factors,  # type: ignore
             feedback_type="explicit",
             user_col="user_id",

From b4d357865548022cc23402411d9f34f4cccd674b Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 10 Mar 2025 20:58:48 +0000
Subject: [PATCH 37/75] seeing if test causes kokoro failure

---
 tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index ca256a419d..63f5a58fbe 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises():
         )
 
 
-def test_decomposition_mf_label_item_col_raises():
-    item_col = "item_id"
-    with pytest.raises(
-        ValueError,
-        match=f"Expected item_col column to be `item_col`, but got {item_col}.",
-    ):
-        decomposition.MatrixFactorization(
-            num_factors=16,
-            feedback_type="explicit",
-            user_col="user_id",
-            item_col=item_col,  # type: ignore
-            rating_col="rating_col",
-            l2_reg=9.83,
-        )
+# def test_decomposition_mf_label_item_col_raises():
+#     item_col = "item_id"
+#     with pytest.raises(
+#         ValueError,
+#         match=f"Expected item_col column to be `item_col`, but got {item_col}.",
+#     ):
+#         decomposition.MatrixFactorization(
+#             num_factors=16,
+#             feedback_type="explicit",
+#             user_col="user_id",
+#             item_col=item_col,  # type: ignore
+#             rating_col="rating_col",
+#             l2_reg=9.83,
+#         )
 
 
 def test_decomposition_mf_invalid_rating_col_raises():

From a63cb90d0099b54404db5d41e7c4b3c7b75b6d14 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 10 Mar 2025 21:11:01 +0000
Subject: [PATCH 38/75] uncomment test-kokoro still failing

---
 tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 63f5a58fbe..ca256a419d 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises():
         )
 
 
-# def test_decomposition_mf_label_item_col_raises():
-#     item_col = "item_id"
-#     with pytest.raises(
-#         ValueError,
-#         match=f"Expected item_col column to be `item_col`, but got {item_col}.",
-#     ):
-#         decomposition.MatrixFactorization(
-#             num_factors=16,
-#             feedback_type="explicit",
-#             user_col="user_id",
-#             item_col=item_col,  # type: ignore
-#             rating_col="rating_col",
-#             l2_reg=9.83,
-#         )
+def test_decomposition_mf_label_item_col_raises():
+    item_col = "item_id"
+    with pytest.raises(
+        ValueError,
+        match=f"Expected item_col column to be `item_col`, but got {item_col}.",
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col=item_col,  # type: ignore
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
 
 
 def test_decomposition_mf_invalid_rating_col_raises():

From e69438d39e6e725dc5763cf2fecd6903753ca159 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 11 Mar 2025 16:42:58 +0000
Subject: [PATCH 39/75] remove comment

---
 tests/unit/ml/test_golden_sql.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 18043426a9..32c2250ec2 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -213,7 +213,7 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y):
 
 
 def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X):
-    model = decomposition.MatrixFactorization(  # revise
+    model = decomposition.MatrixFactorization(
         num_factors=34,
         feedback_type="explicit",
         user_col="user_id",

From 087953f0eac9ed29fa30c79e16a983196d2897db Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 11 Mar 2025 18:40:55 +0000
Subject: [PATCH 40/75] fix test

---
 tests/unit/ml/test_golden_sql.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 32c2250ec2..03695a20e4 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -222,9 +222,6 @@ def test_decomposition_mf_default_fit(bqml_model_factory, mock_session, mock_X):
         l2_reg=9.83,
     )
     model._bqml_model_factory = bqml_model_factory
-    mock_start_query_ml_ddl = mock.Mock()
-    mock_create_model = mock.PropertyMock(return_value=mock_start_query_ml_ddl)
-    type(model)._start_query_ml_ddl = mock_create_model
     model.fit(mock_X)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(

From 8912663ba7b72abf9943789741be710174b83e55 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 12 Mar 2025 13:42:33 +0000
Subject: [PATCH 41/75] test kokoro

---
 bigframes/ml/decomposition.py              |  8 +++----
 tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++-----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 2128470b97..6b396709b6 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -259,10 +259,10 @@ def __init__(
         if type(item_col) is not str:
             raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
 
-        if item_col != "item_col":
-            raise ValueError(
-                f"Expected item_col column to be `item_col`, but got {item_col}."
-            )
+        # if item_col != "item_col":
+        #     raise ValueError(
+        #         f"Expected item_col column to be `item_col`, but got {item_col}."
+        #     )
 
         self.item_col = item_col
 
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index ca256a419d..63f5a58fbe 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises():
         )
 
 
-def test_decomposition_mf_label_item_col_raises():
-    item_col = "item_id"
-    with pytest.raises(
-        ValueError,
-        match=f"Expected item_col column to be `item_col`, but got {item_col}.",
-    ):
-        decomposition.MatrixFactorization(
-            num_factors=16,
-            feedback_type="explicit",
-            user_col="user_id",
-            item_col=item_col,  # type: ignore
-            rating_col="rating_col",
-            l2_reg=9.83,
-        )
+# def test_decomposition_mf_label_item_col_raises():
+#     item_col = "item_id"
+#     with pytest.raises(
+#         ValueError,
+#         match=f"Expected item_col column to be `item_col`, but got {item_col}.",
+#     ):
+#         decomposition.MatrixFactorization(
+#             num_factors=16,
+#             feedback_type="explicit",
+#             user_col="user_id",
+#             item_col=item_col,  # type: ignore
+#             rating_col="rating_col",
+#             l2_reg=9.83,
+#         )
 
 
 def test_decomposition_mf_invalid_rating_col_raises():

From 35a8c1822e880ef45e373d116ad92b2ea1de4f3e Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 12 Mar 2025 15:04:53 +0000
Subject: [PATCH 42/75] test_decomposition.py failing and now feedback_type
 attr does not exist

---
 bigframes/ml/decomposition.py              |  8 +++----
 tests/unit/ml/test_matrix_factorization.py | 28 +++++++++++-----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 6b396709b6..2128470b97 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -259,10 +259,10 @@ def __init__(
         if type(item_col) is not str:
             raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
 
-        # if item_col != "item_col":
-        #     raise ValueError(
-        #         f"Expected item_col column to be `item_col`, but got {item_col}."
-        #     )
+        if item_col != "item_col":
+            raise ValueError(
+                f"Expected item_col column to be `item_col`, but got {item_col}."
+            )
 
         self.item_col = item_col
 
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 63f5a58fbe..ca256a419d 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -155,20 +155,20 @@ def test_decomposition_mf_invalid_item_col_raises():
         )
 
 
-# def test_decomposition_mf_label_item_col_raises():
-#     item_col = "item_id"
-#     with pytest.raises(
-#         ValueError,
-#         match=f"Expected item_col column to be `item_col`, but got {item_col}.",
-#     ):
-#         decomposition.MatrixFactorization(
-#             num_factors=16,
-#             feedback_type="explicit",
-#             user_col="user_id",
-#             item_col=item_col,  # type: ignore
-#             rating_col="rating_col",
-#             l2_reg=9.83,
-#         )
+def test_decomposition_mf_label_item_col_raises():
+    item_col = "item_id"
+    with pytest.raises(
+        ValueError,
+        match=f"Expected item_col column to be `item_col`, but got {item_col}.",
+    ):
+        decomposition.MatrixFactorization(
+            num_factors=16,
+            feedback_type="explicit",
+            user_col="user_id",
+            item_col=item_col,  # type: ignore
+            rating_col="rating_col",
+            l2_reg=9.83,
+        )
 
 
 def test_decomposition_mf_invalid_rating_col_raises():

From ff58ff501888f54af73ffdd4b266bc0a4b0ab037 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 12 Mar 2025 20:10:39 +0000
Subject: [PATCH 43/75] passing tests

---
 bigframes/ml/decomposition.py               |  2 +-
 tests/system/large/ml/test_decomposition.py | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 2128470b97..04e1efc83e 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -227,7 +227,7 @@ def __init__(
         l2_reg: float = 1.0,
     ):
 
-        if feedback_type not in ("explicit", "implicit"):
+        if feedback_type not in ("explicit", "implicit", "EXPLICIT", "IMPLICIT"):
             raise ValueError(
                 f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}"
             )
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index 0a25187935..c9c3d7433e 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -172,11 +172,15 @@ def test_decomposition_mf_configure_fit_load(
         num_factors=6,
         feedback_type="explicit",
         user_col="user_id",
-        item_col="item_id",
+        item_col="item_col",
         rating_col="ratings",
         l2_reg=9.83,
     )
-    model.fit(ratings_df_default_index)
+    model.fit(
+        ratings_df_default_index.rename(
+            columns={"rating": "rating_col", "item_id": "item_col"}
+        )
+    )
 
     reloaded_model = model.to_gbq(
         f"{dataset_id}.temp_configured_mf_model", replace=True
@@ -207,6 +211,6 @@ def test_decomposition_mf_configure_fit_load(
     assert reloaded_model.feedback_type == "EXPLICIT"
     assert reloaded_model.num_factors == 6
     assert reloaded_model.user_col == "user_id"
-    assert reloaded_model.item_col == "item_id"
+    assert reloaded_model.item_col == "item_col"
     assert reloaded_model.rating_col == "rating"
     assert reloaded_model.l2_reg == 9.83

From f0a6ba21d4b2a6a61e49b00de967a802c2b7101e Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Wed, 12 Mar 2025 15:24:32 -0500
Subject: [PATCH 44/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 04e1efc83e..216557b859 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -227,7 +227,8 @@ def __init__(
         l2_reg: float = 1.0,
     ):
 
-        if feedback_type not in ("explicit", "implicit", "EXPLICIT", "IMPLICIT"):
+        feedback_type = feedback_type.lower()
+        if feedback_type not in ("explicit", "implicit"):
             raise ValueError(
                 f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}"
             )

From b586c5ce875364206749203af482795e9a6237b7 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Wed, 12 Mar 2025 15:44:15 -0500
Subject: [PATCH 45/75] Update tests/system/large/ml/test_decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 tests/system/large/ml/test_decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index c9c3d7433e..a2aeaabd9e 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -208,7 +208,7 @@ def test_decomposition_mf_configure_fit_load(
         in reloaded_model._bqml_model.model_name
     )
     assert result is not None
-    assert reloaded_model.feedback_type == "EXPLICIT"
+    assert reloaded_model.feedback_type == "explicit"
     assert reloaded_model.num_factors == 6
     assert reloaded_model.user_col == "user_id"
     assert reloaded_model.item_col == "item_col"

From 565138aed2c24fba554eb508154f54018ed20fad Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Wed, 12 Mar 2025 23:15:18 +0000
Subject: [PATCH 46/75] doc attempt - _mf.py example

---
 .../bigframes_vendored/sklearn/decomposition/_mf.py       | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 2d9ec4e1a1..2d33a177b2 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -25,11 +25,13 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> import bigframes.pandas as bpd
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
-        >>> model = MatrixFactorization(n_components=2, init='random', random_state=0)
-        >>> W = model.fit_transform(X)
-        >>> H = model.components_
+        >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
+        >>> W = model.fit(X)
+        >>> result = model.global_explain()
 
     Args:
+        feedback_type ('explicit' | 'implicit'):
+            Specifies the feedback type for the model. The feedback type determines the algorithm that is used during training.
         num_factors (int or auto, default auto):
             Specifies the number of latent factors to use.
         user_col (str):

From c0ef08f09c46dad3c1f82589dad55857e0d0801d Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Thu, 13 Mar 2025 14:31:48 +0000
Subject: [PATCH 47/75] feedback_type case ignore

---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 216557b859..183e266e30 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -227,7 +227,7 @@ def __init__(
         l2_reg: float = 1.0,
     ):
 
-        feedback_type = feedback_type.lower()
+        feedback_type = feedback_type.lower()  # type: ignore
         if feedback_type not in ("explicit", "implicit"):
             raise ValueError(
                 f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}"

From 664de04d9fd76a19b1d2cb9c40e13a8748b4a1b1 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Tue, 18 Mar 2025 15:33:28 -0500
Subject: [PATCH 48/75] Update _mf.py - remove global_explain()

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 2d33a177b2..6bcae7a206 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -27,7 +27,6 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
         >>> W = model.fit(X)
-        >>> result = model.global_explain()
 
     Args:
         feedback_type ('explicit' | 'implicit'):

From 63e8e9c3b0fbf4a4c62130355eb021aff34868a4 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 18 Mar 2025 22:56:41 +0000
Subject: [PATCH 49/75] fit

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 2d33a177b2..64b77f7730 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,8 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
-        >>> W = model.fit(X)
-        >>> result = model.global_explain()
+        >>> W = model.fit(model.fit(X.rename(columns={'rating': 'rating_col', 'item_id': 'item_col'})))
 
     Args:
         feedback_type ('explicit' | 'implicit'):

From c2e9a5fd3325d9ee0cb3a0704ec515934fe2d800 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 18 Mar 2025 23:31:38 +0000
Subject: [PATCH 50/75] W

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 6bcae7a206..5abb6bb549 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,7 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
-        >>> W = model.fit(X)
+        >>> W = model.fit(model.fit(X.rename(columns={0:'user_id', 1: 'item_col', 2: 'rating_col'})))
 
     Args:
         feedback_type ('explicit' | 'implicit'):

From 193b9c8c8aef13b991e2b8a031ac0d013344baa3 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 24 Mar 2025 17:54:42 +0000
Subject: [PATCH 51/75] fix docs (maybe)

---
 bigframes/ml/decomposition.py                        | 12 +++---------
 .../bigframes_vendored/sklearn/decomposition/_mf.py  |  2 +-
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 183e266e30..17f91683af 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -229,9 +229,7 @@ def __init__(
 
         feedback_type = feedback_type.lower()  # type: ignore
         if feedback_type not in ("explicit", "implicit"):
-            raise ValueError(
-                f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}"
-            )
+            raise ValueError("Expected feedback_type to be `explicit` or `implicit`.")
 
         self.feedback_type = feedback_type
 
@@ -251,9 +249,7 @@ def __init__(
             raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.")
 
         if user_col != "user_id":
-            raise ValueError(
-                f"Expected user_col column to be `user_id`, but got {user_col}."
-            )
+            raise ValueError("Expected user_col column to be `user_id`.")
 
         self.user_col = user_col
 
@@ -261,9 +257,7 @@ def __init__(
             raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
 
         if item_col != "item_col":
-            raise ValueError(
-                f"Expected item_col column to be `item_col`, but got {item_col}."
-            )
+            raise ValueError("Expected item_col column to be `item_col`.")
 
         self.item_col = item_col
 
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 5abb6bb549..d8a1f0eb04 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,7 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
-        >>> W = model.fit(model.fit(X.rename(columns={0:'user_id', 1: 'item_col', 2: 'rating_col'})))
+        >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'}))
 
     Args:
         feedback_type ('explicit' | 'implicit'):

From 5a547f82df503f2ab8f7dc32990d6fdc4a182e39 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Mon, 24 Mar 2025 13:09:13 -0500
Subject: [PATCH 52/75] Update test_matrix_factorization.py with updated error
 messages

---
 tests/unit/ml/test_matrix_factorization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index ca256a419d..bd1b61778d 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -50,7 +50,7 @@ def test_decomposition_mf_invalid_feedback_type_raises():
     feedback_type = "explimp"
     with pytest.raises(
         ValueError,
-        match=f"Expected feedback_type to be `explicit` or `implicit`, but got {feedback_type}",
+        match=f"Expected feedback_type to be `explicit` or `implicit`.",
     ):
         decomposition.MatrixFactorization(
             # Intentionally pass in the wrong type. This will fail if the user is using
@@ -128,7 +128,7 @@ def test_decomposition_mf_label_user_col_raises():
     user_col = "user_col"
     with pytest.raises(
         ValueError,
-        match=f"Expected user_col column to be `user_id`, but got {user_col}.",
+        match=f"Expected user_col column to be `user_id`.",
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
@@ -159,7 +159,7 @@ def test_decomposition_mf_label_item_col_raises():
     item_col = "item_id"
     with pytest.raises(
         ValueError,
-        match=f"Expected item_col column to be `item_col`, but got {item_col}.",
+        match=f"Expected item_col column to be `item_col`.",
     ):
         decomposition.MatrixFactorization(
             num_factors=16,

From 23d8fc896876829c52d90ffd44fbf04187a2ba4b Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 24 Mar 2025 18:16:09 +0000
Subject: [PATCH 53/75] ilnt

---
 tests/unit/ml/test_matrix_factorization.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index bd1b61778d..e4e44d0f99 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -50,7 +50,7 @@ def test_decomposition_mf_invalid_feedback_type_raises():
     feedback_type = "explimp"
     with pytest.raises(
         ValueError,
-        match=f"Expected feedback_type to be `explicit` or `implicit`.",
+        match="Expected feedback_type to be `explicit` or `implicit`.",
     ):
         decomposition.MatrixFactorization(
             # Intentionally pass in the wrong type. This will fail if the user is using
@@ -128,7 +128,7 @@ def test_decomposition_mf_label_user_col_raises():
     user_col = "user_col"
     with pytest.raises(
         ValueError,
-        match=f"Expected user_col column to be `user_id`.",
+        match="Expected user_col column to be `user_id`.",
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
@@ -143,7 +143,7 @@ def test_decomposition_mf_label_user_col_raises():
 def test_decomposition_mf_invalid_item_col_raises():
     item_col = 123
     with pytest.raises(
-        TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}."
+        TypeError, match="Expected item_col to be STR, but got {type(item_col)}."
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
@@ -159,7 +159,7 @@ def test_decomposition_mf_label_item_col_raises():
     item_col = "item_id"
     with pytest.raises(
         ValueError,
-        match=f"Expected item_col column to be `item_col`.",
+        match="Expected item_col column to be `item_col`.",
     ):
         decomposition.MatrixFactorization(
             num_factors=16,

From ed99ad7c67e5b42a7ea6c2ddf1b95c465fbdb170 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Mon, 24 Mar 2025 13:27:56 -0500
Subject: [PATCH 54/75] Update test_matrix_factorization.py - add 'f'

---
 tests/unit/ml/test_matrix_factorization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index e4e44d0f99..047bd367b8 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -143,7 +143,7 @@ def test_decomposition_mf_label_user_col_raises():
 def test_decomposition_mf_invalid_item_col_raises():
     item_col = 123
     with pytest.raises(
-        TypeError, match="Expected item_col to be STR, but got {type(item_col)}."
+        TypeError, match=f"Expected item_col to be STR, but got {type(item_col)}."
     ):
         decomposition.MatrixFactorization(
             num_factors=16,

From e305950d917d92100bd5db5fb07b015e0a2605c9 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 24 Mar 2025 20:35:25 +0000
Subject: [PATCH 55/75] improve errors and update tests

---
 bigframes/ml/decomposition.py              | 16 +++++++++-------
 tests/unit/ml/test_matrix_factorization.py | 18 +++++++++++++++---
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 17f91683af..6768c2db25 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -233,9 +233,9 @@ def __init__(
 
         self.feedback_type = feedback_type
 
-        if type(num_factors) is not int:
+        if not isinstance(num_factors, int):
             raise TypeError(
-                f"Expected num_factors to be INT64, but got {type(num_factors)}."
+                f"Expected num_factors to be INT, but got {type(num_factors)}."
             )
 
         if num_factors < 0:
@@ -245,7 +245,7 @@ def __init__(
 
         self.num_factors = num_factors
 
-        if type(user_col) is not str:
+        if not isinstance(user_col, str):
             raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.")
 
         if user_col != "user_id":
@@ -253,7 +253,7 @@ def __init__(
 
         self.user_col = user_col
 
-        if type(item_col) is not str:
+        if not isinstance(item_col, str):
             raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
 
         if item_col != "item_col":
@@ -261,15 +261,17 @@ def __init__(
 
         self.item_col = item_col
 
-        if type(rating_col) is not str:
+        if not isinstance(rating_col, str):
             raise TypeError(
                 f"Expected rating_col to be STR, but got {type(rating_col)}."
             )
 
         self.rating_col = rating_col
 
-        if type(l2_reg) is not float:
-            raise TypeError(f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}.")
+        if not isinstance(l2_reg, (float, int)):
+            raise TypeError(
+                f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}."
+            )
 
         self.l2_reg = l2_reg
         self._bqml_model: Optional[core.BqmlModel] = None
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 047bd367b8..826681eace 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -25,7 +25,7 @@ def test_decomposition_mf_model():
         user_col="user_id",
         item_col="item_col",
         rating_col="rating_col",
-        l2_reg=9.83,
+        l2_reg=9,
     )
     assert model.num_factors == 16
     assert model.feedback_type == "implicit"
@@ -97,7 +97,7 @@ def test_decomposition_mf_invalid_num_factors_raises():
     num_factors = 0.5
     with pytest.raises(
         TypeError,
-        match=f"Expected num_factors to be INT64, but got {type(num_factors)}.",
+        match=f"Expected num_factors to be INT, but got {type(num_factors)}.",
     ):
         decomposition.MatrixFactorization(
             num_factors=num_factors,  # type: ignore
@@ -186,10 +186,22 @@ def test_decomposition_mf_invalid_rating_col_raises():
         )
 
 
+def test_decomposition_mf_l2_reg():
+    model = decomposition.MatrixFactorization(
+        num_factors=16,
+        feedback_type="explicit",
+        user_col="user_id",
+        item_col="item_col",
+        rating_col="rating_col",
+        l2_reg=6.02,  # type: ignore
+    )
+    assert model.l2_reg == 6.02
+
+
 def test_decomposition_mf_invalid_l2_reg_raises():
     l2_reg = "6.02"
     with pytest.raises(
-        TypeError, match=f"Expected l2_reg to be FLOAT, but got {type(l2_reg)}."
+        TypeError, match=f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}."
     ):
         decomposition.MatrixFactorization(
             num_factors=16,

From 32917e5c5cc4bf4c54bbb8c9a5b3edb12611f2cf Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 08:37:22 -0500
Subject: [PATCH 56/75] Update tests/system/large/ml/test_decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 tests/system/large/ml/test_decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index a2aeaabd9e..add05c9abe 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -212,5 +212,5 @@ def test_decomposition_mf_configure_fit_load(
     assert reloaded_model.num_factors == 6
     assert reloaded_model.user_col == "user_id"
     assert reloaded_model.item_col == "item_col"
-    assert reloaded_model.rating_col == "rating"
+    assert reloaded_model.rating_col == "ratings"
     assert reloaded_model.l2_reg == 9.83

From e485d3b9a2dccce5b2f717a2f8ae2ec70d668242 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 08:39:09 -0500
Subject: [PATCH 57/75] Update bigframes/ml/decomposition.py - num_factors
 error messsage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 6768c2db25..5911e9285e 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -235,7 +235,7 @@ def __init__(
 
         if not isinstance(num_factors, int):
             raise TypeError(
-                f"Expected num_factors to be INT, but got {type(num_factors)}."
+                f"Expected num_factors to be an int, but got {type(num_factors)}."
             )
 
         if num_factors < 0:

From 6a27083edfbf5edd51673e94360ac0de9ea4ff92 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 08:39:36 -0500
Subject: [PATCH 58/75] Update bigframes/ml/decomposition.py - user_col error
 message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 5911e9285e..864df46924 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -246,7 +246,7 @@ def __init__(
         self.num_factors = num_factors
 
         if not isinstance(user_col, str):
-            raise TypeError(f"Expected user_col to be STR, but got {type(user_col)}.")
+            raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.")
 
         if user_col != "user_id":
             raise ValueError("Expected user_col column to be `user_id`.")

From 6e2d902d94bdc3ac2e1c20f9e25357b9eac86107 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 09:59:15 -0500
Subject: [PATCH 59/75] Update bigframes/ml/decomposition.py - rating_col error
 message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 864df46924..7ea0ccfd74 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -263,7 +263,7 @@ def __init__(
 
         if not isinstance(rating_col, str):
             raise TypeError(
-                f"Expected rating_col to be STR, but got {type(rating_col)}."
+                f"Expected rating_col to be a str, but got {type(rating_col)}."
             )
 
         self.rating_col = rating_col

From b65c63789b2bfc2f16ca8467e4742ef140ad86cc Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 09:59:44 -0500
Subject: [PATCH 60/75] Update bigframes/ml/decomposition.py - l2_reg error msg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 7ea0ccfd74..59d2b0277f 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -270,7 +270,7 @@ def __init__(
 
         if not isinstance(l2_reg, (float, int)):
             raise TypeError(
-                f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}."
+                f"Expected l2_reg to be a float or int, but got {type(l2_reg)}."
             )
 
         self.l2_reg = l2_reg

From 74ebe27a9dca3f140059de2d8917c7150e8cc7f8 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Thu, 27 Mar 2025 15:09:18 +0000
Subject: [PATCH 61/75] fix tests to match updated error messages

---
 tests/unit/ml/test_matrix_factorization.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 826681eace..6e4cf04b79 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -97,7 +97,7 @@ def test_decomposition_mf_invalid_num_factors_raises():
     num_factors = 0.5
     with pytest.raises(
         TypeError,
-        match=f"Expected num_factors to be INT, but got {type(num_factors)}.",
+        match=f"Expected num_factors to be an int, but got {type(num_factors)}.",
     ):
         decomposition.MatrixFactorization(
             num_factors=num_factors,  # type: ignore
@@ -112,7 +112,7 @@ def test_decomposition_mf_invalid_num_factors_raises():
 def test_decomposition_mf_invalid_user_col_raises():
     user_col = 123
     with pytest.raises(
-        TypeError, match=f"Expected user_col to be STR, but got {type(user_col)}."
+        TypeError, match=f"Expected user_col to be a str, but got {type(user_col)}."
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
@@ -174,7 +174,7 @@ def test_decomposition_mf_label_item_col_raises():
 def test_decomposition_mf_invalid_rating_col_raises():
     rating_col = 4
     with pytest.raises(
-        TypeError, match=f"Expected rating_col to be STR, but got {type(rating_col)}."
+        TypeError, match=f"Expected rating_col to be a str, but got {type(rating_col)}."
     ):
         decomposition.MatrixFactorization(
             num_factors=16,
@@ -201,7 +201,8 @@ def test_decomposition_mf_l2_reg():
 def test_decomposition_mf_invalid_l2_reg_raises():
     l2_reg = "6.02"
     with pytest.raises(
-        TypeError, match=f"Expected l2_reg to be FLOAT or INT, but got {type(l2_reg)}."
+        TypeError,
+        match=f"Expected l2_reg to be a float or int, but got {type(l2_reg)}.",
     ):
         decomposition.MatrixFactorization(
             num_factors=16,

From 3f40763951cd7b67bda8b5c146545765cad653e8 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 10:52:17 -0500
Subject: [PATCH 62/75] Update
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs df
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index d8a1f0eb04..865db974e2 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -24,7 +24,11 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
 
         >>> import bigframes.pandas as bpd
         >>> from bigframes.ml.decomposition import MatrixFactorization
-        >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+        >>> X = bpd.DataFrame({
+        ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
+        ... "column": [0, 1] * 6,
+        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
+        ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
         >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'}))
 

From 2cbc2e3f5839557969ef2ee0bf5e7317a92fe383 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 10:53:06 -0500
Subject: [PATCH 63/75] Update
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 865db974e2..1d84ebc374 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -29,7 +29,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         ... "column": [0, 1] * 6,
         ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
         ... })
-        >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='user_id', item_col='item_col', l2_reg=2.06)
+        >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)
         >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'}))
 
     Args:

From 0a5aefb5c9b33fb98bd414fe1d14cee9b031ede8 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Thu, 27 Mar 2025 10:53:33 -0500
Subject: [PATCH 64/75] Update
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py - docs fit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 1d84ebc374..c506a10671 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -30,7 +30,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
         ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)
-        >>> W = model.fit(X.rename(columns={0:'user_id', 2: 'rating_col', 1: 'item_col'}))
+        >>> W = model.fit(X)
 
     Args:
         feedback_type ('explicit' | 'implicit'):

From 366e0ab755a9596aa934a03f586d46ee030e06b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Fri, 28 Mar 2025 11:25:57 -0500
Subject: [PATCH 65/75] Update
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index c506a10671..41a2693a14 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,7 +26,7 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame({
         ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
-        ... "column": [0, 1] * 6,
+        ... "column": [0, 1] * 7,
         ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
         ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)

From 56ee62399822a629484254c393041f01238bcd88 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 31 Mar 2025 15:01:00 +0000
Subject: [PATCH 66/75] remove errors and tests

---
 bigframes/ml/decomposition.py              |  6 ----
 tests/unit/ml/test_matrix_factorization.py | 32 ----------------------
 2 files changed, 38 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 59d2b0277f..e99853dfa3 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -248,17 +248,11 @@ def __init__(
         if not isinstance(user_col, str):
             raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.")
 
-        if user_col != "user_id":
-            raise ValueError("Expected user_col column to be `user_id`.")
-
         self.user_col = user_col
 
         if not isinstance(item_col, str):
             raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
 
-        if item_col != "item_col":
-            raise ValueError("Expected item_col column to be `item_col`.")
-
         self.item_col = item_col
 
         if not isinstance(rating_col, str):
diff --git a/tests/unit/ml/test_matrix_factorization.py b/tests/unit/ml/test_matrix_factorization.py
index 6e4cf04b79..92691ba9d4 100644
--- a/tests/unit/ml/test_matrix_factorization.py
+++ b/tests/unit/ml/test_matrix_factorization.py
@@ -124,22 +124,6 @@ def test_decomposition_mf_invalid_user_col_raises():
         )
 
 
-def test_decomposition_mf_label_user_col_raises():
-    user_col = "user_col"
-    with pytest.raises(
-        ValueError,
-        match="Expected user_col column to be `user_id`.",
-    ):
-        decomposition.MatrixFactorization(
-            num_factors=16,
-            feedback_type="explicit",
-            user_col=user_col,  # type: ignore
-            item_col="item_col",
-            rating_col="rating_col",
-            l2_reg=9.83,
-        )
-
-
 def test_decomposition_mf_invalid_item_col_raises():
     item_col = 123
     with pytest.raises(
@@ -155,22 +139,6 @@ def test_decomposition_mf_invalid_item_col_raises():
         )
 
 
-def test_decomposition_mf_label_item_col_raises():
-    item_col = "item_id"
-    with pytest.raises(
-        ValueError,
-        match="Expected item_col column to be `item_col`.",
-    ):
-        decomposition.MatrixFactorization(
-            num_factors=16,
-            feedback_type="explicit",
-            user_col="user_id",
-            item_col=item_col,  # type: ignore
-            rating_col="rating_col",
-            l2_reg=9.83,
-        )
-
-
 def test_decomposition_mf_invalid_rating_col_raises():
     rating_col = 4
     with pytest.raises(

From c9424183486d39c52ff462bb794cc9b40a23313b Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Mon, 31 Mar 2025 10:14:35 -0500
Subject: [PATCH 67/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index e99853dfa3..1488a8270e 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -260,7 +260,7 @@ def __init__(
                 f"Expected rating_col to be a str, but got {type(rating_col)}."
             )
 
-        self.rating_col = rating_col
+        self._input_label_columns = [rating_col]
 
         if not isinstance(l2_reg, (float, int)):
             raise TypeError(

From e0ef53e18627bafb70427d0d7af68f7ff1db6245 Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Mon, 31 Mar 2025 10:14:56 -0500
Subject: [PATCH 68/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 1488a8270e..4f0ff94617 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -34,7 +34,7 @@
     "num_factors": "numFactors",
     "user_col": "userColumn",
     "item_col": "itemColumn",
-    # TODO: Add rating_col
+    "_input_label_columns": "inputLabelColumns",
     "l2_reg": "l2Regularization",
 }
 

From 5018182095b482b9e55c5af3d7bfb77af04f40dd Mon Sep 17 00:00:00 2001
From: rey-esp <drespana@google.com>
Date: Mon, 31 Mar 2025 10:15:09 -0500
Subject: [PATCH 69/75] Update bigframes/ml/decomposition.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/ml/decomposition.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 4f0ff94617..54c2fce9ff 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -271,6 +271,11 @@ def __init__(
         self._bqml_model: Optional[core.BqmlModel] = None
         self._bqml_model_factory = globals.bqml_model_factory()
 
+    @property
+    def rating_col(self) -> str:
+        """str: The rating column name. Defaults to 'rating'.""""
+        return self._input_label_columns[0]
+
     @classmethod
     def _from_bq(
         cls, session: bigframes.session.Session, bq_model: bigquery.Model

From f9397f19b91fbf153dd6f93440686bbeb80c6dd9 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 31 Mar 2025 20:21:23 +0000
Subject: [PATCH 70/75] passing system test

---
 bigframes/ml/decomposition.py               |  2 +-
 tests/data/ratings.jsonl                    | 40 ++++++++++-----------
 tests/data/ratings_schema.json              |  2 +-
 tests/system/large/ml/test_decomposition.py | 21 +++++------
 tests/unit/ml/test_golden_sql.py            |  2 +-
 5 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 54c2fce9ff..ece950a5a2 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -273,7 +273,7 @@ def __init__(
 
     @property
     def rating_col(self) -> str:
-        """str: The rating column name. Defaults to 'rating'.""""
+        """str: The rating column name. Defaults to 'rating'."""
         return self._input_label_columns[0]
 
     @classmethod
diff --git a/tests/data/ratings.jsonl b/tests/data/ratings.jsonl
index 919b61c350..b7cd350d08 100644
--- a/tests/data/ratings.jsonl
+++ b/tests/data/ratings.jsonl
@@ -1,20 +1,20 @@
-{"user_id": 1, "item_id": 2, "ratings": 4.0}
-{"user_id": 1, "item_id": 5, "ratings": 3.0}
-{"user_id": 2, "item_id": 1, "ratings": 5.0}
-{"user_id": 2, "item_id": 3, "ratings": 2.0}
-{"user_id": 3, "item_id": 4, "ratings": 4.5}
-{"user_id": 3, "item_id": 7, "ratings": 3.5}
-{"user_id": 4, "item_id": 2, "ratings": 1.0}
-{"user_id": 4, "item_id": 8, "ratings": 5.0}
-{"user_id": 5, "item_id": 3, "ratings": 4.0}
-{"user_id": 5, "item_id": 9, "ratings": 2.5}
-{"user_id": 6, "item_id": 1, "ratings": 3.0}
-{"user_id": 6, "item_id": 6, "ratings": 4.5}
-{"user_id": 7, "item_id": 5, "ratings": 5.0}
-{"user_id": 7, "item_id": 10, "ratings": 1.5}
-{"user_id": 8, "item_id": 4, "ratings": 2.0}
-{"user_id": 8, "item_id": 7, "ratings": 4.0}
-{"user_id": 9, "item_id": 2, "ratings": 3.5}
-{"user_id": 9, "item_id": 9, "ratings": 5.0}
-{"user_id": 10, "item_id": 3, "ratings": 4.5}
-{"user_id": 10, "item_id": 8, "ratings": 2.5}
+{"user_id": 1, "item_id": 2, "rating": 4.0}
+{"user_id": 1, "item_id": 5, "rating": 3.0}
+{"user_id": 2, "item_id": 1, "rating": 5.0}
+{"user_id": 2, "item_id": 3, "rating": 2.0}
+{"user_id": 3, "item_id": 4, "rating": 4.5}
+{"user_id": 3, "item_id": 7, "rating": 3.5}
+{"user_id": 4, "item_id": 2, "rating": 1.0}
+{"user_id": 4, "item_id": 8, "rating": 5.0}
+{"user_id": 5, "item_id": 3, "rating": 4.0}
+{"user_id": 5, "item_id": 9, "rating": 2.5}
+{"user_id": 6, "item_id": 1, "rating": 3.0}
+{"user_id": 6, "item_id": 6, "rating": 4.5}
+{"user_id": 7, "item_id": 5, "rating": 5.0}
+{"user_id": 7, "item_id": 10, "rating": 1.5}
+{"user_id": 8, "item_id": 4, "rating": 2.0}
+{"user_id": 8, "item_id": 7, "rating": 4.0}
+{"user_id": 9, "item_id": 2, "rating": 3.5}
+{"user_id": 9, "item_id": 9, "rating": 5.0}
+{"user_id": 10, "item_id": 3, "rating": 4.5}
+{"user_id": 10, "item_id": 8, "rating": 2.5}
diff --git a/tests/data/ratings_schema.json b/tests/data/ratings_schema.json
index 1867a8c801..9fd0101ec8 100644
--- a/tests/data/ratings_schema.json
+++ b/tests/data/ratings_schema.json
@@ -11,7 +11,7 @@
     },
     {
       "mode": "NULLABLE",
-      "name": "ratings",
+      "name": "rating",
       "type": "FLOAT"
     }
 ]
diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py
index add05c9abe..d1a5f9f2aa 100644
--- a/tests/system/large/ml/test_decomposition.py
+++ b/tests/system/large/ml/test_decomposition.py
@@ -172,15 +172,12 @@ def test_decomposition_mf_configure_fit_load(
         num_factors=6,
         feedback_type="explicit",
         user_col="user_id",
-        item_col="item_col",
-        rating_col="ratings",
+        item_col="item_id",
+        rating_col="rating",
         l2_reg=9.83,
     )
-    model.fit(
-        ratings_df_default_index.rename(
-            columns={"rating": "rating_col", "item_id": "item_col"}
-        )
-    )
+
+    model.fit(ratings_df_default_index)
 
     reloaded_model = model.to_gbq(
         f"{dataset_id}.temp_configured_mf_model", replace=True
@@ -191,16 +188,14 @@ def test_decomposition_mf_configure_fit_load(
             {
                 "user_id": ["11", "12", "13"],
                 "item_id": [1, 2, 3],
-                "ratings": [1.0, 2.0, 3.0],
+                "rating": [1.0, 2.0, 3.0],
             }
         )
     )
 
     reloaded_model.score(new_ratings)
 
-    result = reloaded_model.predict(
-        new_ratings.rename(columns={"item_id": "item_col"})
-    ).to_pandas()
+    result = reloaded_model.predict(new_ratings).to_pandas()
 
     assert reloaded_model._bqml_model is not None
     assert (
@@ -211,6 +206,6 @@ def test_decomposition_mf_configure_fit_load(
     assert reloaded_model.feedback_type == "explicit"
     assert reloaded_model.num_factors == 6
     assert reloaded_model.user_col == "user_id"
-    assert reloaded_model.item_col == "item_col"
-    assert reloaded_model.rating_col == "ratings"
+    assert reloaded_model.item_col == "item_id"
+    assert reloaded_model.rating_col == "rating"
     assert reloaded_model.l2_reg == 9.83
diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 03695a20e4..083dc25661 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -243,7 +243,7 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X):
 
     mock_session.read_gbq.assert_called_once_with(
         "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_sql))",
-        index_col=["index_column_id"],
+        trial_id=["index_column_id"],
     )
 
 

From b43912073353fbc4f6328584b758fa8f7d56325d Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Mon, 31 Mar 2025 20:44:56 +0000
Subject: [PATCH 71/75] E           AssertionError: expected call not found. E 
          Expected: read_gbq('SELECT * FROM ML.RECOMMEND(MODEL ..,\n 
 (input_X_sql))', trial_id=['index_column_id']) E           Actual:
 read_gbq('SELECT * FROM ML.RECOMMEND(MODEL ..,\n  (input_X_sql))',
 index_col=['index_column_id'])

---
 tests/unit/ml/test_golden_sql.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 083dc25661..03695a20e4 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -243,7 +243,7 @@ def test_decomposition_mf_predict(mock_session, bqml_model, mock_X):
 
     mock_session.read_gbq.assert_called_once_with(
         "SELECT * FROM ML.RECOMMEND(MODEL `model_project`.`model_dataset`.`model_id`,\n  (input_X_sql))",
-        trial_id=["index_column_id"],
+        index_col=["index_column_id"],
     )
 
 

From 8a614c5f42696e2e423378441a33fe8f217369d4 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 1 Apr 2025 15:58:45 +0000
Subject: [PATCH 72/75] same # of elements in each

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 41a2693a14..79b90bd13e 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,8 +26,8 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame({
         ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
-        ... "column": [0, 1] * 7,
-        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
+        ... "column": [0, 1, 1, 1, 0, 3, 1, 0, 2, 1, 1, 0, 2, 1],
+        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 1],
         ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)
         >>> W = model.fit(X)

From c2b47950850a4bfc8da04754652a8bce24a62343 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 1 Apr 2025 18:26:54 +0000
Subject: [PATCH 73/75] attempt

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 79b90bd13e..9d6283e756 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,8 +26,8 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame({
         ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
-        ... "column": [0, 1, 1, 1, 0, 3, 1, 0, 2, 1, 1, 0, 2, 1],
-        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 1],
+        ... "column": [[0, 1], [1, 1], [0, 3], [1, 0], [2, 1], [1, 0], [2, 1]],
+        ... "value": [[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1], [2, 1]],
         ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)
         >>> W = model.fit(X)

From cf6e5be73b64392f1b628b4799ac644946d0445c Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 1 Apr 2025 18:40:17 +0000
Subject: [PATCH 74/75] doc fix

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 9d6283e756..302cf018ee 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -26,8 +26,8 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
         >>> from bigframes.ml.decomposition import MatrixFactorization
         >>> X = bpd.DataFrame({
         ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
-        ... "column": [[0, 1], [1, 1], [0, 3], [1, 0], [2, 1], [1, 0], [2, 1]],
-        ... "value": [[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1], [2, 1]],
+        ... "column": [0,1] * 7,
+        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
         ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)
         >>> W = model.fit(X)

From da230b497268c43539109cae7d5e03d95307c870 Mon Sep 17 00:00:00 2001
From: Daniela <drespana@google.com>
Date: Tue, 1 Apr 2025 18:45:28 +0000
Subject: [PATCH 75/75] doc fix

---
 third_party/bigframes_vendored/sklearn/decomposition/_mf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
index 302cf018ee..fb29cc8984 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_mf.py
@@ -24,10 +24,11 @@ class MatrixFactorization(BaseEstimator, metaclass=ABCMeta):
 
         >>> import bigframes.pandas as bpd
         >>> from bigframes.ml.decomposition import MatrixFactorization
+        >>> bpd.options.display.progress_bar = None
         >>> X = bpd.DataFrame({
         ... "row": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
         ... "column": [0,1] * 7,
-        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1],
+        ... "value": [1, 1, 2, 1, 3, 1.2, 4, 1, 5, 0.8, 6, 1, 2, 3],
         ... })
         >>> model = MatrixFactorization(feedback_type='explicit', num_factors=6, user_col='row', item_col='column', rating_col='value', l2_reg=2.06)
         >>> W = model.fit(X)