From 47118da7ddbee1c4fe7b221513c56bfedee7e5c9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Apr 2024 01:27:04 +0000 Subject: [PATCH 1/8] feat: Add quantile statistic --- bigframes/core/block_transforms.py | 32 +++++++++++++ bigframes/core/blocks.py | 11 +++-- bigframes/core/compile/aggregate_compiler.py | 8 ++++ bigframes/core/groupby/__init__.py | 46 ++++++++++++++++--- bigframes/dataframe.py | 17 +++++++ bigframes/operations/aggregations.py | 12 +++++ bigframes/series.py | 19 ++++++-- tests/system/small/test_dataframe.py | 30 +++++++++++- tests/system/small/test_groupby.py | 35 ++++++++++++++ tests/system/small/test_series.py | 21 +++++++++ .../ibis/backends/bigquery/registry.py | 8 ++++ .../bigframes_vendored/pandas/core/frame.py | 43 ++++++++++++++++- .../pandas/core/groupby/__init__.py | 28 +++++++++++ .../bigframes_vendored/pandas/core/series.py | 31 +++++++++++++ 14 files changed, 324 insertions(+), 17 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index c789b2a69c..61ffdf9a7d 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,6 +15,7 @@ import functools import typing +from typing import Sequence import pandas as pd @@ -105,6 +106,37 @@ def indicate_duplicates( ) +def quantile( + block: blocks.Block, + columns: Sequence[str], + qs: Sequence[float], + grouping_column_ids: Sequence[str] = (), +) -> blocks.Block: + # TODO: handle windowing and more interpolation methods + window = core.WindowSpec( + grouping_keys=tuple(grouping_column_ids), + ) + quantile_cols = [] + labels = [] + for col in columns: + for q in qs: + label = block.col_id_to_label[col] + new_label = (*label, q) if isinstance(label, tuple) else (label, q) + labels.append(new_label) + block, quantile_col = block.apply_window_op( + col, + agg_ops.QuantileOp(q), + window_spec=window, + ) + quantile_cols.append(quantile_col) + block, results = block.aggregate( + grouping_column_ids, + tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols), + dropna=True, + ) + return block.select_columns(results).with_column_labels(labels) + + def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: supported_methods = [ "linear", diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 5b411e5416..7ae513028e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1498,12 +1498,15 @@ def stack(self, how="left", levels: int = 1): row_label_tuples = utils.index_as_tuples(row_labels) - if col_labels is not None: + if col_labels is None: + result_index: pd.Index = pd.Index([None]) + result_col_labels: Sequence[Tuple] = list([()]) + elif all(col_labels.isna()): + result_index = pd.Index([None]) + result_col_labels = utils.index_as_tuples(col_labels.drop_duplicates()) + else: result_index = col_labels.drop_duplicates().dropna(how="all") result_col_labels = utils.index_as_tuples(result_index) - else: - result_index = pd.Index([None]) - result_col_labels = list([()]) # Get matching columns unpivot_columns: List[Tuple[str, List[str]]] = [] diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index ae21243506..98d296c779 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -148,6 +148,14 @@ def _( return cast(ibis_types.NumericValue, value) +@compile_unary_agg.register +@numeric_op +def _( + op: agg_ops.QuantileOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + return _apply_window_if_present(column.quantile(op.q), window) + + @compile_unary_agg.register @numeric_op def _( diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index e2b28553c6..4efbdaea7e 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import Sequence, Union import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby import pandas as pd @@ -115,14 +116,26 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: def median( self, numeric_only: bool = False, *, exact: bool = False ) -> df.DataFrame: - if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) if not numeric_only: self._raise_on_non_numeric("median") + if exact: + return self.quantile(0.5) return self._aggregate_all(agg_ops.median_op, numeric_only=True) + def quantile(self, q=0.5) -> df.DataFrame: + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, + self._selected_cols, + qs=tuple(q) if multi_q else (q,), + grouping_column_ids=self._by_col_ids, + ) + result_df = df.DataFrame(result) + if multi_q: + return result_df.stack() + else: + return result_df.droplevel(-1, 1) + def min(self, numeric_only: bool = False, *args) -> df.DataFrame: return self._aggregate_all(agg_ops.min_op, numeric_only=numeric_only) @@ -466,8 +479,29 @@ def sum(self, *args) -> series.Series: def mean(self, *args) -> series.Series: return self._aggregate(agg_ops.mean_op) - def median(self, *args, **kwargs) -> series.Series: - return self._aggregate(agg_ops.mean_op) + def median( + self, + *args, + exact: bool = False, + **kwargs, + ) -> series.Series: + if exact: + return self.quantile(0.5) + else: + return self._aggregate(agg_ops.median_op) + + def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> series.Series: + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, + (self._value_column,), + qs=tuple(q) if multi_q else (q,), # type: ignore + grouping_column_ids=self._by_col_ids, + ) + if multi_q: + return series.Series(result.stack()) + else: + return series.Series(result.stack()).droplevel(-1) def std(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.std_op) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 32f5a36f79..ad21185c5a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1920,6 +1920,23 @@ def median( block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) + def quantile(self, q): + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, self._block.value_columns, qs=tuple(q) if multi_q else (q,) + ) + if multi_q: + return DataFrame(result.stack()).droplevel(0) + else: + result_df = ( + DataFrame(result) + .stack(list(range(0, self.columns.nlevels))) + .droplevel(0) + ) + result = bigframes.series.Series(result_df._block) + result.name = q + return result + def std( self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False ) -> bigframes.series.Series: diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 36fa787644..45782d007b 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -109,6 +109,18 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return input_types[0] +@dataclasses.dataclass(frozen=True) +class QuantileOp(UnaryAggregateOp): + q: float + + @property + def name(self): + return f"{int(self.q*100)}%" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) + + @dataclasses.dataclass(frozen=True) class ApproxQuartilesOp(UnaryAggregateOp): quartile: int diff --git a/bigframes/series.py b/bigframes/series.py index f11511f969..a4053cec10 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -22,7 +22,7 @@ import os import textwrap import typing -from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -918,10 +918,19 @@ def mean(self) -> float: def median(self, *, exact: bool = False) -> float: if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) - return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + return self.quantile(0.5) + else: + return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + + def quantile(self, q: float): + qs = tuple(q) if utils.is_list_like(q) else (q,) + result = block_ops.quantile(self._block, (self._value_column,), qs=qs) + if utils.is_list_like(q): + result = result.stack() + result = result.drop_levels([result.index_columns[0]]) + return Series(result) + else: + return cast(float, Series(result).to_pandas().squeeze()) def sum(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.sum_op)) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e70764fcc0..7fef7a9dc7 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2504,7 +2504,10 @@ def test_df_melt_default(scalars_dfs): # Pandas produces int64 index, Bigframes produces Int64 (nullable) pd.testing.assert_frame_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, ) @@ -3029,6 +3032,31 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): ) +def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): + q = 0.45 + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): + q = [0, 0.33, 0.67, 1.0] + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pd_result.index = pd_result.index.astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("op"), [ diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index ba79ba1ab1..7b36a06f49 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -65,6 +65,24 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): assert ((pd_min <= bf_result_computed) & (bf_result_computed <= pd_max)).all().all() +@pytest.mark.parametrize( + ("q"), + [ + ([0.2, 0.4, 0.6, 0.8]), + (0.11), + ], +) +def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): + col_names = ["int64_too", "float64_col", "int64_col", "string_col"] + bf_result = ( + scalars_df_index[col_names].groupby("string_col").quantile(q) + ).to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("string_col").quantile(q) + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("operator"), [ @@ -389,3 +407,20 @@ def test_dataframe_groupby_nonnumeric_with_mean(): pd.testing.assert_frame_equal( pd_result, bf_result, check_index_type=False, check_dtype=False ) + + +@pytest.mark.parametrize( + ("q"), + [ + ([0.2, 0.4, 0.6, 0.8]), + (0.11), + ], +) +def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): + bf_result = ( + scalars_df_index.groupby("string_col")["int64_col"].quantile(q) + ).to_pandas() + pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].quantile(q) + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6e4a87df4f..b733c69fd7 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1320,6 +1320,27 @@ def test_median(scalars_dfs): assert pd_min < bf_result < pd_max +def test_median_exact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].median(exact=True) + pd_result = scalars_pandas_df[col_name].median() + assert math.isclose(pd_result, bf_result) + + +def test_series_quantile(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name] + + pd_result = pd_series.quantile([0.0, 0.4, 0.6, 1.0]) + bf_result = bf_series.quantile([0.0, 0.4, 0.6, 1.0]) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + def test_numeric_literal(scalars_dfs): scalars_df, _ = scalars_dfs col_name = "numeric_col" diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index 88826b31ce..fddeab19a2 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -3,6 +3,7 @@ import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops from ibis.backends.bigquery.registry import OPERATION_REGISTRY +import ibis.expr.operations.reductions as ibis_reductions def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile): @@ -31,12 +32,19 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): return f"GENERATE_ARRAY(0, {arg})" +def _quantile(translator, op: ibis_reductions.Quantile): + arg = translator.translate(op.arg) + quantile = translator.translate(op.quantile) + return f"PERCENTILE_CONT({arg}, {quantile})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore + ibis_reductions.Quantile: _quantile, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 1fc80449d1..7221245cb4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3786,13 +3786,54 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Default False. Include only float, int, boolean columns. exact (bool. default False): Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + one. Returns: bigframes.series.Series: Series with the median of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile( + self, + q=0.5, + ): + """ + Return values at the given quantile over requested axis. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: Float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + [2 rows x 2 columns] + + Args: + q (float or array-like, default 0.5 (50% quantile)): + Value between 0 <= q <= 1, the quantile(s) to compute. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame: + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def var(self, axis=0, *, numeric_only: bool = False): """Return unbiased variance over requested axis. diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ed4ca66f38..9b65fe475c 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -85,6 +85,34 @@ def median( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile(self, q=0.5): + """ + Return group values at the given quantile, a la numpy.percentile. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + + [2 rows x 1 columns] + + Args: + q (float or array-like, default 0.5 (50% quantile)): + Value(s) between 0 and 1 providing the quantile(s) to compute. + + Returns: + Series or DataFrame: Return type determined by caller of GroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def std( self, *, diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 192e19fa5a..aec20a5c08 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2757,6 +2757,37 @@ def median(self, *, exact: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile( + self, + q=0.5, + ) -> float | Series: + """ + Return value at the given quantile. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.5 2.5 + 0.75 3.25 + dtype: Float64 + + Args: + q (float or array-like, default 0.5 (50% quantile)): + The quantile(s) to compute, which can lie in range: 0 <= q <= 1. + + Returns: + float or Series: + If ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles, otherwise + a float will be returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def prod(self): """Return the product of the values over the requested axis. From 77a982db3290511ba6a9861dc07771b0346a1893 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 15 Apr 2024 18:42:36 +0000 Subject: [PATCH 2/8] fix series.quantile type annotation --- bigframes/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index c8a8db8b1a..9d30bdbb1e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -972,7 +972,7 @@ def median(self, *, exact: bool = False) -> float: else: return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) - def quantile(self, q: float): + def quantile(self, q: float) -> Union[Series, float]: qs = tuple(q) if utils.is_list_like(q) else (q,) result = block_ops.quantile(self._block, (self._value_column,), qs=qs) if utils.is_list_like(q): From 856115e6d80a485413af025f12ef882c7fb0d425 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 15 Apr 2024 22:05:25 +0000 Subject: [PATCH 3/8] add numeric_only param --- bigframes/core/groupby/__init__.py | 19 +++++++++++++++---- bigframes/dataframe.py | 19 ++++++++++++++----- .../bigframes_vendored/pandas/core/frame.py | 9 ++------- .../pandas/core/groupby/__init__.py | 4 +++- .../bigframes_vendored/pandas/core/series.py | 2 +- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 4efbdaea7e..0f53342352 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -122,12 +122,21 @@ def median( return self.quantile(0.5) return self._aggregate_all(agg_ops.median_op, numeric_only=True) - def quantile(self, q=0.5) -> df.DataFrame: + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("quantile") + q_cols = tuple( + col + for col in self._selected_cols + if self._column_type(col) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + ) multi_q = utils.is_list_like(q) result = block_ops.quantile( self._block, - self._selected_cols, - qs=tuple(q) if multi_q else (q,), + q_cols, + qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, ) result_df = df.DataFrame(result) @@ -490,7 +499,9 @@ def median( else: return self._aggregate(agg_ops.median_op) - def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> series.Series: + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ) -> series.Series: multi_q = utils.is_list_like(q) result = block_ops.quantile( self._block, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7a51ef9874..8a338aae61 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2009,20 +2009,29 @@ def median( frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_stack(agg_ops.median_op) - return bigframes.series.Series(block.select_column("values")) + if exact: + return self.quantile() + else: + block = frame._block.aggregate_all_and_stack(agg_ops.median_op) + return bigframes.series.Series(block.select_column("values")) - def quantile(self, q): + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ): + if not numeric_only: + frame = self._raise_on_non_numeric("median") + else: + frame = self._drop_non_numeric() multi_q = utils.is_list_like(q) result = block_ops.quantile( - self._block, self._block.value_columns, qs=tuple(q) if multi_q else (q,) + frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore ) if multi_q: return DataFrame(result.stack()).droplevel(0) else: result_df = ( DataFrame(result) - .stack(list(range(0, self.columns.nlevels))) + .stack(list(range(0, frame.columns.nlevels))) .droplevel(0) ) result = bigframes.series.Series(result_df._block) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 4a030d0841..0791021bc2 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4516,10 +4516,7 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def quantile( - self, - q=0.5, - ): + def quantile(self, q=0.5, *, numeric_only: bool = False): """ Return values at the given quantile over requested axis. @@ -4542,9 +4539,7 @@ def quantile( Args: q (float or array-like, default 0.5 (50% quantile)): Value between 0 <= q <= 1, the quantile(s) to compute. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - numeric_only : bool, default False + numeric_only (bool, default False): Include only `float`, `int` or `boolean` data. Returns: diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 9b65fe475c..6310d7e271 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -85,7 +85,7 @@ def median( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def quantile(self, q=0.5): + def quantile(self, q=0.5, *, numeric_only: bool = False): """ Return group values at the given quantile, a la numpy.percentile. @@ -107,6 +107,8 @@ def quantile(self, q=0.5): Args: q (float or array-like, default 0.5 (50% quantile)): Value(s) between 0 and 1 providing the quantile(s) to compute. + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. Returns: Series or DataFrame: Return type determined by caller of GroupBy object. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 919bd5986d..30341ed680 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3154,7 +3154,7 @@ def median(self, *, exact: bool = False): def quantile( self, q=0.5, - ) -> float | Series: + ): """ Return value at the given quantile. From 9983ff0e93c4c8582da13e9118fea1e79546bf22 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 15 Apr 2024 23:16:00 +0000 Subject: [PATCH 4/8] add limit and fix types --- bigframes/constants.py | 3 +++ bigframes/core/block_transforms.py | 2 ++ bigframes/dataframe.py | 6 +++--- bigframes/series.py | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bigframes/constants.py b/bigframes/constants.py index 0751501085..c6d8f3acc2 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -92,3 +92,6 @@ LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS ) + +# BigQuery default is 10000, leave 100 for overhead +MAX_COLUMNS = 9900 diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 61ffdf9a7d..1eae73014c 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -118,6 +118,8 @@ def quantile( ) quantile_cols = [] labels = [] + if len(columns) * len(qs) > constants.MAX_COLUMNS: + raise NotImplementedError("Too many aggregates requested.") for col in columns: for q in qs: label = block.col_id_to_label[col] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8a338aae61..953a89c34f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2034,9 +2034,9 @@ def quantile( .stack(list(range(0, frame.columns.nlevels))) .droplevel(0) ) - result = bigframes.series.Series(result_df._block) - result.name = q - return result + result_series = bigframes.series.Series(result_df._block) + result_series.name = q + return result_series def std( self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False diff --git a/bigframes/series.py b/bigframes/series.py index 9d30bdbb1e..febfdc3c6a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -972,7 +972,7 @@ def median(self, *, exact: bool = False) -> float: else: return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) - def quantile(self, q: float) -> Union[Series, float]: + def quantile(self, q: Union[float, Sequence[float]]) -> Union[Series, float]: qs = tuple(q) if utils.is_list_like(q) else (q,) result = block_ops.quantile(self._block, (self._value_column,), qs=qs) if utils.is_list_like(q): From 04dcb9877cf0281b4056fba9fad41874fea78e8f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Apr 2024 16:21:47 +0000 Subject: [PATCH 5/8] fix mypy --- bigframes/series.py | 2 +- .../bigframes_vendored/pandas/core/frame.py | 4 +++- .../bigframes_vendored/pandas/core/series.py | 15 ++++++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index febfdc3c6a..0a8e398e08 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -972,7 +972,7 @@ def median(self, *, exact: bool = False) -> float: else: return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) - def quantile(self, q: Union[float, Sequence[float]]) -> Union[Series, float]: + def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, float]: qs = tuple(q) if utils.is_list_like(q) else (q,) result = block_ops.quantile(self._block, (self._value_column,), qs=qs) if utils.is_list_like(q): diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 0791021bc2..e894900646 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4516,7 +4516,9 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def quantile(self, q=0.5, *, numeric_only: bool = False): + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ): """ Return values at the given quantile over requested axis. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 30341ed680..5e3b4c46ef 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,7 +3,16 @@ """ from __future__ import annotations -from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING +from typing import ( + Hashable, + IO, + Literal, + Mapping, + Optional, + Sequence, + TYPE_CHECKING, + Union, +) from bigframes_vendored.pandas.core.generic import NDFrame import numpy @@ -3153,8 +3162,8 @@ def median(self, *, exact: bool = False): def quantile( self, - q=0.5, - ): + q: Union[float, Sequence[float]] = 0.5, + ) -> Union[Series, float]: """ Return value at the given quantile. From d0b3fc38770865e10306e81f8d723ca487413780 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Apr 2024 17:34:51 +0000 Subject: [PATCH 6/8] actually fix mypy --- bigframes/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 0a8e398e08..b834411bce 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -968,7 +968,7 @@ def mean(self) -> float: def median(self, *, exact: bool = False) -> float: if exact: - return self.quantile(0.5) + return typing.cast(float, self.quantile(0.5)) else: return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) From b76ad9bd75efbf072eb30b65c77016339eefc753 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Apr 2024 20:10:39 +0000 Subject: [PATCH 7/8] fix issue with multiindex isna not impl --- bigframes/core/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 7ae513028e..f6850020df 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1501,7 +1501,9 @@ def stack(self, how="left", levels: int = 1): if col_labels is None: result_index: pd.Index = pd.Index([None]) result_col_labels: Sequence[Tuple] = list([()]) - elif all(col_labels.isna()): + elif (col_labels.nlevels == 1) and all( + col_labels.isna() + ): # isna not implemented for MultiIndex for newer pandas versions result_index = pd.Index([None]) result_col_labels = utils.index_as_tuples(col_labels.drop_duplicates()) else: From 677aaf2afcdf6277613ac446e1d9c0d780ec9e81 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Apr 2024 22:23:30 +0000 Subject: [PATCH 8/8] fix plot accessor doctests printing progress bar --- third_party/bigframes_vendored/pandas/plotting/_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 19f56965df..bf016357a6 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -11,6 +11,7 @@ class PlotAccessor: For Series: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -57,6 +58,7 @@ def hist( >>> import bigframes.pandas as bpd >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -93,6 +95,7 @@ def line( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -160,6 +163,7 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6],