From 5924338ff275f08baad0f95e6107b74f440e7007 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 14 Apr 2025 23:43:45 +0000 Subject: [PATCH] feat: detect duplicate column/index names in read_gbq before send query. --- bigframes/session/loader.py | 27 +++++++++++++++ tests/system/small/test_session.py | 53 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 1b6e096ed0..2fa6e64baa 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -89,6 +89,31 @@ def _to_index_cols( return index_cols +def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]): + index_cols_list = list(index_cols) if index_cols is not None else [] + columns_list = list(columns) if columns is not None else [] + set_index = set(index_cols_list) + set_columns = set(columns_list) + + if len(index_cols_list) > len(set_index): + raise ValueError( + "The 'index_col' argument contains duplicate names. " + "All column names specified in 'index_col' must be unique." + ) + + if len(columns_list) > len(set_columns): + raise ValueError( + "The 'columns' argument contains duplicate names. " + "All column names specified in 'columns' must be unique." + ) + + if not set_index.isdisjoint(set_columns): + raise ValueError( + "Found column names that exist in both 'index_col' and 'columns' arguments. " + "These arguments must specify distinct sets of columns." + ) + + @dataclasses.dataclass class GbqDataLoader: """ @@ -328,6 +353,7 @@ def read_gbq_table( table=table, index_col=index_col, ) + _check_column_duplicates(index_cols, columns) for key in index_cols: if key not in table_column_names: @@ -569,6 +595,7 @@ def read_gbq_query( ) index_cols = _to_index_cols(index_col) + _check_column_duplicates(index_cols, columns) filters_copy1, filters_copy2 = itertools.tee(filters) has_filters = len(list(filters_copy1)) != 0 diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 00cfe539b4..c969c4a588 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1627,3 +1627,56 @@ def test_read_gbq_test(test_session: bigframes.Session): actual = test_session.read_gbq(table_id).to_pandas() assert actual.shape == (1, 1) + + +@pytest.mark.parametrize( + ("query_or_table", "index_col", "columns"), + [ + pytest.param( + "{scalars_table_id}", + ("int64_col", "string_col", "int64_col"), + ("float64_col", "bool_col"), + id="table_input_index_col_dup", + marks=pytest.mark.xfail( + raises=ValueError, + reason="ValueError: Duplicate names within 'index_col'.", + strict=True, + ), + ), + pytest.param( + """SELECT int64_col, string_col, float64_col, bool_col + FROM `{scalars_table_id}`""", + ("int64_col",), + ("string_col", "float64_col", "string_col"), + id="query_input_columns_dup", + marks=pytest.mark.xfail( + raises=ValueError, + reason="ValueError: Duplicate names within 'columns'.", + strict=True, + ), + ), + pytest.param( + "{scalars_table_id}", + ("int64_col", "string_col"), + ("float64_col", "string_col", "bool_col"), + id="table_input_cross_dup", + marks=pytest.mark.xfail( + raises=ValueError, + reason="ValueError: Overlap between 'index_col' and 'columns'.", + strict=True, + ), + ), + ], +) +def test_read_gbq_duplicate_columns_xfail( + session: bigframes.Session, + scalars_table_id: str, + query_or_table: str, + index_col: tuple, + columns: tuple, +): + session.read_gbq( + query_or_table.format(scalars_table_id=scalars_table_id), + index_col=index_col, + columns=columns, + )