From e971fdfdd2e021dd3c8582202cf588ba199dc0ac Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Jul 2024 01:44:21 +0000 Subject: [PATCH 1/2] fix: Fix 'sql' property for null index --- bigframes/dataframe.py | 4 +++- bigframes/session/__init__.py | 2 ++ bigframes/session/_io/bigquery/read_gbq_table.py | 4 ++++ tests/system/small/test_unordered.py | 9 +++++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4dcc4414ed..1e23497a5c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -385,7 +385,9 @@ def _to_sql_query( @property def sql(self) -> str: """Compiles this DataFrame's expression tree to SQL.""" - include_index = self.index.name is not None or len(self.index.names) > 1 + include_index = self._has_index and ( + self.index.name is not None or len(self.index.names) > 1 + ) sql, _, _ = self._to_sql_query(include_index=include_index) return sql diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 10c0797873..cc788f9002 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -884,6 +884,8 @@ def _read_gbq_table( table=table, index_cols=index_cols, api_name=api_name, + # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique + metadata_only=not self._strictly_ordered, ) schema = schemata.ArraySchema.from_bq_table(table) if columns: diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 879a8ba44c..03b26f9460 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -152,6 +152,7 @@ def are_index_cols_unique( table: bigquery.table.Table, index_cols: List[str], api_name: str, + metadata_only: bool = False, ) -> bool: if len(index_cols) == 0: return False @@ -161,6 +162,9 @@ def are_index_cols_unique( if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols): return True + if metadata_only: + # Sometimes not worth scanning data to check uniqueness + return False # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 4448ddc838..3f7e09fdd6 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -20,6 +20,15 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_unordered_mode_sql_no_hash(unordered_session): + bf_df = unordered_session.read_gbq( + "bigquery-public-data.ethereum_blockchain.blocks" + ) + sql = bf_df.sql + assert "ORDER BY" not in sql.upper() + assert "farm_fingerprint" not in sql + + def test_unordered_mode_cache_aggregate(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) df = bpd.DataFrame(pd_df, session=unordered_session) From a85c3193a85172e8d7c52d2012f4810d80f9d321 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 23 Jul 2024 17:57:33 +0000 Subject: [PATCH 2/2] make sql test case normalized --- tests/system/small/test_unordered.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 940e28a149..6d9171aeed 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -25,8 +25,8 @@ def test_unordered_mode_sql_no_hash(unordered_session): "bigquery-public-data.ethereum_blockchain.blocks" ) sql = bf_df.sql - assert "ORDER BY" not in sql.upper() - assert "farm_fingerprint" not in sql + assert "ORDER BY".casefold() not in sql.casefold() + assert "farm_fingerprint".casefold() not in sql.casefold() def test_unordered_mode_job_label(unordered_session):