From c6745da99567d3f68fa4f9db3bf4d66890eb2b11 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 29 Aug 2024 19:11:35 +0000 Subject: [PATCH 1/4] add tests --- tests/system/small/test_session.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 5b5db74ea6..6f3d464e53 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1036,6 +1036,26 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine): assert len(df.columns) == 1 +@pytest.mark.parametrize( + "engine", + [ + pytest.param( + "bigquery", + id="bq_engine", + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + pytest.param(None, id="default_engine"), + ], +) +def test_read_csv_others(session, scalars_pandas_df_index, engine): + # TODO: Update to `main` branch after merging https://github.com/googleapis/python-bigquery-dataframes/pull/938 + uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main_chelsealin_addcsvfile/tests/data/people.csv" + df = session.read_csv(uri, engine=engine) + assert len(df.columns) == 3 + + @pytest.mark.parametrize( "engine", [ From 073c615692de979f2b939008d130ca41659f2b20 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 29 Aug 2024 19:32:08 +0000 Subject: [PATCH 2/4] feat: enable read_csv() to process other files --- bigframes/session/__init__.py | 6 ++++-- bigframes/session/loader.py | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c91266b875..5c85552bba 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1008,10 +1008,12 @@ def _check_file_size(self, filepath: str): blob = bucket.blob(blob_name) blob.reload() file_size = blob.size - else: # local file path + elif os.path.exists(filepath): # local file path file_size = os.path.getsize(filepath) + else: + file_size = None - if file_size > max_size: + if file_size is not None and file_size > max_size: # Convert to GB file_size = round(file_size / (1024**3), 1) max_size = int(max_size / 1024**3) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index edfd57b965..924fddce12 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -18,6 +18,7 @@ import dataclasses import datetime import itertools +import os import typing from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union @@ -421,11 +422,16 @@ def _read_bigquery_load_job( load_job = self._bqclient.load_table_from_uri( filepath_or_buffer, table, job_config=job_config ) - else: + elif os.path.exists(filepath_or_buffer): # local file path with open(filepath_or_buffer, "rb") as source_file: load_job = self._bqclient.load_table_from_file( source_file, table, job_config=job_config ) + else: + raise NotImplementedError( + f"BigQuery engine only supports a local file path or GCS path. " + f"{constants.FEEDBACK_LINK}" + ) else: load_job = self._bqclient.load_table_from_file( filepath_or_buffer, table, job_config=job_config From fa68d2d9e480d213c38b0984714abaeadf35545e Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Sun, 1 Sep 2024 01:17:36 +0000 Subject: [PATCH 3/4] update to main --- tests/system/small/test_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 6f3d464e53..3dce0963cb 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1051,7 +1051,7 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine): ) def test_read_csv_others(session, scalars_pandas_df_index, engine): # TODO: Update to `main` branch after merging https://github.com/googleapis/python-bigquery-dataframes/pull/938 - uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main_chelsealin_addcsvfile/tests/data/people.csv" + uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main/tests/data/people.csv" df = session.read_csv(uri, engine=engine) assert len(df.columns) == 3 From 1726161bfdb5d68fc79e74fddcee1c94c636c075 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 3 Sep 2024 18:26:47 +0000 Subject: [PATCH 4/4] add docs --- tests/system/small/test_session.py | 3 +-- third_party/bigframes_vendored/pandas/io/parsers/readers.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 3dce0963cb..ed3e38e6f8 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1049,8 +1049,7 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine): pytest.param(None, id="default_engine"), ], ) -def test_read_csv_others(session, scalars_pandas_df_index, engine): - # TODO: Update to `main` branch after merging https://github.com/googleapis/python-bigquery-dataframes/pull/938 +def test_read_csv_others(session, engine): uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main/tests/data/people.csv" df = session.read_csv(uri, engine=engine) assert len(df.columns) == 3 diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 248cf8e0fe..35b2a1982a 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -51,8 +51,7 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ): - """Loads DataFrame from comma-separated values (csv) file locally or from - Cloud Storage. + """Loads data from a comma-separated values (csv) file into a DataFrame. The CSV file data will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. @@ -60,7 +59,8 @@ def read_csv( .. note:: using `engine="bigquery"` will not guarantee the same ordering as the file. Instead, set a serialized index column as the index and sort by - that in the resulting DataFrame. + that in the resulting DataFrame. Only files stored on your local machine + or in Google Cloud Storage are supported. .. note:: For non-bigquery engine, data is inlined in the query SQL if it is