diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a76d8f9..58f7ecaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.12.2-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.12.1-alpha...v0.12.2-alpha) (2024-02-02) + + +### Bug Fixes + +* Reduce API polling for `Document.from_batch_process_operation()` ([#249](https://github.com/googleapis/python-documentai-toolbox/issues/249)) ([0677299](https://github.com/googleapis/python-documentai-toolbox/commit/0677299e6cb07e812b462c36775117956ad6256c)) + ## [0.12.1-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.12.0-alpha...v0.12.1-alpha) (2024-02-02) diff --git a/google/cloud/documentai_toolbox/converters/converter.py b/google/cloud/documentai_toolbox/converters/converter.py index ba3c9233..ffc7c138 100644 --- a/google/cloud/documentai_toolbox/converters/converter.py +++ b/google/cloud/documentai_toolbox/converters/converter.py @@ -64,7 +64,8 @@ def _get_base_ocr( client = documentai.DocumentProcessorServiceClient( client_options=ClientOptions( api_endpoint=f"{location}-documentai.googleapis.com" - ) + ), + client_info=gcs_utilities._get_client_info(), ) name = ( diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index 6952c7e2..1502adbf 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.12.1-alpha" +__version__ = "0.12.2-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 3de68230..6df97312 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -22,9 +22,9 @@ import re from typing import Dict, List, Optional, Type, Union -from google.api_core.client_options import ClientOptions +from google.api_core.operation import from_gapic as operation_from_gapic from google.cloud.vision import AnnotateFileResponse -from google.longrunning.operations_pb2 import GetOperationRequest, Operation +from google.longrunning.operations_pb2 import GetOperationRequest from jinja2 import Environment, PackageLoader from pikepdf import Pdf @@ -137,48 +137,57 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume def _get_batch_process_metadata( - location: str, operation_name: str + operation_name: str, + timeout: Optional[float] = None, ) -> documentai.BatchProcessMetadata: r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation. Args: - location (str): - Required. The location of the processor used for `batch_process_documents()`. - operation_name (str): Required. The fully qualified operation name for a `batch_process_documents()` operation. + + timeout (float): + Optional. Default None. Time in seconds to wait for operation to complete. + If None, will wait indefinitely. Returns: documentai.BatchProcessMetadata: Metadata from batch process. """ client = documentai.DocumentProcessorServiceClient( - client_options=ClientOptions( - api_endpoint=f"{location}-documentai.googleapis.com" - ) + client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"), ) - while True: - operation: Operation = client.get_operation( - request=GetOperationRequest(name=operation_name) - ) + # Poll Operation until complete. + operation = operation_from_gapic( + operation=client.get_operation( + request=GetOperationRequest(name=operation_name), + metadata=documentai.BatchProcessMetadata(), + ), + operations_client=client, + result_type=documentai.BatchProcessResponse, + ) + operation.result(timeout=timeout) - if operation.done: - break + operation_pb = operation.operation - if not operation.metadata: + # Get Operation metadata. + if not operation_pb.metadata: raise ValueError(f"Operation does not contain metadata: {operation}") metadata_type = ( "type.googleapis.com/google.cloud.documentai.v1.BatchProcessMetadata" ) - if not operation.metadata.type_url or operation.metadata.type_url != metadata_type: + if ( + not operation_pb.metadata.type_url + or operation_pb.metadata.type_url != metadata_type + ): raise ValueError( - f"Operation metadata type is not `{metadata_type}`. Type is `{operation.metadata.type_url}`." + f"Operation metadata type is not `{metadata_type}`. Type is `{operation_pb.metadata.type_url}`." ) metadata: documentai.BatchProcessMetadata = ( - documentai.BatchProcessMetadata.deserialize(operation.metadata.value) + documentai.BatchProcessMetadata.deserialize(operation_pb.metadata.value) ) return metadata @@ -518,7 +527,10 @@ def from_batch_process_metadata( @classmethod def from_batch_process_operation( - cls: Type["Document"], location: str, operation_name: str + cls: Type["Document"], + location: str, # pylint: disable=unused-argument + operation_name: str, + timeout: Optional[float] = None, ) -> List["Document"]: r"""Loads Documents from Cloud Storage, using the operation name returned from `batch_process_documents()`. @@ -533,19 +545,26 @@ def from_batch_process_operation( Args: location (str): - Required. The location of the processor used for `batch_process_documents()`. + Optional. The location of the processor used for `batch_process_documents()`. + Deprecated. Maintained for backwards compatibility. operation_name (str): Required. The fully qualified operation name for a `batch_process_documents()` operation. Format: `projects/{project}/locations/{location}/operations/{operation}` + + timeout (float): + Optional. Default None. Time in seconds to wait for operation to complete. + If None, will wait indefinitely. + Returns: List[Document]: A list of wrapped documents from gcs. Each document corresponds to an input file. """ return cls.from_batch_process_metadata( metadata=_get_batch_process_metadata( - location=location, operation_name=operation_name + operation_name=operation_name, + timeout=timeout, ) ) diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index c7c80a8d..31ac799e 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -226,9 +226,9 @@ def test_get_batch_process_metadata_with_valid_operation( mock_client.get_operation.return_value = mock_operation - location = "us" operation_name = "projects/123456/locations/us/operations/7890123" - document._get_batch_process_metadata(location, operation_name) + timeout = 1 + document._get_batch_process_metadata(operation_name, timeout=timeout) mock_client.get_operation.assert_called() mock_docai.BatchProcessMetadata.deserialize.assert_called() @@ -264,9 +264,8 @@ def test_get_batch_process_metadata_with_running_operation( mock_operation_finished, ] - location = "us" operation_name = "projects/123456/locations/us/operations/7890123" - document._get_batch_process_metadata(location, operation_name) + document._get_batch_process_metadata(operation_name) mock_client.get_operation.assert_called() mock_docai.BatchProcessMetadata.deserialize.assert_called() @@ -280,12 +279,11 @@ def test_get_batch_process_metadata_with_no_metadata(mock_docai): ): mock_client = mock_docai.DocumentProcessorServiceClient.return_value - location = "us" operation_name = "projects/123456/locations/us/operations/7890123" mock_operation = mock.Mock(done=True, metadata=None) mock_client.get_operation.return_value = mock_operation - document._get_batch_process_metadata(location, operation_name) + document._get_batch_process_metadata(operation_name) @mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") @@ -296,7 +294,6 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai): ): mock_client = mock_docai.DocumentProcessorServiceClient.return_value - location = "us" operation_name = "projects/123456/locations/us/operations/7890123" mock_operation = mock.Mock( done=True, @@ -306,7 +303,7 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai): ) mock_client.get_operation.return_value = mock_operation - document._get_batch_process_metadata(location, operation_name) + document._get_batch_process_metadata(operation_name) def test_bigquery_column_name():