From b0e200d71b5bf765aa00d9ccff7fe87dbfee0d37 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Wed, 2 Jul 2025 09:01:46 -0700 Subject: [PATCH] chore: update instructions and remove unnecessary code --- .env | 2 -- README.md | 12 +++--------- quickstart.py | 31 ++++++++++++++++++++++--------- 3 files changed, 25 insertions(+), 20 deletions(-) delete mode 100644 .env diff --git a/.env b/.env deleted file mode 100644 index 335f306..0000000 --- a/.env +++ /dev/null @@ -1,2 +0,0 @@ -# Postgres database address for cocoindex -COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex diff --git a/README.md b/README.md index 02fa1b0..7ceaf22 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,9 @@ Video tutorial with detailed explanation: [Cocoindex Quickstart Video Guide](htt ## Prerequisite - [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. -- Install CocoIndex +- Install CocoIndex and other dependencies: ```bash -pip install -U cocoindex +pip install -U "cocoindex[embeddings]" "psycopg[binary,pool]" pgvector ``` - Make sure you have specify the database URL by environment variable: @@ -25,16 +25,10 @@ export COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@localhost:5432/c ## Run -Setup index: - -```bash -cocoindex setup quickstart.py -``` - Update index: ```bash -cocoindex update quickstart.py +cocoindex update --setup quickstart.py ``` Run query: diff --git a/quickstart.py b/quickstart.py index 579082c..302b84f 100644 --- a/quickstart.py +++ b/quickstart.py @@ -1,8 +1,8 @@ import cocoindex -from dotenv import load_dotenv from psycopg_pool import ConnectionPool import os + @cocoindex.transform_flow() def text_to_embedding( text: cocoindex.DataSlice[str], @@ -17,11 +17,15 @@ def text_to_embedding( ) ) -@cocoindex.flow_def(name="TextEmbedding") -def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): + +@cocoindex.flow_def(name="TextEmbeddingQuickStart") +def text_embedding_flow( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +): # Add a data source to read files from a directory data_scope["documents"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="markdown_files")) + cocoindex.sources.LocalFile(path="markdown_files") + ) # Add a collector for data to be exported to the vector index doc_embeddings = data_scope.add_collector() @@ -31,7 +35,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind # Split the document into chunks, put into `chunks` field doc["chunks"] = doc["content"].transform( cocoindex.functions.SplitRecursively(), - language="javascript", chunk_size=300, chunk_overlap=100) + language="javascript", + chunk_size=300, + chunk_overlap=100, + ) # Transform data of each chunk with doc["chunks"].row() as chunk: @@ -39,8 +46,12 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind chunk["embedding"] = text_to_embedding(chunk["text"]) # Collect the chunk into the collector. - doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], - text=chunk["text"], embedding=chunk["embedding"]) + doc_embeddings.collect( + filename=doc["filename"], + location=chunk["location"], + text=chunk["text"], + embedding=chunk["embedding"], + ) # Export collected data to a vector index. doc_embeddings.export( @@ -55,6 +66,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind ], ) + def search(pool: ConnectionPool, query: str, top_k: int = 5): # Get the table name, for the export target in the text_embedding_flow above. table_name = cocoindex.utils.get_target_storage_default_name( @@ -77,6 +89,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): for row in cur.fetchall() ] + def _main(): # Initialize the database connection pool. pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) @@ -94,7 +107,7 @@ def _main(): print("---") print() + if __name__ == "__main__": - load_dotenv() cocoindex.init() - _main() \ No newline at end of file + _main()