From 0fcef54c480b5f15106dcde59c90fd68275b56e4 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Fri, 29 Aug 2025 15:12:45 -0700 Subject: [PATCH 1/2] simplify get started --- .env | 6 +++ README.md | 18 ++++---- main.py | 53 +++++++++++++++++++++++ quickstart.py | 113 -------------------------------------------------- 4 files changed, 68 insertions(+), 122 deletions(-) create mode 100644 .env create mode 100644 main.py delete mode 100644 quickstart.py diff --git a/.env b/.env new file mode 100644 index 0000000..335feb6 --- /dev/null +++ b/.env @@ -0,0 +1,6 @@ +# Postgres database address for cocoindex +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex + +# Fallback to CPU for operations not supported by MPS on Mac. +# It's no-op for other platforms. +PYTORCH_ENABLE_MPS_FALLBACK=1 diff --git a/README.md b/README.md index 7ceaf22..05862a3 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Quickstart demo following the [Cocoindex Quickstart](https://cocoindex.io/docs/quickstart) guide. Super easy to get your RAG data pipeline running in ~50 lines of python 🚀. -⭐ Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) +⭐ Star [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) if you like it! [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) Video tutorial with detailed explanation: [Cocoindex Quickstart Video Guide](https://www.youtube.com/watch?v=dQw4w9WgXcQ) @@ -15,12 +15,7 @@ Video tutorial with detailed explanation: [Cocoindex Quickstart Video Guide](htt - Install CocoIndex and other dependencies: ```bash -pip install -U "cocoindex[embeddings]" "psycopg[binary,pool]" pgvector -``` - -- Make sure you have specify the database URL by environment variable: -``` -export COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@localhost:5432/cocoindex" +pip install -U "cocoindex[embeddings]" ``` ## Run @@ -28,11 +23,16 @@ export COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@localhost:5432/c Update index: ```bash -cocoindex update --setup quickstart.py +cocoindex update --setup main.py ``` Run query: ```bash -python quickstart.py +python main.py +``` + +## Run with CocoInsight +```bash +cocoindex server -ci main.py ``` \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..90f3651 --- /dev/null +++ b/main.py @@ -0,0 +1,53 @@ +import cocoindex + +@cocoindex.flow_def(name="TextEmbeddingQuickStart") +def text_embedding_flow( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +): + # Add a data source to read files from a directory + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.LocalFile(path="markdown_files") + ) + + # Add a collector for data to be exported to the vector index + doc_embeddings = data_scope.add_collector() + + # Transform data of each document + with data_scope["documents"].row() as doc: + # Split the document into chunks, put into `chunks` field + doc["chunks"] = doc["content"].transform( + cocoindex.functions.SplitRecursively(), + language="javascript", + chunk_size=300, + chunk_overlap=100, + ) + + # Transform data of each chunk + with doc["chunks"].row() as chunk: + # Embed the chunk, put into `embedding` field (inlined transform) + chunk["embedding"] = chunk["text"].transform( + cocoindex.functions.SentenceTransformerEmbed( + model="sentence-transformers/all-MiniLM-L6-v2" + ) + ) + + # Collect the chunk into the collector. + doc_embeddings.collect( + filename=doc["filename"], + location=chunk["location"], + text=chunk["text"], + embedding=chunk["embedding"], + ) + + # Export collected data to a vector index. + doc_embeddings.export( + "doc_embeddings", + cocoindex.storages.Postgres(), + primary_key_fields=["filename", "location"], + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, + ) + ], + ) diff --git a/quickstart.py b/quickstart.py deleted file mode 100644 index 302b84f..0000000 --- a/quickstart.py +++ /dev/null @@ -1,113 +0,0 @@ -import cocoindex -from psycopg_pool import ConnectionPool -import os - - -@cocoindex.transform_flow() -def text_to_embedding( - text: cocoindex.DataSlice[str], -) -> cocoindex.DataSlice[list[float]]: - """ - Embed the text using a SentenceTransformer model. - This is a shared logic between indexing and querying, so extract it as a function. - """ - return text.transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2" - ) - ) - - -@cocoindex.flow_def(name="TextEmbeddingQuickStart") -def text_embedding_flow( - flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): - # Add a data source to read files from a directory - data_scope["documents"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="markdown_files") - ) - - # Add a collector for data to be exported to the vector index - doc_embeddings = data_scope.add_collector() - - # Transform data of each document - with data_scope["documents"].row() as doc: - # Split the document into chunks, put into `chunks` field - doc["chunks"] = doc["content"].transform( - cocoindex.functions.SplitRecursively(), - language="javascript", - chunk_size=300, - chunk_overlap=100, - ) - - # Transform data of each chunk - with doc["chunks"].row() as chunk: - # Embed the chunk, put into `embedding` field - chunk["embedding"] = text_to_embedding(chunk["text"]) - - # Collect the chunk into the collector. - doc_embeddings.collect( - filename=doc["filename"], - location=chunk["location"], - text=chunk["text"], - embedding=chunk["embedding"], - ) - - # Export collected data to a vector index. - doc_embeddings.export( - "doc_embeddings", - cocoindex.storages.Postgres(), - primary_key_fields=["filename", "location"], - vector_indexes=[ - cocoindex.VectorIndexDef( - field_name="embedding", - metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, - ) - ], - ) - - -def search(pool: ConnectionPool, query: str, top_k: int = 5): - # Get the table name, for the export target in the text_embedding_flow above. - table_name = cocoindex.utils.get_target_storage_default_name( - text_embedding_flow, "doc_embeddings" - ) - # Evaluate the transform flow defined above with the input query, to get the embedding. - query_vector = text_to_embedding.eval(query) - # Run the query and get the results. - with pool.connection() as conn: - with conn.cursor() as cur: - cur.execute( - f""" - SELECT filename, text, embedding <=> %s::vector AS distance - FROM {table_name} ORDER BY distance LIMIT %s - """, - (query_vector, top_k), - ) - return [ - {"filename": row[0], "text": row[1], "score": 1.0 - row[2]} - for row in cur.fetchall() - ] - - -def _main(): - # Initialize the database connection pool. - pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) - # Run queries in a loop to demonstrate the query capabilities. - while True: - query = input("Enter search query (or Enter to quit): ") - if query == "": - break - # Run the query function with the database connection pool and the query. - results = search(pool, query) - print("\nSearch results:") - for result in results: - print(f"[{result['score']:.3f}] {result['filename']}") - print(f" {result['text']}") - print("---") - print() - - -if __name__ == "__main__": - cocoindex.init() - _main() From 4e9d530444142da16ec5d3c5760350aa45aeb664 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Fri, 29 Aug 2025 15:29:49 -0700 Subject: [PATCH 2/2] Delete .env --- .env | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .env diff --git a/.env b/.env deleted file mode 100644 index 335feb6..0000000 --- a/.env +++ /dev/null @@ -1,6 +0,0 @@ -# Postgres database address for cocoindex -COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex - -# Fallback to CPU for operations not supported by MPS on Mac. -# It's no-op for other platforms. -PYTORCH_ENABLE_MPS_FALLBACK=1