diff --git a/docs/getting-started/genai.md b/docs/getting-started/genai.md index a93335c56e3..0b4e5225142 100644 --- a/docs/getting-started/genai.md +++ b/docs/getting-started/genai.md @@ -56,7 +56,6 @@ The transformation workflow typically involves: 3. **Chunking**: Split documents into smaller, semantically meaningful chunks 4. **Embedding Generation**: Convert text chunks into vector embeddings 5. **Storage**: Store embeddings and metadata in Feast's feature store - ### Feature Transformation for LLMs Feast supports transformations that can be used to: @@ -66,6 +65,99 @@ Feast supports transformations that can be used to: * Normalize and preprocess features before serving to LLMs * Apply custom transformations to adapt features for specific LLM requirements +## Getting Started with Feast for GenAI + +### Installation + +To use Feast with vector database support, install with the appropriate extras: + +```bash +# For Milvus support +pip install feast[milvus,nlp] + +# For Elasticsearch support +pip install feast[elasticsearch] + +# For Qdrant support +pip install feast[qdrant] + +# For SQLite support (Python 3.10 only) +pip install feast[sqlite_vec] +``` + +### Configuration + +Configure your feature store to use a vector database as the online store: + +```yaml +project: genai-project +provider: local +registry: data/registry.db +online_store: + type: milvus + path: data/online_store.db + vector_enabled: true + embedding_dim: 384 # Adjust based on your embedding model + index_type: "IVF_FLAT" + +offline_store: + type: file +entity_key_serialization_version: 3 +``` + +### Defining Vector Features + +Create feature views with vector index support: + +```python +from feast import FeatureView, Field, Entity +from feast.types import Array, Float32, String + +document = Entity( + name="document_id", + description="Document identifier", + join_keys=["document_id"], +) + +document_embeddings = FeatureView( + name="document_embeddings", + entities=[document], + schema=[ + Field( + name="vector", + dtype=Array(Float32), + vector_index=True, # Enable vector search + vector_search_metric="COSINE", # Similarity metric + ), + Field(name="document_id", dtype=String), + Field(name="content", dtype=String), + ], + source=document_source, + ttl=timedelta(days=30), +) +``` + +### Retrieving Similar Documents + +Use the `retrieve_online_documents_v2` method to find similar documents: + +```python +# Generate query embedding +query = "How does Feast support vector databases?" +query_embedding = embed_text(query) # Your embedding function + +# Retrieve similar documents +context_data = store.retrieve_online_documents_v2( + features=[ + "document_embeddings:vector", + "document_embeddings:document_id", + "document_embeddings:content", + ], + query=query_embedding, + top_k=3, + distance_metric='COSINE', +).to_df() +``` ## Use Cases ### Document Question-Answering @@ -104,7 +196,6 @@ This integration enables: - Generating embeddings for millions of text chunks - Efficiently materializing features to vector databases - Scaling RAG applications to enterprise-level document repositories - ## Learn More For more detailed information and examples: diff --git a/sdk/python/feast/document_labeling.py b/sdk/python/feast/document_labeling.py new file mode 100644 index 00000000000..82619abb3b7 --- /dev/null +++ b/sdk/python/feast/document_labeling.py @@ -0,0 +1,96 @@ +import json +from typing import Any, Dict, List, Optional + +from feast.feature import Feature + + +class DocumentLabel: + def __init__( + self, + chunk_id: str, + document_id: str, + label: str, + confidence: Optional[float] = None, + metadata: Optional[Dict[str, Any]] = None, + ): + self.chunk_id = chunk_id + self.document_id = document_id + self.label = label + self.confidence = confidence + self.metadata = metadata or {} + + def to_dict(self) -> Dict[str, Any]: + return { + "chunk_id": self.chunk_id, + "document_id": self.document_id, + "label": self.label, + "confidence": self.confidence, + "metadata": self.metadata, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentLabel": + return cls( + chunk_id=data["chunk_id"], + document_id=data["document_id"], + label=data["label"], + confidence=data.get("confidence"), + metadata=data.get("metadata", {}), + ) + + +def store_document_label(feature: Feature, label: DocumentLabel) -> None: + if not hasattr(feature, "labels") or feature.labels is None: + if hasattr(feature, "_labels"): + feature._labels = {} + else: + return + + labels_dict = feature.labels if hasattr(feature, "labels") else feature._labels + labels_key = "document_labels" + if labels_key not in labels_dict: + labels_dict[labels_key] = "[]" + + existing_labels = json.loads(labels_dict[labels_key]) + existing_labels.append(label.to_dict()) + labels_dict[labels_key] = json.dumps(existing_labels) + + +def get_document_labels(feature: Feature) -> List[DocumentLabel]: + labels_dict = None + if hasattr(feature, "labels") and feature.labels: + labels_dict = feature.labels + elif hasattr(feature, "_labels") and feature._labels: + labels_dict = feature._labels + + if not labels_dict or "document_labels" not in labels_dict: + return [] + + labels_data = json.loads(labels_dict["document_labels"]) + return [DocumentLabel.from_dict(label_dict) for label_dict in labels_data] + + +def remove_document_label(feature: Feature, chunk_id: str, document_id: str) -> bool: + labels_dict = None + if hasattr(feature, "labels") and feature.labels: + labels_dict = feature.labels + elif hasattr(feature, "_labels") and feature._labels: + labels_dict = feature._labels + + if not labels_dict or "document_labels" not in labels_dict: + return False + + existing_labels = json.loads(labels_dict["document_labels"]) + original_length = len(existing_labels) + + filtered_labels = [ + label + for label in existing_labels + if not (label["chunk_id"] == chunk_id and label["document_id"] == document_id) + ] + + if len(filtered_labels) < original_length: + labels_dict["document_labels"] = json.dumps(filtered_labels) + return True + + return False diff --git a/sdk/python/feast/feature_server.py b/sdk/python/feast/feature_server.py index b7ec388ba38..990fa4f2fb6 100644 --- a/sdk/python/feast/feature_server.py +++ b/sdk/python/feast/feature_server.py @@ -101,6 +101,10 @@ class ChatRequest(BaseModel): messages: List[ChatMessage] +class ReadDocumentRequest(BaseModel): + file_path: str + + def _get_features(request: GetOnlineFeaturesRequest, store: "feast.FeatureStore"): if request.feature_service: feature_service = store.get_feature_service( @@ -356,6 +360,21 @@ async def chat(request: ChatRequest): # For now, just return dummy text return {"response": "This is a dummy response from the Feast feature server."} + @app.post("/read-document") + async def read_document_endpoint(request: ReadDocumentRequest): + try: + import os + + if not os.path.exists(request.file_path): + return {"error": f"File not found: {request.file_path}"} + + with open(request.file_path, "r", encoding="utf-8") as file: + content = file.read() + + return {"content": content, "file_path": request.file_path} + except Exception as e: + return {"error": str(e)} + @app.get("/chat") async def chat_ui(): # Serve the chat UI diff --git a/ui/src/FeastUISansProviders.tsx b/ui/src/FeastUISansProviders.tsx index 3a15d9bf083..6ce5866e008 100644 --- a/ui/src/FeastUISansProviders.tsx +++ b/ui/src/FeastUISansProviders.tsx @@ -22,6 +22,7 @@ import FeatureServiceInstance from "./pages/feature-services/FeatureServiceInsta import DataSourceInstance from "./pages/data-sources/DataSourceInstance"; import RootProjectSelectionPage from "./pages/RootProjectSelectionPage"; import DatasetInstance from "./pages/saved-data-sets/DatasetInstance"; +import DocumentLabelingPage from "./pages/document-labeling/DocumentLabelingPage"; import PermissionsIndex from "./pages/permissions/Index"; import LineageIndex from "./pages/lineage/Index"; import NoProjectGuard from "./components/NoProjectGuard"; @@ -145,6 +146,10 @@ const FeastUISansProvidersInner = ({ path="data-set/:datasetName/*" element={} /> + } + /> } /> } /> diff --git a/ui/src/custom-tabs/TabsRegistryContext.tsx b/ui/src/custom-tabs/TabsRegistryContext.tsx index d4cc3690a44..4152edd832d 100644 --- a/ui/src/custom-tabs/TabsRegistryContext.tsx +++ b/ui/src/custom-tabs/TabsRegistryContext.tsx @@ -289,6 +289,7 @@ export { useDataSourceCustomTabs, useEntityCustomTabs, useDatasetCustomTabs, + // Routes useRegularFeatureViewCustomTabRoutes, useOnDemandFeatureViewCustomTabRoutes, diff --git a/ui/src/custom-tabs/document-labeling-tab/DocumentLabelingTab.tsx b/ui/src/custom-tabs/document-labeling-tab/DocumentLabelingTab.tsx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ui/src/custom-tabs/document-labeling-tab/example-config.ts b/ui/src/custom-tabs/document-labeling-tab/example-config.ts new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ui/src/custom-tabs/document-labeling-tab/index.ts b/ui/src/custom-tabs/document-labeling-tab/index.ts new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ui/src/custom-tabs/document-labeling-tab/useDocumentLabelingQuery.tsx b/ui/src/custom-tabs/document-labeling-tab/useDocumentLabelingQuery.tsx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ui/src/custom-tabs/types.ts b/ui/src/custom-tabs/types.ts index 3a7bbdfd8e6..be8c19651a0 100644 --- a/ui/src/custom-tabs/types.ts +++ b/ui/src/custom-tabs/types.ts @@ -136,6 +136,20 @@ interface DatasetCustomTabRegistrationInterface }: DatasetCustomTabProps) => JSX.Element; } +// Type for Document Labeling Custom Tabs +interface DocumentLabelingCustomTabProps { + id: string | undefined; + feastObjectQuery: RegularFeatureViewQueryReturnType; +} +interface DocumentLabelingCustomTabRegistrationInterface + extends CustomTabRegistrationInterface { + Component: ({ + id, + feastObjectQuery, + ...args + }: DocumentLabelingCustomTabProps) => JSX.Element; +} + export type { CustomTabRegistrationInterface, RegularFeatureViewQueryReturnType, @@ -157,4 +171,6 @@ export type { FeatureCustomTabProps, DatasetCustomTabRegistrationInterface, DatasetCustomTabProps, + DocumentLabelingCustomTabRegistrationInterface, + DocumentLabelingCustomTabProps, }; diff --git a/ui/src/example-feast-ui-config.ts b/ui/src/example-feast-ui-config.ts new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ui/src/pages/Sidebar.tsx b/ui/src/pages/Sidebar.tsx index 57599139a44..afc6c43acb4 100644 --- a/ui/src/pages/Sidebar.tsx +++ b/ui/src/pages/Sidebar.tsx @@ -131,6 +131,15 @@ const SideNav = () => { renderItem: (props) => , isSelected: useMatchSubpath(`${baseUrl}/data-set`), }, + { + name: "Document Labeling", + id: htmlIdGenerator("documentLabeling")(), + icon: , + renderItem: (props) => ( + + ), + isSelected: useMatchSubpath(`${baseUrl}/document-labeling`), + }, { name: "Permissions", id: htmlIdGenerator("permissions")(), diff --git a/ui/src/pages/document-labeling/DocumentLabelingPage.tsx b/ui/src/pages/document-labeling/DocumentLabelingPage.tsx new file mode 100644 index 00000000000..9ec4c090a6f --- /dev/null +++ b/ui/src/pages/document-labeling/DocumentLabelingPage.tsx @@ -0,0 +1,407 @@ +import React, { useState } from "react"; +import { + EuiPage, + EuiPageBody, + EuiPageSection, + EuiPageHeader, + EuiTitle, + EuiSpacer, + EuiFlexGroup, + EuiFlexItem, + EuiButton, + EuiFieldText, + EuiFormRow, + EuiPanel, + EuiText, + EuiCallOut, + EuiLoadingSpinner, + EuiButtonGroup, + EuiCode, +} from "@elastic/eui"; + +interface DocumentContent { + content: string; + file_path: string; +} + +interface TextSelection { + text: string; + start: number; + end: number; +} + +interface DocumentLabel { + text: string; + start: number; + end: number; + label: string; + timestamp: number; +} + +const DocumentLabelingPage = () => { + const [filePath, setFilePath] = useState("./src/test-document.txt"); + const [selectedText, setSelectedText] = useState(null); + const [labelingMode, setLabelingMode] = useState("relevant"); + const [labels, setLabels] = useState([]); + const [isLoading, setIsLoading] = useState(false); + const [documentContent, setDocumentContent] = + useState(null); + const [error, setError] = useState(null); + + const loadDocument = async () => { + if (!filePath) return; + + setIsLoading(true); + setError(null); + + try { + if (filePath === "./src/test-document.txt") { + const testContent = `This is a sample document for testing the document labeling functionality in Feast UI. + +The document contains multiple paragraphs and sections that can be used to test the text highlighting and labeling features. + +This paragraph discusses machine learning and artificial intelligence concepts. It covers topics like neural networks, deep learning, and natural language processing. Users should be able to select and label relevant portions of this text for RAG retrieval systems. + +Another section focuses on data engineering and ETL pipelines. This content explains how to process large datasets and build scalable data infrastructure. The labeling system should allow users to mark this as relevant or irrelevant for their specific use cases. + +The final paragraph contains information about feature stores and real-time machine learning systems. This text can be used to test the highlighting functionality and ensure that labels are properly stored and displayed in the user interface.`; + + setDocumentContent({ + content: testContent, + file_path: filePath, + }); + } else { + throw new Error( + "Document not found. Please use the test document path: ./src/test-document.txt", + ); + } + } catch (err) { + setError( + err instanceof Error + ? err.message + : "An error occurred while loading the document", + ); + } finally { + setIsLoading(false); + } + }; + + const handleTextSelection = () => { + const selection = window.getSelection(); + if (selection && selection.toString().trim() && documentContent) { + const selectedTextContent = selection.toString().trim(); + const range = selection.getRangeAt(0); + + const textContent = documentContent.content; + + let startIndex = -1; + let endIndex = -1; + + const rangeText = range.toString(); + if (rangeText) { + startIndex = textContent.indexOf(rangeText); + if (startIndex !== -1) { + endIndex = startIndex + rangeText.length; + } + } + + if (startIndex !== -1 && endIndex !== -1) { + setSelectedText({ + text: selectedTextContent, + start: startIndex, + end: endIndex, + }); + } + } + }; + + const handleLabelSelection = () => { + if (selectedText) { + const newLabel: DocumentLabel = { + text: selectedText.text, + start: selectedText.start, + end: selectedText.end, + label: labelingMode, + timestamp: Date.now(), + }; + + setLabels([...labels, newLabel]); + setSelectedText(null); + + const selection = window.getSelection(); + if (selection) { + selection.removeAllRanges(); + } + } + }; + + const handleRemoveLabel = (index: number) => { + setLabels(labels.filter((_: DocumentLabel, i: number) => i !== index)); + }; + + const renderDocumentWithHighlights = ( + content: string, + ): (string | React.ReactElement)[] => { + const allHighlights = [...labels]; + + if (selectedText) { + allHighlights.push({ + text: selectedText.text, + start: selectedText.start, + end: selectedText.end, + label: "temp-selection", + timestamp: 0, + }); + } + + if (allHighlights.length === 0) { + return [content]; + } + + const sortedHighlights = [...allHighlights].sort( + (a, b) => a.start - b.start, + ); + const result: (string | React.ReactElement)[] = []; + let lastIndex = 0; + + sortedHighlights.forEach((highlight, index) => { + result.push(content.slice(lastIndex, highlight.start)); + + let highlightColor = "#d4edda"; + let borderColor = "#c3e6cb"; + + if (highlight.label === "temp-selection") { + highlightColor = "#add8e6"; + borderColor = "#87ceeb"; + } else if (highlight.label === "irrelevant") { + highlightColor = "#f8d7da"; + borderColor = "#f5c6cb"; + } + + result.push( + + {highlight.text} + , + ); + + lastIndex = highlight.end; + }); + + result.push(content.slice(lastIndex)); + return result; + }; + + const labelingOptions = [ + { + id: "relevant", + label: "Relevant", + }, + { + id: "irrelevant", + label: "Irrelevant", + }, + ]; + + return ( + + + + + Document Labeling for RAG + + + + + + + + Load a document file and highlight text chunks to label them as + relevant or irrelevant for RAG retrieval. This helps improve the + quality of your retrieval system by providing human feedback. + + + + + + + + + setFilePath(e.target.value)} + /> + + + + + + Load Document + + + + + + + + {isLoading && ( + + + + + + Loading document... + + + )} + + {error && ( + + {error} + + )} + + {documentContent && ( + <> + + + + Labeling mode: + + + + setLabelingMode(id)} + buttonSize="s" + /> + + + + Label Selected Text + + + + + + + {selectedText && ( + + {selectedText.text} + + )} + + + + + + Document Content + + + + + {renderDocumentWithHighlights(documentContent.content)} + + + + + {labels.length > 0 && ( + <> + + + + Labels ({labels.length}) + + + {labels.map((label, index) => ( + + + + {label.label} + + + + + "{label.text.substring(0, 100)} + {label.text.length > 100 ? "..." : ""}" + + + + handleRemoveLabel(index)} + > + Remove + + + + ))} + + > + )} + > + )} + + + + + ); +}; + +export default DocumentLabelingPage; diff --git a/ui/src/pages/document-labeling/index.ts b/ui/src/pages/document-labeling/index.ts new file mode 100644 index 00000000000..f3f4012b362 --- /dev/null +++ b/ui/src/pages/document-labeling/index.ts @@ -0,0 +1 @@ +export { default } from "./DocumentLabelingPage"; diff --git a/ui/src/test-document.txt b/ui/src/test-document.txt new file mode 100644 index 00000000000..9a25d0c3d95 --- /dev/null +++ b/ui/src/test-document.txt @@ -0,0 +1,9 @@ +This is a sample document for testing the document labeling functionality in Feast UI. + +The document contains multiple paragraphs and sections that can be used to test the text highlighting and labeling features. + +This paragraph discusses machine learning and artificial intelligence concepts. It covers topics like neural networks, deep learning, and natural language processing. Users should be able to select and label relevant portions of this text for RAG retrieval systems. + +Another section focuses on data engineering and ETL pipelines. This content explains how to process large datasets and build scalable data infrastructure. The labeling system should allow users to mark this as relevant or irrelevant for their specific use cases. + +The final paragraph contains information about feature stores and real-time machine learning systems. This text can be used to test the highlighting functionality and ensure that labels are properly stored and displayed in the user interface. diff --git a/ui/src/utils/custom-tabs/DocumentLabelingCustomTabLoadingWrapper.tsx b/ui/src/utils/custom-tabs/DocumentLabelingCustomTabLoadingWrapper.tsx new file mode 100644 index 00000000000..e69de29bb2d
+ Load a document file and highlight text chunks to label them as + relevant or irrelevant for RAG retrieval. This helps improve the + quality of your retrieval system by providing human feedback. +
{error}