From 6a247dc70ca5e797fa4e01121d7458fbc6bbceb9 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Wed, 17 Sep 2025 13:16:16 +0200 Subject: [PATCH 1/2] feat(experiments): add experiment runner (#604) Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- .release-it.json | 1 - eslint.config.mjs | 2 +- package.json | 8 +- packages/client/package.json | 1 + packages/client/src/LangfuseClient.ts | 60 +- packages/client/src/dataset/index.ts | 237 +++- .../src/experiment/ExperimentManager.ts | 723 +++++++++++ packages/client/src/experiment/adapters.ts | 94 ++ packages/client/src/experiment/types.ts | 382 ++++++ packages/client/src/index.ts | 3 + packages/client/src/score/index.ts | 4 + pnpm-lock.yaml | 357 ++++++ tests/e2e/experiments.e2e.test.ts | 1072 +++++++++++++++++ tests/e2e/openai.e2e.test.ts | 2 +- tests/e2e/tracing.e2e.test.ts | 7 +- 15 files changed, 2904 insertions(+), 49 deletions(-) create mode 100644 packages/client/src/experiment/ExperimentManager.ts create mode 100644 packages/client/src/experiment/adapters.ts create mode 100644 packages/client/src/experiment/types.ts create mode 100644 tests/e2e/experiments.e2e.test.ts diff --git a/.release-it.json b/.release-it.json index 67ee0972..f8febbca 100644 --- a/.release-it.json +++ b/.release-it.json @@ -1,6 +1,5 @@ { "git": { - "requireBranch": "main", "requireCleanWorkingDir": true, "requireUpstream": true, "addUntrackedFiles": false, diff --git a/eslint.config.mjs b/eslint.config.mjs index 3a18a8e3..7c025182 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -35,7 +35,7 @@ export default [ ignoreRestSiblings: true, }, ], - "@typescript-eslint/no-explicit-any": "warn", + "@typescript-eslint/no-explicit-any": "off", "@typescript-eslint/no-unnecessary-type-constraint": "error", "prettier/prettier": "error", "no-redeclare": "off", diff --git a/package.json b/package.json index 6abb5e71..086d155c 100644 --- a/package.json +++ b/package.json @@ -2,6 +2,8 @@ "name": "langfuse-js", "version": "4.0.1", "description": "Langfuse JavaScript / TypeScript SDK", + "author": "Langfuse", + "license": "MIT", "private": true, "type": "module", "scripts": { @@ -25,8 +27,6 @@ "nuke": "pnpm clean && rm -rf node_modules && rm -rf packages/*/node_modules && pnpm install", "prepare": "husky" }, - "author": "Langfuse", - "license": "MIT", "devDependencies": { "@ai-sdk/anthropic": "^2", "@ai-sdk/openai": "^2", @@ -46,6 +46,7 @@ "@typescript-eslint/eslint-plugin": "^8.36.0", "@typescript-eslint/parser": "^8.39.0", "ai": "^5", + "autoevals": "^0.0.131", "dotenv": "^17.2.0", "eslint": "^9.32.0", "eslint-config-prettier": "^10.1.8", @@ -67,5 +68,8 @@ }, "engines": { "node": ">=20" + }, + "resolutions": { + "ml-spectra-processing": "14.14.0" } } diff --git a/packages/client/package.json b/packages/client/package.json index 56192e06..3a525d63 100644 --- a/packages/client/package.json +++ b/packages/client/package.json @@ -29,6 +29,7 @@ ], "dependencies": { "@langfuse/core": "workspace:^", + "@langfuse/tracing": "workspace:^", "mustache": "^4.2.0" }, "peerDependencies": { diff --git a/packages/client/src/LangfuseClient.ts b/packages/client/src/LangfuseClient.ts index d817bd42..4750b45a 100644 --- a/packages/client/src/LangfuseClient.ts +++ b/packages/client/src/LangfuseClient.ts @@ -6,6 +6,7 @@ import { } from "@langfuse/core"; import { DatasetManager } from "./dataset/index.js"; +import { ExperimentManager } from "./experiment/ExperimentManager.js"; import { MediaManager } from "./media/index.js"; import { PromptManager } from "./prompt/index.js"; import { ScoreManager } from "./score/index.js"; @@ -106,6 +107,62 @@ export class LangfuseClient { */ public media: MediaManager; + /** + * Manager for running experiments on datasets and data items. + * + * The experiment manager provides comprehensive functionality for: + * - Running tasks on datasets or custom data arrays + * - Evaluating outputs with custom or pre-built evaluators + * - Tracking experiment runs with automatic tracing + * - Generating formatted result summaries + * - Integrating with AutoEvals library evaluators + * + * @example Basic experiment execution + * ```typescript + * const langfuse = new LangfuseClient(); + * + * const result = await langfuse.experiment.run({ + * name: "Model Evaluation", + * description: "Testing model performance on Q&A tasks", + * data: [ + * { input: "What is 2+2?", expectedOutput: "4" }, + * { input: "What is the capital of France?", expectedOutput: "Paris" } + * ], + * task: async ({ input }) => { + * // Your model/task implementation + * const response = await myModel.generate(input); + * return response; + * }, + * evaluators: [ + * async ({ output, expectedOutput }) => ({ + * name: "exact_match", + * value: output.trim().toLowerCase() === expectedOutput.toLowerCase() ? 1 : 0 + * }) + * ] + * }); + * + * console.log(await result.format()); + * ``` + * + * @example Using with datasets + * ```typescript + * const dataset = await langfuse.dataset.get("my-test-dataset"); + * const result = await dataset.runExperiment({ + * name: "Production Readiness Test", + * task: myTask, + * evaluators: [accuracyEvaluator, latencyEvaluator], + * runEvaluators: [overallQualityEvaluator] + * }); + * ``` + * + * @see {@link ExperimentManager} for detailed API documentation + * @see {@link ExperimentParams} for configuration options + * @see {@link ExperimentResult} for result structure + * @public + * @since 4.0.0 + */ + public experiment: ExperimentManager; + private baseUrl: string; private projectId: string | null = null; @@ -236,9 +293,10 @@ export class LangfuseClient { }); this.prompt = new PromptManager({ apiClient: this.api }); - this.dataset = new DatasetManager({ apiClient: this.api }); + this.dataset = new DatasetManager({ langfuseClient: this }); this.score = new ScoreManager({ apiClient: this.api }); this.media = new MediaManager({ apiClient: this.api }); + this.experiment = new ExperimentManager({ langfuseClient: this }); // Keep v3 compat by exposing old interface this.getPrompt = this.prompt.get.bind(this.prompt); // keep correct this context for cache access diff --git a/packages/client/src/dataset/index.ts b/packages/client/src/dataset/index.ts index e9127978..dbe9cab4 100644 --- a/packages/client/src/dataset/index.ts +++ b/packages/client/src/dataset/index.ts @@ -1,21 +1,124 @@ -import { - LangfuseAPIClient, - Dataset, - DatasetRunItem, - DatasetItem, -} from "@langfuse/core"; +import { Dataset, DatasetRunItem, DatasetItem } from "@langfuse/core"; import { Span } from "@opentelemetry/api"; +import { ExperimentResult, ExperimentParams } from "../experiment/types.js"; +import { LangfuseClient } from "../LangfuseClient.js"; + +/** + * Function type for running experiments on Langfuse datasets. + * + * This function type is attached to fetched datasets to enable convenient + * experiment execution directly on dataset objects. + * + * @param params - Experiment parameters excluding data (since data comes from the dataset) + * @returns Promise resolving to experiment results + * + * @example + * ```typescript + * const dataset = await langfuse.dataset.get("my-dataset"); + * const result = await dataset.runExperiment({ + * name: "Model Evaluation", + * runName: "Model Evaluation Run 1", // optional + * task: myTask, + * evaluators: [myEvaluator] + * }); + * ``` + * + * @public + * @since 4.0.0 + */ +export type RunExperimentOnDataset = ( + params: Omit>, "data">, +) => Promise>>; + +/** + * Enhanced dataset object with additional methods for linking and experiments. + * + * This type extends the base Dataset with functionality for: + * - Linking dataset items to traces/observations + * - Running experiments directly on the dataset + * + * @example Working with a fetched dataset + * ```typescript + * const dataset = await langfuse.dataset.get("my-evaluation-dataset"); + * + * // Access dataset metadata + * console.log(dataset.name, dataset.description); + * + * // Work with individual items + * for (const item of dataset.items) { + * console.log(item.input, item.expectedOutput); + * + * // Link item to a trace + * await item.link(myObservation, "experiment-run-1"); + * } + * + * // Run experiments on the entire dataset + * const result = await dataset.runExperiment({ + * name: "Model Comparison", + * task: myTask, + * evaluators: [accuracyEvaluator] + * }); + * ``` + * + * @public + * @since 4.0.0 + */ +export type FetchedDataset = Dataset & { + /** Dataset items with additional linking functionality */ + items: (DatasetItem & { link: LinkDatasetItemFunction })[]; + /** Function to run experiments directly on this dataset */ + runExperiment: RunExperimentOnDataset; +}; + /** * Function type for linking dataset items to OpenTelemetry spans. - * This allows dataset items to be associated with specific traces for experiment tracking. * - * @param obj - Object containing the OpenTelemetry span - * @param runName - Name of the dataset run - * @param runArgs - Optional arguments for the dataset run + * This function creates a connection between a dataset item and a trace/observation, + * enabling tracking of which dataset items were used in which experiments or runs. + * This is essential for creating dataset runs and tracking experiment lineage. + * + * @param obj - Object containing the OpenTelemetry span to link to + * @param obj.otelSpan - The OpenTelemetry span from a Langfuse observation + * @param runName - Name of the experiment run for grouping related items + * @param runArgs - Optional configuration for the dataset run + * @param runArgs.description - Description of the experiment run + * @param runArgs.metadata - Additional metadata to attach to the run * @returns Promise that resolves to the created dataset run item * + * @example Basic linking + * ```typescript + * const dataset = await langfuse.dataset.get("my-dataset"); + * const span = startObservation("my-task", { input: "test" }); + * span.update({ output: "result" }); + * span.end(); + * + * // Link the dataset item to this execution + * await dataset.items[0].link( + * { otelSpan: span.otelSpan }, + * "experiment-run-1" + * ); + * ``` + * + * @example Linking with metadata + * ```typescript + * await dataset.items[0].link( + * { otelSpan: span.otelSpan }, + * "model-comparison-v2", + * { + * description: "Comparing GPT-4 vs Claude performance", + * metadata: { + * modelVersion: "gpt-4-1106-preview", + * temperature: 0.7, + * timestamp: new Date().toISOString() + * } + * } + * ); + * ``` + * + * @see {@link https://langfuse.com/docs/datasets} Langfuse datasets documentation * @public + * @since 4.0.0 */ export type LinkDatasetItemFunction = ( obj: { otelSpan: Span }, @@ -37,7 +140,7 @@ export type LinkDatasetItemFunction = ( * @public */ export class DatasetManager { - private apiClient: LangfuseAPIClient; + private langfuseClient: LangfuseClient; /** * Creates a new DatasetManager instance. @@ -45,56 +148,100 @@ export class DatasetManager { * @param params - Configuration object containing the API client * @internal */ - constructor(params: { apiClient: LangfuseAPIClient }) { - this.apiClient = params.apiClient; + constructor(params: { langfuseClient: LangfuseClient }) { + this.langfuseClient = params.langfuseClient; } /** - * Retrieves a dataset by name along with all its items. + * Retrieves a dataset by name with all its items and experiment functionality. * - * This method automatically handles pagination to fetch all dataset items - * and enhances each item with a `link` function for easy experiment tracking. + * This method fetches a dataset and all its associated items, with support + * for automatic pagination to handle large datasets efficiently. The returned + * dataset object includes enhanced functionality for linking items to traces + * and running experiments directly on the dataset. * * @param name - The name of the dataset to retrieve - * @param options - Optional configuration for fetching + * @param options - Optional configuration for data fetching * @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50) + * @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities + * + * @example Basic dataset retrieval + * ```typescript + * const dataset = await langfuse.dataset.get("my-evaluation-dataset"); + * console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`); * - * @returns Promise that resolves to the dataset with enhanced items + * // Access dataset properties + * console.log(dataset.description); + * console.log(dataset.metadata); + * ``` * - * @example + * @example Working with dataset items * ```typescript - * const dataset = await langfuse.dataset.get("my-dataset"); + * const dataset = await langfuse.dataset.get("qa-dataset"); * * for (const item of dataset.items) { - * // Use the item data for your experiment - * const result = await processItem(item.input); + * console.log("Question:", item.input); + * console.log("Expected Answer:", item.expectedOutput); * - * // Link the result to the dataset item - * await item.link( - * { otelSpan: currentSpan }, - * "experiment-run-1", - * { description: "Testing new model" } - * ); + * // Each item has a link function for connecting to traces + * // await item.link(span, "experiment-name"); * } * ``` + * + * @example Running experiments on datasets + * ```typescript + * const dataset = await langfuse.dataset.get("benchmark-dataset"); + * + * const result = await dataset.runExperiment({ + * name: "GPT-4 Benchmark", + * runName: "GPT-4 Benchmark v1.2", // optional exact run name + * description: "Evaluating GPT-4 on our benchmark tasks", + * task: async ({ input }) => { + * const response = await openai.chat.completions.create({ + * model: "gpt-4", + * messages: [{ role: "user", content: input }] + * }); + * return response.choices[0].message.content; + * }, + * evaluators: [ + * async ({ output, expectedOutput }) => ({ + * name: "exact_match", + * value: output === expectedOutput ? 1 : 0 + * }) + * ] + * }); + * + * console.log(await result.format()); + * ``` + * + * @example Handling large datasets + * ```typescript + * // For very large datasets, use smaller page sizes + * const largeDataset = await langfuse.dataset.get( + * "large-dataset", + * { fetchItemsPageSize: 100 } + * ); + * ``` + * + * @throws {Error} If the dataset does not exist or cannot be accessed + * @see {@link FetchedDataset} for the complete return type specification + * @see {@link RunExperimentOnDataset} for experiment execution details + * @public + * @since 4.0.0 */ async get( name: string, options?: { fetchItemsPageSize: number; }, - ): Promise< - Dataset & { - items: (DatasetItem & { link: LinkDatasetItemFunction })[]; - } - > { - const dataset = await this.apiClient.datasets.get(name); + ): Promise { + const dataset = await this.langfuseClient.api.datasets.get(name); const items: DatasetItem[] = []; let page = 1; while (true) { - const itemsResponse = await this.apiClient.datasetItems.list({ + const itemsResponse = await this.langfuseClient.api.datasetItems.list({ datasetName: name, limit: options?.fetchItemsPageSize ?? 50, page, @@ -109,12 +256,22 @@ export class DatasetManager { page++; } + const itemsWithLinkMethod = items.map((item) => ({ + ...item, + link: this.createDatasetItemLinkFunction(item), + })); + + const runExperiment: RunExperimentOnDataset = (params) => { + return this.langfuseClient.experiment.run({ + data: items, + ...params, + }); + }; + const returnDataset = { ...dataset, - items: items.map((item) => ({ - ...item, - link: this.createDatasetItemLinkFunction(item), - })), + items: itemsWithLinkMethod, + runExperiment, }; return returnDataset; @@ -138,7 +295,7 @@ export class DatasetManager { metadata?: any; }, ): Promise => { - return await this.apiClient.datasetRunItems.create({ + return await this.langfuseClient.api.datasetRunItems.create({ runName, datasetItemId: item.id, traceId: obj.otelSpan.spanContext().traceId, diff --git a/packages/client/src/experiment/ExperimentManager.ts b/packages/client/src/experiment/ExperimentManager.ts new file mode 100644 index 00000000..fa146a69 --- /dev/null +++ b/packages/client/src/experiment/ExperimentManager.ts @@ -0,0 +1,723 @@ +import { DatasetItem, getGlobalLogger } from "@langfuse/core"; +import { startActiveObservation } from "@langfuse/tracing"; +import { ProxyTracerProvider, trace } from "@opentelemetry/api"; + +import { LangfuseClient } from "../LangfuseClient.js"; + +import { + ExperimentParams, + ExperimentResult, + ExperimentTask, + ExperimentItem, + ExperimentItemResult, + Evaluator, + Evaluation, +} from "./types.js"; + +/** + * Manages the execution and evaluation of experiments on datasets. + * + * The ExperimentManager provides a comprehensive framework for running experiments + * that test models or tasks against datasets, with support for automatic evaluation, + * scoring. + * + * @example Basic experiment usage + * ```typescript + * const langfuse = new LangfuseClient(); + * + * const result = await langfuse.experiment.run({ + * name: "Capital Cities Test", + * description: "Testing model knowledge of world capitals", + * data: [ + * { input: "France", expectedOutput: "Paris" }, + * { input: "Germany", expectedOutput: "Berlin" } + * ], + * task: async ({ input }) => { + * const response = await openai.chat.completions.create({ + * model: "gpt-4", + * messages: [{ role: "user", content: `What is the capital of ${input}?` }] + * }); + * return response.choices[0].message.content; + * }, + * evaluators: [ + * async ({ input, output, expectedOutput }) => ({ + * name: "exact_match", + * value: output === expectedOutput ? 1 : 0 + * }) + * ] + * }); + * + * console.log(await result.format()); + * ``` + * + * @example Using with Langfuse datasets + * ```typescript + * const dataset = await langfuse.dataset.get("my-dataset"); + * + * const result = await dataset.runExperiment({ + * name: "Model Comparison", + * task: myTask, + * evaluators: [myEvaluator], + * runEvaluators: [averageScoreEvaluator] + * }); + * ``` + * + * @public + */ +export class ExperimentManager { + private langfuseClient: LangfuseClient; + + /** + * Creates a new ExperimentManager instance. + * + * @param params - Configuration object + * @param params.langfuseClient - The Langfuse client instance for API communication + * @internal + */ + constructor(params: { langfuseClient: LangfuseClient }) { + this.langfuseClient = params.langfuseClient; + } + + /** + * Gets the global logger instance for experiment-related logging. + * + * @returns The global logger instance + * @internal + */ + get logger() { + return getGlobalLogger(); + } + + /** + * Executes an experiment by running a task on each data item and evaluating the results. + * + * This method orchestrates the complete experiment lifecycle: + * 1. Executes the task function on each data item with proper tracing + * 2. Runs item-level evaluators on each task output + * 3. Executes run-level evaluators on the complete result set + * 4. Links results to dataset runs (for Langfuse datasets) + * 5. Stores all scores and traces in Langfuse + * + * @param config - The experiment configuration + * @param config.name - Human-readable name for the experiment + * @param config.runName - Optional exact name for the experiment run (defaults to name + timestamp) + * @param config.description - Optional description of the experiment's purpose + * @param config.metadata - Optional metadata to attach to the experiment run + * @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[]) + * @param config.task - Function that processes each data item and returns output + * @param config.evaluators - Optional array of functions to evaluate each item's output + * @param config.runEvaluators - Optional array of functions to evaluate the entire run + * @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity) + * + * @returns Promise that resolves to experiment results including: + * - runName: The experiment run name (either provided or generated) + * - itemResults: Results for each processed data item + * - runEvaluations: Results from run-level evaluators + * - datasetRunId: ID of the dataset run (if using Langfuse datasets) + * - format: Function to format results for display + * + * @throws {Error} When task execution fails and cannot be handled gracefully + * @throws {Error} When required evaluators fail critically + * + * @example Simple experiment + * ```typescript + * const result = await langfuse.experiment.run({ + * name: "Translation Quality Test", + * data: [ + * { input: "Hello world", expectedOutput: "Hola mundo" }, + * { input: "Good morning", expectedOutput: "Buenos dรญas" } + * ], + * task: async ({ input }) => translateText(input, 'es'), + * evaluators: [ + * async ({ output, expectedOutput }) => ({ + * name: "bleu_score", + * value: calculateBleuScore(output, expectedOutput) + * }) + * ] + * }); + * ``` + * + * @example Experiment with concurrency control + * ```typescript + * const result = await langfuse.experiment.run({ + * name: "Large Scale Evaluation", + * data: largeBatchOfItems, + * task: expensiveModelCall, + * maxConcurrency: 5, // Process max 5 items simultaneously + * evaluators: [myEvaluator], + * runEvaluators: [ + * async ({ itemResults }) => ({ + * name: "average_score", + * value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length + * }) + * ] + * }); + * ``` + * + * @see {@link ExperimentParams} for detailed parameter documentation + * @see {@link ExperimentResult} for detailed return value documentation + * @see {@link Evaluator} for evaluator function specifications + * @see {@link RunEvaluator} for run evaluator function specifications + * + * @public + */ + async run< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, + >( + config: ExperimentParams, + ): Promise> { + const { + data, + evaluators, + task, + name, + runName: providedRunName, + description, + metadata, + maxConcurrency: batchSize = Infinity, + runEvaluators, + } = config; + + const runName = this.createExperimentRunName({ + name, + runName: providedRunName, + }); + + if (!this.isOtelRegistered()) { + this.logger.warn( + "OpenTelemetry has not been set up. Traces will not be sent to Langfuse.See our docs on how to set up OpenTelemetry: https://langfuse.com/docs/observability/sdk/typescript/setup#tracing-setup", + ); + } + + const itemResults: ExperimentItemResult[] = + []; + + for (let i = 0; i < data.length; i += batchSize) { + const batch = data.slice(i, i + batchSize); + + const promises: Promise< + ExperimentItemResult + >[] = batch.map(async (item) => { + return this.runItem({ + item, + evaluators, + task, + experimentName: name, + experimentRunName: runName, + experimentDescription: description, + experimentMetadata: metadata, + }); + }); + + const settledResults = await Promise.allSettled(promises); + const results = settledResults.reduce( + (acc, settledResult) => { + if (settledResult.status === "fulfilled") { + acc.push(settledResult.value); + } else { + const errorMessage = + settledResult.reason instanceof Error + ? settledResult.reason.message + : String(settledResult.reason); + this.logger.error( + `Task failed with error: ${errorMessage}. Skipping item.`, + ); + } + return acc; + }, + [] as ExperimentItemResult[], + ); + + itemResults.push(...results); + } + + // Get dataset run URL + const datasetRunId = + itemResults.length > 0 ? itemResults[0].datasetRunId : undefined; + + let datasetRunUrl = undefined; + if (datasetRunId && data.length > 0 && "datasetId" in data[0]) { + const datasetId = data[0].datasetId; + const projectUrl = (await this.langfuseClient.getTraceUrl("mock")).split( + "/traces", + )[0]; + + datasetRunUrl = `${projectUrl}/datasets/${datasetId}/runs/${datasetRunId}`; + } + + // Execute run evaluators + let runEvaluations: Evaluation[] = []; + if (runEvaluators && runEvaluators?.length > 0) { + const promises = runEvaluators.map(async (runEvaluator) => { + return runEvaluator({ itemResults }) + .then((result) => { + // Handle both single evaluation and array of evaluations + return Array.isArray(result) ? result : [result]; + }) + .catch((err) => { + this.logger.error("Run evaluator failed with error ", err); + + throw err; + }); + }); + + runEvaluations = (await Promise.allSettled(promises)).reduce( + (acc, settledPromise) => { + if (settledPromise.status === "fulfilled") { + acc.push(...settledPromise.value); + } + + return acc; + }, + [] as Evaluation[], + ); + + if (datasetRunId) { + runEvaluations.forEach((runEval) => + this.langfuseClient.score.create({ datasetRunId, ...runEval }), + ); + } + } + + await this.langfuseClient.score.flush(); + + return { + runName, + itemResults, + datasetRunId, + datasetRunUrl, + runEvaluations, + format: async (options?: { includeItemResults?: boolean }) => + await this.prettyPrintResults({ + datasetRunUrl, + itemResults, + originalData: data, + runEvaluations, + name: config.name, + runName, + description: config.description, + includeItemResults: options?.includeItemResults ?? false, + }), + }; + } + + /** + * Executes the task and evaluators for a single data item. + * + * This method handles the complete processing pipeline for one data item: + * 1. Executes the task within a traced observation span + * 2. Links the result to a dataset run (if applicable) + * 3. Runs all item-level evaluators on the output + * 4. Stores evaluation scores in Langfuse + * 5. Handles errors gracefully by continuing with remaining evaluators + * + * @param params - Parameters for item execution + * @param params.experimentName - Name of the parent experiment + * @param params.experimentRunName - Run name for the parent experiment + * @param params.experimentDescription - Description of the parent experiment + * @param params.experimentMetadata - Metadata for the parent experiment + * @param params.item - The data item to process + * @param params.task - The task function to execute + * @param params.evaluators - Optional evaluators to run on the output + * + * @returns Promise resolving to the item result with output, evaluations, and trace info + * + * @throws {Error} When task execution fails (propagated from task function) + * + * @internal + */ + private async runItem< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, + >(params: { + experimentName: ExperimentParams["name"]; + experimentRunName: string; + experimentDescription: ExperimentParams< + Input, + ExpectedOutput, + Metadata + >["description"]; + experimentMetadata: ExperimentParams< + Input, + ExpectedOutput, + Metadata + >["metadata"]; + item: ExperimentParams["data"][0]; + task: ExperimentTask; + evaluators?: Evaluator[]; + }): Promise> { + const { item, evaluators = [], task, experimentMetadata = {} } = params; + + const { output, traceId, observationId } = await startActiveObservation( + "experiment-item-run", + async (span) => { + const output = await task(item); + + span.update({ + input: item.input, + output, + metadata: { + experiment_name: params.experimentName, + experiment_run_name: params.experimentRunName, + ...experimentMetadata, + ...(item.metadata ?? {}), + ...("id" in item && "datasetId" in item + ? { + dataset_id: item["datasetId"], + dataset_item_id: item["id"], + } + : {}), + }, + }); + + return { output, traceId: span.traceId, observationId: span.id }; + }, + ); + + let datasetRunId: string | undefined = undefined; + + if ("id" in item) { + await this.langfuseClient.api.datasetRunItems + .create({ + runName: params.experimentRunName, + runDescription: params.experimentDescription, + metadata: params.experimentMetadata, + datasetItemId: item.id, + traceId, + observationId, + }) + .then((result) => { + datasetRunId = result.datasetRunId; + }) + .catch((err) => + this.logger.error("Linking dataset run item failed", err), + ); + } + + const evalPromises: Promise[] = evaluators.map( + async (evaluator) => { + const params = { + input: item.input as any, + expectedOutput: item.expectedOutput as any, + output, + }; + + return evaluator(params) + .then((result) => { + // Handle both single evaluation and array of evaluations + return Array.isArray(result) ? result : [result]; + }) + .catch((err) => { + this.logger.error( + `Evaluator '${evaluator.name}' failed for params \n\n${JSON.stringify(params)}\n\n with error: ${err}`, + ); + + throw err; + }); + }, + ); + + const evals = (await Promise.allSettled(evalPromises)).reduce( + (acc, promiseResult) => { + if (promiseResult.status === "fulfilled") { + acc.push(...promiseResult.value.flat()); + } + + return acc; + }, + [] as Evaluation[], + ); + + for (const ev of evals) { + this.langfuseClient.score.create({ + traceId, + name: ev.name, + comment: ev.comment, + value: ev.value, + metadata: ev.metadata, + dataType: ev.dataType, + }); + } + + return { + output, + evaluations: evals, + traceId, + datasetRunId, + item, + }; + } + + /** + * Formats experiment results into a human-readable string representation. + * + * Creates a comprehensive, nicely formatted summary of the experiment including: + * - Individual item results with inputs, outputs, expected values, and scores + * - Dataset item and trace links (when available) + * - Experiment overview with aggregate statistics + * - Average scores across all evaluations + * - Run-level evaluation results + * - Links to dataset runs in the Langfuse UI + * + * @param params - Formatting parameters + * @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI + * @param params.itemResults - Results from processing each data item + * @param params.originalData - The original input data items + * @param params.runEvaluations - Results from run-level evaluators + * @param params.name - Name of the experiment + * @param params.description - Optional description of the experiment + * @param params.includeItemResults - Whether to include individual item details (default: false) + * + * @returns Promise resolving to formatted string representation + * + * @example Output format + * ``` + * 1. Item 1: + * Input: What is the capital of France? + * Expected: Paris + * Actual: Paris + * Scores: + * โ€ข exact_match: 1.000 + * โ€ข similarity: 0.95 + * ๐Ÿ’ญ Very close match with expected output + * + * Dataset Item: + * https://cloud.langfuse.com/project/123/datasets/456/items/789 + * + * Trace: + * https://cloud.langfuse.com/project/123/traces/abc123 + * + * โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + * ๐Ÿ“Š Translation Quality Test - Testing model accuracy + * 2 items + * Evaluations: + * โ€ข exact_match + * โ€ข similarity + * + * Average Scores: + * โ€ข exact_match: 0.850 + * โ€ข similarity: 0.923 + * + * Run Evaluations: + * โ€ข overall_quality: 0.887 + * ๐Ÿ’ญ Good performance with room for improvement + * + * ๐Ÿ”— Dataset Run: + * https://cloud.langfuse.com/project/123/datasets/456/runs/def456 + * ``` + * + * @internal + */ + private async prettyPrintResults< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, + >(params: { + datasetRunUrl?: string; + itemResults: ExperimentItemResult[]; + originalData: + | ExperimentItem[] + | DatasetItem[]; + runEvaluations: Evaluation[]; + name: string; + runName: string; + description?: string; + includeItemResults?: boolean; + }): Promise { + const { + itemResults, + originalData, + runEvaluations, + name, + runName, + description, + includeItemResults = false, + } = params; + + if (itemResults.length === 0) { + return "No experiment results to display."; + } + + let output = ""; + + // Individual results + if (includeItemResults) { + for (let index = 0; index < itemResults.length; index++) { + const result = itemResults[index]; + const originalItem = originalData[index]; + + output += `\n${index + 1}. Item ${index + 1}:\n`; + + // Input, expected, and actual on separate lines + if (originalItem?.input !== undefined) { + output += ` Input: ${this.formatValue(originalItem.input)}\n`; + } + + const expectedOutput = + originalItem?.expectedOutput ?? result.expectedOutput ?? null; + output += ` Expected: ${expectedOutput !== null ? this.formatValue(expectedOutput) : "null"}\n`; + output += ` Actual: ${this.formatValue(result.output)}\n`; + + // Scores on separate lines + if (result.evaluations.length > 0) { + output += ` Scores:\n`; + result.evaluations.forEach((evaluation) => { + const score = + typeof evaluation.value === "number" + ? evaluation.value.toFixed(3) + : evaluation.value; + output += ` โ€ข ${evaluation.name}: ${score}`; + if (evaluation.comment) { + output += `\n ๐Ÿ’ญ ${evaluation.comment}`; + } + output += "\n"; + }); + } + + // Dataset item link on separate line + if ( + originalItem && + "id" in originalItem && + "datasetId" in originalItem + ) { + const projectUrl = ( + await this.langfuseClient.getTraceUrl("mock") + ).split("/traces")[0]; + const datasetItemUrl = `${projectUrl}/datasets/${originalItem.datasetId}/items/${originalItem.id}`; + output += `\n Dataset Item:\n ${datasetItemUrl}\n`; + } + + // Trace link on separate line + if (result.traceId) { + const traceUrl = await this.langfuseClient.getTraceUrl( + result.traceId, + ); + output += `\n Trace:\n ${traceUrl}\n`; + } + } + } else { + output += `Individual Results: Hidden (${itemResults.length} items)\n`; + output += "๐Ÿ’ก Call format({ includeItemResults: true }) to view them\n"; + } + + // Experiment Overview + const totalItems = itemResults.length; + const evaluationNames = new Set( + itemResults.flatMap((r) => r.evaluations.map((e) => e.name)), + ); + + output += `\n${"โ”€".repeat(50)}\n`; + output += `๐Ÿงช Experiment: ${name}`; + output += `\n๐Ÿ“‹ Run name: ${runName}`; + if (description) { + output += ` - ${description}`; + } + + output += `\n${totalItems} items`; + + if (evaluationNames.size > 0) { + output += `\nEvaluations:`; + Array.from(evaluationNames).forEach((evalName) => { + output += `\n โ€ข ${evalName}`; + }); + output += "\n"; + } + + // Average scores in bulleted list + if (evaluationNames.size > 0) { + output += `\nAverage Scores:`; + for (const evalName of evaluationNames) { + const scores = itemResults + .flatMap((r) => r.evaluations) + .filter((e) => e.name === evalName && typeof e.value === "number") + .map((e) => e.value as number); + + if (scores.length > 0) { + const avg = scores.reduce((a, b) => a + b, 0) / scores.length; + output += `\n โ€ข ${evalName}: ${avg.toFixed(3)}`; + } + } + output += "\n"; + } + + // Run evaluations + if (runEvaluations.length > 0) { + output += `\nRun Evaluations:`; + runEvaluations.forEach((runEval) => { + const score = + typeof runEval.value === "number" + ? runEval.value.toFixed(3) + : runEval.value; + output += `\n โ€ข ${runEval.name}: ${score}`; + if (runEval.comment) { + output += `\n ๐Ÿ’ญ ${runEval.comment}`; + } + }); + output += "\n"; + } + + if (params.datasetRunUrl) { + output += `\n๐Ÿ”— Dataset Run:\n ${params.datasetRunUrl}`; + } + + return output; + } + + /** + * Formats a value for display in pretty-printed output. + * + * Handles different value types appropriately: + * - Strings: Truncates long strings to 50 characters with "..." + * - Objects/Arrays: Converts to JSON string representation + * - Primitives: Uses toString() representation + * + * @param value - The value to format + * @returns Formatted string representation suitable for display + * + * @internal + */ + private formatValue(value: any): string { + if (typeof value === "string") { + return value.length > 50 ? `${value.substring(0, 47)}...` : value; + } + return JSON.stringify(value); + } + + private isOtelRegistered(): boolean { + let tracerProvider = trace.getTracerProvider(); + + if (tracerProvider instanceof ProxyTracerProvider) { + tracerProvider = tracerProvider.getDelegate(); + } + + return tracerProvider.constructor.name !== "NoopTracerProvider"; + } + + /** + * Creates an experiment run name based on provided parameters. + * + * If runName is provided, returns it directly. Otherwise, generates + * a name by combining the experiment name with an ISO timestamp. + * + * @param params - Parameters for run name creation + * @param params.name - The experiment name + * @param params.runName - Optional provided run name + * @returns The final run name to use + * + * @internal + */ + private createExperimentRunName(params: { + name: string; + runName?: string; + }): string { + if (params.runName) { + return params.runName; + } + + const isoTimestamp = new Date().toISOString(); + return `${params.name} - ${isoTimestamp}`; + } +} diff --git a/packages/client/src/experiment/adapters.ts b/packages/client/src/experiment/adapters.ts new file mode 100644 index 00000000..a8a87c88 --- /dev/null +++ b/packages/client/src/experiment/adapters.ts @@ -0,0 +1,94 @@ +import { Evaluator } from "./types.js"; + +/** + * Converts an AutoEvals evaluator to a Langfuse-compatible evaluator function. + * + * This adapter function bridges the gap between AutoEvals library evaluators + * and Langfuse experiment evaluators, handling parameter mapping and result + * formatting automatically. + * + * AutoEvals evaluators expect `input`, `output`, and `expected` parameters, + * while Langfuse evaluators use `input`, `output`, and `expectedOutput`. + * This function handles the parameter name mapping. + * + * @template E - Type of the AutoEvals evaluator function + * @param autoevalEvaluator - The AutoEvals evaluator function to convert + * @param params - Optional additional parameters to pass to the AutoEvals evaluator + * @returns A Langfuse-compatible evaluator function + * + * @example Basic usage with AutoEvals + * ```typescript + * import { Factuality, Levenshtein } from 'autoevals'; + * import { createEvaluatorFromAutoevals } from '@langfuse/client'; + * + * const factualityEvaluator = createEvaluatorFromAutoevals(Factuality); + * const levenshteinEvaluator = createEvaluatorFromAutoevals(Levenshtein); + * + * await langfuse.experiment.run({ + * name: "AutoEvals Integration Test", + * data: myDataset, + * task: myTask, + * evaluators: [factualityEvaluator, levenshteinEvaluator] + * }); + * ``` + * + * @example Using with additional parameters + * ```typescript + * import { Factuality } from 'autoevals'; + * + * const factualityEvaluator = createEvaluatorFromAutoevals( + * Factuality, + * { model: 'gpt-4o' } // Additional params for AutoEvals + * ); + * + * await langfuse.experiment.run({ + * name: "Factuality Test", + * data: myDataset, + * task: myTask, + * evaluators: [factualityEvaluator] + * }); + * ``` + * + * @see {@link https://github.com/braintrustdata/autoevals} AutoEvals library documentation + * @see {@link Evaluator} for Langfuse evaluator specifications + * + * @public + * @since 4.0.0 + */ +export function createEvaluatorFromAutoevals( + autoevalEvaluator: E, + params?: Params, +): Evaluator { + const langfuseEvaluator: Evaluator = async (langfuseEvaluatorParams) => { + const score = await autoevalEvaluator({ + ...(params ?? {}), + input: langfuseEvaluatorParams.input, + output: langfuseEvaluatorParams.output, + expected: langfuseEvaluatorParams.expectedOutput, + }); + + return { + name: score.name, + value: score.score ?? 0, + metadata: score.metadata, + }; + }; + + return langfuseEvaluator; +} + +/** + * Utility type to extract parameter types from AutoEvals evaluator functions. + * + * This type helper extracts the parameter type from an AutoEvals evaluator + * and omits the standard parameters (input, output, expected) that are + * handled by the adapter, leaving only the additional configuration parameters. + * + * @template E - The AutoEvals evaluator function type + * @internal + */ +type Params = Parameters< + E extends (...args: any[]) => any ? E : never +>[0] extends infer P + ? Omit + : never; diff --git a/packages/client/src/experiment/types.ts b/packages/client/src/experiment/types.ts new file mode 100644 index 00000000..d2f7a29d --- /dev/null +++ b/packages/client/src/experiment/types.ts @@ -0,0 +1,382 @@ +import { DatasetItem, ScoreBody } from "@langfuse/core"; + +export type ExperimentItem< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = + | { + /** + * The input data to pass to the task function. + * + * Can be any type - string, object, array, etc. This data will be passed + * to your task function as the `input` parameter. Structure it according + * to your task's requirements. + */ + input?: Input; + + /** + * The expected output for evaluation purposes. + * + * Optional ground truth or reference output for this input. + * Used by evaluators to assess task performance. If not provided, + * only evaluators that don't require expected output can be used. + */ + expectedOutput?: ExpectedOutput; + + /** + * Optional metadata to attach to the experiment item. + * + * Store additional context, tags, or custom data related to this specific item. + * This metadata will be available in traces and can be used for filtering, + * analysis, or custom evaluator logic. + */ + metadata?: Metadata; + } + | DatasetItem; + +/** + * Parameters passed to an experiment task function. + * + * Can be either an ExperimentItem (for custom datasets) or a DatasetItem + * (for Langfuse datasets). The task function should handle both types. + * + * @public + * @since 4.1.0 + */ +export type ExperimentTaskParams< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = ExperimentItem; + +/** + * Function type for experiment tasks that process input data and return output. + * + * The task function is the core component being tested in an experiment. + * It receives either an ExperimentItem or DatasetItem and produces output + * that will be evaluated. + * + * @param params - Either an ExperimentItem or DatasetItem containing input and metadata + * @returns Promise resolving to the task's output (any type) + * + * @example Task handling both item types + * ```typescript + * const universalTask: ExperimentTask = async (item) => { + * // Works with both ExperimentItem and DatasetItem + * const input = item.input; + * const metadata = item.metadata; + * + * const response = await openai.chat.completions.create({ + * model: "gpt-4", + * messages: [{ role: "user", content: input }] + * }); + * + * return response.choices[0].message.content; + * }; + * ``` + * + * @public + * @since 4.1.0 + */ +export type ExperimentTask< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = ( + params: ExperimentTaskParams, +) => Promise; + +export type Evaluation = Pick< + ScoreBody, + "name" | "value" | "comment" | "metadata" | "dataType" +>; + +export type EvaluatorParams< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + /** + * The original input data passed to the task. + * + * This is the same input that was provided to the task function. + * Use this for context-aware evaluations or input-output relationship analysis. + */ + input: Input; + + /** + * The output produced by the task. + * + * This is the actual result returned by your task function. + * This is the primary value to evaluate against expectations. + */ + output: any; + + /** + * The expected output for comparison (optional). + * + * This is the ground truth or expected result for the given input. + * May not be available for all evaluation scenarios. + */ + expectedOutput?: ExpectedOutput; + + /** + * Optional metadata about the evaluation context. + * + * Contains additional information from the experiment item or dataset item + * that may be useful for evaluation logic, such as tags, categories, + * or other contextual data. + */ + metadata?: Metadata; +}; +export type Evaluator< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = ( + params: EvaluatorParams, +) => Promise; + +export type RunEvaluatorParams< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + /** + * Results from all processed experiment items. + * + * Each item contains the input, output, evaluations, and metadata from + * processing a single data item. Use this for aggregate analysis, + * statistical calculations, and cross-item comparisons. + */ + itemResults: ExperimentItemResult[]; +}; +export type RunEvaluator< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = ( + params: RunEvaluatorParams, +) => Promise; + +export type ExperimentParams< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + /** + * Human-readable name for the experiment. + * + * This name will appear in Langfuse UI and experiment results. + * Choose a descriptive name that identifies the experiment's purpose. + */ + name: string; + + /** + * Optional exact name for the experiment run. + * + * If provided, this will be used as the exact dataset run name if the data + * contains Langfuse dataset items. If not provided, this will default to + * the experiment name appended with an ISO timestamp. + */ + runName?: string; + + /** + * Optional description explaining the experiment's purpose. + * + * Provide context about what you're testing, methodology, or goals. + * This helps with experiment tracking and result interpretation. + */ + description?: string; + + /** + * Optional metadata to attach to the experiment run. + * + * Store additional context like model versions, hyperparameters, + * or any other relevant information for analysis and comparison. + */ + metadata?: Record; + + /** + * Array of data items to process. + * + * Can be either custom ExperimentItem[] or DatasetItem[] from Langfuse. + * Each item should contain input data and optionally expected output. + */ + data: ExperimentItem[]; + + /** + * The task function to execute on each data item. + * + * This function receives input data and produces output that will be evaluated. + * It should encapsulate the model or system being tested. + */ + task: ExperimentTask; + + /** + * Optional array of evaluator functions to assess each item's output. + * + * Each evaluator receives input, output, and expected output (if available) + * and returns evaluation results. Multiple evaluators enable comprehensive assessment. + */ + evaluators?: Evaluator[]; + + /** + * Optional array of run-level evaluators to assess the entire experiment. + * + * These evaluators receive all item results and can perform aggregate analysis + * like calculating averages, detecting patterns, or statistical analysis. + */ + runEvaluators?: RunEvaluator[]; + + /** + * Maximum number of concurrent task executions (default: Infinity). + * + * Controls parallelism to manage resource usage and API rate limits. + * Set lower values for expensive operations or rate-limited services. + */ + maxConcurrency?: number; +}; + +export type ExperimentItemResult< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = Pick< + ExperimentItem, + "input" | "expectedOutput" +> & { + /** + * The original experiment or dataset item that was processed. + * + * Contains the complete original item data including input, expected output, + * metadata, and any additional fields. Useful for accessing item-specific + * context or metadata in result analysis. + */ + item: ExperimentItem; + /** + * The actual output produced by the task. + * + * This is the result returned by your task function for this specific input. + * It will be passed to evaluators for assessment against expected outputs. + */ + output: any; + + /** + * Results from all evaluators that ran on this item. + * + * Contains evaluation scores, comments, and metadata from each evaluator + * that successfully processed this item. Failed evaluators are excluded. + */ + evaluations: Evaluation[]; + + /** + * Langfuse trace ID for this item's execution (for debugging and analysis). + * + * Use this ID to view detailed execution traces in the Langfuse UI, + * including timing, inputs, outputs, and any nested observations. + */ + traceId?: string; + + /** + * Dataset run ID if this item was part of a Langfuse dataset. + * + * Present only when running experiments on Langfuse datasets. + * Links this item result to a specific dataset run for tracking and comparison. + */ + datasetRunId?: string; +}; + +/** + * Complete result of an experiment execution. + * + * Contains all results from processing the experiment data, + * including individual item results, run-level evaluations, + * and utilities for result visualization. + * + * @example Using experiment results + * ```typescript + * const result = await langfuse.experiment.run(config); + * + * // Access individual results + * console.log(`Processed ${result.itemResults.length} items`); + * + * // Check run-level evaluations + * const avgScore = result.runEvaluations.find(e => e.name === 'average_score'); + * console.log(`Average score: ${avgScore?.value}`); + * + * // Print formatted results + * console.log(await result.format()); + * + * // Print summary with individual item results + * console.log(await result.format({ includeItemResults: true })); + * + * // Link to dataset run (if available) + * if (result.datasetRunUrl) { + * console.log(`View in Langfuse: dataset run ${result.datasetRunUrl}`); + * } + * ``` + * + * @public + */ +export type ExperimentResult< + Input = any, + ExpectedOutput = any, + Metadata extends Record = Record, +> = { + /** + * The experiment run name. + * + * This is equal to the dataset run name if experiment was on Langfuse dataset. + * Either the provided runName parameter or generated name (experiment name + timestamp). + */ + runName: string; + + /** + * ID of the dataset run in Langfuse (only for experiments on Langfuse datasets). + * + * Present only when running experiments on Langfuse datasets. + * Use this ID to access the dataset run via the Langfuse API or UI + * for detailed analysis and comparison with other runs. + */ + datasetRunId?: string; + + /** + * URL to the dataset run in the Langfuse UI (only for experiments on Langfuse datasets). + * + * Direct link to view the complete dataset run in the Langfuse web interface, + * including all experiment results, traces, and analytics. Provides easy access + * to detailed analysis and visualization of the experiment. + */ + datasetRunUrl?: string; + + /** + * Results from processing each individual data item. + * + * Contains the complete results for every item in your experiment data, + * including inputs, outputs, evaluations, and trace information. + * Use this for detailed analysis of individual item performance. + */ + itemResults: ExperimentItemResult[]; + + /** + * Results from run-level evaluators that assessed the entire experiment. + * + * Contains aggregate evaluations that analyze the complete experiment, + * such as average scores, statistical measures, or overall quality assessments. + */ + runEvaluations: Evaluation[]; + + /** + * Function to format experiment results in a human-readable format. + * + * Generates a comprehensive, nicely formatted summary including individual results, + * aggregate statistics, evaluation scores, and links to traces and dataset runs. + * + * @param options - Formatting options + * @param options.includeItemResults - Whether to include individual item details (default: false) + * @returns Promise resolving to formatted string representation + */ + format: (options?: { includeItemResults?: boolean }) => Promise; +}; diff --git a/packages/client/src/index.ts b/packages/client/src/index.ts index 850c7476..d55bc403 100644 --- a/packages/client/src/index.ts +++ b/packages/client/src/index.ts @@ -3,3 +3,6 @@ export * from "./prompt/index.js"; export * from "./score/index.js"; export * from "./dataset/index.js"; export * from "./media/index.js"; +export * from "./experiment/ExperimentManager.js"; +export * from "./experiment/adapters.js"; +export * from "./experiment/types.js"; diff --git a/packages/client/src/score/index.ts b/packages/client/src/score/index.ts index 3aee8867..92c8800b 100644 --- a/packages/client/src/score/index.ts +++ b/packages/client/src/score/index.ts @@ -93,6 +93,10 @@ export class ScoreManager { } this.eventQueue.push(scoreIngestionEvent); + this.logger.debug( + "Added score event to queue:\n", + JSON.stringify(scoreIngestionEvent, null, 2), + ); if (this.eventQueue.length >= this.flushAtCount) { this.flushPromise = this.flush(); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1aa42445..25cded9f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -4,6 +4,9 @@ settings: autoInstallPeers: true excludeLinksFromLockfile: false +overrides: + ml-spectra-processing: 14.14.0 + importers: .: @@ -62,6 +65,9 @@ importers: ai: specifier: ^5 version: 5.0.23(zod@3.25.76) + autoevals: + specifier: ^0.0.131 + version: 0.0.131 dotenv: specifier: ^17.2.0 version: 17.2.0 @@ -122,6 +128,9 @@ importers: '@langfuse/core': specifier: workspace:^ version: link:../core + '@langfuse/tracing': + specifier: workspace:^ + version: link:../tracing '@opentelemetry/api': specifier: ^1.9.0 version: 1.9.0 @@ -1106,6 +1115,12 @@ packages: '@types/mustache@4.2.6': resolution: {integrity: sha512-t+8/QWTAhOFlrF1IVZqKnMRJi84EgkIK5Kh0p2JV4OLywUvCwJPFxbJAl7XAow7DVIHsF+xW9f1MVzg0L6Szjw==} + '@types/node-fetch@2.6.13': + resolution: {integrity: sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==} + + '@types/node@18.19.124': + resolution: {integrity: sha512-hY4YWZFLs3ku6D2Gqo3RchTd9VRCcrjqp/I0mmohYeUVA5Y8eCXKJEasHxLAJVZRJuQogfd1GiJ9lgogBgKeuQ==} + '@types/node@20.19.11': resolution: {integrity: sha512-uug3FEEGv0r+jrecvUUpbY8lLisvIjg6AAic6a2bSP5OEOLeJsDSnvhCDov7ipFFMXS3orMpzlmi0ZcuGkBbow==} @@ -1255,6 +1270,10 @@ packages: '@vitest/utils@3.2.4': resolution: {integrity: sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==} + abort-controller@3.0.0: + resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==} + engines: {node: '>=6.5'} + acorn-import-attributes@1.9.5: resolution: {integrity: sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==} peerDependencies: @@ -1277,6 +1296,10 @@ packages: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} + agentkeepalive@4.6.0: + resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==} + engines: {node: '>= 8.0.0'} + ai@5.0.23: resolution: {integrity: sha512-1zUF0o1zRI7UmSd8u5CKc2iHNhv21tM95Oka81c0CF77GnTbq5RvrAqVuLI+gMyKcIgs99yxA+xc5hJXvh6V+w==} engines: {node: '>=18'} @@ -1286,6 +1309,9 @@ packages: ajv@6.12.6: resolution: {integrity: sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==} + ajv@8.17.1: + resolution: {integrity: sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==} + ansi-escapes@4.3.2: resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} engines: {node: '>=8'} @@ -1358,6 +1384,12 @@ packages: async-retry@1.3.3: resolution: {integrity: sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw==} + asynckit@0.4.0: + resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} + + autoevals@0.0.131: + resolution: {integrity: sha512-F+3lraja+Ms7n1M2cpWl65N7AYx4sPocRW454H5HlSGabYMfuFOUxw8IXmEYDkQ38BxtZ0Wd5ZAQj9RF59YJWw==} + available-typed-arrays@1.0.7: resolution: {integrity: sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==} engines: {node: '>= 0.4'} @@ -1375,6 +1407,9 @@ packages: before-after-hook@3.0.2: resolution: {integrity: sha512-Nik3Sc0ncrMK4UUdXQmAnRtzmNQTAAXmXIopizwZ1W1t8QmfJj+zL4OA2I7XPTPW5z5TDqv4hRo/JzouDJnX3A==} + binary-search@1.3.6: + resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} + boolbase@1.0.0: resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} @@ -1459,6 +1494,9 @@ packages: resolution: {integrity: sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==} engines: {node: '>=20.18.1'} + cheminfo-types@1.8.1: + resolution: {integrity: sha512-FRcpVkox+cRovffgqNdDFQ1eUav+i/Vq/CUd1hcfEl2bevntFlzznL+jE8g4twl6ElB7gZjCko6pYpXyMn+6dA==} + chokidar@4.0.3: resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==} engines: {node: '>= 14.16.0'} @@ -1496,6 +1534,10 @@ packages: color-name@1.1.4: resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} + combined-stream@1.0.8: + resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} + engines: {node: '>= 0.8'} + commander@4.1.1: resolution: {integrity: sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==} engines: {node: '>= 6'} @@ -1503,6 +1545,15 @@ packages: compare-func@2.0.0: resolution: {integrity: sha512-zHig5N+tPWARooBnb0Zx1MFcdfpyJrfTJ3Y5L+IFvUm8rM74hHz66z0gw0x4tijh5CorKkKUCnW82R2vmpeCRA==} + compute-cosine-similarity@1.1.0: + resolution: {integrity: sha512-FXhNx0ILLjGi9Z9+lglLzM12+0uoTnYkHm7GiadXDAr0HGVLm25OivUS1B/LPkbzzvlcXz/1EvWg9ZYyJSdhTw==} + + compute-dot@1.1.0: + resolution: {integrity: sha512-L5Ocet4DdMrXboss13K59OK23GXjiSia7+7Ukc7q4Bl+RVpIXK2W9IHMbWDZkh+JUEvJAwOKRaJDiFUa1LTnJg==} + + compute-l2norm@1.1.0: + resolution: {integrity: sha512-6EHh1Elj90eU28SXi+h2PLnTQvZmkkHWySpoFz+WOlVNLz3DQoC4ISUHSV9n5jMxPHtKGJ01F4uu2PsXBB8sSg==} + concat-map@0.0.1: resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} @@ -1672,6 +1723,10 @@ packages: resolution: {integrity: sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==} engines: {node: '>= 14'} + delayed-stream@1.0.0: + resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} + engines: {node: '>=0.4.0'} + destr@2.0.5: resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} @@ -1892,6 +1947,10 @@ packages: resolution: {integrity: sha512-e3x3FBvGzeCIHhF+zhK8FZA2vC5uFn6b4HJjegUbIWrDb4mJ7JjTGMJY9VGIbRVpmSwHopNiaJibhjIr+HfLug==} engines: {node: '>=6.0.0'} + event-target-shim@5.0.1: + resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==} + engines: {node: '>=6'} + eventemitter3@4.0.7: resolution: {integrity: sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==} @@ -1933,6 +1992,9 @@ packages: fast-levenshtein@2.0.6: resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==} + fast-uri@3.1.0: + resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==} + fastq@1.19.1: resolution: {integrity: sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==} @@ -1944,6 +2006,9 @@ packages: picomatch: optional: true + fft.js@4.0.4: + resolution: {integrity: sha512-f9c00hphOgeQTlDyavwTtu6RiK8AIFjD6+jvXkNkpeQ7rirK3uFWVpalkoS4LAwbdX7mfZ8aoBfFVQX1Re/8aw==} + file-entry-cache@8.0.0: resolution: {integrity: sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==} engines: {node: '>=16.0.0'} @@ -1978,6 +2043,17 @@ packages: resolution: {integrity: sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==} engines: {node: '>=14'} + form-data-encoder@1.7.2: + resolution: {integrity: sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==} + + form-data@4.0.4: + resolution: {integrity: sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==} + engines: {node: '>= 6'} + + formdata-node@4.4.1: + resolution: {integrity: sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==} + engines: {node: '>= 12.20'} + fsevents@2.3.3: resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} @@ -2126,6 +2202,9 @@ packages: resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==} engines: {node: '>=16.17.0'} + humanize-ms@1.2.1: + resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==} + husky@9.1.7: resolution: {integrity: sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA==} engines: {node: '>=18'} @@ -2178,6 +2257,10 @@ packages: '@types/node': optional: true + install@0.13.0: + resolution: {integrity: sha512-zDml/jzr2PKU9I8J/xyZBQn8rPCAY//UOYNmR01XwNwyfhEWObo2SWfSl1+0tm1u6PhxLwDnfsT/6jB7OUxqFA==} + engines: {node: '>= 0.10'} + internal-slot@1.1.0: resolution: {integrity: sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==} engines: {node: '>= 0.4'} @@ -2186,6 +2269,9 @@ packages: resolution: {integrity: sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==} engines: {node: '>= 12'} + is-any-array@2.0.1: + resolution: {integrity: sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==} + is-array-buffer@3.0.5: resolution: {integrity: sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==} engines: {node: '>= 0.4'} @@ -2348,6 +2434,10 @@ packages: resolution: {integrity: sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==} engines: {node: '>=10'} + js-levenshtein@1.1.6: + resolution: {integrity: sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==} + engines: {node: '>=0.10.0'} + js-tiktoken@1.0.20: resolution: {integrity: sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==} @@ -2370,6 +2460,9 @@ packages: json-schema-traverse@0.4.1: resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==} + json-schema-traverse@1.0.0: + resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + json-schema@0.4.0: resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==} @@ -2408,6 +2501,9 @@ packages: resolution: {integrity: sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==} engines: {node: '>=14'} + linear-sum-assignment@1.0.7: + resolution: {integrity: sha512-jfLoSGwZNyjfY8eK4ayhjfcIu3BfWvP6sWieYzYI3AWldwXVoWEz1gtrQL10v/8YltYLBunqNjeVFXPMUs+MJg==} + lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} @@ -2508,10 +2604,18 @@ packages: resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==} engines: {node: '>=8.6'} + mime-db@1.52.0: + resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==} + engines: {node: '>= 0.6'} + mime-db@1.54.0: resolution: {integrity: sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==} engines: {node: '>= 0.6'} + mime-types@2.1.35: + resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==} + engines: {node: '>= 0.6'} + mime-types@3.0.1: resolution: {integrity: sha512-xRc4oEhT6eaBpU1XF7AjpOFD+xQmXNB5OVKwp4tqCuBpHLS/ZbBDrc07mYTDqVMg6PfxUjjNp85O6Cd2Z/5HWA==} engines: {node: '>= 0.6'} @@ -2538,6 +2642,24 @@ packages: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} + ml-array-max@1.2.4: + resolution: {integrity: sha512-BlEeg80jI0tW6WaPyGxf5Sa4sqvcyY6lbSn5Vcv44lp1I2GR6AWojfUvLnGTNsIXrZ8uqWmo8VcG1WpkI2ONMQ==} + + ml-array-min@1.2.3: + resolution: {integrity: sha512-VcZ5f3VZ1iihtrGvgfh/q0XlMobG6GQ8FsNyQXD3T+IlstDv85g8kfV0xUG1QPRO/t21aukaJowDzMTc7j5V6Q==} + + ml-array-rescale@1.3.7: + resolution: {integrity: sha512-48NGChTouvEo9KBctDfHC3udWnQKNKEWN0ziELvY3KG25GR5cA8K8wNVzracsqSW1QEkAXjTNx+ycgAv06/1mQ==} + + ml-matrix@6.12.1: + resolution: {integrity: sha512-TJ+8eOFdp+INvzR4zAuwBQJznDUfktMtOB6g/hUcGh3rcyjxbz4Te57Pgri8Q9bhSQ7Zys4IYOGhFdnlgeB6Lw==} + + ml-spectra-processing@14.14.0: + resolution: {integrity: sha512-3+nQBRQwO4e5SwsuF/PJXN+mJOptBLZxaT2l/aqRy45lKHrkauA7qdXLhCZF/VLEXJr3TOEaFlZT2fDfnJBcrA==} + + ml-xsadd@3.0.1: + resolution: {integrity: sha512-Fz2q6dwgzGM8wYKGArTUTZDGa4lQFA2Vi6orjGeTVRy22ZnQFKlJuwS9n8NRviqz1KHAHAzdKJwbnYhdo38uYg==} + mlly@1.7.4: resolution: {integrity: sha512-qmdSIPC4bDJXgZTCR7XosJiNKySV7O215tsPtDN9iEO/7q/76b/ijtgRu/+epFXSJhijtTCCGp3DWS549P3xKw==} @@ -2582,9 +2704,23 @@ packages: resolution: {integrity: sha512-NHDDGYudnvRutt/VhKFlX26IotXe1w0cmkDm6JGquh5bz/bDTw0LufSmH/GxTjEdpHEO+bVKFTwdrcGa/9XlKQ==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + node-domexception@1.0.0: + resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} + engines: {node: '>=10.5.0'} + deprecated: Use your platform's native DOMException instead + node-fetch-native@1.6.7: resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==} + node-fetch@2.7.0: + resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} + engines: {node: 4.x || >=6.0.0} + peerDependencies: + encoding: ^0.1.0 + peerDependenciesMeta: + encoding: + optional: true + normalize-package-data@6.0.2: resolution: {integrity: sha512-V6gygoYb/5EmNI+MEGrWkC+e6+Rr7mTmfHrxDbLzxQogBkgzo76rkok0Am6thgSF7Mv2nLOajAJj5vDJZEFn7g==} engines: {node: ^16.14.0 || >=18.0.0} @@ -2644,6 +2780,18 @@ packages: resolution: {integrity: sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==} engines: {node: '>=18'} + openai@4.104.0: + resolution: {integrity: sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA==} + hasBin: true + peerDependencies: + ws: ^8.18.0 + zod: ^3.23.8 + peerDependenciesMeta: + ws: + optional: true + zod: + optional: true + openai@5.10.2: resolution: {integrity: sha512-n+vi74LzHtvlKcDPn9aApgELGiu5CwhaLG40zxLTlFQdoSJCLACORIPC2uVQ3JEYAbqapM+XyRKFy2Thej7bIw==} hasBin: true @@ -2893,6 +3041,10 @@ packages: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} + require-from-string@2.0.2: + resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} + engines: {node: '>=0.10.0'} + require-in-the-middle@7.5.2: resolution: {integrity: sha512-gAZ+kLqBdHarXB64XpAe2VCjB7rIRv+mU8tfRWziHRJ5umKsIHN2tLLv6EtMw7WCdP19S0ERVMldNvxYCHnhSQ==} engines: {node: '>=8.6.0'} @@ -3178,6 +3330,9 @@ packages: resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==} engines: {node: '>=8.0'} + tr46@0.0.3: + resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + tr46@1.0.1: resolution: {integrity: sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==} @@ -3286,6 +3441,9 @@ packages: resolution: {integrity: sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==} engines: {node: '>= 0.4'} + undici-types@5.26.5: + resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} + undici-types@6.21.0: resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} @@ -3331,6 +3489,12 @@ packages: validate-npm-package-license@3.0.4: resolution: {integrity: sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew==} + validate.io-array@1.0.6: + resolution: {integrity: sha512-DeOy7CnPEziggrOO5CZhVKJw6S3Yi7e9e65R1Nl/RTN1vTQKnzjfvks0/8kQ40FP/dsjRAOd4hxmJ7uLa6vxkg==} + + validate.io-function@1.0.2: + resolution: {integrity: sha512-LlFybRJEriSuBnUhQyG5bwglhh50EpTL2ul23MPIuR1odjO7XaMLFV8vHGwp7AZciFxtYOeiSCT5st+XSPONiQ==} + vite-node@3.2.4: resolution: {integrity: sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==} engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} @@ -3404,6 +3568,13 @@ packages: jsdom: optional: true + web-streams-polyfill@4.0.0-beta.3: + resolution: {integrity: sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==} + engines: {node: '>= 14'} + + webidl-conversions@3.0.1: + resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} + webidl-conversions@4.0.2: resolution: {integrity: sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==} @@ -3419,6 +3590,9 @@ packages: resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==} engines: {node: '>=18'} + whatwg-url@5.0.0: + resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==} + whatwg-url@7.1.0: resolution: {integrity: sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==} @@ -4402,6 +4576,15 @@ snapshots: '@types/mustache@4.2.6': {} + '@types/node-fetch@2.6.13': + dependencies: + '@types/node': 24.3.0 + form-data: 4.0.4 + + '@types/node@18.19.124': + dependencies: + undici-types: 5.26.5 + '@types/node@20.19.11': dependencies: undici-types: 6.21.0 @@ -4605,6 +4788,10 @@ snapshots: loupe: 3.2.0 tinyrainbow: 2.0.0 + abort-controller@3.0.0: + dependencies: + event-target-shim: 5.0.1 + acorn-import-attributes@1.9.5(acorn@8.15.0): dependencies: acorn: 8.15.0 @@ -4619,6 +4806,10 @@ snapshots: agent-base@7.1.4: {} + agentkeepalive@4.6.0: + dependencies: + humanize-ms: 1.2.1 + ai@5.0.23(zod@3.25.76): dependencies: '@ai-sdk/gateway': 1.0.12(zod@3.25.76) @@ -4634,6 +4825,13 @@ snapshots: json-schema-traverse: 0.4.1 uri-js: 4.4.1 + ajv@8.17.1: + dependencies: + fast-deep-equal: 3.1.3 + fast-uri: 3.1.0 + json-schema-traverse: 1.0.0 + require-from-string: 2.0.2 + ansi-escapes@4.3.2: dependencies: type-fest: 0.21.3 @@ -4718,6 +4916,23 @@ snapshots: dependencies: retry: 0.13.1 + asynckit@0.4.0: {} + + autoevals@0.0.131: + dependencies: + ajv: 8.17.1 + compute-cosine-similarity: 1.1.0 + js-levenshtein: 1.1.6 + js-yaml: 4.1.0 + linear-sum-assignment: 1.0.7 + mustache: 4.2.0 + openai: 4.104.0(zod@3.25.76) + zod: 3.25.76 + zod-to-json-schema: 3.24.6(zod@3.25.76) + transitivePeerDependencies: + - encoding + - ws + available-typed-arrays@1.0.7: dependencies: possible-typed-array-names: 1.1.0 @@ -4730,6 +4945,8 @@ snapshots: before-after-hook@3.0.2: {} + binary-search@1.3.6: {} + boolbase@1.0.0: {} brace-expansion@1.1.12: @@ -4838,6 +5055,8 @@ snapshots: undici: 7.13.0 whatwg-mimetype: 4.0.0 + cheminfo-types@1.8.1: {} + chokidar@4.0.3: dependencies: readdirp: 4.1.2 @@ -4870,6 +5089,10 @@ snapshots: color-name@1.1.4: {} + combined-stream@1.0.8: + dependencies: + delayed-stream: 1.0.0 + commander@4.1.1: {} compare-func@2.0.0: @@ -4877,6 +5100,23 @@ snapshots: array-ify: 1.0.0 dot-prop: 5.3.0 + compute-cosine-similarity@1.1.0: + dependencies: + compute-dot: 1.1.0 + compute-l2norm: 1.1.0 + validate.io-array: 1.0.6 + validate.io-function: 1.0.2 + + compute-dot@1.1.0: + dependencies: + validate.io-array: 1.0.6 + validate.io-function: 1.0.2 + + compute-l2norm@1.1.0: + dependencies: + validate.io-array: 1.0.6 + validate.io-function: 1.0.2 + concat-map@0.0.1: {} concat-stream@2.0.0: @@ -5053,6 +5293,8 @@ snapshots: escodegen: 2.1.0 esprima: 4.0.1 + delayed-stream@1.0.0: {} + destr@2.0.5: {} detect-indent@7.0.1: {} @@ -5371,6 +5613,8 @@ snapshots: eta@3.5.0: {} + event-target-shim@5.0.1: {} + eventemitter3@4.0.7: {} eventsource-parser@3.0.5: {} @@ -5415,6 +5659,8 @@ snapshots: fast-levenshtein@2.0.6: {} + fast-uri@3.1.0: {} + fastq@1.19.1: dependencies: reusify: 1.1.0 @@ -5427,6 +5673,8 @@ snapshots: optionalDependencies: picomatch: 4.0.3 + fft.js@4.0.4: {} + file-entry-cache@8.0.0: dependencies: flat-cache: 4.0.1 @@ -5464,6 +5712,21 @@ snapshots: cross-spawn: 7.0.6 signal-exit: 4.1.0 + form-data-encoder@1.7.2: {} + + form-data@4.0.4: + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + es-set-tostringtag: 2.1.0 + hasown: 2.0.2 + mime-types: 2.1.35 + + formdata-node@4.4.1: + dependencies: + node-domexception: 1.0.0 + web-streams-polyfill: 4.0.0-beta.3 + fsevents@2.3.3: optional: true @@ -5648,6 +5911,10 @@ snapshots: human-signals@5.0.0: {} + humanize-ms@1.2.1: + dependencies: + ms: 2.1.3 + husky@9.1.7: {} iconv-lite@0.4.24: @@ -5694,6 +5961,8 @@ snapshots: optionalDependencies: '@types/node': 24.3.0 + install@0.13.0: {} + internal-slot@1.1.0: dependencies: es-errors: 1.3.0 @@ -5705,6 +5974,8 @@ snapshots: jsbn: 1.1.0 sprintf-js: 1.1.3 + is-any-array@2.0.1: {} + is-array-buffer@3.0.5: dependencies: call-bind: 1.0.8 @@ -5860,6 +6131,8 @@ snapshots: joycon@3.1.1: {} + js-levenshtein@1.1.6: {} + js-tiktoken@1.0.20: dependencies: base64-js: 1.5.1 @@ -5878,6 +6151,8 @@ snapshots: json-schema-traverse@0.4.1: {} + json-schema-traverse@1.0.0: {} + json-schema@0.4.0: {} json-stable-stringify-without-jsonify@1.0.1: {} @@ -5912,6 +6187,13 @@ snapshots: lilconfig@3.1.3: {} + linear-sum-assignment@1.0.7: + dependencies: + cheminfo-types: 1.8.1 + install: 0.13.0 + ml-matrix: 6.12.1 + ml-spectra-processing: 14.14.0 + lines-and-columns@1.2.4: {} linkify-it@5.0.0: @@ -5996,8 +6278,14 @@ snapshots: braces: 3.0.3 picomatch: 2.3.1 + mime-db@1.52.0: {} + mime-db@1.54.0: {} + mime-types@2.1.35: + dependencies: + mime-db: 1.52.0 + mime-types@3.0.1: dependencies: mime-db: 1.54.0 @@ -6018,6 +6306,36 @@ snapshots: minipass@7.1.2: {} + ml-array-max@1.2.4: + dependencies: + is-any-array: 2.0.1 + + ml-array-min@1.2.3: + dependencies: + is-any-array: 2.0.1 + + ml-array-rescale@1.3.7: + dependencies: + is-any-array: 2.0.1 + ml-array-max: 1.2.4 + ml-array-min: 1.2.3 + + ml-matrix@6.12.1: + dependencies: + is-any-array: 2.0.1 + ml-array-rescale: 1.3.7 + + ml-spectra-processing@14.14.0: + dependencies: + binary-search: 1.3.6 + cheminfo-types: 1.8.1 + fft.js: 4.0.4 + is-any-array: 2.0.1 + ml-matrix: 6.12.1 + ml-xsadd: 3.0.1 + + ml-xsadd@3.0.1: {} + mlly@1.7.4: dependencies: acorn: 8.15.0 @@ -6053,8 +6371,14 @@ snapshots: dependencies: type-fest: 2.19.0 + node-domexception@1.0.0: {} + node-fetch-native@1.6.7: {} + node-fetch@2.7.0: + dependencies: + whatwg-url: 5.0.0 + normalize-package-data@6.0.2: dependencies: hosted-git-info: 7.0.2 @@ -6129,6 +6453,20 @@ snapshots: is-inside-container: 1.0.0 wsl-utils: 0.1.0 + openai@4.104.0(zod@3.25.76): + dependencies: + '@types/node': 18.19.124 + '@types/node-fetch': 2.6.13 + abort-controller: 3.0.0 + agentkeepalive: 4.6.0 + form-data-encoder: 1.7.2 + formdata-node: 4.4.1 + node-fetch: 2.7.0 + optionalDependencies: + zod: 3.25.76 + transitivePeerDependencies: + - encoding + openai@5.10.2(zod@3.25.76): optionalDependencies: zod: 3.25.76 @@ -6429,6 +6767,8 @@ snapshots: require-directory@2.1.1: {} + require-from-string@2.0.2: {} + require-in-the-middle@7.5.2: dependencies: debug: 4.4.1 @@ -6751,6 +7091,8 @@ snapshots: dependencies: is-number: 7.0.0 + tr46@0.0.3: {} + tr46@1.0.1: dependencies: punycode: 2.3.1 @@ -6877,6 +7219,8 @@ snapshots: has-symbols: 1.1.0 which-boxed-primitive: 1.1.1 + undici-types@5.26.5: {} + undici-types@6.21.0: {} undici-types@7.10.0: {} @@ -6908,6 +7252,10 @@ snapshots: spdx-correct: 3.2.0 spdx-expression-parse: 3.0.1 + validate.io-array@1.0.6: {} + + validate.io-function@1.0.2: {} + vite-node@3.2.4(@types/node@24.3.0)(jiti@2.5.1)(tsx@4.20.3)(yaml@2.8.0): dependencies: cac: 6.7.14 @@ -6986,6 +7334,10 @@ snapshots: - tsx - yaml + web-streams-polyfill@4.0.0-beta.3: {} + + webidl-conversions@3.0.1: {} + webidl-conversions@4.0.2: {} whatwg-encoding@3.1.1: @@ -6996,6 +7348,11 @@ snapshots: whatwg-mimetype@4.0.0: {} + whatwg-url@5.0.0: + dependencies: + tr46: 0.0.3 + webidl-conversions: 3.0.1 + whatwg-url@7.1.0: dependencies: lodash.sortby: 4.7.0 diff --git a/tests/e2e/experiments.e2e.test.ts b/tests/e2e/experiments.e2e.test.ts new file mode 100644 index 00000000..b31c42c5 --- /dev/null +++ b/tests/e2e/experiments.e2e.test.ts @@ -0,0 +1,1072 @@ +import { + Evaluator, + ExperimentTask, + LangfuseClient, + RunEvaluator, + createEvaluatorFromAutoevals, +} from "@langfuse/client"; +import { observeOpenAI } from "@langfuse/openai"; +import { Factuality, Levenshtein } from "autoevals"; +import { nanoid } from "nanoid"; +import OpenAI from "openai"; +import { describe, it, afterEach, beforeEach, expect } from "vitest"; + +import { + setupServerTestEnvironment, + teardownServerTestEnvironment, + waitForServerIngestion, + type ServerTestEnvironment, +} from "./helpers/serverSetup.js"; + +describe("Langfuse Datasets E2E", () => { + let langfuse: LangfuseClient; + let testEnv: ServerTestEnvironment; + + const dataset = [ + { + input: "Germany", + expectedOutput: "Berlin", + }, + { + input: "France", + expectedOutput: "Paris", + }, + { + input: "Spain", + expectedOutput: "Madrid", + }, + ]; + + const task: ExperimentTask = async (params) => { + const client = observeOpenAI(new OpenAI()); + + const response = await client.chat.completions.create({ + model: "gpt-4.1", + messages: [ + { + role: "user", + content: `What is the capital of ${params.input}? Be cheeky sometimes in your answer and give the unofficial one. Respond in one word.`, + }, + ], + }); + + return response.choices[0].message.content; + }; + + const factualityEvaluator: Evaluator = async (params) => { + const response = await new OpenAI().chat.completions.parse({ + model: "gpt-4.1", + messages: [ + { + role: "user", + content: `Rate the correctness of this sentence: The capital of ${params.input} is ${params.output}`, + }, + ], + response_format: { + type: "json_schema", + json_schema: { + name: "score", + description: + "score between 0 to 1 where 0 is false and 1 is correct.", + schema: { + $schema: "http://json-schema.org/draft-04/schema#", + type: "object", + properties: { + score: { + type: "integer", + }, + reasoning: { + type: "string", + }, + }, + required: ["score", "reasoning"], + }, + }, + }, + }); + + const parsed = JSON.parse(response.choices[0].message.content!); + + return [ + { + name: "manual-factuality", + value: parsed.score, + comment: parsed.reasoning, + metadata: { reasoning: parsed.reasoning }, + }, + ]; + }; + + const levenshteinAverageRunEvaluator: RunEvaluator = async ({ + itemResults, + }) => { + const average = itemResults + .map((result) => + result.evaluations.filter((e) => e.name === "Levenshtein"), + ) + .flat() + .reduce((acc, curr, _, array) => { + return acc + (curr.value as number) / array.length; + }, 0); + + return { + name: "levenshtein-average", + value: average, + }; + }; + + beforeEach(async () => { + testEnv = await setupServerTestEnvironment(); + langfuse = new LangfuseClient(); + }); + + afterEach(async () => { + await teardownServerTestEnvironment(testEnv); + await langfuse.flush(); + }); + + it("should run an experiment on local dataset", async () => { + const result = await langfuse.experiment.run({ + name: "Euro capitals", + description: "Country capital experiment", + data: dataset, + task, + evaluators: [ + createEvaluatorFromAutoevals(Factuality), + createEvaluatorFromAutoevals(Levenshtein), + factualityEvaluator, + ], + runEvaluators: [levenshteinAverageRunEvaluator], + }); + + console.log(await result.format()); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + // Validate basic result structure + expect(result.itemResults).toHaveLength(3); + expect(result.runEvaluations).toHaveLength(1); + expect(result.runEvaluations[0]).toMatchObject({ + name: "levenshtein-average", + value: expect.any(Number), + }); + // Should have generated runName (experiment name + timestamp) + expect(result.runName).toMatch( + /^Euro capitals - \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/, + ); + // No datasetRunId for local datasets + expect(result.datasetRunId).toBeUndefined(); + + // Validate item results structure + result.itemResults.forEach((itemResult, index) => { + expect(itemResult).toMatchObject({ + output: expect.any(String), + evaluations: expect.arrayContaining([ + expect.objectContaining({ + name: "Factuality", + value: expect.any(Number), + }), + expect.objectContaining({ + name: "Levenshtein", + value: expect.any(Number), + }), + expect.objectContaining({ + name: "manual-factuality", + value: expect.any(Number), + }), + ]), + traceId: expect.any(String), + }); + + // Should have 3 evaluations per item + expect(itemResult.evaluations).toHaveLength(3); + // No datasetRunId for local datasets + expect(itemResult.datasetRunId).toBeUndefined(); + }); + }); + + it("should run an experiment on a langfuse dataset", async () => { + // create remote dataset + const datasetName = "euro-capitals-" + nanoid(); + await langfuse.api.datasets.create({ + name: datasetName, + description: "Collection of euro countries and capitals", + }); + + // create remote dataset items + await Promise.all( + dataset.map((item) => + langfuse.api.datasetItems.create({ datasetName, ...item }), + ), + ); + + const fetchedDataset = await langfuse.dataset.get(datasetName); + + const experimentName = "Euro capitals on LF dataset"; + const result = await fetchedDataset.runExperiment({ + name: experimentName, + description: "Country capital experiment", + task, + evaluators: [ + createEvaluatorFromAutoevals(Factuality), + createEvaluatorFromAutoevals(Levenshtein), + factualityEvaluator, + ], + runEvaluators: [levenshteinAverageRunEvaluator], + }); + + console.log(await result.format()); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + // Validate basic result structure + expect(result.itemResults).toHaveLength(3); + expect(result.runEvaluations).toHaveLength(1); + expect(result.runEvaluations[0]).toMatchObject({ + name: "levenshtein-average", + value: expect.any(Number), + }); + // Should have generated runName (experiment name + timestamp) + expect(result.runName).toMatch( + /^Euro capitals on LF dataset - \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/, + ); + expect(result.datasetRunId).toBeDefined(); + + // Validate item results structure + result.itemResults.forEach((itemResult, index) => { + expect(itemResult).toMatchObject({ + output: expect.any(String), + evaluations: expect.arrayContaining([ + expect.objectContaining({ + name: "Factuality", + value: expect.any(Number), + }), + expect.objectContaining({ + name: "Levenshtein", + value: expect.any(Number), + }), + expect.objectContaining({ + name: "manual-factuality", + value: expect.any(Number), + }), + ]), + traceId: expect.any(String), + datasetRunId: expect.any(String), + }); + + // Should have 3 evaluations per item + expect(itemResult.evaluations).toHaveLength(3); + }); + + // Fetch dataset run from API and validate against database + const datasetRun = await langfuse.api.datasets.getRun( + datasetName, + result.runName, + ); + + expect(datasetRun).toBeDefined(); + expect(datasetRun).toMatchObject({ + name: result.runName, + description: "Country capital experiment", + datasetId: fetchedDataset.id, + datasetName: datasetName, + }); + + // Validate dataset run items + expect(datasetRun.datasetRunItems).toHaveLength(3); + + // Each run item should correspond to one of our experiment results + result.itemResults.forEach((itemResult) => { + const correspondingRunItem = datasetRun.datasetRunItems.find( + (runItem) => runItem.traceId === itemResult.traceId, + ); + + expect(correspondingRunItem).toBeDefined(); + expect(correspondingRunItem).toMatchObject({ + traceId: itemResult.traceId, + datasetItemId: expect.any(String), + }); + }); + + // Validate that traces contain the expected scores + // Each trace should have 3 item-level evaluations + 1 run-level evaluation + const expectedTraceIds = result.itemResults.map((r) => r.traceId); + expect(expectedTraceIds).toHaveLength(3); + expectedTraceIds.forEach((traceId) => { + expect(traceId).toMatch(/^[a-f0-9]{32}$/); + }); + }); + + it("should support custom runName parameter", async () => { + // create remote dataset + const datasetName = "custom-run-name-test-" + nanoid(); + await langfuse.api.datasets.create({ + name: datasetName, + description: "Test custom run names", + }); + + // create remote dataset items + await Promise.all( + dataset + .slice(0, 2) + .map((item) => + langfuse.api.datasetItems.create({ datasetName, ...item }), + ), + ); + + const fetchedDataset = await langfuse.dataset.get(datasetName); + + const customRunName = "Custom Run Name " + nanoid(); + const result = await fetchedDataset.runExperiment({ + name: "Test Experiment", + runName: customRunName, + description: "Testing custom run name", + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + // Should use the custom run name exactly + expect(result.runName).toBe(customRunName); + expect(result.datasetRunId).toBeDefined(); + + // Fetch dataset run and verify it has the custom name + const datasetRun = await langfuse.api.datasets.getRun( + datasetName, + customRunName, + ); + + expect(datasetRun).toBeDefined(); + expect(datasetRun).toMatchObject({ + name: customRunName, + description: "Testing custom run name", + datasetId: fetchedDataset.id, + datasetName: datasetName, + }); + }); + + it("should support custom runName with local datasets", async () => { + const customRunName = "Local Custom Run " + nanoid(); + const result = await langfuse.experiment.run({ + name: "Local Test Experiment", + runName: customRunName, + description: "Testing custom run name with local data", + data: dataset.slice(0, 2), + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + // Should use the custom run name exactly + expect(result.runName).toBe(customRunName); + expect(result.itemResults).toHaveLength(2); + // No dataset run for local datasets + expect(result.datasetRunId).toBeUndefined(); + }); + + // Error Handling Tests + describe("Error Handling", () => { + it("should handle evaluator failures gracefully", async () => { + const failingEvaluator: Evaluator = async () => { + throw new Error("Evaluator failed"); + }; + + const result = await langfuse.experiment.run({ + name: "Error test", + description: "Test evaluator error handling", + data: dataset.slice(0, 1), // Just one item + task, + evaluators: [ + createEvaluatorFromAutoevals(Factuality), // This should work + failingEvaluator, // This should fail + ], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + // Should still complete the experiment + expect(result.itemResults).toHaveLength(1); + expect(result.itemResults[0].evaluations).toHaveLength(1); // Only the working evaluator + expect(result.itemResults[0].evaluations[0].name).toBe("Factuality"); + }); + + it("should handle task failures gracefully", async () => { + const failingTask: ExperimentTask = async () => { + throw new Error("Task failed"); + }; + + // The experiment should handle the task failure gracefully by skipping the failed item + const result = await langfuse.experiment.run({ + name: "Task error test", + description: "Test task error handling", + data: dataset.slice(0, 1), + task: failingTask, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + // Should complete experiment but skip the failed item + expect(result.itemResults).toHaveLength(0); + expect(result.runEvaluations).toHaveLength(0); + }); + + it("should handle mixed task success and failures", async () => { + const mixedTask: ExperimentTask = async ({ input }) => { + if (input === "Germany") { + throw new Error("Task failed for Germany"); + } + return `Capital of ${input}`; + }; + + const result = await langfuse.experiment.run({ + name: "Mixed task results test", + description: "Test mixed success/failure handling", + data: dataset.slice(0, 2), // Germany and France + task: mixedTask, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + // Should complete experiment with only successful items + expect(result.itemResults).toHaveLength(1); // Only France should succeed + expect(result.itemResults[0].output).toContain("France"); + expect(result.itemResults[0].evaluations).toHaveLength(1); + }); + + it("should handle run evaluator failures", async () => { + const failingRunEvaluator: RunEvaluator = async () => { + throw new Error("Run evaluator failed"); + }; + + const result = await langfuse.experiment.run({ + name: "Run evaluator error test", + description: "Test run evaluator error handling", + data: dataset.slice(0, 1), + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + runEvaluators: [failingRunEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + // Should complete experiment but run evaluations should be empty + expect(result.itemResults).toHaveLength(1); + expect(result.runEvaluations).toHaveLength(0); + }); + }); + + // Edge Cases Tests + describe("Edge Cases", () => { + it("should handle empty dataset", async () => { + const result = await langfuse.experiment.run({ + name: "Empty dataset test", + description: "Test empty dataset handling", + data: [], + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + runEvaluators: [levenshteinAverageRunEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(500); + + expect(result.itemResults).toHaveLength(0); + expect(result.runEvaluations).toHaveLength(1); // Run evaluators will still execute with empty data + expect(await result.format()).toContain("No experiment results"); + }); + + it("should handle dataset with missing fields", async () => { + const incompleteDataset = [ + { input: "Germany" }, // Missing expectedOutput + { expectedOutput: "Paris" }, // Missing input + { input: "Spain", expectedOutput: "Madrid" }, // Complete + ]; + + const result = await langfuse.experiment.run({ + name: "Incomplete data test", + description: "Test incomplete dataset handling", + data: incompleteDataset, + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(3); + // Should handle missing fields gracefully + result.itemResults.forEach((item) => { + expect(item.traceId).toBeDefined(); + expect(item.output).toBeDefined(); + }); + }); + + it("should handle very large dataset", async () => { + // Create a larger dataset for performance testing + const largeDataset = Array.from({ length: 20 }, (_, i) => ({ + input: `Country ${i}`, + expectedOutput: `Capital ${i}`, + })); + + const result = await langfuse.experiment.run({ + name: "Large dataset test", + description: "Test performance with larger dataset", + data: largeDataset, + task: async ({ input }) => `Output for ${input}`, + evaluators: [ + async () => ({ + name: "simple-eval", + value: Math.random(), + }), + ], + maxConcurrency: 5, // Test concurrency limit + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(3000); + + expect(result.itemResults).toHaveLength(20); + result.itemResults.forEach((item) => { + expect(item.evaluations).toHaveLength(1); + expect(item.traceId).toBeDefined(); + }); + }, 30000); + }); + + // New Features Tests + describe("New Features", () => { + it("should support evaluators returning single evaluation", async () => { + const singleEvaluationEvaluator: Evaluator = async ({ + input, + output, + }) => { + // Return single evaluation instead of array + return { + name: "single-eval", + value: input === "Germany" ? 1 : 0, + comment: `Single evaluation for ${input}`, + }; + }; + + const result = await langfuse.experiment.run({ + name: "Single evaluation test", + description: "Test single evaluation return", + data: dataset.slice(0, 2), + task, + evaluators: [singleEvaluationEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(2); + result.itemResults.forEach((item) => { + expect(item.evaluations).toHaveLength(1); + expect(item.evaluations[0]).toMatchObject({ + name: "single-eval", + value: expect.any(Number), + comment: expect.stringContaining("Single evaluation for"), + }); + }); + }); + + it("should support run evaluators returning single evaluation", async () => { + const singleRunEvaluator: RunEvaluator = async ({ itemResults }) => { + // Return single evaluation instead of array + return { + name: "single-run-eval", + value: itemResults.length, + comment: `Processed ${itemResults.length} items`, + }; + }; + + const result = await langfuse.experiment.run({ + name: "Single run evaluation test", + description: "Test single run evaluation return", + data: dataset.slice(0, 2), + task, + runEvaluators: [singleRunEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.runEvaluations).toHaveLength(1); + expect(result.runEvaluations[0]).toMatchObject({ + name: "single-run-eval", + value: 2, + comment: "Processed 2 items", + }); + }); + + it("should support format with includeItemResults option", async () => { + const result = await langfuse.experiment.run({ + name: "Format options test", + description: "Test format options", + data: dataset, + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + runEvaluators: [levenshteinAverageRunEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + // Test with includeItemResults: false (default) + const compactOutput = await result.format(); + expect(compactOutput).toContain("Individual Results: Hidden"); + expect(compactOutput).toContain( + "Call format({ includeItemResults: true })", + ); + expect(compactOutput).toContain("Format options test"); // Should still show summary + + // Test with includeItemResults: true + const fullOutput = await result.format({ includeItemResults: true }); + expect(fullOutput).toContain("1. Item 1:"); + expect(fullOutput).toContain("2. Item 2:"); + expect(fullOutput).toContain("3. Item 3:"); + + // Test default behavior (should be same as false) + const defaultOutput = await result.format(); + expect(defaultOutput).toEqual(compactOutput); + }); + }); + + // Concurrency and Performance Tests + describe("Concurrency and Performance", () => { + it("should respect maxConcurrency parameter", async () => { + let concurrentCount = 0; + let maxConcurrentReached = 0; + + const slowTask: ExperimentTask = async ({ input }) => { + concurrentCount++; + maxConcurrentReached = Math.max(maxConcurrentReached, concurrentCount); + + // Simulate slow operation + await new Promise((resolve) => setTimeout(resolve, 100)); + + concurrentCount--; + return `Processed ${input}`; + }; + + const testData = Array.from({ length: 10 }, (_, i) => ({ + input: `Item ${i}`, + expectedOutput: `Expected ${i}`, + })); + + const result = await langfuse.experiment.run({ + name: "Concurrency test", + description: "Test maxConcurrency parameter", + data: testData, + task: slowTask, + maxConcurrency: 3, + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + expect(result.itemResults).toHaveLength(10); + expect(maxConcurrentReached).toBeLessThanOrEqual(3); + }, 15000); + + it("should handle evaluators with different execution times", async () => { + const fastEvaluator: Evaluator = async () => ({ + name: "fast-eval", + value: 1, + }); + + const slowEvaluator: Evaluator = async () => { + await new Promise((resolve) => setTimeout(resolve, 200)); + return { + name: "slow-eval", + value: 0.5, + }; + }; + + const start = Date.now(); + const result = await langfuse.experiment.run({ + name: "Mixed speed evaluators test", + description: "Test evaluators with different execution times", + data: dataset.slice(0, 2), + task, + evaluators: [fastEvaluator, slowEvaluator], + }); + const duration = Date.now() - start; + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(2); + result.itemResults.forEach((item) => { + expect(item.evaluations).toHaveLength(2); + expect(item.evaluations.map((e) => e.name)).toContain("fast-eval"); + expect(item.evaluations.map((e) => e.name)).toContain("slow-eval"); + }); + + // Should complete in reasonable time (parallel execution) + expect(duration).toBeLessThan(2000); // Should be much faster than 400ms * 2 items sequentially + }, 10000); + }); + + // Data Persistence and API Integration Tests + describe("Data Persistence and API Integration", () => { + it("should persist scores correctly", async () => { + const datasetName = "score-persistence-test-" + nanoid(); + await langfuse.api.datasets.create({ name: datasetName }); + + const testItem = { + input: "Test input", + expectedOutput: "Test output", + }; + + const createdItem = await langfuse.api.datasetItems.create({ + datasetName, + ...testItem, + }); + + const fetchedDataset = await langfuse.dataset.get(datasetName); + + const testEvaluator: Evaluator = async () => ({ + name: "persistence-test-eval", + value: 0.85, + comment: "Test evaluation for persistence", + }); + + const testRunEvaluator: RunEvaluator = async () => ({ + name: "persistence-test-run-eval", + value: 0.9, + comment: "Test run evaluation for persistence", + }); + + const result = await fetchedDataset.runExperiment({ + name: "Score persistence test", + description: "Test score persistence", + task, + evaluators: [testEvaluator], + runEvaluators: [testRunEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(3000); + + // Validate scores are persisted + const datasetRun = await langfuse.api.datasets.getRun( + datasetName, + result.runName, + ); + + expect(datasetRun).toBeDefined(); + expect(datasetRun.datasetRunItems).toHaveLength(1); + + // Validate item-level scores are linked to traces + const runItem = datasetRun.datasetRunItems[0]; + expect(runItem.traceId).toBe(result.itemResults[0].traceId); + }); + + it("should handle multiple experiments on same dataset", async () => { + const datasetName = "multi-experiment-test-" + nanoid(); + await langfuse.api.datasets.create({ name: datasetName }); + + await Promise.all( + dataset + .slice(0, 2) + .map((item) => + langfuse.api.datasetItems.create({ datasetName, ...item }), + ), + ); + + const fetchedDataset = await langfuse.dataset.get(datasetName); + + // Run first experiment + const result1 = await fetchedDataset.runExperiment({ + name: "Experiment 1", + description: "First experiment", + task, + evaluators: [createEvaluatorFromAutoevals(Factuality)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + // Run second experiment + const result2 = await fetchedDataset.runExperiment({ + name: "Experiment 2", + description: "Second experiment", + task, + evaluators: [createEvaluatorFromAutoevals(Levenshtein)], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + // Both experiments should have different run IDs + expect(result1.datasetRunId).toBeDefined(); + expect(result2.datasetRunId).toBeDefined(); + expect(result1.datasetRunId).not.toBe(result2.datasetRunId); + + // Validate both runs exist in database + const run1 = await langfuse.api.datasets.getRun( + datasetName, + result1.runName, + ); + const run2 = await langfuse.api.datasets.getRun( + datasetName, + result2.runName, + ); + + expect(run1).toBeDefined(); + expect(run2).toBeDefined(); + expect(run1.id).not.toBe(run2.id); + }); + + it("should preserve dataset run metadata", async () => { + const datasetName = "metadata-test-" + nanoid(); + await langfuse.api.datasets.create({ name: datasetName }); + + await langfuse.api.datasetItems.create({ + datasetName, + input: "Test", + expectedOutput: "Test output", + }); + + const fetchedDataset = await langfuse.dataset.get(datasetName); + + const result = await fetchedDataset.runExperiment({ + name: "Metadata test experiment", + description: "Testing metadata preservation", + metadata: { testKey: "testValue", experimentVersion: "1.0" }, + task, + evaluators: [ + async () => ({ + name: "test-eval", + value: 1, + metadata: { evaluatorVersion: "2.0" }, + }), + ], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(2000); + + const datasetRun = await langfuse.api.datasets.getRun( + datasetName, + result.runName, + ); + + expect(datasetRun).toMatchObject({ + name: result.runName, + description: "Testing metadata preservation", + metadata: { testKey: "testValue", experimentVersion: "1.0" }, + }); + }); + }); + + // Different Evaluator Configurations Tests + describe("Different Evaluator Configurations", () => { + it("should work with no evaluators", async () => { + const result = await langfuse.experiment.run({ + name: "No evaluators test", + description: "Test experiment with no evaluators", + data: dataset.slice(0, 2), + task, + evaluators: [], // No evaluators + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(2); + result.itemResults.forEach((item) => { + expect(item.evaluations).toHaveLength(0); + expect(item.traceId).toBeDefined(); + expect(item.output).toBeDefined(); + }); + expect(result.runEvaluations).toHaveLength(0); + }); + + it("should work with only run evaluators", async () => { + const onlyRunEvaluator: RunEvaluator = async ({ itemResults }) => ({ + name: "run-only-eval", + value: itemResults.length * 10, + comment: `Run-level evaluation of ${itemResults.length} items`, + }); + + const result = await langfuse.experiment.run({ + name: "Only run evaluators test", + description: "Test with only run evaluators", + data: dataset.slice(0, 3), + task, + evaluators: [], // No item evaluators + runEvaluators: [onlyRunEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(3); + result.itemResults.forEach((item) => { + expect(item.evaluations).toHaveLength(0); // No item evaluations + expect(item.traceId).toBeDefined(); + }); + + expect(result.runEvaluations).toHaveLength(1); + expect(result.runEvaluations[0]).toMatchObject({ + name: "run-only-eval", + value: 30, // 3 items * 10 + }); + }); + + it("should handle mix of sync and async evaluators", async () => { + const asyncEvaluator: Evaluator = async ({ input }) => { + await new Promise((resolve) => setTimeout(resolve, 50)); + return { + name: "async-eval", + value: input.length / 10, + }; + }; + + // Simulated sync evaluator (still returns Promise per type signature) + const syncEvaluator: Evaluator = async ({ input }) => { + return { + name: "sync-eval", + value: input === "Germany" ? 1 : 0, + }; + }; + + const result = await langfuse.experiment.run({ + name: "Mixed sync/async evaluators test", + description: "Test mix of sync and async evaluators", + data: dataset.slice(0, 2), + task, + evaluators: [asyncEvaluator, syncEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(2); + result.itemResults.forEach((item) => { + expect(item.evaluations).toHaveLength(2); + const evalNames = item.evaluations.map((e) => e.name); + expect(evalNames).toContain("async-eval"); + expect(evalNames).toContain("sync-eval"); + }); + }); + + it("should handle evaluators returning different data types", async () => { + const numberEvaluator: Evaluator = async () => ({ + name: "number-eval", + value: 42, + }); + + const stringEvaluator: Evaluator = async () => ({ + name: "string-eval", + value: "excellent", + }); + + const booleanEvaluator: Evaluator = async () => ({ + name: "boolean-eval", + value: true, + dataType: "BOOLEAN", + }); + + const result = await langfuse.experiment.run({ + name: "Different data types test", + description: "Test evaluators with different return value types", + data: dataset.slice(0, 1), + task, + evaluators: [numberEvaluator, stringEvaluator, booleanEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(1); + const evaluations = result.itemResults[0].evaluations; + expect(evaluations).toHaveLength(3); + + const numberEval = evaluations.find((e) => e.name === "number-eval"); + const stringEval = evaluations.find((e) => e.name === "string-eval"); + const booleanEval = evaluations.find((e) => e.name === "boolean-eval"); + + expect(numberEval?.value).toBe(42); + expect(stringEval?.value).toBe("excellent"); + expect(booleanEval?.value).toBe(true); + }); + + it("should handle complex evaluator metadata and comments", async () => { + const complexEvaluator: Evaluator = async ({ + input, + output, + expectedOutput, + }) => [ + { + name: "detailed-eval", + value: 0.85, + comment: `Detailed analysis: input="${input}", output="${output}", expected="${expectedOutput}"`, + metadata: { + inputLength: input?.length || 0, + outputLength: output?.length || 0, + timestamp: new Date().toISOString(), + evaluatorVersion: "1.2.3", + }, + dataType: "NUMERIC" as const, + }, + { + name: "secondary-eval", + value: input === expectedOutput ? "perfect" : "imperfect", + comment: "Secondary evaluation result", + metadata: { secondary: true }, + }, + ]; + + const result = await langfuse.experiment.run({ + name: "Complex evaluator test", + description: "Test evaluators with complex metadata", + data: dataset.slice(0, 1), + task, + evaluators: [complexEvaluator], + }); + + await testEnv.spanProcessor.forceFlush(); + await waitForServerIngestion(1000); + + expect(result.itemResults).toHaveLength(1); + const evaluations = result.itemResults[0].evaluations; + expect(evaluations).toHaveLength(2); + + const detailedEval = evaluations.find((e) => e.name === "detailed-eval"); + expect(detailedEval).toMatchObject({ + name: "detailed-eval", + value: 0.85, + comment: expect.stringContaining("Detailed analysis"), + metadata: expect.objectContaining({ + inputLength: expect.any(Number), + evaluatorVersion: "1.2.3", + }), + dataType: "NUMERIC", + }); + + const secondaryEval = evaluations.find( + (e) => e.name === "secondary-eval", + ); + expect(secondaryEval).toMatchObject({ + name: "secondary-eval", + value: expect.any(String), + metadata: { secondary: true }, + }); + }); + }); +}); diff --git a/tests/e2e/openai.e2e.test.ts b/tests/e2e/openai.e2e.test.ts index fed19fef..67fb571b 100644 --- a/tests/e2e/openai.e2e.test.ts +++ b/tests/e2e/openai.e2e.test.ts @@ -148,7 +148,7 @@ describe("OpenAI integration E2E tests", () => { expect(content).toBeDefined(); await testEnv.spanProcessor.forceFlush(); - await waitForServerIngestion(2000); + await waitForServerIngestion(4000); const traces = await langfuseClient.api.trace.list({ name: generationName, diff --git a/tests/e2e/tracing.e2e.test.ts b/tests/e2e/tracing.e2e.test.ts index b098ffbe..5ca0851b 100644 --- a/tests/e2e/tracing.e2e.test.ts +++ b/tests/e2e/tracing.e2e.test.ts @@ -1,17 +1,18 @@ -import { describe, it, beforeEach, afterEach } from "vitest"; import { startObservation, startActiveObservation, observe, } from "@langfuse/tracing"; +import { nanoid } from "nanoid"; +import { describe, it, beforeEach, afterEach } from "vitest"; + +import { ServerAssertions } from "./helpers/serverAssertions.js"; import { setupServerTestEnvironment, teardownServerTestEnvironment, waitForServerIngestion, type ServerTestEnvironment, } from "./helpers/serverSetup.js"; -import { ServerAssertions } from "./helpers/serverAssertions.js"; -import { nanoid } from "nanoid"; describe("Server Export E2E Tests", () => { let testEnv: ServerTestEnvironment; From a7940a5ed74c3d763891891e8592d0152b0cedf3 Mon Sep 17 00:00:00 2001 From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com> Date: Wed, 17 Sep 2025 13:16:58 +0200 Subject: [PATCH 2/2] chore: release v4.1.0 --- CHANGELOG.md | 6 ++++++ package.json | 2 +- packages/client/package.json | 2 +- packages/core/package.json | 2 +- packages/langchain/package.json | 2 +- packages/openai/package.json | 2 +- packages/otel/package.json | 2 +- packages/tracing/package.json | 2 +- 8 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 908b4b2d..905979fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [4.1.0](https://github.com/langfuse/langfuse-js/compare/v4.0.1...v4.1.0) (2025-09-17) + +### โœจ Features + +* **experiments:** add experiment runner ([#604](https://github.com/langfuse/langfuse-js/issues/604)) ([6a247dc](https://github.com/langfuse/langfuse-js/commit/6a247dc70ca5e797fa4e01121d7458fbc6bbceb9)) + ## [4.0.1](https://github.com/langfuse/langfuse-js/compare/v4.0.0...v4.0.1) (2025-09-11) ### ๐Ÿ› Bug Fixes diff --git a/package.json b/package.json index 086d155c..12bdcfde 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "langfuse-js", - "version": "4.0.1", + "version": "4.1.0", "description": "Langfuse JavaScript / TypeScript SDK", "author": "Langfuse", "license": "MIT", diff --git a/packages/client/package.json b/packages/client/package.json index 3a525d63..469ad079 100644 --- a/packages/client/package.json +++ b/packages/client/package.json @@ -1,6 +1,6 @@ { "name": "@langfuse/client", - "version": "4.0.1", + "version": "4.1.0", "description": "Langfuse API client for universal JavaScript environments", "type": "module", "sideEffects": false, diff --git a/packages/core/package.json b/packages/core/package.json index f28728da..5c7953e5 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -1,6 +1,6 @@ { "name": "@langfuse/core", - "version": "4.0.1", + "version": "4.1.0", "description": "Core functions and utilities for Langfuse packages", "type": "module", "sideEffects": false, diff --git a/packages/langchain/package.json b/packages/langchain/package.json index 30f7959e..17141d58 100644 --- a/packages/langchain/package.json +++ b/packages/langchain/package.json @@ -1,6 +1,6 @@ { "name": "@langfuse/langchain", - "version": "4.0.1", + "version": "4.1.0", "description": "Langfuse integration for LangChain", "type": "module", "sideEffects": false, diff --git a/packages/openai/package.json b/packages/openai/package.json index a9cfb46d..9f3d7392 100644 --- a/packages/openai/package.json +++ b/packages/openai/package.json @@ -1,6 +1,6 @@ { "name": "@langfuse/openai", - "version": "4.0.1", + "version": "4.1.0", "description": "Langfuse integration for OpenAI SDK", "type": "module", "sideEffects": false, diff --git a/packages/otel/package.json b/packages/otel/package.json index 751fda64..4468ce68 100644 --- a/packages/otel/package.json +++ b/packages/otel/package.json @@ -1,6 +1,6 @@ { "name": "@langfuse/otel", - "version": "4.0.1", + "version": "4.1.0", "author": "Langfuse", "license": "MIT", "engines": { diff --git a/packages/tracing/package.json b/packages/tracing/package.json index a5fa6497..06c095d7 100644 --- a/packages/tracing/package.json +++ b/packages/tracing/package.json @@ -1,6 +1,6 @@ { "name": "@langfuse/tracing", - "version": "4.0.1", + "version": "4.1.0", "author": "Langfuse", "license": "MIT", "engines": {