From 6a247dc70ca5e797fa4e01121d7458fbc6bbceb9 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Wed, 17 Sep 2025 13:16:16 +0200
Subject: [PATCH 1/2] feat(experiments): add experiment runner (#604)

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 .release-it.json                              |    1 -
 eslint.config.mjs                             |    2 +-
 package.json                                  |    8 +-
 packages/client/package.json                  |    1 +
 packages/client/src/LangfuseClient.ts         |   60 +-
 packages/client/src/dataset/index.ts          |  237 +++-
 .../src/experiment/ExperimentManager.ts       |  723 +++++++++++
 packages/client/src/experiment/adapters.ts    |   94 ++
 packages/client/src/experiment/types.ts       |  382 ++++++
 packages/client/src/index.ts                  |    3 +
 packages/client/src/score/index.ts            |    4 +
 pnpm-lock.yaml                                |  357 ++++++
 tests/e2e/experiments.e2e.test.ts             | 1072 +++++++++++++++++
 tests/e2e/openai.e2e.test.ts                  |    2 +-
 tests/e2e/tracing.e2e.test.ts                 |    7 +-
 15 files changed, 2904 insertions(+), 49 deletions(-)
 create mode 100644 packages/client/src/experiment/ExperimentManager.ts
 create mode 100644 packages/client/src/experiment/adapters.ts
 create mode 100644 packages/client/src/experiment/types.ts
 create mode 100644 tests/e2e/experiments.e2e.test.ts

diff --git a/.release-it.json b/.release-it.json
index 67ee0972..f8febbca 100644
--- a/.release-it.json
+++ b/.release-it.json
@@ -1,6 +1,5 @@
 {
   "git": {
-    "requireBranch": "main",
     "requireCleanWorkingDir": true,
     "requireUpstream": true,
     "addUntrackedFiles": false,
diff --git a/eslint.config.mjs b/eslint.config.mjs
index 3a18a8e3..7c025182 100644
--- a/eslint.config.mjs
+++ b/eslint.config.mjs
@@ -35,7 +35,7 @@ export default [
           ignoreRestSiblings: true,
         },
       ],
-      "@typescript-eslint/no-explicit-any": "warn",
+      "@typescript-eslint/no-explicit-any": "off",
       "@typescript-eslint/no-unnecessary-type-constraint": "error",
       "prettier/prettier": "error",
       "no-redeclare": "off",
diff --git a/package.json b/package.json
index 6abb5e71..086d155c 100644
--- a/package.json
+++ b/package.json
@@ -2,6 +2,8 @@
   "name": "langfuse-js",
   "version": "4.0.1",
   "description": "Langfuse JavaScript / TypeScript SDK",
+  "author": "Langfuse",
+  "license": "MIT",
   "private": true,
   "type": "module",
   "scripts": {
@@ -25,8 +27,6 @@
     "nuke": "pnpm clean && rm -rf node_modules && rm -rf packages/*/node_modules && pnpm install",
     "prepare": "husky"
   },
-  "author": "Langfuse",
-  "license": "MIT",
   "devDependencies": {
     "@ai-sdk/anthropic": "^2",
     "@ai-sdk/openai": "^2",
@@ -46,6 +46,7 @@
     "@typescript-eslint/eslint-plugin": "^8.36.0",
     "@typescript-eslint/parser": "^8.39.0",
     "ai": "^5",
+    "autoevals": "^0.0.131",
     "dotenv": "^17.2.0",
     "eslint": "^9.32.0",
     "eslint-config-prettier": "^10.1.8",
@@ -67,5 +68,8 @@
   },
   "engines": {
     "node": ">=20"
+  },
+  "resolutions": {
+    "ml-spectra-processing": "14.14.0"
   }
 }
diff --git a/packages/client/package.json b/packages/client/package.json
index 56192e06..3a525d63 100644
--- a/packages/client/package.json
+++ b/packages/client/package.json
@@ -29,6 +29,7 @@
   ],
   "dependencies": {
     "@langfuse/core": "workspace:^",
+    "@langfuse/tracing": "workspace:^",
     "mustache": "^4.2.0"
   },
   "peerDependencies": {
diff --git a/packages/client/src/LangfuseClient.ts b/packages/client/src/LangfuseClient.ts
index d817bd42..4750b45a 100644
--- a/packages/client/src/LangfuseClient.ts
+++ b/packages/client/src/LangfuseClient.ts
@@ -6,6 +6,7 @@ import {
 } from "@langfuse/core";
 
 import { DatasetManager } from "./dataset/index.js";
+import { ExperimentManager } from "./experiment/ExperimentManager.js";
 import { MediaManager } from "./media/index.js";
 import { PromptManager } from "./prompt/index.js";
 import { ScoreManager } from "./score/index.js";
@@ -106,6 +107,62 @@ export class LangfuseClient {
    */
   public media: MediaManager;
 
+  /**
+   * Manager for running experiments on datasets and data items.
+   *
+   * The experiment manager provides comprehensive functionality for:
+   * - Running tasks on datasets or custom data arrays
+   * - Evaluating outputs with custom or pre-built evaluators
+   * - Tracking experiment runs with automatic tracing
+   * - Generating formatted result summaries
+   * - Integrating with AutoEvals library evaluators
+   *
+   * @example Basic experiment execution
+   * ```typescript
+   * const langfuse = new LangfuseClient();
+   *
+   * const result = await langfuse.experiment.run({
+   *   name: "Model Evaluation",
+   *   description: "Testing model performance on Q&A tasks",
+   *   data: [
+   *     { input: "What is 2+2?", expectedOutput: "4" },
+   *     { input: "What is the capital of France?", expectedOutput: "Paris" }
+   *   ],
+   *   task: async ({ input }) => {
+   *     // Your model/task implementation
+   *     const response = await myModel.generate(input);
+   *     return response;
+   *   },
+   *   evaluators: [
+   *     async ({ output, expectedOutput }) => ({
+   *       name: "exact_match",
+   *       value: output.trim().toLowerCase() === expectedOutput.toLowerCase() ? 1 : 0
+   *     })
+   *   ]
+   * });
+   *
+   * console.log(await result.format());
+   * ```
+   *
+   * @example Using with datasets
+   * ```typescript
+   * const dataset = await langfuse.dataset.get("my-test-dataset");
+   * const result = await dataset.runExperiment({
+   *   name: "Production Readiness Test",
+   *   task: myTask,
+   *   evaluators: [accuracyEvaluator, latencyEvaluator],
+   *   runEvaluators: [overallQualityEvaluator]
+   * });
+   * ```
+   *
+   * @see {@link ExperimentManager} for detailed API documentation
+   * @see {@link ExperimentParams} for configuration options
+   * @see {@link ExperimentResult} for result structure
+   * @public
+   * @since 4.0.0
+   */
+  public experiment: ExperimentManager;
+
   private baseUrl: string;
   private projectId: string | null = null;
 
@@ -236,9 +293,10 @@ export class LangfuseClient {
     });
 
     this.prompt = new PromptManager({ apiClient: this.api });
-    this.dataset = new DatasetManager({ apiClient: this.api });
+    this.dataset = new DatasetManager({ langfuseClient: this });
     this.score = new ScoreManager({ apiClient: this.api });
     this.media = new MediaManager({ apiClient: this.api });
+    this.experiment = new ExperimentManager({ langfuseClient: this });
 
     // Keep v3 compat by exposing old interface
     this.getPrompt = this.prompt.get.bind(this.prompt); // keep correct this context for cache access
diff --git a/packages/client/src/dataset/index.ts b/packages/client/src/dataset/index.ts
index e9127978..dbe9cab4 100644
--- a/packages/client/src/dataset/index.ts
+++ b/packages/client/src/dataset/index.ts
@@ -1,21 +1,124 @@
-import {
-  LangfuseAPIClient,
-  Dataset,
-  DatasetRunItem,
-  DatasetItem,
-} from "@langfuse/core";
+import { Dataset, DatasetRunItem, DatasetItem } from "@langfuse/core";
 import { Span } from "@opentelemetry/api";
 
+import { ExperimentResult, ExperimentParams } from "../experiment/types.js";
+import { LangfuseClient } from "../LangfuseClient.js";
+
+/**
+ * Function type for running experiments on Langfuse datasets.
+ *
+ * This function type is attached to fetched datasets to enable convenient
+ * experiment execution directly on dataset objects.
+ *
+ * @param params - Experiment parameters excluding data (since data comes from the dataset)
+ * @returns Promise resolving to experiment results
+ *
+ * @example
+ * ```typescript
+ * const dataset = await langfuse.dataset.get("my-dataset");
+ * const result = await dataset.runExperiment({
+ *   name: "Model Evaluation",
+ *   runName: "Model Evaluation Run 1", // optional
+ *   task: myTask,
+ *   evaluators: [myEvaluator]
+ * });
+ * ```
+ *
+ * @public
+ * @since 4.0.0
+ */
+export type RunExperimentOnDataset = (
+  params: Omit<ExperimentParams<any, any, Record<string, any>>, "data">,
+) => Promise<ExperimentResult<any, any, Record<string, any>>>;
+
+/**
+ * Enhanced dataset object with additional methods for linking and experiments.
+ *
+ * This type extends the base Dataset with functionality for:
+ * - Linking dataset items to traces/observations
+ * - Running experiments directly on the dataset
+ *
+ * @example Working with a fetched dataset
+ * ```typescript
+ * const dataset = await langfuse.dataset.get("my-evaluation-dataset");
+ *
+ * // Access dataset metadata
+ * console.log(dataset.name, dataset.description);
+ *
+ * // Work with individual items
+ * for (const item of dataset.items) {
+ *   console.log(item.input, item.expectedOutput);
+ *
+ *   // Link item to a trace
+ *   await item.link(myObservation, "experiment-run-1");
+ * }
+ *
+ * // Run experiments on the entire dataset
+ * const result = await dataset.runExperiment({
+ *   name: "Model Comparison",
+ *   task: myTask,
+ *   evaluators: [accuracyEvaluator]
+ * });
+ * ```
+ *
+ * @public
+ * @since 4.0.0
+ */
+export type FetchedDataset = Dataset & {
+  /** Dataset items with additional linking functionality */
+  items: (DatasetItem & { link: LinkDatasetItemFunction })[];
+  /** Function to run experiments directly on this dataset */
+  runExperiment: RunExperimentOnDataset;
+};
+
 /**
  * Function type for linking dataset items to OpenTelemetry spans.
- * This allows dataset items to be associated with specific traces for experiment tracking.
  *
- * @param obj - Object containing the OpenTelemetry span
- * @param runName - Name of the dataset run
- * @param runArgs - Optional arguments for the dataset run
+ * This function creates a connection between a dataset item and a trace/observation,
+ * enabling tracking of which dataset items were used in which experiments or runs.
+ * This is essential for creating dataset runs and tracking experiment lineage.
+ *
+ * @param obj - Object containing the OpenTelemetry span to link to
+ * @param obj.otelSpan - The OpenTelemetry span from a Langfuse observation
+ * @param runName - Name of the experiment run for grouping related items
+ * @param runArgs - Optional configuration for the dataset run
+ * @param runArgs.description - Description of the experiment run
+ * @param runArgs.metadata - Additional metadata to attach to the run
  * @returns Promise that resolves to the created dataset run item
  *
+ * @example Basic linking
+ * ```typescript
+ * const dataset = await langfuse.dataset.get("my-dataset");
+ * const span = startObservation("my-task", { input: "test" });
+ * span.update({ output: "result" });
+ * span.end();
+ *
+ * // Link the dataset item to this execution
+ * await dataset.items[0].link(
+ *   { otelSpan: span.otelSpan },
+ *   "experiment-run-1"
+ * );
+ * ```
+ *
+ * @example Linking with metadata
+ * ```typescript
+ * await dataset.items[0].link(
+ *   { otelSpan: span.otelSpan },
+ *   "model-comparison-v2",
+ *   {
+ *     description: "Comparing GPT-4 vs Claude performance",
+ *     metadata: {
+ *       modelVersion: "gpt-4-1106-preview",
+ *       temperature: 0.7,
+ *       timestamp: new Date().toISOString()
+ *     }
+ *   }
+ * );
+ * ```
+ *
+ * @see {@link https://langfuse.com/docs/datasets} Langfuse datasets documentation
  * @public
+ * @since 4.0.0
  */
 export type LinkDatasetItemFunction = (
   obj: { otelSpan: Span },
@@ -37,7 +140,7 @@ export type LinkDatasetItemFunction = (
  * @public
  */
 export class DatasetManager {
-  private apiClient: LangfuseAPIClient;
+  private langfuseClient: LangfuseClient;
 
   /**
    * Creates a new DatasetManager instance.
@@ -45,56 +148,100 @@ export class DatasetManager {
    * @param params - Configuration object containing the API client
    * @internal
    */
-  constructor(params: { apiClient: LangfuseAPIClient }) {
-    this.apiClient = params.apiClient;
+  constructor(params: { langfuseClient: LangfuseClient }) {
+    this.langfuseClient = params.langfuseClient;
   }
 
   /**
-   * Retrieves a dataset by name along with all its items.
+   * Retrieves a dataset by name with all its items and experiment functionality.
    *
-   * This method automatically handles pagination to fetch all dataset items
-   * and enhances each item with a `link` function for easy experiment tracking.
+   * This method fetches a dataset and all its associated items, with support
+   * for automatic pagination to handle large datasets efficiently. The returned
+   * dataset object includes enhanced functionality for linking items to traces
+   * and running experiments directly on the dataset.
    *
    * @param name - The name of the dataset to retrieve
-   * @param options - Optional configuration for fetching
+   * @param options - Optional configuration for data fetching
    * @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50)
+   * @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities
+   *
+   * @example Basic dataset retrieval
+   * ```typescript
+   * const dataset = await langfuse.dataset.get("my-evaluation-dataset");
+   * console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`);
    *
-   * @returns Promise that resolves to the dataset with enhanced items
+   * // Access dataset properties
+   * console.log(dataset.description);
+   * console.log(dataset.metadata);
+   * ```
    *
-   * @example
+   * @example Working with dataset items
    * ```typescript
-   * const dataset = await langfuse.dataset.get("my-dataset");
+   * const dataset = await langfuse.dataset.get("qa-dataset");
    *
    * for (const item of dataset.items) {
-   *   // Use the item data for your experiment
-   *   const result = await processItem(item.input);
+   *   console.log("Question:", item.input);
+   *   console.log("Expected Answer:", item.expectedOutput);
    *
-   *   // Link the result to the dataset item
-   *   await item.link(
-   *     { otelSpan: currentSpan },
-   *     "experiment-run-1",
-   *     { description: "Testing new model" }
-   *   );
+   *   // Each item has a link function for connecting to traces
+   *   // await item.link(span, "experiment-name");
    * }
    * ```
+   *
+   * @example Running experiments on datasets
+   * ```typescript
+   * const dataset = await langfuse.dataset.get("benchmark-dataset");
+   *
+   * const result = await dataset.runExperiment({
+   *   name: "GPT-4 Benchmark",
+   *   runName: "GPT-4 Benchmark v1.2", // optional exact run name
+   *   description: "Evaluating GPT-4 on our benchmark tasks",
+   *   task: async ({ input }) => {
+   *     const response = await openai.chat.completions.create({
+   *       model: "gpt-4",
+   *       messages: [{ role: "user", content: input }]
+   *     });
+   *     return response.choices[0].message.content;
+   *   },
+   *   evaluators: [
+   *     async ({ output, expectedOutput }) => ({
+   *       name: "exact_match",
+   *       value: output === expectedOutput ? 1 : 0
+   *     })
+   *   ]
+   * });
+   *
+   * console.log(await result.format());
+   * ```
+   *
+   * @example Handling large datasets
+   * ```typescript
+   * // For very large datasets, use smaller page sizes
+   * const largeDataset = await langfuse.dataset.get(
+   *   "large-dataset",
+   *   { fetchItemsPageSize: 100 }
+   * );
+   * ```
+   *
+   * @throws {Error} If the dataset does not exist or cannot be accessed
+   * @see {@link FetchedDataset} for the complete return type specification
+   * @see {@link RunExperimentOnDataset} for experiment execution details
+   * @public
+   * @since 4.0.0
    */
   async get(
     name: string,
     options?: {
       fetchItemsPageSize: number;
     },
-  ): Promise<
-    Dataset & {
-      items: (DatasetItem & { link: LinkDatasetItemFunction })[];
-    }
-  > {
-    const dataset = await this.apiClient.datasets.get(name);
+  ): Promise<FetchedDataset> {
+    const dataset = await this.langfuseClient.api.datasets.get(name);
     const items: DatasetItem[] = [];
 
     let page = 1;
 
     while (true) {
-      const itemsResponse = await this.apiClient.datasetItems.list({
+      const itemsResponse = await this.langfuseClient.api.datasetItems.list({
         datasetName: name,
         limit: options?.fetchItemsPageSize ?? 50,
         page,
@@ -109,12 +256,22 @@ export class DatasetManager {
       page++;
     }
 
+    const itemsWithLinkMethod = items.map((item) => ({
+      ...item,
+      link: this.createDatasetItemLinkFunction(item),
+    }));
+
+    const runExperiment: RunExperimentOnDataset = (params) => {
+      return this.langfuseClient.experiment.run({
+        data: items,
+        ...params,
+      });
+    };
+
     const returnDataset = {
       ...dataset,
-      items: items.map((item) => ({
-        ...item,
-        link: this.createDatasetItemLinkFunction(item),
-      })),
+      items: itemsWithLinkMethod,
+      runExperiment,
     };
 
     return returnDataset;
@@ -138,7 +295,7 @@ export class DatasetManager {
         metadata?: any;
       },
     ): Promise<DatasetRunItem> => {
-      return await this.apiClient.datasetRunItems.create({
+      return await this.langfuseClient.api.datasetRunItems.create({
         runName,
         datasetItemId: item.id,
         traceId: obj.otelSpan.spanContext().traceId,
diff --git a/packages/client/src/experiment/ExperimentManager.ts b/packages/client/src/experiment/ExperimentManager.ts
new file mode 100644
index 00000000..fa146a69
--- /dev/null
+++ b/packages/client/src/experiment/ExperimentManager.ts
@@ -0,0 +1,723 @@
+import { DatasetItem, getGlobalLogger } from "@langfuse/core";
+import { startActiveObservation } from "@langfuse/tracing";
+import { ProxyTracerProvider, trace } from "@opentelemetry/api";
+
+import { LangfuseClient } from "../LangfuseClient.js";
+
+import {
+  ExperimentParams,
+  ExperimentResult,
+  ExperimentTask,
+  ExperimentItem,
+  ExperimentItemResult,
+  Evaluator,
+  Evaluation,
+} from "./types.js";
+
+/**
+ * Manages the execution and evaluation of experiments on datasets.
+ *
+ * The ExperimentManager provides a comprehensive framework for running experiments
+ * that test models or tasks against datasets, with support for automatic evaluation,
+ * scoring.
+ *
+ * @example Basic experiment usage
+ * ```typescript
+ * const langfuse = new LangfuseClient();
+ *
+ * const result = await langfuse.experiment.run({
+ *   name: "Capital Cities Test",
+ *   description: "Testing model knowledge of world capitals",
+ *   data: [
+ *     { input: "France", expectedOutput: "Paris" },
+ *     { input: "Germany", expectedOutput: "Berlin" }
+ *   ],
+ *   task: async ({ input }) => {
+ *     const response = await openai.chat.completions.create({
+ *       model: "gpt-4",
+ *       messages: [{ role: "user", content: `What is the capital of ${input}?` }]
+ *     });
+ *     return response.choices[0].message.content;
+ *   },
+ *   evaluators: [
+ *     async ({ input, output, expectedOutput }) => ({
+ *       name: "exact_match",
+ *       value: output === expectedOutput ? 1 : 0
+ *     })
+ *   ]
+ * });
+ *
+ * console.log(await result.format());
+ * ```
+ *
+ * @example Using with Langfuse datasets
+ * ```typescript
+ * const dataset = await langfuse.dataset.get("my-dataset");
+ *
+ * const result = await dataset.runExperiment({
+ *   name: "Model Comparison",
+ *   task: myTask,
+ *   evaluators: [myEvaluator],
+ *   runEvaluators: [averageScoreEvaluator]
+ * });
+ * ```
+ *
+ * @public
+ */
+export class ExperimentManager {
+  private langfuseClient: LangfuseClient;
+
+  /**
+   * Creates a new ExperimentManager instance.
+   *
+   * @param params - Configuration object
+   * @param params.langfuseClient - The Langfuse client instance for API communication
+   * @internal
+   */
+  constructor(params: { langfuseClient: LangfuseClient }) {
+    this.langfuseClient = params.langfuseClient;
+  }
+
+  /**
+   * Gets the global logger instance for experiment-related logging.
+   *
+   * @returns The global logger instance
+   * @internal
+   */
+  get logger() {
+    return getGlobalLogger();
+  }
+
+  /**
+   * Executes an experiment by running a task on each data item and evaluating the results.
+   *
+   * This method orchestrates the complete experiment lifecycle:
+   * 1. Executes the task function on each data item with proper tracing
+   * 2. Runs item-level evaluators on each task output
+   * 3. Executes run-level evaluators on the complete result set
+   * 4. Links results to dataset runs (for Langfuse datasets)
+   * 5. Stores all scores and traces in Langfuse
+   *
+   * @param config - The experiment configuration
+   * @param config.name - Human-readable name for the experiment
+   * @param config.runName - Optional exact name for the experiment run (defaults to name + timestamp)
+   * @param config.description - Optional description of the experiment's purpose
+   * @param config.metadata - Optional metadata to attach to the experiment run
+   * @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
+   * @param config.task - Function that processes each data item and returns output
+   * @param config.evaluators - Optional array of functions to evaluate each item's output
+   * @param config.runEvaluators - Optional array of functions to evaluate the entire run
+   * @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
+   *
+   * @returns Promise that resolves to experiment results including:
+   *   - runName: The experiment run name (either provided or generated)
+   *   - itemResults: Results for each processed data item
+   *   - runEvaluations: Results from run-level evaluators
+   *   - datasetRunId: ID of the dataset run (if using Langfuse datasets)
+   *   - format: Function to format results for display
+   *
+   * @throws {Error} When task execution fails and cannot be handled gracefully
+   * @throws {Error} When required evaluators fail critically
+   *
+   * @example Simple experiment
+   * ```typescript
+   * const result = await langfuse.experiment.run({
+   *   name: "Translation Quality Test",
+   *   data: [
+   *     { input: "Hello world", expectedOutput: "Hola mundo" },
+   *     { input: "Good morning", expectedOutput: "Buenos días" }
+   *   ],
+   *   task: async ({ input }) => translateText(input, 'es'),
+   *   evaluators: [
+   *     async ({ output, expectedOutput }) => ({
+   *       name: "bleu_score",
+   *       value: calculateBleuScore(output, expectedOutput)
+   *     })
+   *   ]
+   * });
+   * ```
+   *
+   * @example Experiment with concurrency control
+   * ```typescript
+   * const result = await langfuse.experiment.run({
+   *   name: "Large Scale Evaluation",
+   *   data: largeBatchOfItems,
+   *   task: expensiveModelCall,
+   *   maxConcurrency: 5, // Process max 5 items simultaneously
+   *   evaluators: [myEvaluator],
+   *   runEvaluators: [
+   *     async ({ itemResults }) => ({
+   *       name: "average_score",
+   *       value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length
+   *     })
+   *   ]
+   * });
+   * ```
+   *
+   * @see {@link ExperimentParams} for detailed parameter documentation
+   * @see {@link ExperimentResult} for detailed return value documentation
+   * @see {@link Evaluator} for evaluator function specifications
+   * @see {@link RunEvaluator} for run evaluator function specifications
+   *
+   * @public
+   */
+  async run<
+    Input = any,
+    ExpectedOutput = any,
+    Metadata extends Record<string, any> = Record<string, any>,
+  >(
+    config: ExperimentParams<Input, ExpectedOutput, Metadata>,
+  ): Promise<ExperimentResult<Input, ExpectedOutput, Metadata>> {
+    const {
+      data,
+      evaluators,
+      task,
+      name,
+      runName: providedRunName,
+      description,
+      metadata,
+      maxConcurrency: batchSize = Infinity,
+      runEvaluators,
+    } = config;
+
+    const runName = this.createExperimentRunName({
+      name,
+      runName: providedRunName,
+    });
+
+    if (!this.isOtelRegistered()) {
+      this.logger.warn(
+        "OpenTelemetry has not been set up. Traces will not be sent to Langfuse.See our docs on how to set up OpenTelemetry: https://langfuse.com/docs/observability/sdk/typescript/setup#tracing-setup",
+      );
+    }
+
+    const itemResults: ExperimentItemResult<Input, ExpectedOutput, Metadata>[] =
+      [];
+
+    for (let i = 0; i < data.length; i += batchSize) {
+      const batch = data.slice(i, i + batchSize);
+
+      const promises: Promise<
+        ExperimentItemResult<Input, ExpectedOutput, Metadata>
+      >[] = batch.map(async (item) => {
+        return this.runItem({
+          item,
+          evaluators,
+          task,
+          experimentName: name,
+          experimentRunName: runName,
+          experimentDescription: description,
+          experimentMetadata: metadata,
+        });
+      });
+
+      const settledResults = await Promise.allSettled(promises);
+      const results = settledResults.reduce(
+        (acc, settledResult) => {
+          if (settledResult.status === "fulfilled") {
+            acc.push(settledResult.value);
+          } else {
+            const errorMessage =
+              settledResult.reason instanceof Error
+                ? settledResult.reason.message
+                : String(settledResult.reason);
+            this.logger.error(
+              `Task failed with error: ${errorMessage}. Skipping item.`,
+            );
+          }
+          return acc;
+        },
+        [] as ExperimentItemResult<Input, ExpectedOutput, Metadata>[],
+      );
+
+      itemResults.push(...results);
+    }
+
+    // Get dataset run URL
+    const datasetRunId =
+      itemResults.length > 0 ? itemResults[0].datasetRunId : undefined;
+
+    let datasetRunUrl = undefined;
+    if (datasetRunId && data.length > 0 && "datasetId" in data[0]) {
+      const datasetId = data[0].datasetId;
+      const projectUrl = (await this.langfuseClient.getTraceUrl("mock")).split(
+        "/traces",
+      )[0];
+
+      datasetRunUrl = `${projectUrl}/datasets/${datasetId}/runs/${datasetRunId}`;
+    }
+
+    // Execute run evaluators
+    let runEvaluations: Evaluation[] = [];
+    if (runEvaluators && runEvaluators?.length > 0) {
+      const promises = runEvaluators.map(async (runEvaluator) => {
+        return runEvaluator({ itemResults })
+          .then((result) => {
+            // Handle both single evaluation and array of evaluations
+            return Array.isArray(result) ? result : [result];
+          })
+          .catch((err) => {
+            this.logger.error("Run evaluator failed with error ", err);
+
+            throw err;
+          });
+      });
+
+      runEvaluations = (await Promise.allSettled(promises)).reduce(
+        (acc, settledPromise) => {
+          if (settledPromise.status === "fulfilled") {
+            acc.push(...settledPromise.value);
+          }
+
+          return acc;
+        },
+        [] as Evaluation[],
+      );
+
+      if (datasetRunId) {
+        runEvaluations.forEach((runEval) =>
+          this.langfuseClient.score.create({ datasetRunId, ...runEval }),
+        );
+      }
+    }
+
+    await this.langfuseClient.score.flush();
+
+    return {
+      runName,
+      itemResults,
+      datasetRunId,
+      datasetRunUrl,
+      runEvaluations,
+      format: async (options?: { includeItemResults?: boolean }) =>
+        await this.prettyPrintResults({
+          datasetRunUrl,
+          itemResults,
+          originalData: data,
+          runEvaluations,
+          name: config.name,
+          runName,
+          description: config.description,
+          includeItemResults: options?.includeItemResults ?? false,
+        }),
+    };
+  }
+
+  /**
+   * Executes the task and evaluators for a single data item.
+   *
+   * This method handles the complete processing pipeline for one data item:
+   * 1. Executes the task within a traced observation span
+   * 2. Links the result to a dataset run (if applicable)
+   * 3. Runs all item-level evaluators on the output
+   * 4. Stores evaluation scores in Langfuse
+   * 5. Handles errors gracefully by continuing with remaining evaluators
+   *
+   * @param params - Parameters for item execution
+   * @param params.experimentName - Name of the parent experiment
+   * @param params.experimentRunName - Run name for the parent experiment
+   * @param params.experimentDescription - Description of the parent experiment
+   * @param params.experimentMetadata - Metadata for the parent experiment
+   * @param params.item - The data item to process
+   * @param params.task - The task function to execute
+   * @param params.evaluators - Optional evaluators to run on the output
+   *
+   * @returns Promise resolving to the item result with output, evaluations, and trace info
+   *
+   * @throws {Error} When task execution fails (propagated from task function)
+   *
+   * @internal
+   */
+  private async runItem<
+    Input = any,
+    ExpectedOutput = any,
+    Metadata extends Record<string, any> = Record<string, any>,
+  >(params: {
+    experimentName: ExperimentParams<Input, ExpectedOutput, Metadata>["name"];
+    experimentRunName: string;
+    experimentDescription: ExperimentParams<
+      Input,
+      ExpectedOutput,
+      Metadata
+    >["description"];
+    experimentMetadata: ExperimentParams<
+      Input,
+      ExpectedOutput,
+      Metadata
+    >["metadata"];
+    item: ExperimentParams<Input, ExpectedOutput, Metadata>["data"][0];
+    task: ExperimentTask<Input, ExpectedOutput, Metadata>;
+    evaluators?: Evaluator<Input, ExpectedOutput, Metadata>[];
+  }): Promise<ExperimentItemResult<Input, ExpectedOutput, Metadata>> {
+    const { item, evaluators = [], task, experimentMetadata = {} } = params;
+
+    const { output, traceId, observationId } = await startActiveObservation(
+      "experiment-item-run",
+      async (span) => {
+        const output = await task(item);
+
+        span.update({
+          input: item.input,
+          output,
+          metadata: {
+            experiment_name: params.experimentName,
+            experiment_run_name: params.experimentRunName,
+            ...experimentMetadata,
+            ...(item.metadata ?? {}),
+            ...("id" in item && "datasetId" in item
+              ? {
+                  dataset_id: item["datasetId"],
+                  dataset_item_id: item["id"],
+                }
+              : {}),
+          },
+        });
+
+        return { output, traceId: span.traceId, observationId: span.id };
+      },
+    );
+
+    let datasetRunId: string | undefined = undefined;
+
+    if ("id" in item) {
+      await this.langfuseClient.api.datasetRunItems
+        .create({
+          runName: params.experimentRunName,
+          runDescription: params.experimentDescription,
+          metadata: params.experimentMetadata,
+          datasetItemId: item.id,
+          traceId,
+          observationId,
+        })
+        .then((result) => {
+          datasetRunId = result.datasetRunId;
+        })
+        .catch((err) =>
+          this.logger.error("Linking dataset run item failed", err),
+        );
+    }
+
+    const evalPromises: Promise<Evaluation[]>[] = evaluators.map(
+      async (evaluator) => {
+        const params = {
+          input: item.input as any,
+          expectedOutput: item.expectedOutput as any,
+          output,
+        };
+
+        return evaluator(params)
+          .then((result) => {
+            // Handle both single evaluation and array of evaluations
+            return Array.isArray(result) ? result : [result];
+          })
+          .catch((err) => {
+            this.logger.error(
+              `Evaluator '${evaluator.name}' failed for params \n\n${JSON.stringify(params)}\n\n with error: ${err}`,
+            );
+
+            throw err;
+          });
+      },
+    );
+
+    const evals = (await Promise.allSettled(evalPromises)).reduce(
+      (acc, promiseResult) => {
+        if (promiseResult.status === "fulfilled") {
+          acc.push(...promiseResult.value.flat());
+        }
+
+        return acc;
+      },
+      [] as Evaluation[],
+    );
+
+    for (const ev of evals) {
+      this.langfuseClient.score.create({
+        traceId,
+        name: ev.name,
+        comment: ev.comment,
+        value: ev.value,
+        metadata: ev.metadata,
+        dataType: ev.dataType,
+      });
+    }
+
+    return {
+      output,
+      evaluations: evals,
+      traceId,
+      datasetRunId,
+      item,
+    };
+  }
+
+  /**
+   * Formats experiment results into a human-readable string representation.
+   *
+   * Creates a comprehensive, nicely formatted summary of the experiment including:
+   * - Individual item results with inputs, outputs, expected values, and scores
+   * - Dataset item and trace links (when available)
+   * - Experiment overview with aggregate statistics
+   * - Average scores across all evaluations
+   * - Run-level evaluation results
+   * - Links to dataset runs in the Langfuse UI
+   *
+   * @param params - Formatting parameters
+   * @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI
+   * @param params.itemResults - Results from processing each data item
+   * @param params.originalData - The original input data items
+   * @param params.runEvaluations - Results from run-level evaluators
+   * @param params.name - Name of the experiment
+   * @param params.description - Optional description of the experiment
+   * @param params.includeItemResults - Whether to include individual item details (default: false)
+   *
+   * @returns Promise resolving to formatted string representation
+   *
+   * @example Output format
+   * ```
+   * 1. Item 1:
+   *    Input:    What is the capital of France?
+   *    Expected: Paris
+   *    Actual:   Paris
+   *    Scores:
+   *      • exact_match: 1.000
+   *      • similarity: 0.95
+   *        💭 Very close match with expected output
+   *
+   *    Dataset Item:
+   *    https://cloud.langfuse.com/project/123/datasets/456/items/789
+   *
+   *    Trace:
+   *    https://cloud.langfuse.com/project/123/traces/abc123
+   *
+   * ──────────────────────────────────────────────────
+   * 📊 Translation Quality Test - Testing model accuracy
+   * 2 items
+   * Evaluations:
+   *   • exact_match
+   *   • similarity
+   *
+   * Average Scores:
+   *   • exact_match: 0.850
+   *   • similarity: 0.923
+   *
+   * Run Evaluations:
+   *   • overall_quality: 0.887
+   *     💭 Good performance with room for improvement
+   *
+   * 🔗 Dataset Run:
+   *    https://cloud.langfuse.com/project/123/datasets/456/runs/def456
+   * ```
+   *
+   * @internal
+   */
+  private async prettyPrintResults<
+    Input = any,
+    ExpectedOutput = any,
+    Metadata extends Record<string, any> = Record<string, any>,
+  >(params: {
+    datasetRunUrl?: string;
+    itemResults: ExperimentItemResult<Input, ExpectedOutput, Metadata>[];
+    originalData:
+      | ExperimentItem<Input, ExpectedOutput, Metadata>[]
+      | DatasetItem[];
+    runEvaluations: Evaluation[];
+    name: string;
+    runName: string;
+    description?: string;
+    includeItemResults?: boolean;
+  }): Promise<string> {
+    const {
+      itemResults,
+      originalData,
+      runEvaluations,
+      name,
+      runName,
+      description,
+      includeItemResults = false,
+    } = params;
+
+    if (itemResults.length === 0) {
+      return "No experiment results to display.";
+    }
+
+    let output = "";
+
+    // Individual results
+    if (includeItemResults) {
+      for (let index = 0; index < itemResults.length; index++) {
+        const result = itemResults[index];
+        const originalItem = originalData[index];
+
+        output += `\n${index + 1}. Item ${index + 1}:\n`;
+
+        // Input, expected, and actual on separate lines
+        if (originalItem?.input !== undefined) {
+          output += `   Input:    ${this.formatValue(originalItem.input)}\n`;
+        }
+
+        const expectedOutput =
+          originalItem?.expectedOutput ?? result.expectedOutput ?? null;
+        output += `   Expected: ${expectedOutput !== null ? this.formatValue(expectedOutput) : "null"}\n`;
+        output += `   Actual:   ${this.formatValue(result.output)}\n`;
+
+        // Scores on separate lines
+        if (result.evaluations.length > 0) {
+          output += `   Scores:\n`;
+          result.evaluations.forEach((evaluation) => {
+            const score =
+              typeof evaluation.value === "number"
+                ? evaluation.value.toFixed(3)
+                : evaluation.value;
+            output += `     • ${evaluation.name}: ${score}`;
+            if (evaluation.comment) {
+              output += `\n       💭 ${evaluation.comment}`;
+            }
+            output += "\n";
+          });
+        }
+
+        // Dataset item link on separate line
+        if (
+          originalItem &&
+          "id" in originalItem &&
+          "datasetId" in originalItem
+        ) {
+          const projectUrl = (
+            await this.langfuseClient.getTraceUrl("mock")
+          ).split("/traces")[0];
+          const datasetItemUrl = `${projectUrl}/datasets/${originalItem.datasetId}/items/${originalItem.id}`;
+          output += `\n   Dataset Item:\n   ${datasetItemUrl}\n`;
+        }
+
+        // Trace link on separate line
+        if (result.traceId) {
+          const traceUrl = await this.langfuseClient.getTraceUrl(
+            result.traceId,
+          );
+          output += `\n   Trace:\n   ${traceUrl}\n`;
+        }
+      }
+    } else {
+      output += `Individual Results: Hidden (${itemResults.length} items)\n`;
+      output += "💡 Call format({ includeItemResults: true }) to view them\n";
+    }
+
+    // Experiment Overview
+    const totalItems = itemResults.length;
+    const evaluationNames = new Set(
+      itemResults.flatMap((r) => r.evaluations.map((e) => e.name)),
+    );
+
+    output += `\n${"─".repeat(50)}\n`;
+    output += `🧪 Experiment: ${name}`;
+    output += `\n📋 Run name: ${runName}`;
+    if (description) {
+      output += ` - ${description}`;
+    }
+
+    output += `\n${totalItems} items`;
+
+    if (evaluationNames.size > 0) {
+      output += `\nEvaluations:`;
+      Array.from(evaluationNames).forEach((evalName) => {
+        output += `\n  • ${evalName}`;
+      });
+      output += "\n";
+    }
+
+    // Average scores in bulleted list
+    if (evaluationNames.size > 0) {
+      output += `\nAverage Scores:`;
+      for (const evalName of evaluationNames) {
+        const scores = itemResults
+          .flatMap((r) => r.evaluations)
+          .filter((e) => e.name === evalName && typeof e.value === "number")
+          .map((e) => e.value as number);
+
+        if (scores.length > 0) {
+          const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
+          output += `\n  • ${evalName}: ${avg.toFixed(3)}`;
+        }
+      }
+      output += "\n";
+    }
+
+    // Run evaluations
+    if (runEvaluations.length > 0) {
+      output += `\nRun Evaluations:`;
+      runEvaluations.forEach((runEval) => {
+        const score =
+          typeof runEval.value === "number"
+            ? runEval.value.toFixed(3)
+            : runEval.value;
+        output += `\n  • ${runEval.name}: ${score}`;
+        if (runEval.comment) {
+          output += `\n    💭 ${runEval.comment}`;
+        }
+      });
+      output += "\n";
+    }
+
+    if (params.datasetRunUrl) {
+      output += `\n🔗 Dataset Run:\n   ${params.datasetRunUrl}`;
+    }
+
+    return output;
+  }
+
+  /**
+   * Formats a value for display in pretty-printed output.
+   *
+   * Handles different value types appropriately:
+   * - Strings: Truncates long strings to 50 characters with "..."
+   * - Objects/Arrays: Converts to JSON string representation
+   * - Primitives: Uses toString() representation
+   *
+   * @param value - The value to format
+   * @returns Formatted string representation suitable for display
+   *
+   * @internal
+   */
+  private formatValue(value: any): string {
+    if (typeof value === "string") {
+      return value.length > 50 ? `${value.substring(0, 47)}...` : value;
+    }
+    return JSON.stringify(value);
+  }
+
+  private isOtelRegistered(): boolean {
+    let tracerProvider = trace.getTracerProvider();
+
+    if (tracerProvider instanceof ProxyTracerProvider) {
+      tracerProvider = tracerProvider.getDelegate();
+    }
+
+    return tracerProvider.constructor.name !== "NoopTracerProvider";
+  }
+
+  /**
+   * Creates an experiment run name based on provided parameters.
+   *
+   * If runName is provided, returns it directly. Otherwise, generates
+   * a name by combining the experiment name with an ISO timestamp.
+   *
+   * @param params - Parameters for run name creation
+   * @param params.name - The experiment name
+   * @param params.runName - Optional provided run name
+   * @returns The final run name to use
+   *
+   * @internal
+   */
+  private createExperimentRunName(params: {
+    name: string;
+    runName?: string;
+  }): string {
+    if (params.runName) {
+      return params.runName;
+    }
+
+    const isoTimestamp = new Date().toISOString();
+    return `${params.name} - ${isoTimestamp}`;
+  }
+}
diff --git a/packages/client/src/experiment/adapters.ts b/packages/client/src/experiment/adapters.ts
new file mode 100644
index 00000000..a8a87c88
--- /dev/null
+++ b/packages/client/src/experiment/adapters.ts
@@ -0,0 +1,94 @@
+import { Evaluator } from "./types.js";
+
+/**
+ * Converts an AutoEvals evaluator to a Langfuse-compatible evaluator function.
+ *
+ * This adapter function bridges the gap between AutoEvals library evaluators
+ * and Langfuse experiment evaluators, handling parameter mapping and result
+ * formatting automatically.
+ *
+ * AutoEvals evaluators expect `input`, `output`, and `expected` parameters,
+ * while Langfuse evaluators use `input`, `output`, and `expectedOutput`.
+ * This function handles the parameter name mapping.
+ *
+ * @template E - Type of the AutoEvals evaluator function
+ * @param autoevalEvaluator - The AutoEvals evaluator function to convert
+ * @param params - Optional additional parameters to pass to the AutoEvals evaluator
+ * @returns A Langfuse-compatible evaluator function
+ *
+ * @example Basic usage with AutoEvals
+ * ```typescript
+ * import { Factuality, Levenshtein } from 'autoevals';
+ * import { createEvaluatorFromAutoevals } from '@langfuse/client';
+ *
+ * const factualityEvaluator = createEvaluatorFromAutoevals(Factuality);
+ * const levenshteinEvaluator = createEvaluatorFromAutoevals(Levenshtein);
+ *
+ * await langfuse.experiment.run({
+ *   name: "AutoEvals Integration Test",
+ *   data: myDataset,
+ *   task: myTask,
+ *   evaluators: [factualityEvaluator, levenshteinEvaluator]
+ * });
+ * ```
+ *
+ * @example Using with additional parameters
+ * ```typescript
+ * import { Factuality } from 'autoevals';
+ *
+ * const factualityEvaluator = createEvaluatorFromAutoevals(
+ *   Factuality,
+ *   { model: 'gpt-4o' } // Additional params for AutoEvals
+ * );
+ *
+ * await langfuse.experiment.run({
+ *   name: "Factuality Test",
+ *   data: myDataset,
+ *   task: myTask,
+ *   evaluators: [factualityEvaluator]
+ * });
+ * ```
+ *
+ * @see {@link https://github.com/braintrustdata/autoevals} AutoEvals library documentation
+ * @see {@link Evaluator} for Langfuse evaluator specifications
+ *
+ * @public
+ * @since 4.0.0
+ */
+export function createEvaluatorFromAutoevals<E extends CallableFunction>(
+  autoevalEvaluator: E,
+  params?: Params<E>,
+): Evaluator {
+  const langfuseEvaluator: Evaluator = async (langfuseEvaluatorParams) => {
+    const score = await autoevalEvaluator({
+      ...(params ?? {}),
+      input: langfuseEvaluatorParams.input,
+      output: langfuseEvaluatorParams.output,
+      expected: langfuseEvaluatorParams.expectedOutput,
+    });
+
+    return {
+      name: score.name,
+      value: score.score ?? 0,
+      metadata: score.metadata,
+    };
+  };
+
+  return langfuseEvaluator;
+}
+
+/**
+ * Utility type to extract parameter types from AutoEvals evaluator functions.
+ *
+ * This type helper extracts the parameter type from an AutoEvals evaluator
+ * and omits the standard parameters (input, output, expected) that are
+ * handled by the adapter, leaving only the additional configuration parameters.
+ *
+ * @template E - The AutoEvals evaluator function type
+ * @internal
+ */
+type Params<E> = Parameters<
+  E extends (...args: any[]) => any ? E : never
+>[0] extends infer P
+  ? Omit<P, "input" | "output" | "expected">
+  : never;
diff --git a/packages/client/src/experiment/types.ts b/packages/client/src/experiment/types.ts
new file mode 100644
index 00000000..d2f7a29d
--- /dev/null
+++ b/packages/client/src/experiment/types.ts
@@ -0,0 +1,382 @@
+import { DatasetItem, ScoreBody } from "@langfuse/core";
+
+export type ExperimentItem<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> =
+  | {
+      /**
+       * The input data to pass to the task function.
+       *
+       * Can be any type - string, object, array, etc. This data will be passed
+       * to your task function as the `input` parameter. Structure it according
+       * to your task's requirements.
+       */
+      input?: Input;
+
+      /**
+       * The expected output for evaluation purposes.
+       *
+       * Optional ground truth or reference output for this input.
+       * Used by evaluators to assess task performance. If not provided,
+       * only evaluators that don't require expected output can be used.
+       */
+      expectedOutput?: ExpectedOutput;
+
+      /**
+       * Optional metadata to attach to the experiment item.
+       *
+       * Store additional context, tags, or custom data related to this specific item.
+       * This metadata will be available in traces and can be used for filtering,
+       * analysis, or custom evaluator logic.
+       */
+      metadata?: Metadata;
+    }
+  | DatasetItem;
+
+/**
+ * Parameters passed to an experiment task function.
+ *
+ * Can be either an ExperimentItem (for custom datasets) or a DatasetItem
+ * (for Langfuse datasets). The task function should handle both types.
+ *
+ * @public
+ * @since 4.1.0
+ */
+export type ExperimentTaskParams<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = ExperimentItem<Input, ExpectedOutput, Metadata>;
+
+/**
+ * Function type for experiment tasks that process input data and return output.
+ *
+ * The task function is the core component being tested in an experiment.
+ * It receives either an ExperimentItem or DatasetItem and produces output
+ * that will be evaluated.
+ *
+ * @param params - Either an ExperimentItem or DatasetItem containing input and metadata
+ * @returns Promise resolving to the task's output (any type)
+ *
+ * @example Task handling both item types
+ * ```typescript
+ * const universalTask: ExperimentTask = async (item) => {
+ *   // Works with both ExperimentItem and DatasetItem
+ *   const input = item.input;
+ *   const metadata = item.metadata;
+ *
+ *   const response = await openai.chat.completions.create({
+ *     model: "gpt-4",
+ *     messages: [{ role: "user", content: input }]
+ *   });
+ *
+ *   return response.choices[0].message.content;
+ * };
+ * ```
+ *
+ * @public
+ * @since 4.1.0
+ */
+export type ExperimentTask<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = (
+  params: ExperimentTaskParams<Input, ExpectedOutput, Metadata>,
+) => Promise<any>;
+
+export type Evaluation = Pick<
+  ScoreBody,
+  "name" | "value" | "comment" | "metadata" | "dataType"
+>;
+
+export type EvaluatorParams<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = {
+  /**
+   * The original input data passed to the task.
+   *
+   * This is the same input that was provided to the task function.
+   * Use this for context-aware evaluations or input-output relationship analysis.
+   */
+  input: Input;
+
+  /**
+   * The output produced by the task.
+   *
+   * This is the actual result returned by your task function.
+   * This is the primary value to evaluate against expectations.
+   */
+  output: any;
+
+  /**
+   * The expected output for comparison (optional).
+   *
+   * This is the ground truth or expected result for the given input.
+   * May not be available for all evaluation scenarios.
+   */
+  expectedOutput?: ExpectedOutput;
+
+  /**
+   * Optional metadata about the evaluation context.
+   *
+   * Contains additional information from the experiment item or dataset item
+   * that may be useful for evaluation logic, such as tags, categories,
+   * or other contextual data.
+   */
+  metadata?: Metadata;
+};
+export type Evaluator<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = (
+  params: EvaluatorParams<Input, ExpectedOutput, Metadata>,
+) => Promise<Evaluation[] | Evaluation>;
+
+export type RunEvaluatorParams<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = {
+  /**
+   * Results from all processed experiment items.
+   *
+   * Each item contains the input, output, evaluations, and metadata from
+   * processing a single data item. Use this for aggregate analysis,
+   * statistical calculations, and cross-item comparisons.
+   */
+  itemResults: ExperimentItemResult<Input, ExpectedOutput, Metadata>[];
+};
+export type RunEvaluator<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = (
+  params: RunEvaluatorParams<Input, ExpectedOutput, Metadata>,
+) => Promise<Evaluation[] | Evaluation>;
+
+export type ExperimentParams<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = {
+  /**
+   * Human-readable name for the experiment.
+   *
+   * This name will appear in Langfuse UI and experiment results.
+   * Choose a descriptive name that identifies the experiment's purpose.
+   */
+  name: string;
+
+  /**
+   * Optional exact name for the experiment run.
+   *
+   * If provided, this will be used as the exact dataset run name if the data
+   * contains Langfuse dataset items. If not provided, this will default to
+   * the experiment name appended with an ISO timestamp.
+   */
+  runName?: string;
+
+  /**
+   * Optional description explaining the experiment's purpose.
+   *
+   * Provide context about what you're testing, methodology, or goals.
+   * This helps with experiment tracking and result interpretation.
+   */
+  description?: string;
+
+  /**
+   * Optional metadata to attach to the experiment run.
+   *
+   * Store additional context like model versions, hyperparameters,
+   * or any other relevant information for analysis and comparison.
+   */
+  metadata?: Record<string, any>;
+
+  /**
+   * Array of data items to process.
+   *
+   * Can be either custom ExperimentItem[] or DatasetItem[] from Langfuse.
+   * Each item should contain input data and optionally expected output.
+   */
+  data: ExperimentItem<Input, ExpectedOutput, Metadata>[];
+
+  /**
+   * The task function to execute on each data item.
+   *
+   * This function receives input data and produces output that will be evaluated.
+   * It should encapsulate the model or system being tested.
+   */
+  task: ExperimentTask<Input, ExpectedOutput, Metadata>;
+
+  /**
+   * Optional array of evaluator functions to assess each item's output.
+   *
+   * Each evaluator receives input, output, and expected output (if available)
+   * and returns evaluation results. Multiple evaluators enable comprehensive assessment.
+   */
+  evaluators?: Evaluator<Input, ExpectedOutput, Metadata>[];
+
+  /**
+   * Optional array of run-level evaluators to assess the entire experiment.
+   *
+   * These evaluators receive all item results and can perform aggregate analysis
+   * like calculating averages, detecting patterns, or statistical analysis.
+   */
+  runEvaluators?: RunEvaluator<Input, ExpectedOutput, Metadata>[];
+
+  /**
+   * Maximum number of concurrent task executions (default: Infinity).
+   *
+   * Controls parallelism to manage resource usage and API rate limits.
+   * Set lower values for expensive operations or rate-limited services.
+   */
+  maxConcurrency?: number;
+};
+
+export type ExperimentItemResult<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = Pick<
+  ExperimentItem<Input, ExpectedOutput, Metadata>,
+  "input" | "expectedOutput"
+> & {
+  /**
+   * The original experiment or dataset item that was processed.
+   *
+   * Contains the complete original item data including input, expected output,
+   * metadata, and any additional fields. Useful for accessing item-specific
+   * context or metadata in result analysis.
+   */
+  item: ExperimentItem<Input, ExpectedOutput, Metadata>;
+  /**
+   * The actual output produced by the task.
+   *
+   * This is the result returned by your task function for this specific input.
+   * It will be passed to evaluators for assessment against expected outputs.
+   */
+  output: any;
+
+  /**
+   * Results from all evaluators that ran on this item.
+   *
+   * Contains evaluation scores, comments, and metadata from each evaluator
+   * that successfully processed this item. Failed evaluators are excluded.
+   */
+  evaluations: Evaluation[];
+
+  /**
+   * Langfuse trace ID for this item's execution (for debugging and analysis).
+   *
+   * Use this ID to view detailed execution traces in the Langfuse UI,
+   * including timing, inputs, outputs, and any nested observations.
+   */
+  traceId?: string;
+
+  /**
+   * Dataset run ID if this item was part of a Langfuse dataset.
+   *
+   * Present only when running experiments on Langfuse datasets.
+   * Links this item result to a specific dataset run for tracking and comparison.
+   */
+  datasetRunId?: string;
+};
+
+/**
+ * Complete result of an experiment execution.
+ *
+ * Contains all results from processing the experiment data,
+ * including individual item results, run-level evaluations,
+ * and utilities for result visualization.
+ *
+ * @example Using experiment results
+ * ```typescript
+ * const result = await langfuse.experiment.run(config);
+ *
+ * // Access individual results
+ * console.log(`Processed ${result.itemResults.length} items`);
+ *
+ * // Check run-level evaluations
+ * const avgScore = result.runEvaluations.find(e => e.name === 'average_score');
+ * console.log(`Average score: ${avgScore?.value}`);
+ *
+ * // Print formatted results
+ * console.log(await result.format());
+ *
+ * // Print summary with individual item results
+ * console.log(await result.format({ includeItemResults: true }));
+ *
+ * // Link to dataset run (if available)
+ * if (result.datasetRunUrl) {
+ *   console.log(`View in Langfuse: dataset run ${result.datasetRunUrl}`);
+ * }
+ * ```
+ *
+ * @public
+ */
+export type ExperimentResult<
+  Input = any,
+  ExpectedOutput = any,
+  Metadata extends Record<string, any> = Record<string, any>,
+> = {
+  /**
+   * The experiment run name.
+   *
+   * This is equal to the dataset run name if experiment was on Langfuse dataset.
+   * Either the provided runName parameter or generated name (experiment name + timestamp).
+   */
+  runName: string;
+
+  /**
+   * ID of the dataset run in Langfuse (only for experiments on Langfuse datasets).
+   *
+   * Present only when running experiments on Langfuse datasets.
+   * Use this ID to access the dataset run via the Langfuse API or UI
+   * for detailed analysis and comparison with other runs.
+   */
+  datasetRunId?: string;
+
+  /**
+   * URL to the dataset run in the Langfuse UI (only for experiments on Langfuse datasets).
+   *
+   * Direct link to view the complete dataset run in the Langfuse web interface,
+   * including all experiment results, traces, and analytics. Provides easy access
+   * to detailed analysis and visualization of the experiment.
+   */
+  datasetRunUrl?: string;
+
+  /**
+   * Results from processing each individual data item.
+   *
+   * Contains the complete results for every item in your experiment data,
+   * including inputs, outputs, evaluations, and trace information.
+   * Use this for detailed analysis of individual item performance.
+   */
+  itemResults: ExperimentItemResult<Input, ExpectedOutput, Metadata>[];
+
+  /**
+   * Results from run-level evaluators that assessed the entire experiment.
+   *
+   * Contains aggregate evaluations that analyze the complete experiment,
+   * such as average scores, statistical measures, or overall quality assessments.
+   */
+  runEvaluations: Evaluation[];
+
+  /**
+   * Function to format experiment results in a human-readable format.
+   *
+   * Generates a comprehensive, nicely formatted summary including individual results,
+   * aggregate statistics, evaluation scores, and links to traces and dataset runs.
+   *
+   * @param options - Formatting options
+   * @param options.includeItemResults - Whether to include individual item details (default: false)
+   * @returns Promise resolving to formatted string representation
+   */
+  format: (options?: { includeItemResults?: boolean }) => Promise<string>;
+};
diff --git a/packages/client/src/index.ts b/packages/client/src/index.ts
index 850c7476..d55bc403 100644
--- a/packages/client/src/index.ts
+++ b/packages/client/src/index.ts
@@ -3,3 +3,6 @@ export * from "./prompt/index.js";
 export * from "./score/index.js";
 export * from "./dataset/index.js";
 export * from "./media/index.js";
+export * from "./experiment/ExperimentManager.js";
+export * from "./experiment/adapters.js";
+export * from "./experiment/types.js";
diff --git a/packages/client/src/score/index.ts b/packages/client/src/score/index.ts
index 3aee8867..92c8800b 100644
--- a/packages/client/src/score/index.ts
+++ b/packages/client/src/score/index.ts
@@ -93,6 +93,10 @@ export class ScoreManager {
     }
 
     this.eventQueue.push(scoreIngestionEvent);
+    this.logger.debug(
+      "Added score event to queue:\n",
+      JSON.stringify(scoreIngestionEvent, null, 2),
+    );
 
     if (this.eventQueue.length >= this.flushAtCount) {
       this.flushPromise = this.flush();
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 1aa42445..25cded9f 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -4,6 +4,9 @@ settings:
   autoInstallPeers: true
   excludeLinksFromLockfile: false
 
+overrides:
+  ml-spectra-processing: 14.14.0
+
 importers:
 
   .:
@@ -62,6 +65,9 @@ importers:
       ai:
         specifier: ^5
         version: 5.0.23(zod@3.25.76)
+      autoevals:
+        specifier: ^0.0.131
+        version: 0.0.131
       dotenv:
         specifier: ^17.2.0
         version: 17.2.0
@@ -122,6 +128,9 @@ importers:
       '@langfuse/core':
         specifier: workspace:^
         version: link:../core
+      '@langfuse/tracing':
+        specifier: workspace:^
+        version: link:../tracing
       '@opentelemetry/api':
         specifier: ^1.9.0
         version: 1.9.0
@@ -1106,6 +1115,12 @@ packages:
   '@types/mustache@4.2.6':
     resolution: {integrity: sha512-t+8/QWTAhOFlrF1IVZqKnMRJi84EgkIK5Kh0p2JV4OLywUvCwJPFxbJAl7XAow7DVIHsF+xW9f1MVzg0L6Szjw==}
 
+  '@types/node-fetch@2.6.13':
+    resolution: {integrity: sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==}
+
+  '@types/node@18.19.124':
+    resolution: {integrity: sha512-hY4YWZFLs3ku6D2Gqo3RchTd9VRCcrjqp/I0mmohYeUVA5Y8eCXKJEasHxLAJVZRJuQogfd1GiJ9lgogBgKeuQ==}
+
   '@types/node@20.19.11':
     resolution: {integrity: sha512-uug3FEEGv0r+jrecvUUpbY8lLisvIjg6AAic6a2bSP5OEOLeJsDSnvhCDov7ipFFMXS3orMpzlmi0ZcuGkBbow==}
 
@@ -1255,6 +1270,10 @@ packages:
   '@vitest/utils@3.2.4':
     resolution: {integrity: sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==}
 
+  abort-controller@3.0.0:
+    resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==}
+    engines: {node: '>=6.5'}
+
   acorn-import-attributes@1.9.5:
     resolution: {integrity: sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==}
     peerDependencies:
@@ -1277,6 +1296,10 @@ packages:
     resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==}
     engines: {node: '>= 14'}
 
+  agentkeepalive@4.6.0:
+    resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==}
+    engines: {node: '>= 8.0.0'}
+
   ai@5.0.23:
     resolution: {integrity: sha512-1zUF0o1zRI7UmSd8u5CKc2iHNhv21tM95Oka81c0CF77GnTbq5RvrAqVuLI+gMyKcIgs99yxA+xc5hJXvh6V+w==}
     engines: {node: '>=18'}
@@ -1286,6 +1309,9 @@ packages:
   ajv@6.12.6:
     resolution: {integrity: sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==}
 
+  ajv@8.17.1:
+    resolution: {integrity: sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==}
+
   ansi-escapes@4.3.2:
     resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
     engines: {node: '>=8'}
@@ -1358,6 +1384,12 @@ packages:
   async-retry@1.3.3:
     resolution: {integrity: sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw==}
 
+  asynckit@0.4.0:
+    resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
+
+  autoevals@0.0.131:
+    resolution: {integrity: sha512-F+3lraja+Ms7n1M2cpWl65N7AYx4sPocRW454H5HlSGabYMfuFOUxw8IXmEYDkQ38BxtZ0Wd5ZAQj9RF59YJWw==}
+
   available-typed-arrays@1.0.7:
     resolution: {integrity: sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==}
     engines: {node: '>= 0.4'}
@@ -1375,6 +1407,9 @@ packages:
   before-after-hook@3.0.2:
     resolution: {integrity: sha512-Nik3Sc0ncrMK4UUdXQmAnRtzmNQTAAXmXIopizwZ1W1t8QmfJj+zL4OA2I7XPTPW5z5TDqv4hRo/JzouDJnX3A==}
 
+  binary-search@1.3.6:
+    resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==}
+
   boolbase@1.0.0:
     resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
 
@@ -1459,6 +1494,9 @@ packages:
     resolution: {integrity: sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==}
     engines: {node: '>=20.18.1'}
 
+  cheminfo-types@1.8.1:
+    resolution: {integrity: sha512-FRcpVkox+cRovffgqNdDFQ1eUav+i/Vq/CUd1hcfEl2bevntFlzznL+jE8g4twl6ElB7gZjCko6pYpXyMn+6dA==}
+
   chokidar@4.0.3:
     resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==}
     engines: {node: '>= 14.16.0'}
@@ -1496,6 +1534,10 @@ packages:
   color-name@1.1.4:
     resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==}
 
+  combined-stream@1.0.8:
+    resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
+    engines: {node: '>= 0.8'}
+
   commander@4.1.1:
     resolution: {integrity: sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==}
     engines: {node: '>= 6'}
@@ -1503,6 +1545,15 @@ packages:
   compare-func@2.0.0:
     resolution: {integrity: sha512-zHig5N+tPWARooBnb0Zx1MFcdfpyJrfTJ3Y5L+IFvUm8rM74hHz66z0gw0x4tijh5CorKkKUCnW82R2vmpeCRA==}
 
+  compute-cosine-similarity@1.1.0:
+    resolution: {integrity: sha512-FXhNx0ILLjGi9Z9+lglLzM12+0uoTnYkHm7GiadXDAr0HGVLm25OivUS1B/LPkbzzvlcXz/1EvWg9ZYyJSdhTw==}
+
+  compute-dot@1.1.0:
+    resolution: {integrity: sha512-L5Ocet4DdMrXboss13K59OK23GXjiSia7+7Ukc7q4Bl+RVpIXK2W9IHMbWDZkh+JUEvJAwOKRaJDiFUa1LTnJg==}
+
+  compute-l2norm@1.1.0:
+    resolution: {integrity: sha512-6EHh1Elj90eU28SXi+h2PLnTQvZmkkHWySpoFz+WOlVNLz3DQoC4ISUHSV9n5jMxPHtKGJ01F4uu2PsXBB8sSg==}
+
   concat-map@0.0.1:
     resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==}
 
@@ -1672,6 +1723,10 @@ packages:
     resolution: {integrity: sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==}
     engines: {node: '>= 14'}
 
+  delayed-stream@1.0.0:
+    resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
+    engines: {node: '>=0.4.0'}
+
   destr@2.0.5:
     resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==}
 
@@ -1892,6 +1947,10 @@ packages:
     resolution: {integrity: sha512-e3x3FBvGzeCIHhF+zhK8FZA2vC5uFn6b4HJjegUbIWrDb4mJ7JjTGMJY9VGIbRVpmSwHopNiaJibhjIr+HfLug==}
     engines: {node: '>=6.0.0'}
 
+  event-target-shim@5.0.1:
+    resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==}
+    engines: {node: '>=6'}
+
   eventemitter3@4.0.7:
     resolution: {integrity: sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==}
 
@@ -1933,6 +1992,9 @@ packages:
   fast-levenshtein@2.0.6:
     resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
 
+  fast-uri@3.1.0:
+    resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==}
+
   fastq@1.19.1:
     resolution: {integrity: sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==}
 
@@ -1944,6 +2006,9 @@ packages:
       picomatch:
         optional: true
 
+  fft.js@4.0.4:
+    resolution: {integrity: sha512-f9c00hphOgeQTlDyavwTtu6RiK8AIFjD6+jvXkNkpeQ7rirK3uFWVpalkoS4LAwbdX7mfZ8aoBfFVQX1Re/8aw==}
+
   file-entry-cache@8.0.0:
     resolution: {integrity: sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==}
     engines: {node: '>=16.0.0'}
@@ -1978,6 +2043,17 @@ packages:
     resolution: {integrity: sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==}
     engines: {node: '>=14'}
 
+  form-data-encoder@1.7.2:
+    resolution: {integrity: sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==}
+
+  form-data@4.0.4:
+    resolution: {integrity: sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==}
+    engines: {node: '>= 6'}
+
+  formdata-node@4.4.1:
+    resolution: {integrity: sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==}
+    engines: {node: '>= 12.20'}
+
   fsevents@2.3.3:
     resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
     engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
@@ -2126,6 +2202,9 @@ packages:
     resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==}
     engines: {node: '>=16.17.0'}
 
+  humanize-ms@1.2.1:
+    resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==}
+
   husky@9.1.7:
     resolution: {integrity: sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA==}
     engines: {node: '>=18'}
@@ -2178,6 +2257,10 @@ packages:
       '@types/node':
         optional: true
 
+  install@0.13.0:
+    resolution: {integrity: sha512-zDml/jzr2PKU9I8J/xyZBQn8rPCAY//UOYNmR01XwNwyfhEWObo2SWfSl1+0tm1u6PhxLwDnfsT/6jB7OUxqFA==}
+    engines: {node: '>= 0.10'}
+
   internal-slot@1.1.0:
     resolution: {integrity: sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==}
     engines: {node: '>= 0.4'}
@@ -2186,6 +2269,9 @@ packages:
     resolution: {integrity: sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==}
     engines: {node: '>= 12'}
 
+  is-any-array@2.0.1:
+    resolution: {integrity: sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==}
+
   is-array-buffer@3.0.5:
     resolution: {integrity: sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==}
     engines: {node: '>= 0.4'}
@@ -2348,6 +2434,10 @@ packages:
     resolution: {integrity: sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==}
     engines: {node: '>=10'}
 
+  js-levenshtein@1.1.6:
+    resolution: {integrity: sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==}
+    engines: {node: '>=0.10.0'}
+
   js-tiktoken@1.0.20:
     resolution: {integrity: sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==}
 
@@ -2370,6 +2460,9 @@ packages:
   json-schema-traverse@0.4.1:
     resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==}
 
+  json-schema-traverse@1.0.0:
+    resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
+
   json-schema@0.4.0:
     resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==}
 
@@ -2408,6 +2501,9 @@ packages:
     resolution: {integrity: sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==}
     engines: {node: '>=14'}
 
+  linear-sum-assignment@1.0.7:
+    resolution: {integrity: sha512-jfLoSGwZNyjfY8eK4ayhjfcIu3BfWvP6sWieYzYI3AWldwXVoWEz1gtrQL10v/8YltYLBunqNjeVFXPMUs+MJg==}
+
   lines-and-columns@1.2.4:
     resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==}
 
@@ -2508,10 +2604,18 @@ packages:
     resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==}
     engines: {node: '>=8.6'}
 
+  mime-db@1.52.0:
+    resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==}
+    engines: {node: '>= 0.6'}
+
   mime-db@1.54.0:
     resolution: {integrity: sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==}
     engines: {node: '>= 0.6'}
 
+  mime-types@2.1.35:
+    resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==}
+    engines: {node: '>= 0.6'}
+
   mime-types@3.0.1:
     resolution: {integrity: sha512-xRc4oEhT6eaBpU1XF7AjpOFD+xQmXNB5OVKwp4tqCuBpHLS/ZbBDrc07mYTDqVMg6PfxUjjNp85O6Cd2Z/5HWA==}
     engines: {node: '>= 0.6'}
@@ -2538,6 +2642,24 @@ packages:
     resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==}
     engines: {node: '>=16 || 14 >=14.17'}
 
+  ml-array-max@1.2.4:
+    resolution: {integrity: sha512-BlEeg80jI0tW6WaPyGxf5Sa4sqvcyY6lbSn5Vcv44lp1I2GR6AWojfUvLnGTNsIXrZ8uqWmo8VcG1WpkI2ONMQ==}
+
+  ml-array-min@1.2.3:
+    resolution: {integrity: sha512-VcZ5f3VZ1iihtrGvgfh/q0XlMobG6GQ8FsNyQXD3T+IlstDv85g8kfV0xUG1QPRO/t21aukaJowDzMTc7j5V6Q==}
+
+  ml-array-rescale@1.3.7:
+    resolution: {integrity: sha512-48NGChTouvEo9KBctDfHC3udWnQKNKEWN0ziELvY3KG25GR5cA8K8wNVzracsqSW1QEkAXjTNx+ycgAv06/1mQ==}
+
+  ml-matrix@6.12.1:
+    resolution: {integrity: sha512-TJ+8eOFdp+INvzR4zAuwBQJznDUfktMtOB6g/hUcGh3rcyjxbz4Te57Pgri8Q9bhSQ7Zys4IYOGhFdnlgeB6Lw==}
+
+  ml-spectra-processing@14.14.0:
+    resolution: {integrity: sha512-3+nQBRQwO4e5SwsuF/PJXN+mJOptBLZxaT2l/aqRy45lKHrkauA7qdXLhCZF/VLEXJr3TOEaFlZT2fDfnJBcrA==}
+
+  ml-xsadd@3.0.1:
+    resolution: {integrity: sha512-Fz2q6dwgzGM8wYKGArTUTZDGa4lQFA2Vi6orjGeTVRy22ZnQFKlJuwS9n8NRviqz1KHAHAzdKJwbnYhdo38uYg==}
+
   mlly@1.7.4:
     resolution: {integrity: sha512-qmdSIPC4bDJXgZTCR7XosJiNKySV7O215tsPtDN9iEO/7q/76b/ijtgRu/+epFXSJhijtTCCGp3DWS549P3xKw==}
 
@@ -2582,9 +2704,23 @@ packages:
     resolution: {integrity: sha512-NHDDGYudnvRutt/VhKFlX26IotXe1w0cmkDm6JGquh5bz/bDTw0LufSmH/GxTjEdpHEO+bVKFTwdrcGa/9XlKQ==}
     engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
 
+  node-domexception@1.0.0:
+    resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
+    engines: {node: '>=10.5.0'}
+    deprecated: Use your platform's native DOMException instead
+
   node-fetch-native@1.6.7:
     resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==}
 
+  node-fetch@2.7.0:
+    resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
+    engines: {node: 4.x || >=6.0.0}
+    peerDependencies:
+      encoding: ^0.1.0
+    peerDependenciesMeta:
+      encoding:
+        optional: true
+
   normalize-package-data@6.0.2:
     resolution: {integrity: sha512-V6gygoYb/5EmNI+MEGrWkC+e6+Rr7mTmfHrxDbLzxQogBkgzo76rkok0Am6thgSF7Mv2nLOajAJj5vDJZEFn7g==}
     engines: {node: ^16.14.0 || >=18.0.0}
@@ -2644,6 +2780,18 @@ packages:
     resolution: {integrity: sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==}
     engines: {node: '>=18'}
 
+  openai@4.104.0:
+    resolution: {integrity: sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA==}
+    hasBin: true
+    peerDependencies:
+      ws: ^8.18.0
+      zod: ^3.23.8
+    peerDependenciesMeta:
+      ws:
+        optional: true
+      zod:
+        optional: true
+
   openai@5.10.2:
     resolution: {integrity: sha512-n+vi74LzHtvlKcDPn9aApgELGiu5CwhaLG40zxLTlFQdoSJCLACORIPC2uVQ3JEYAbqapM+XyRKFy2Thej7bIw==}
     hasBin: true
@@ -2893,6 +3041,10 @@ packages:
     resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
     engines: {node: '>=0.10.0'}
 
+  require-from-string@2.0.2:
+    resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
+    engines: {node: '>=0.10.0'}
+
   require-in-the-middle@7.5.2:
     resolution: {integrity: sha512-gAZ+kLqBdHarXB64XpAe2VCjB7rIRv+mU8tfRWziHRJ5umKsIHN2tLLv6EtMw7WCdP19S0ERVMldNvxYCHnhSQ==}
     engines: {node: '>=8.6.0'}
@@ -3178,6 +3330,9 @@ packages:
     resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
     engines: {node: '>=8.0'}
 
+  tr46@0.0.3:
+    resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
+
   tr46@1.0.1:
     resolution: {integrity: sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==}
 
@@ -3286,6 +3441,9 @@ packages:
     resolution: {integrity: sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==}
     engines: {node: '>= 0.4'}
 
+  undici-types@5.26.5:
+    resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
+
   undici-types@6.21.0:
     resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==}
 
@@ -3331,6 +3489,12 @@ packages:
   validate-npm-package-license@3.0.4:
     resolution: {integrity: sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew==}
 
+  validate.io-array@1.0.6:
+    resolution: {integrity: sha512-DeOy7CnPEziggrOO5CZhVKJw6S3Yi7e9e65R1Nl/RTN1vTQKnzjfvks0/8kQ40FP/dsjRAOd4hxmJ7uLa6vxkg==}
+
+  validate.io-function@1.0.2:
+    resolution: {integrity: sha512-LlFybRJEriSuBnUhQyG5bwglhh50EpTL2ul23MPIuR1odjO7XaMLFV8vHGwp7AZciFxtYOeiSCT5st+XSPONiQ==}
+
   vite-node@3.2.4:
     resolution: {integrity: sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==}
     engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
@@ -3404,6 +3568,13 @@ packages:
       jsdom:
         optional: true
 
+  web-streams-polyfill@4.0.0-beta.3:
+    resolution: {integrity: sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==}
+    engines: {node: '>= 14'}
+
+  webidl-conversions@3.0.1:
+    resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}
+
   webidl-conversions@4.0.2:
     resolution: {integrity: sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==}
 
@@ -3419,6 +3590,9 @@ packages:
     resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==}
     engines: {node: '>=18'}
 
+  whatwg-url@5.0.0:
+    resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==}
+
   whatwg-url@7.1.0:
     resolution: {integrity: sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==}
 
@@ -4402,6 +4576,15 @@ snapshots:
 
   '@types/mustache@4.2.6': {}
 
+  '@types/node-fetch@2.6.13':
+    dependencies:
+      '@types/node': 24.3.0
+      form-data: 4.0.4
+
+  '@types/node@18.19.124':
+    dependencies:
+      undici-types: 5.26.5
+
   '@types/node@20.19.11':
     dependencies:
       undici-types: 6.21.0
@@ -4605,6 +4788,10 @@ snapshots:
       loupe: 3.2.0
       tinyrainbow: 2.0.0
 
+  abort-controller@3.0.0:
+    dependencies:
+      event-target-shim: 5.0.1
+
   acorn-import-attributes@1.9.5(acorn@8.15.0):
     dependencies:
       acorn: 8.15.0
@@ -4619,6 +4806,10 @@ snapshots:
 
   agent-base@7.1.4: {}
 
+  agentkeepalive@4.6.0:
+    dependencies:
+      humanize-ms: 1.2.1
+
   ai@5.0.23(zod@3.25.76):
     dependencies:
       '@ai-sdk/gateway': 1.0.12(zod@3.25.76)
@@ -4634,6 +4825,13 @@ snapshots:
       json-schema-traverse: 0.4.1
       uri-js: 4.4.1
 
+  ajv@8.17.1:
+    dependencies:
+      fast-deep-equal: 3.1.3
+      fast-uri: 3.1.0
+      json-schema-traverse: 1.0.0
+      require-from-string: 2.0.2
+
   ansi-escapes@4.3.2:
     dependencies:
       type-fest: 0.21.3
@@ -4718,6 +4916,23 @@ snapshots:
     dependencies:
       retry: 0.13.1
 
+  asynckit@0.4.0: {}
+
+  autoevals@0.0.131:
+    dependencies:
+      ajv: 8.17.1
+      compute-cosine-similarity: 1.1.0
+      js-levenshtein: 1.1.6
+      js-yaml: 4.1.0
+      linear-sum-assignment: 1.0.7
+      mustache: 4.2.0
+      openai: 4.104.0(zod@3.25.76)
+      zod: 3.25.76
+      zod-to-json-schema: 3.24.6(zod@3.25.76)
+    transitivePeerDependencies:
+      - encoding
+      - ws
+
   available-typed-arrays@1.0.7:
     dependencies:
       possible-typed-array-names: 1.1.0
@@ -4730,6 +4945,8 @@ snapshots:
 
   before-after-hook@3.0.2: {}
 
+  binary-search@1.3.6: {}
+
   boolbase@1.0.0: {}
 
   brace-expansion@1.1.12:
@@ -4838,6 +5055,8 @@ snapshots:
       undici: 7.13.0
       whatwg-mimetype: 4.0.0
 
+  cheminfo-types@1.8.1: {}
+
   chokidar@4.0.3:
     dependencies:
       readdirp: 4.1.2
@@ -4870,6 +5089,10 @@ snapshots:
 
   color-name@1.1.4: {}
 
+  combined-stream@1.0.8:
+    dependencies:
+      delayed-stream: 1.0.0
+
   commander@4.1.1: {}
 
   compare-func@2.0.0:
@@ -4877,6 +5100,23 @@ snapshots:
       array-ify: 1.0.0
       dot-prop: 5.3.0
 
+  compute-cosine-similarity@1.1.0:
+    dependencies:
+      compute-dot: 1.1.0
+      compute-l2norm: 1.1.0
+      validate.io-array: 1.0.6
+      validate.io-function: 1.0.2
+
+  compute-dot@1.1.0:
+    dependencies:
+      validate.io-array: 1.0.6
+      validate.io-function: 1.0.2
+
+  compute-l2norm@1.1.0:
+    dependencies:
+      validate.io-array: 1.0.6
+      validate.io-function: 1.0.2
+
   concat-map@0.0.1: {}
 
   concat-stream@2.0.0:
@@ -5053,6 +5293,8 @@ snapshots:
       escodegen: 2.1.0
       esprima: 4.0.1
 
+  delayed-stream@1.0.0: {}
+
   destr@2.0.5: {}
 
   detect-indent@7.0.1: {}
@@ -5371,6 +5613,8 @@ snapshots:
 
   eta@3.5.0: {}
 
+  event-target-shim@5.0.1: {}
+
   eventemitter3@4.0.7: {}
 
   eventsource-parser@3.0.5: {}
@@ -5415,6 +5659,8 @@ snapshots:
 
   fast-levenshtein@2.0.6: {}
 
+  fast-uri@3.1.0: {}
+
   fastq@1.19.1:
     dependencies:
       reusify: 1.1.0
@@ -5427,6 +5673,8 @@ snapshots:
     optionalDependencies:
       picomatch: 4.0.3
 
+  fft.js@4.0.4: {}
+
   file-entry-cache@8.0.0:
     dependencies:
       flat-cache: 4.0.1
@@ -5464,6 +5712,21 @@ snapshots:
       cross-spawn: 7.0.6
       signal-exit: 4.1.0
 
+  form-data-encoder@1.7.2: {}
+
+  form-data@4.0.4:
+    dependencies:
+      asynckit: 0.4.0
+      combined-stream: 1.0.8
+      es-set-tostringtag: 2.1.0
+      hasown: 2.0.2
+      mime-types: 2.1.35
+
+  formdata-node@4.4.1:
+    dependencies:
+      node-domexception: 1.0.0
+      web-streams-polyfill: 4.0.0-beta.3
+
   fsevents@2.3.3:
     optional: true
 
@@ -5648,6 +5911,10 @@ snapshots:
 
   human-signals@5.0.0: {}
 
+  humanize-ms@1.2.1:
+    dependencies:
+      ms: 2.1.3
+
   husky@9.1.7: {}
 
   iconv-lite@0.4.24:
@@ -5694,6 +5961,8 @@ snapshots:
     optionalDependencies:
       '@types/node': 24.3.0
 
+  install@0.13.0: {}
+
   internal-slot@1.1.0:
     dependencies:
       es-errors: 1.3.0
@@ -5705,6 +5974,8 @@ snapshots:
       jsbn: 1.1.0
       sprintf-js: 1.1.3
 
+  is-any-array@2.0.1: {}
+
   is-array-buffer@3.0.5:
     dependencies:
       call-bind: 1.0.8
@@ -5860,6 +6131,8 @@ snapshots:
 
   joycon@3.1.1: {}
 
+  js-levenshtein@1.1.6: {}
+
   js-tiktoken@1.0.20:
     dependencies:
       base64-js: 1.5.1
@@ -5878,6 +6151,8 @@ snapshots:
 
   json-schema-traverse@0.4.1: {}
 
+  json-schema-traverse@1.0.0: {}
+
   json-schema@0.4.0: {}
 
   json-stable-stringify-without-jsonify@1.0.1: {}
@@ -5912,6 +6187,13 @@ snapshots:
 
   lilconfig@3.1.3: {}
 
+  linear-sum-assignment@1.0.7:
+    dependencies:
+      cheminfo-types: 1.8.1
+      install: 0.13.0
+      ml-matrix: 6.12.1
+      ml-spectra-processing: 14.14.0
+
   lines-and-columns@1.2.4: {}
 
   linkify-it@5.0.0:
@@ -5996,8 +6278,14 @@ snapshots:
       braces: 3.0.3
       picomatch: 2.3.1
 
+  mime-db@1.52.0: {}
+
   mime-db@1.54.0: {}
 
+  mime-types@2.1.35:
+    dependencies:
+      mime-db: 1.52.0
+
   mime-types@3.0.1:
     dependencies:
       mime-db: 1.54.0
@@ -6018,6 +6306,36 @@ snapshots:
 
   minipass@7.1.2: {}
 
+  ml-array-max@1.2.4:
+    dependencies:
+      is-any-array: 2.0.1
+
+  ml-array-min@1.2.3:
+    dependencies:
+      is-any-array: 2.0.1
+
+  ml-array-rescale@1.3.7:
+    dependencies:
+      is-any-array: 2.0.1
+      ml-array-max: 1.2.4
+      ml-array-min: 1.2.3
+
+  ml-matrix@6.12.1:
+    dependencies:
+      is-any-array: 2.0.1
+      ml-array-rescale: 1.3.7
+
+  ml-spectra-processing@14.14.0:
+    dependencies:
+      binary-search: 1.3.6
+      cheminfo-types: 1.8.1
+      fft.js: 4.0.4
+      is-any-array: 2.0.1
+      ml-matrix: 6.12.1
+      ml-xsadd: 3.0.1
+
+  ml-xsadd@3.0.1: {}
+
   mlly@1.7.4:
     dependencies:
       acorn: 8.15.0
@@ -6053,8 +6371,14 @@ snapshots:
     dependencies:
       type-fest: 2.19.0
 
+  node-domexception@1.0.0: {}
+
   node-fetch-native@1.6.7: {}
 
+  node-fetch@2.7.0:
+    dependencies:
+      whatwg-url: 5.0.0
+
   normalize-package-data@6.0.2:
     dependencies:
       hosted-git-info: 7.0.2
@@ -6129,6 +6453,20 @@ snapshots:
       is-inside-container: 1.0.0
       wsl-utils: 0.1.0
 
+  openai@4.104.0(zod@3.25.76):
+    dependencies:
+      '@types/node': 18.19.124
+      '@types/node-fetch': 2.6.13
+      abort-controller: 3.0.0
+      agentkeepalive: 4.6.0
+      form-data-encoder: 1.7.2
+      formdata-node: 4.4.1
+      node-fetch: 2.7.0
+    optionalDependencies:
+      zod: 3.25.76
+    transitivePeerDependencies:
+      - encoding
+
   openai@5.10.2(zod@3.25.76):
     optionalDependencies:
       zod: 3.25.76
@@ -6429,6 +6767,8 @@ snapshots:
 
   require-directory@2.1.1: {}
 
+  require-from-string@2.0.2: {}
+
   require-in-the-middle@7.5.2:
     dependencies:
       debug: 4.4.1
@@ -6751,6 +7091,8 @@ snapshots:
     dependencies:
       is-number: 7.0.0
 
+  tr46@0.0.3: {}
+
   tr46@1.0.1:
     dependencies:
       punycode: 2.3.1
@@ -6877,6 +7219,8 @@ snapshots:
       has-symbols: 1.1.0
       which-boxed-primitive: 1.1.1
 
+  undici-types@5.26.5: {}
+
   undici-types@6.21.0: {}
 
   undici-types@7.10.0: {}
@@ -6908,6 +7252,10 @@ snapshots:
       spdx-correct: 3.2.0
       spdx-expression-parse: 3.0.1
 
+  validate.io-array@1.0.6: {}
+
+  validate.io-function@1.0.2: {}
+
   vite-node@3.2.4(@types/node@24.3.0)(jiti@2.5.1)(tsx@4.20.3)(yaml@2.8.0):
     dependencies:
       cac: 6.7.14
@@ -6986,6 +7334,10 @@ snapshots:
       - tsx
       - yaml
 
+  web-streams-polyfill@4.0.0-beta.3: {}
+
+  webidl-conversions@3.0.1: {}
+
   webidl-conversions@4.0.2: {}
 
   whatwg-encoding@3.1.1:
@@ -6996,6 +7348,11 @@ snapshots:
 
   whatwg-mimetype@4.0.0: {}
 
+  whatwg-url@5.0.0:
+    dependencies:
+      tr46: 0.0.3
+      webidl-conversions: 3.0.1
+
   whatwg-url@7.1.0:
     dependencies:
       lodash.sortby: 4.7.0
diff --git a/tests/e2e/experiments.e2e.test.ts b/tests/e2e/experiments.e2e.test.ts
new file mode 100644
index 00000000..b31c42c5
--- /dev/null
+++ b/tests/e2e/experiments.e2e.test.ts
@@ -0,0 +1,1072 @@
+import {
+  Evaluator,
+  ExperimentTask,
+  LangfuseClient,
+  RunEvaluator,
+  createEvaluatorFromAutoevals,
+} from "@langfuse/client";
+import { observeOpenAI } from "@langfuse/openai";
+import { Factuality, Levenshtein } from "autoevals";
+import { nanoid } from "nanoid";
+import OpenAI from "openai";
+import { describe, it, afterEach, beforeEach, expect } from "vitest";
+
+import {
+  setupServerTestEnvironment,
+  teardownServerTestEnvironment,
+  waitForServerIngestion,
+  type ServerTestEnvironment,
+} from "./helpers/serverSetup.js";
+
+describe("Langfuse Datasets E2E", () => {
+  let langfuse: LangfuseClient;
+  let testEnv: ServerTestEnvironment;
+
+  const dataset = [
+    {
+      input: "Germany",
+      expectedOutput: "Berlin",
+    },
+    {
+      input: "France",
+      expectedOutput: "Paris",
+    },
+    {
+      input: "Spain",
+      expectedOutput: "Madrid",
+    },
+  ];
+
+  const task: ExperimentTask = async (params) => {
+    const client = observeOpenAI(new OpenAI());
+
+    const response = await client.chat.completions.create({
+      model: "gpt-4.1",
+      messages: [
+        {
+          role: "user",
+          content: `What is the capital of ${params.input}? Be cheeky sometimes in your answer and give the unofficial one. Respond in one word.`,
+        },
+      ],
+    });
+
+    return response.choices[0].message.content;
+  };
+
+  const factualityEvaluator: Evaluator = async (params) => {
+    const response = await new OpenAI().chat.completions.parse({
+      model: "gpt-4.1",
+      messages: [
+        {
+          role: "user",
+          content: `Rate the correctness of this sentence: The capital of ${params.input} is ${params.output}`,
+        },
+      ],
+      response_format: {
+        type: "json_schema",
+        json_schema: {
+          name: "score",
+          description:
+            "score between 0 to 1 where 0 is false and 1 is correct.",
+          schema: {
+            $schema: "http://json-schema.org/draft-04/schema#",
+            type: "object",
+            properties: {
+              score: {
+                type: "integer",
+              },
+              reasoning: {
+                type: "string",
+              },
+            },
+            required: ["score", "reasoning"],
+          },
+        },
+      },
+    });
+
+    const parsed = JSON.parse(response.choices[0].message.content!);
+
+    return [
+      {
+        name: "manual-factuality",
+        value: parsed.score,
+        comment: parsed.reasoning,
+        metadata: { reasoning: parsed.reasoning },
+      },
+    ];
+  };
+
+  const levenshteinAverageRunEvaluator: RunEvaluator = async ({
+    itemResults,
+  }) => {
+    const average = itemResults
+      .map((result) =>
+        result.evaluations.filter((e) => e.name === "Levenshtein"),
+      )
+      .flat()
+      .reduce((acc, curr, _, array) => {
+        return acc + (curr.value as number) / array.length;
+      }, 0);
+
+    return {
+      name: "levenshtein-average",
+      value: average,
+    };
+  };
+
+  beforeEach(async () => {
+    testEnv = await setupServerTestEnvironment();
+    langfuse = new LangfuseClient();
+  });
+
+  afterEach(async () => {
+    await teardownServerTestEnvironment(testEnv);
+    await langfuse.flush();
+  });
+
+  it("should run an experiment on local dataset", async () => {
+    const result = await langfuse.experiment.run({
+      name: "Euro capitals",
+      description: "Country capital experiment",
+      data: dataset,
+      task,
+      evaluators: [
+        createEvaluatorFromAutoevals(Factuality),
+        createEvaluatorFromAutoevals(Levenshtein),
+        factualityEvaluator,
+      ],
+      runEvaluators: [levenshteinAverageRunEvaluator],
+    });
+
+    console.log(await result.format());
+
+    await testEnv.spanProcessor.forceFlush();
+    await waitForServerIngestion(2000);
+
+    // Validate basic result structure
+    expect(result.itemResults).toHaveLength(3);
+    expect(result.runEvaluations).toHaveLength(1);
+    expect(result.runEvaluations[0]).toMatchObject({
+      name: "levenshtein-average",
+      value: expect.any(Number),
+    });
+    // Should have generated runName (experiment name + timestamp)
+    expect(result.runName).toMatch(
+      /^Euro capitals - \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
+    );
+    // No datasetRunId for local datasets
+    expect(result.datasetRunId).toBeUndefined();
+
+    // Validate item results structure
+    result.itemResults.forEach((itemResult, index) => {
+      expect(itemResult).toMatchObject({
+        output: expect.any(String),
+        evaluations: expect.arrayContaining([
+          expect.objectContaining({
+            name: "Factuality",
+            value: expect.any(Number),
+          }),
+          expect.objectContaining({
+            name: "Levenshtein",
+            value: expect.any(Number),
+          }),
+          expect.objectContaining({
+            name: "manual-factuality",
+            value: expect.any(Number),
+          }),
+        ]),
+        traceId: expect.any(String),
+      });
+
+      // Should have 3 evaluations per item
+      expect(itemResult.evaluations).toHaveLength(3);
+      // No datasetRunId for local datasets
+      expect(itemResult.datasetRunId).toBeUndefined();
+    });
+  });
+
+  it("should run an experiment on a langfuse dataset", async () => {
+    // create remote dataset
+    const datasetName = "euro-capitals-" + nanoid();
+    await langfuse.api.datasets.create({
+      name: datasetName,
+      description: "Collection of euro countries and capitals",
+    });
+
+    // create remote dataset items
+    await Promise.all(
+      dataset.map((item) =>
+        langfuse.api.datasetItems.create({ datasetName, ...item }),
+      ),
+    );
+
+    const fetchedDataset = await langfuse.dataset.get(datasetName);
+
+    const experimentName = "Euro capitals on LF dataset";
+    const result = await fetchedDataset.runExperiment({
+      name: experimentName,
+      description: "Country capital experiment",
+      task,
+      evaluators: [
+        createEvaluatorFromAutoevals(Factuality),
+        createEvaluatorFromAutoevals(Levenshtein),
+        factualityEvaluator,
+      ],
+      runEvaluators: [levenshteinAverageRunEvaluator],
+    });
+
+    console.log(await result.format());
+
+    await testEnv.spanProcessor.forceFlush();
+    await waitForServerIngestion(2000);
+
+    // Validate basic result structure
+    expect(result.itemResults).toHaveLength(3);
+    expect(result.runEvaluations).toHaveLength(1);
+    expect(result.runEvaluations[0]).toMatchObject({
+      name: "levenshtein-average",
+      value: expect.any(Number),
+    });
+    // Should have generated runName (experiment name + timestamp)
+    expect(result.runName).toMatch(
+      /^Euro capitals on LF dataset - \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
+    );
+    expect(result.datasetRunId).toBeDefined();
+
+    // Validate item results structure
+    result.itemResults.forEach((itemResult, index) => {
+      expect(itemResult).toMatchObject({
+        output: expect.any(String),
+        evaluations: expect.arrayContaining([
+          expect.objectContaining({
+            name: "Factuality",
+            value: expect.any(Number),
+          }),
+          expect.objectContaining({
+            name: "Levenshtein",
+            value: expect.any(Number),
+          }),
+          expect.objectContaining({
+            name: "manual-factuality",
+            value: expect.any(Number),
+          }),
+        ]),
+        traceId: expect.any(String),
+        datasetRunId: expect.any(String),
+      });
+
+      // Should have 3 evaluations per item
+      expect(itemResult.evaluations).toHaveLength(3);
+    });
+
+    // Fetch dataset run from API and validate against database
+    const datasetRun = await langfuse.api.datasets.getRun(
+      datasetName,
+      result.runName,
+    );
+
+    expect(datasetRun).toBeDefined();
+    expect(datasetRun).toMatchObject({
+      name: result.runName,
+      description: "Country capital experiment",
+      datasetId: fetchedDataset.id,
+      datasetName: datasetName,
+    });
+
+    // Validate dataset run items
+    expect(datasetRun.datasetRunItems).toHaveLength(3);
+
+    // Each run item should correspond to one of our experiment results
+    result.itemResults.forEach((itemResult) => {
+      const correspondingRunItem = datasetRun.datasetRunItems.find(
+        (runItem) => runItem.traceId === itemResult.traceId,
+      );
+
+      expect(correspondingRunItem).toBeDefined();
+      expect(correspondingRunItem).toMatchObject({
+        traceId: itemResult.traceId,
+        datasetItemId: expect.any(String),
+      });
+    });
+
+    // Validate that traces contain the expected scores
+    // Each trace should have 3 item-level evaluations + 1 run-level evaluation
+    const expectedTraceIds = result.itemResults.map((r) => r.traceId);
+    expect(expectedTraceIds).toHaveLength(3);
+    expectedTraceIds.forEach((traceId) => {
+      expect(traceId).toMatch(/^[a-f0-9]{32}$/);
+    });
+  });
+
+  it("should support custom runName parameter", async () => {
+    // create remote dataset
+    const datasetName = "custom-run-name-test-" + nanoid();
+    await langfuse.api.datasets.create({
+      name: datasetName,
+      description: "Test custom run names",
+    });
+
+    // create remote dataset items
+    await Promise.all(
+      dataset
+        .slice(0, 2)
+        .map((item) =>
+          langfuse.api.datasetItems.create({ datasetName, ...item }),
+        ),
+    );
+
+    const fetchedDataset = await langfuse.dataset.get(datasetName);
+
+    const customRunName = "Custom Run Name " + nanoid();
+    const result = await fetchedDataset.runExperiment({
+      name: "Test Experiment",
+      runName: customRunName,
+      description: "Testing custom run name",
+      task,
+      evaluators: [createEvaluatorFromAutoevals(Factuality)],
+    });
+
+    await testEnv.spanProcessor.forceFlush();
+    await waitForServerIngestion(2000);
+
+    // Should use the custom run name exactly
+    expect(result.runName).toBe(customRunName);
+    expect(result.datasetRunId).toBeDefined();
+
+    // Fetch dataset run and verify it has the custom name
+    const datasetRun = await langfuse.api.datasets.getRun(
+      datasetName,
+      customRunName,
+    );
+
+    expect(datasetRun).toBeDefined();
+    expect(datasetRun).toMatchObject({
+      name: customRunName,
+      description: "Testing custom run name",
+      datasetId: fetchedDataset.id,
+      datasetName: datasetName,
+    });
+  });
+
+  it("should support custom runName with local datasets", async () => {
+    const customRunName = "Local Custom Run " + nanoid();
+    const result = await langfuse.experiment.run({
+      name: "Local Test Experiment",
+      runName: customRunName,
+      description: "Testing custom run name with local data",
+      data: dataset.slice(0, 2),
+      task,
+      evaluators: [createEvaluatorFromAutoevals(Factuality)],
+    });
+
+    await testEnv.spanProcessor.forceFlush();
+    await waitForServerIngestion(1000);
+
+    // Should use the custom run name exactly
+    expect(result.runName).toBe(customRunName);
+    expect(result.itemResults).toHaveLength(2);
+    // No dataset run for local datasets
+    expect(result.datasetRunId).toBeUndefined();
+  });
+
+  // Error Handling Tests
+  describe("Error Handling", () => {
+    it("should handle evaluator failures gracefully", async () => {
+      const failingEvaluator: Evaluator = async () => {
+        throw new Error("Evaluator failed");
+      };
+
+      const result = await langfuse.experiment.run({
+        name: "Error test",
+        description: "Test evaluator error handling",
+        data: dataset.slice(0, 1), // Just one item
+        task,
+        evaluators: [
+          createEvaluatorFromAutoevals(Factuality), // This should work
+          failingEvaluator, // This should fail
+        ],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      // Should still complete the experiment
+      expect(result.itemResults).toHaveLength(1);
+      expect(result.itemResults[0].evaluations).toHaveLength(1); // Only the working evaluator
+      expect(result.itemResults[0].evaluations[0].name).toBe("Factuality");
+    });
+
+    it("should handle task failures gracefully", async () => {
+      const failingTask: ExperimentTask = async () => {
+        throw new Error("Task failed");
+      };
+
+      // The experiment should handle the task failure gracefully by skipping the failed item
+      const result = await langfuse.experiment.run({
+        name: "Task error test",
+        description: "Test task error handling",
+        data: dataset.slice(0, 1),
+        task: failingTask,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      // Should complete experiment but skip the failed item
+      expect(result.itemResults).toHaveLength(0);
+      expect(result.runEvaluations).toHaveLength(0);
+    });
+
+    it("should handle mixed task success and failures", async () => {
+      const mixedTask: ExperimentTask = async ({ input }) => {
+        if (input === "Germany") {
+          throw new Error("Task failed for Germany");
+        }
+        return `Capital of ${input}`;
+      };
+
+      const result = await langfuse.experiment.run({
+        name: "Mixed task results test",
+        description: "Test mixed success/failure handling",
+        data: dataset.slice(0, 2), // Germany and France
+        task: mixedTask,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      // Should complete experiment with only successful items
+      expect(result.itemResults).toHaveLength(1); // Only France should succeed
+      expect(result.itemResults[0].output).toContain("France");
+      expect(result.itemResults[0].evaluations).toHaveLength(1);
+    });
+
+    it("should handle run evaluator failures", async () => {
+      const failingRunEvaluator: RunEvaluator = async () => {
+        throw new Error("Run evaluator failed");
+      };
+
+      const result = await langfuse.experiment.run({
+        name: "Run evaluator error test",
+        description: "Test run evaluator error handling",
+        data: dataset.slice(0, 1),
+        task,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+        runEvaluators: [failingRunEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      // Should complete experiment but run evaluations should be empty
+      expect(result.itemResults).toHaveLength(1);
+      expect(result.runEvaluations).toHaveLength(0);
+    });
+  });
+
+  // Edge Cases Tests
+  describe("Edge Cases", () => {
+    it("should handle empty dataset", async () => {
+      const result = await langfuse.experiment.run({
+        name: "Empty dataset test",
+        description: "Test empty dataset handling",
+        data: [],
+        task,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+        runEvaluators: [levenshteinAverageRunEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(500);
+
+      expect(result.itemResults).toHaveLength(0);
+      expect(result.runEvaluations).toHaveLength(1); // Run evaluators will still execute with empty data
+      expect(await result.format()).toContain("No experiment results");
+    });
+
+    it("should handle dataset with missing fields", async () => {
+      const incompleteDataset = [
+        { input: "Germany" }, // Missing expectedOutput
+        { expectedOutput: "Paris" }, // Missing input
+        { input: "Spain", expectedOutput: "Madrid" }, // Complete
+      ];
+
+      const result = await langfuse.experiment.run({
+        name: "Incomplete data test",
+        description: "Test incomplete dataset handling",
+        data: incompleteDataset,
+        task,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(3);
+      // Should handle missing fields gracefully
+      result.itemResults.forEach((item) => {
+        expect(item.traceId).toBeDefined();
+        expect(item.output).toBeDefined();
+      });
+    });
+
+    it("should handle very large dataset", async () => {
+      // Create a larger dataset for performance testing
+      const largeDataset = Array.from({ length: 20 }, (_, i) => ({
+        input: `Country ${i}`,
+        expectedOutput: `Capital ${i}`,
+      }));
+
+      const result = await langfuse.experiment.run({
+        name: "Large dataset test",
+        description: "Test performance with larger dataset",
+        data: largeDataset,
+        task: async ({ input }) => `Output for ${input}`,
+        evaluators: [
+          async () => ({
+            name: "simple-eval",
+            value: Math.random(),
+          }),
+        ],
+        maxConcurrency: 5, // Test concurrency limit
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(3000);
+
+      expect(result.itemResults).toHaveLength(20);
+      result.itemResults.forEach((item) => {
+        expect(item.evaluations).toHaveLength(1);
+        expect(item.traceId).toBeDefined();
+      });
+    }, 30000);
+  });
+
+  // New Features Tests
+  describe("New Features", () => {
+    it("should support evaluators returning single evaluation", async () => {
+      const singleEvaluationEvaluator: Evaluator = async ({
+        input,
+        output,
+      }) => {
+        // Return single evaluation instead of array
+        return {
+          name: "single-eval",
+          value: input === "Germany" ? 1 : 0,
+          comment: `Single evaluation for ${input}`,
+        };
+      };
+
+      const result = await langfuse.experiment.run({
+        name: "Single evaluation test",
+        description: "Test single evaluation return",
+        data: dataset.slice(0, 2),
+        task,
+        evaluators: [singleEvaluationEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(2);
+      result.itemResults.forEach((item) => {
+        expect(item.evaluations).toHaveLength(1);
+        expect(item.evaluations[0]).toMatchObject({
+          name: "single-eval",
+          value: expect.any(Number),
+          comment: expect.stringContaining("Single evaluation for"),
+        });
+      });
+    });
+
+    it("should support run evaluators returning single evaluation", async () => {
+      const singleRunEvaluator: RunEvaluator = async ({ itemResults }) => {
+        // Return single evaluation instead of array
+        return {
+          name: "single-run-eval",
+          value: itemResults.length,
+          comment: `Processed ${itemResults.length} items`,
+        };
+      };
+
+      const result = await langfuse.experiment.run({
+        name: "Single run evaluation test",
+        description: "Test single run evaluation return",
+        data: dataset.slice(0, 2),
+        task,
+        runEvaluators: [singleRunEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.runEvaluations).toHaveLength(1);
+      expect(result.runEvaluations[0]).toMatchObject({
+        name: "single-run-eval",
+        value: 2,
+        comment: "Processed 2 items",
+      });
+    });
+
+    it("should support format with includeItemResults option", async () => {
+      const result = await langfuse.experiment.run({
+        name: "Format options test",
+        description: "Test format options",
+        data: dataset,
+        task,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+        runEvaluators: [levenshteinAverageRunEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      // Test with includeItemResults: false (default)
+      const compactOutput = await result.format();
+      expect(compactOutput).toContain("Individual Results: Hidden");
+      expect(compactOutput).toContain(
+        "Call format({ includeItemResults: true })",
+      );
+      expect(compactOutput).toContain("Format options test"); // Should still show summary
+
+      // Test with includeItemResults: true
+      const fullOutput = await result.format({ includeItemResults: true });
+      expect(fullOutput).toContain("1. Item 1:");
+      expect(fullOutput).toContain("2. Item 2:");
+      expect(fullOutput).toContain("3. Item 3:");
+
+      // Test default behavior (should be same as false)
+      const defaultOutput = await result.format();
+      expect(defaultOutput).toEqual(compactOutput);
+    });
+  });
+
+  // Concurrency and Performance Tests
+  describe("Concurrency and Performance", () => {
+    it("should respect maxConcurrency parameter", async () => {
+      let concurrentCount = 0;
+      let maxConcurrentReached = 0;
+
+      const slowTask: ExperimentTask = async ({ input }) => {
+        concurrentCount++;
+        maxConcurrentReached = Math.max(maxConcurrentReached, concurrentCount);
+
+        // Simulate slow operation
+        await new Promise((resolve) => setTimeout(resolve, 100));
+
+        concurrentCount--;
+        return `Processed ${input}`;
+      };
+
+      const testData = Array.from({ length: 10 }, (_, i) => ({
+        input: `Item ${i}`,
+        expectedOutput: `Expected ${i}`,
+      }));
+
+      const result = await langfuse.experiment.run({
+        name: "Concurrency test",
+        description: "Test maxConcurrency parameter",
+        data: testData,
+        task: slowTask,
+        maxConcurrency: 3,
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(2000);
+
+      expect(result.itemResults).toHaveLength(10);
+      expect(maxConcurrentReached).toBeLessThanOrEqual(3);
+    }, 15000);
+
+    it("should handle evaluators with different execution times", async () => {
+      const fastEvaluator: Evaluator = async () => ({
+        name: "fast-eval",
+        value: 1,
+      });
+
+      const slowEvaluator: Evaluator = async () => {
+        await new Promise((resolve) => setTimeout(resolve, 200));
+        return {
+          name: "slow-eval",
+          value: 0.5,
+        };
+      };
+
+      const start = Date.now();
+      const result = await langfuse.experiment.run({
+        name: "Mixed speed evaluators test",
+        description: "Test evaluators with different execution times",
+        data: dataset.slice(0, 2),
+        task,
+        evaluators: [fastEvaluator, slowEvaluator],
+      });
+      const duration = Date.now() - start;
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(2);
+      result.itemResults.forEach((item) => {
+        expect(item.evaluations).toHaveLength(2);
+        expect(item.evaluations.map((e) => e.name)).toContain("fast-eval");
+        expect(item.evaluations.map((e) => e.name)).toContain("slow-eval");
+      });
+
+      // Should complete in reasonable time (parallel execution)
+      expect(duration).toBeLessThan(2000); // Should be much faster than 400ms * 2 items sequentially
+    }, 10000);
+  });
+
+  // Data Persistence and API Integration Tests
+  describe("Data Persistence and API Integration", () => {
+    it("should persist scores correctly", async () => {
+      const datasetName = "score-persistence-test-" + nanoid();
+      await langfuse.api.datasets.create({ name: datasetName });
+
+      const testItem = {
+        input: "Test input",
+        expectedOutput: "Test output",
+      };
+
+      const createdItem = await langfuse.api.datasetItems.create({
+        datasetName,
+        ...testItem,
+      });
+
+      const fetchedDataset = await langfuse.dataset.get(datasetName);
+
+      const testEvaluator: Evaluator = async () => ({
+        name: "persistence-test-eval",
+        value: 0.85,
+        comment: "Test evaluation for persistence",
+      });
+
+      const testRunEvaluator: RunEvaluator = async () => ({
+        name: "persistence-test-run-eval",
+        value: 0.9,
+        comment: "Test run evaluation for persistence",
+      });
+
+      const result = await fetchedDataset.runExperiment({
+        name: "Score persistence test",
+        description: "Test score persistence",
+        task,
+        evaluators: [testEvaluator],
+        runEvaluators: [testRunEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(3000);
+
+      // Validate scores are persisted
+      const datasetRun = await langfuse.api.datasets.getRun(
+        datasetName,
+        result.runName,
+      );
+
+      expect(datasetRun).toBeDefined();
+      expect(datasetRun.datasetRunItems).toHaveLength(1);
+
+      // Validate item-level scores are linked to traces
+      const runItem = datasetRun.datasetRunItems[0];
+      expect(runItem.traceId).toBe(result.itemResults[0].traceId);
+    });
+
+    it("should handle multiple experiments on same dataset", async () => {
+      const datasetName = "multi-experiment-test-" + nanoid();
+      await langfuse.api.datasets.create({ name: datasetName });
+
+      await Promise.all(
+        dataset
+          .slice(0, 2)
+          .map((item) =>
+            langfuse.api.datasetItems.create({ datasetName, ...item }),
+          ),
+      );
+
+      const fetchedDataset = await langfuse.dataset.get(datasetName);
+
+      // Run first experiment
+      const result1 = await fetchedDataset.runExperiment({
+        name: "Experiment 1",
+        description: "First experiment",
+        task,
+        evaluators: [createEvaluatorFromAutoevals(Factuality)],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(2000);
+
+      // Run second experiment
+      const result2 = await fetchedDataset.runExperiment({
+        name: "Experiment 2",
+        description: "Second experiment",
+        task,
+        evaluators: [createEvaluatorFromAutoevals(Levenshtein)],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(2000);
+
+      // Both experiments should have different run IDs
+      expect(result1.datasetRunId).toBeDefined();
+      expect(result2.datasetRunId).toBeDefined();
+      expect(result1.datasetRunId).not.toBe(result2.datasetRunId);
+
+      // Validate both runs exist in database
+      const run1 = await langfuse.api.datasets.getRun(
+        datasetName,
+        result1.runName,
+      );
+      const run2 = await langfuse.api.datasets.getRun(
+        datasetName,
+        result2.runName,
+      );
+
+      expect(run1).toBeDefined();
+      expect(run2).toBeDefined();
+      expect(run1.id).not.toBe(run2.id);
+    });
+
+    it("should preserve dataset run metadata", async () => {
+      const datasetName = "metadata-test-" + nanoid();
+      await langfuse.api.datasets.create({ name: datasetName });
+
+      await langfuse.api.datasetItems.create({
+        datasetName,
+        input: "Test",
+        expectedOutput: "Test output",
+      });
+
+      const fetchedDataset = await langfuse.dataset.get(datasetName);
+
+      const result = await fetchedDataset.runExperiment({
+        name: "Metadata test experiment",
+        description: "Testing metadata preservation",
+        metadata: { testKey: "testValue", experimentVersion: "1.0" },
+        task,
+        evaluators: [
+          async () => ({
+            name: "test-eval",
+            value: 1,
+            metadata: { evaluatorVersion: "2.0" },
+          }),
+        ],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(2000);
+
+      const datasetRun = await langfuse.api.datasets.getRun(
+        datasetName,
+        result.runName,
+      );
+
+      expect(datasetRun).toMatchObject({
+        name: result.runName,
+        description: "Testing metadata preservation",
+        metadata: { testKey: "testValue", experimentVersion: "1.0" },
+      });
+    });
+  });
+
+  // Different Evaluator Configurations Tests
+  describe("Different Evaluator Configurations", () => {
+    it("should work with no evaluators", async () => {
+      const result = await langfuse.experiment.run({
+        name: "No evaluators test",
+        description: "Test experiment with no evaluators",
+        data: dataset.slice(0, 2),
+        task,
+        evaluators: [], // No evaluators
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(2);
+      result.itemResults.forEach((item) => {
+        expect(item.evaluations).toHaveLength(0);
+        expect(item.traceId).toBeDefined();
+        expect(item.output).toBeDefined();
+      });
+      expect(result.runEvaluations).toHaveLength(0);
+    });
+
+    it("should work with only run evaluators", async () => {
+      const onlyRunEvaluator: RunEvaluator = async ({ itemResults }) => ({
+        name: "run-only-eval",
+        value: itemResults.length * 10,
+        comment: `Run-level evaluation of ${itemResults.length} items`,
+      });
+
+      const result = await langfuse.experiment.run({
+        name: "Only run evaluators test",
+        description: "Test with only run evaluators",
+        data: dataset.slice(0, 3),
+        task,
+        evaluators: [], // No item evaluators
+        runEvaluators: [onlyRunEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(3);
+      result.itemResults.forEach((item) => {
+        expect(item.evaluations).toHaveLength(0); // No item evaluations
+        expect(item.traceId).toBeDefined();
+      });
+
+      expect(result.runEvaluations).toHaveLength(1);
+      expect(result.runEvaluations[0]).toMatchObject({
+        name: "run-only-eval",
+        value: 30, // 3 items * 10
+      });
+    });
+
+    it("should handle mix of sync and async evaluators", async () => {
+      const asyncEvaluator: Evaluator = async ({ input }) => {
+        await new Promise((resolve) => setTimeout(resolve, 50));
+        return {
+          name: "async-eval",
+          value: input.length / 10,
+        };
+      };
+
+      // Simulated sync evaluator (still returns Promise per type signature)
+      const syncEvaluator: Evaluator = async ({ input }) => {
+        return {
+          name: "sync-eval",
+          value: input === "Germany" ? 1 : 0,
+        };
+      };
+
+      const result = await langfuse.experiment.run({
+        name: "Mixed sync/async evaluators test",
+        description: "Test mix of sync and async evaluators",
+        data: dataset.slice(0, 2),
+        task,
+        evaluators: [asyncEvaluator, syncEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(2);
+      result.itemResults.forEach((item) => {
+        expect(item.evaluations).toHaveLength(2);
+        const evalNames = item.evaluations.map((e) => e.name);
+        expect(evalNames).toContain("async-eval");
+        expect(evalNames).toContain("sync-eval");
+      });
+    });
+
+    it("should handle evaluators returning different data types", async () => {
+      const numberEvaluator: Evaluator = async () => ({
+        name: "number-eval",
+        value: 42,
+      });
+
+      const stringEvaluator: Evaluator = async () => ({
+        name: "string-eval",
+        value: "excellent",
+      });
+
+      const booleanEvaluator: Evaluator = async () => ({
+        name: "boolean-eval",
+        value: true,
+        dataType: "BOOLEAN",
+      });
+
+      const result = await langfuse.experiment.run({
+        name: "Different data types test",
+        description: "Test evaluators with different return value types",
+        data: dataset.slice(0, 1),
+        task,
+        evaluators: [numberEvaluator, stringEvaluator, booleanEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(1);
+      const evaluations = result.itemResults[0].evaluations;
+      expect(evaluations).toHaveLength(3);
+
+      const numberEval = evaluations.find((e) => e.name === "number-eval");
+      const stringEval = evaluations.find((e) => e.name === "string-eval");
+      const booleanEval = evaluations.find((e) => e.name === "boolean-eval");
+
+      expect(numberEval?.value).toBe(42);
+      expect(stringEval?.value).toBe("excellent");
+      expect(booleanEval?.value).toBe(true);
+    });
+
+    it("should handle complex evaluator metadata and comments", async () => {
+      const complexEvaluator: Evaluator = async ({
+        input,
+        output,
+        expectedOutput,
+      }) => [
+        {
+          name: "detailed-eval",
+          value: 0.85,
+          comment: `Detailed analysis: input="${input}", output="${output}", expected="${expectedOutput}"`,
+          metadata: {
+            inputLength: input?.length || 0,
+            outputLength: output?.length || 0,
+            timestamp: new Date().toISOString(),
+            evaluatorVersion: "1.2.3",
+          },
+          dataType: "NUMERIC" as const,
+        },
+        {
+          name: "secondary-eval",
+          value: input === expectedOutput ? "perfect" : "imperfect",
+          comment: "Secondary evaluation result",
+          metadata: { secondary: true },
+        },
+      ];
+
+      const result = await langfuse.experiment.run({
+        name: "Complex evaluator test",
+        description: "Test evaluators with complex metadata",
+        data: dataset.slice(0, 1),
+        task,
+        evaluators: [complexEvaluator],
+      });
+
+      await testEnv.spanProcessor.forceFlush();
+      await waitForServerIngestion(1000);
+
+      expect(result.itemResults).toHaveLength(1);
+      const evaluations = result.itemResults[0].evaluations;
+      expect(evaluations).toHaveLength(2);
+
+      const detailedEval = evaluations.find((e) => e.name === "detailed-eval");
+      expect(detailedEval).toMatchObject({
+        name: "detailed-eval",
+        value: 0.85,
+        comment: expect.stringContaining("Detailed analysis"),
+        metadata: expect.objectContaining({
+          inputLength: expect.any(Number),
+          evaluatorVersion: "1.2.3",
+        }),
+        dataType: "NUMERIC",
+      });
+
+      const secondaryEval = evaluations.find(
+        (e) => e.name === "secondary-eval",
+      );
+      expect(secondaryEval).toMatchObject({
+        name: "secondary-eval",
+        value: expect.any(String),
+        metadata: { secondary: true },
+      });
+    });
+  });
+});
diff --git a/tests/e2e/openai.e2e.test.ts b/tests/e2e/openai.e2e.test.ts
index fed19fef..67fb571b 100644
--- a/tests/e2e/openai.e2e.test.ts
+++ b/tests/e2e/openai.e2e.test.ts
@@ -148,7 +148,7 @@ describe("OpenAI integration E2E tests", () => {
     expect(content).toBeDefined();
 
     await testEnv.spanProcessor.forceFlush();
-    await waitForServerIngestion(2000);
+    await waitForServerIngestion(4000);
 
     const traces = await langfuseClient.api.trace.list({
       name: generationName,
diff --git a/tests/e2e/tracing.e2e.test.ts b/tests/e2e/tracing.e2e.test.ts
index b098ffbe..5ca0851b 100644
--- a/tests/e2e/tracing.e2e.test.ts
+++ b/tests/e2e/tracing.e2e.test.ts
@@ -1,17 +1,18 @@
-import { describe, it, beforeEach, afterEach } from "vitest";
 import {
   startObservation,
   startActiveObservation,
   observe,
 } from "@langfuse/tracing";
+import { nanoid } from "nanoid";
+import { describe, it, beforeEach, afterEach } from "vitest";
+
+import { ServerAssertions } from "./helpers/serverAssertions.js";
 import {
   setupServerTestEnvironment,
   teardownServerTestEnvironment,
   waitForServerIngestion,
   type ServerTestEnvironment,
 } from "./helpers/serverSetup.js";
-import { ServerAssertions } from "./helpers/serverAssertions.js";
-import { nanoid } from "nanoid";
 
 describe("Server Export E2E Tests", () => {
   let testEnv: ServerTestEnvironment;

From a7940a5ed74c3d763891891e8592d0152b0cedf3 Mon Sep 17 00:00:00 2001
From: Hassieb Pakzad <68423100+hassiebp@users.noreply.github.com>
Date: Wed, 17 Sep 2025 13:16:58 +0200
Subject: [PATCH 2/2] chore: release v4.1.0

---
 CHANGELOG.md                    | 6 ++++++
 package.json                    | 2 +-
 packages/client/package.json    | 2 +-
 packages/core/package.json      | 2 +-
 packages/langchain/package.json | 2 +-
 packages/openai/package.json    | 2 +-
 packages/otel/package.json      | 2 +-
 packages/tracing/package.json   | 2 +-
 8 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 908b4b2d..905979fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## [4.1.0](https://github.com/langfuse/langfuse-js/compare/v4.0.1...v4.1.0) (2025-09-17)
+
+### ✨ Features
+
+* **experiments:** add experiment runner ([#604](https://github.com/langfuse/langfuse-js/issues/604)) ([6a247dc](https://github.com/langfuse/langfuse-js/commit/6a247dc70ca5e797fa4e01121d7458fbc6bbceb9))
+
 ## [4.0.1](https://github.com/langfuse/langfuse-js/compare/v4.0.0...v4.0.1) (2025-09-11)
 
 ### 🐛 Bug Fixes
diff --git a/package.json b/package.json
index 086d155c..12bdcfde 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langfuse-js",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "description": "Langfuse JavaScript / TypeScript SDK",
   "author": "Langfuse",
   "license": "MIT",
diff --git a/packages/client/package.json b/packages/client/package.json
index 3a525d63..469ad079 100644
--- a/packages/client/package.json
+++ b/packages/client/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langfuse/client",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "description": "Langfuse API client for universal JavaScript environments",
   "type": "module",
   "sideEffects": false,
diff --git a/packages/core/package.json b/packages/core/package.json
index f28728da..5c7953e5 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langfuse/core",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "description": "Core functions and utilities for Langfuse packages",
   "type": "module",
   "sideEffects": false,
diff --git a/packages/langchain/package.json b/packages/langchain/package.json
index 30f7959e..17141d58 100644
--- a/packages/langchain/package.json
+++ b/packages/langchain/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langfuse/langchain",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "description": "Langfuse integration for LangChain",
   "type": "module",
   "sideEffects": false,
diff --git a/packages/openai/package.json b/packages/openai/package.json
index a9cfb46d..9f3d7392 100644
--- a/packages/openai/package.json
+++ b/packages/openai/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langfuse/openai",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "description": "Langfuse integration for OpenAI SDK",
   "type": "module",
   "sideEffects": false,
diff --git a/packages/otel/package.json b/packages/otel/package.json
index 751fda64..4468ce68 100644
--- a/packages/otel/package.json
+++ b/packages/otel/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langfuse/otel",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "author": "Langfuse",
   "license": "MIT",
   "engines": {
diff --git a/packages/tracing/package.json b/packages/tracing/package.json
index a5fa6497..06c095d7 100644
--- a/packages/tracing/package.json
+++ b/packages/tracing/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langfuse/tracing",
-  "version": "4.0.1",
+  "version": "4.1.0",
   "author": "Langfuse",
   "license": "MIT",
   "engines": {