Run on dataset.
Run the Chain or language model on a dataset and store traces to the specified project name.
For the (usually faster) async version of this function,
see arun_on_dataset.
run_on_dataset(
client: Client | None,
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
evaluation: smith_eval.RunEvalConfig | None = None,
dataset_version: datetime | str | None = None,
concurrency_level: int = 5,
project_name: str | None = None,
project_metadata: dict[str, Any] | None = None,
verbose: bool = False,
revision_id: str | None = None,
**kwargs: Any = {}
) -> dict[str, Any]Examples:
from langsmith import Client
from langchain_openai import ChatOpenAI
from langchain_classic.chains import LLMChain
from langchain_classic.smith import smith_eval.RunEvalConfig, run_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
model = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
model,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = smith_eval.RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
smith_eval.RunEvalConfig.Criteria("helpfulness"),
smith_eval.RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth "
"grader to answer this question?"
}),
]
)
client = Client()
run_on_dataset(
client,
dataset_name="<my_dataset_name>",
llm_or_chain_factory=construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the StringEvaluator or
LangSmith's RunEvaluator classes.
from typing import Optional
from langchain_classic.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(
self, prediction, reference=None, input=None, **kwargs
) -> dict:
return {"score": prediction == reference}
evaluation_config = smith_eval.RunEvalConfig(
custom_evaluators=[MyStringEvaluator()],
)
run_on_dataset(
client,
dataset_name="<my_dataset_name>",
llm_or_chain_factory=construct_chain,
evaluation=evaluation_config,
)| Name | Type | Description |
|---|---|---|
dataset_name* | str | Name of the dataset to run the chain on. |
llm_or_chain_factory* | MODEL_OR_CHAIN_FACTORY | Language model or Chain constructor to run over the dataset. The Chain constructor is used to permit independent calls on each example without carrying over state. |
evaluation | smith_eval.RunEvalConfig | None | Default: NoneConfiguration for evaluators to run on the results of the chain. |
dataset_version | datetime | str | None | Default: NoneOptional version of the dataset. |
concurrency_level | int | Default: 5The number of async tasks to run concurrently. |
project_name | str | None | Default: NoneName of the project to store the traces in.
Defaults to |
project_metadata | dict[str, Any] | None | Default: NoneOptional metadata to add to the project. Useful for storing information the test variant. (prompt version, model version, etc.) |
client* | Client | None | LangSmith client to use to access the dataset and to log feedback and run traces. |
verbose | bool | Default: FalseWhether to print progress. |
revision_id | str | None | Default: NoneOptional revision identifier to assign this test run to track the performance of different versions of your system. |
**kwargs | Any | Default: {}Should not be used, but is provided for backwards compatibility. |