diff --git a/eli5/cli/duochat/evaluate_v1.py b/eli5/cli/duochat/evaluate_v1.py index 90d8257343fdd6a1159b20c380985908456eb615..bca9cd788207c680d5790deff692937d2621661e 100644 --- a/eli5/cli/duochat/evaluate_v1.py +++ b/eli5/cli/duochat/evaluate_v1.py @@ -40,9 +40,12 @@ from typing import Annotated, Optional import typer from eli5 import duochat +from eli5.core.evaluators.config import build_evaluators, parse_from_yaml +from eli5.duochat import evaluation __all__ = ["app"] + app = typer.Typer() @@ -71,20 +74,27 @@ def docs( ] = None, rate_limit: Annotated[ int, - typer.Option( - help="Limits the number of evals and requests per rate_limit_period. " - "This is applied separately to evaluations and to predictions." - ), + typer.Option(help="Limit the number of requests to the Duo Chat API per rate_limit_period."), ] = 10, rate_limit_period: Annotated[int, typer.Option(help="Period where rate limit is counted.")] = 60, experiment_prefix: Annotated[ str, typer.Option(help="A name prefix to make it easier to search for this experiment in LangSmith."), ] = "", + config_file: Annotated[ + Path | None, typer.Option(exists=True, dir_okay=False, help="TODO: path to the configuration file") + ] = None, ): - duochat.evaluation.docs( + evaluators = build_evaluators( + parse_from_yaml(config_file) + if config_file + else evaluation.config.EvaluateDocsDefaultConfigSet().get() + ) + + evaluation.docs( ctx.obj.langsmith_client, dataset=dataset, + evaluators=evaluators, limit=limit, offset=offset, example_ids=example_ids, diff --git a/eli5/core/evaluators/__init__.py b/eli5/core/evaluators/__init__.py index dd1d58d7038808b1b6eeb3c02aa70f80c6867e06..6ef3967a66f5ee66f6fc5f684e9d8727a7a36cec 100644 --- a/eli5/core/evaluators/__init__.py +++ b/eli5/core/evaluators/__init__.py @@ -1,2 +1,2 @@ -from eli5.core.evaluators import embeddings, pairwise +from eli5.core.evaluators import config, embeddings, pairwise from eli5.core.evaluators.base import * diff --git a/eli5/core/evaluators/config.py b/eli5/core/evaluators/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a601c793a5ca05dea389f8872202f029b196239 --- /dev/null +++ b/eli5/core/evaluators/config.py @@ -0,0 +1,192 @@ +from abc import ABC, abstractmethod +from enum import StrEnum +from pathlib import Path +from typing import ( + Any, + Callable, + Generic, + Optional, + Self, + Type, + TypeAlias, + TypeVar, + cast, +) + +import yaml +from anthropic import BaseModel +from langchain_anthropic import ChatAnthropic +from langchain_core.language_models import BaseChatModel +from langsmith.schemas import Example, Run +from pydantic import ConfigDict, Field, PrivateAttr, model_validator + +from eli5.core.evaluators.base import BaseEvaluator, BaseLLMEvaluator + +__all__ = [ + "ModelClassProvider", + "ModelConfig", + "BaseEvaluatorConfig", + "BaseLLMEvaluatorConfig", + "BaseConfigSet", + "EvaluationConfig", + "evaluator_registry", + "parse_from_yaml", + "build_evaluators", +] + + +class ModelClassProvider(StrEnum): + ANTHROPIC = "anthropic" + + +TypeEvaluatorInput = TypeVar("TypeEvaluatorInput") +# pylint: disable-next=invalid-name +TypeModelFactories: TypeAlias = dict[ModelClassProvider, Type[BaseChatModel]] + + +class ModelConfig(BaseModel): + name: str + class_provider: ModelClassProvider + params: dict[str, Any] = Field(default_factory=dict) + + +class BaseEvaluatorConfig(BaseModel, Generic[TypeEvaluatorInput]): + _evaluator_type: Type[BaseEvaluator] = PrivateAttr() + + params: dict[str, Any] = Field(default_factory=dict) + prepare_data: Callable[[Run, Example], TypeEvaluatorInput] + + +class BaseLLMEvaluatorConfig(BaseEvaluatorConfig): + model_config = ConfigDict(protected_namespaces=()) + + model: ModelConfig + + +class BaseConfigSet(ABC): + @abstractmethod + def get(self) -> list[BaseEvaluatorConfig]: + pass + + +class EvaluationConfig(BaseModel): + evaluators: list[BaseEvaluatorConfig] + + @model_validator(mode="before") + @classmethod + def set_data(cls, data: dict[str, Any]) -> dict[str, Any]: + registry = _EvaluatorConfigRegistry.instance() + + evaluators = [] + for evaluator in data["evaluators"]: + if config_type := registry.get(evaluator["name"]): + evaluators.append(config_type(**evaluator)) + else: + raise ValueError(f"unsupported evaluator config {evaluator["name"]}") + + return {"evaluators": evaluators} + + +class _EvaluatorConfigRegistry: + _instance: Optional[Self] = None + """ + This class implements the Singleton pattern to ensure only one instance exists. + """ + + configs: dict[str, Type[BaseEvaluatorConfig]] + + def __new__(cls): + if cls._instance is None: + cls._instance = super(_EvaluatorConfigRegistry, cls).__new__(cls) + cls._instance.configs = {} + return cls._instance + + @classmethod + def instance(cls) -> Self: + return cls() + + def register(self, name: str, config: Type[BaseEvaluatorConfig]): + self.configs[name] = config + + def get(self, name: str) -> Optional[Type[BaseEvaluatorConfig]]: + if config := self.configs.get(name, None): + return config + + return None + + def list(self) -> dict[str, tuple[Type[BaseEvaluatorConfig], Type[BaseEvaluator]]]: + return self.configs + + +_model_factories: TypeModelFactories = {ModelClassProvider.ANTHROPIC: ChatAnthropic} + + +def _build_llm_evaluator( + registry: _EvaluatorConfigRegistry, model_factories: TypeModelFactories, config: BaseLLMEvaluatorConfig +) -> BaseEvaluator: + evaluator_type = cast(Type[BaseLLMEvaluator], config.evaluator_type) + model_type = model_factories.get(config.model.class_provider, None) + if not model_type: + raise ValueError(f"unknown model class provider {config.model.class_provider}") + + return evaluator_type( + model=model_type(model=config.model.name, **config.model.params), + prepare_data=config.prepare_data, + **config.params, + ) + + +def evaluator_registry(evaluator: Type[BaseEvaluator]): + def decorator(config: Type[BaseEvaluatorConfig]) -> Type[BaseEvaluatorConfig]: + class ModifiedConfig(config): + @property + def name(self) -> str: + return self._evaluator_type.__name__ + + @property + def evaluator_type(self) -> Type[BaseEvaluator]: + return self._evaluator_type + + @model_validator(mode="after") + def set_evaluator_type(self) -> Self: + # pylint: disable-next=attribute-defined-outside-init; defined in BaseEvaluatorConfig + self._evaluator_type = evaluator + + return self + + # Set the correct name for the modified class + ModifiedConfig.__name__ = config.__name__ + ModifiedConfig.__qualname__ = config.__qualname__ + + # Register the config class + registry = _EvaluatorConfigRegistry.instance() + registry.register(evaluator.__name__, ModifiedConfig) + + return ModifiedConfig + + return decorator + + +def parse_from_yaml(path: Path) -> list[BaseEvaluatorConfig]: + with open(path, "r") as fp: + config = EvaluationConfig(**yaml.safe_load(fp)) + + return config.evaluators + + +def build_evaluators( + configs: list[BaseEvaluatorConfig], extend_model_factories: Optional[TypeModelFactories] = None +) -> list[BaseEvaluator]: + registry = _EvaluatorConfigRegistry.instance() + model_factories = {**_model_factories, **(extend_model_factories or {})} + + evaluators = [] + for config in configs: + if isinstance(config, BaseLLMEvaluatorConfig): + # We currently support building LLM judges only. + # TODO: add building functions to support other evaluators as well. + evaluators.append(_build_llm_evaluator(registry, model_factories, config)) + else: + raise TypeError(f"unsupported evaluator config type {type(config)}") + + return evaluators diff --git a/eli5/duochat/evaluation/__init__.py b/eli5/duochat/evaluation/__init__.py index 8bbb9a4fcdeeb5398d18f2e80393fb2702095838..131ca6786ab2dd6cf203d67bef6c5026176d34d6 100644 --- a/eli5/duochat/evaluation/__init__.py +++ b/eli5/duochat/evaluation/__init__.py @@ -1,3 +1,4 @@ +from eli5.duochat.evaluation import config from eli5.duochat.evaluation.docs import * from eli5.duochat.evaluation.pairwise import * from eli5.duochat.evaluation.resources import * diff --git a/eli5/duochat/evaluation/config/.examples/docs_multiple_llms_judges.yaml b/eli5/duochat/evaluation/config/.examples/docs_multiple_llms_judges.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e8ad3ad3f1827cb061e4f0ea612883cc49f16b32 --- /dev/null +++ b/eli5/duochat/evaluation/config/.examples/docs_multiple_llms_judges.yaml @@ -0,0 +1,23 @@ +.model_anthropic_params: &model_anthropic_params + temperature: 0.0 + timeout: 60 + max_tokens: 2048 + max_retries: 1 + +.cot_qa_doc_accuracy_params: &cot_qa_doc_accuracy_params + limits_calls: 10 + limits_period: 60 + +evaluators: + - name: CotQADocAccuracyEvaluator + model: + name: claude-3-haiku-20240307 + class_provider: anthropic + params: *model_anthropic_params + params: *cot_qa_doc_accuracy_params + - name: CotQADocAccuracyEvaluator + model: + name: claude-3-sonnet-20240229 + class_provider: anthropic + params: *model_anthropic_params + params: *cot_qa_doc_accuracy_params diff --git a/eli5/duochat/evaluation/config/__init__.py b/eli5/duochat/evaluation/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..86a3d12e0ad22565a786e00ec0d1f2a7366e04c4 --- /dev/null +++ b/eli5/duochat/evaluation/config/__init__.py @@ -0,0 +1 @@ +from eli5.duochat.evaluation.config.docs import * diff --git a/eli5/duochat/evaluation/config/docs.py b/eli5/duochat/evaluation/config/docs.py new file mode 100644 index 0000000000000000000000000000000000000000..4cc23ef1bdb060db59ff1fdb20d8f2ba093defab --- /dev/null +++ b/eli5/duochat/evaluation/config/docs.py @@ -0,0 +1,42 @@ +from typing import Any, Callable + +from langsmith.schemas import Example, Run + +from eli5.core.evaluators.config import ( + BaseConfigSet, + BaseEvaluatorConfig, + BaseLLMEvaluatorConfig, + ModelClassProvider, + ModelConfig, + evaluator_registry, +) +from eli5.duochat.evaluation.evaluators import ( + CotQADocAccuracyEvaluator, + CotQADocEvaluatorInput, +) + +__all__ = ["ConfigCotQADocAccuracyEvaluator", "EvaluateDocsDefaultConfigSet"] + + +@evaluator_registry(CotQADocAccuracyEvaluator) +class ConfigCotQADocAccuracyEvaluator(BaseLLMEvaluatorConfig): + """ + Define config for the CotQADocAccuracyEvaluator evaluator and set default values. + """ + + model: ModelConfig = ModelConfig( + name="claude-3-sonnet-20240229", + class_provider=ModelClassProvider.ANTHROPIC, + params={"temperature": 0, "max_tokens": 2_048, "timeout": 60}, + ) + params: dict[str, Any] = {"inc_model_name_metric": False, "limits_calls": 10, "limits_period": 60} + prepare_data: Callable[[Run, Example], CotQADocEvaluatorInput] = lambda run, example: { + "query": example.inputs.get("question"), + "context": "\n".join([example.inputs.get("context"), example.outputs.get("possible_answer")]), + "answer": run.outputs.get("actual_answer", ""), + } + + +class EvaluateDocsDefaultConfigSet(BaseConfigSet): + def get(self) -> list[BaseEvaluatorConfig]: + return [ConfigCotQADocAccuracyEvaluator()] diff --git a/eli5/duochat/evaluation/docs.py b/eli5/duochat/evaluation/docs.py index 9bdee2854f83ada9fa0dc8794717876bf3d366c1..a12c5fa671a38b7bbc147f7c0a532eaa9abb0050 100644 --- a/eli5/duochat/evaluation/docs.py +++ b/eli5/duochat/evaluation/docs.py @@ -1,13 +1,12 @@ from datetime import datetime from typing import Optional -from langchain_anthropic import ChatAnthropic from langsmith import Client from langsmith.evaluation import evaluate as ls_evaluate +from eli5.core.evaluators import BaseEvaluator from eli5.core.utils import conditionally_rate_limited from eli5.duochat.clients import get_duo_chat_answer -from eli5.duochat.evaluation.evaluators import CotQADocAccuracyEvaluator __all__ = ["docs"] @@ -15,6 +14,7 @@ __all__ = ["docs"] def docs( client: Client, dataset: str, + evaluators: list[BaseEvaluator], limit: Optional[int] = None, offset: int = 0, example_ids: Optional[list[str]] = None, @@ -29,6 +29,7 @@ def docs( client (Client): A LangSmith Client instance for interacting with the LangSmith API. dataset (str): The name of the dataset to use for evaluation compatible with the format of 'duo_chat.cot_qa_docs.1'. This dataset should contain examples with context, questions, and possible answers. + evaluators: TODO, update limit (int, optional): The maximum number of examples to evaluate. Defaults to 0 (no limit). offset (int, optional): The number of examples to skip before starting evaluation. Defaults to 0. example_ids (str, optional): list of IDs of example to evaluate. If provided, overrides limit and offset. @@ -58,23 +59,6 @@ def docs( return {"actual_answer": actual_answer} - # Initialize Claude model for LLM-based evaluation - model = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.0) - - # Set up evaluators for LangSmith. - evaluators = [ - CotQADocAccuracyEvaluator( - model=model, - prepare_data=lambda run, example: { - "query": example.inputs.get("question"), - "context": "\n".join([example.inputs.get("context"), example.outputs.get("possible_answer")]), - "answer": run.outputs.get("actual_answer", ""), - }, - limits_calls=rate_limit, - limits_period=rate_limit_period, - ) - ] - time = datetime.now().strftime("%Y-%m-%d %I:%M:%S %p") prefix = experiment_prefix or f"Run {dataset} on GDK on {time}" diff --git a/eli5/duochat/evaluation/evaluators/cot_qa_docs.py b/eli5/duochat/evaluation/evaluators/cot_qa_docs.py index f1a6fdd419ad377a92fde3d681f34933e502c896..ded4e99b6035f6f9ada47088cef4d7aa94850c0a 100644 --- a/eli5/duochat/evaluation/evaluators/cot_qa_docs.py +++ b/eli5/duochat/evaluation/evaluators/cot_qa_docs.py @@ -68,6 +68,7 @@ class CotQADocAccuracyEvaluator(BaseLLMEvaluator): context, question, and answer. The evaluator assigns a score from 1 to 4. """ + inc_model_name_metric: bool = True prepare_data: Callable[[Run, Example], CotQADocEvaluatorInput] retry_exceptions: tuple[Type[Exception]] = ( anthropic.InternalServerError, @@ -77,7 +78,13 @@ class CotQADocAccuracyEvaluator(BaseLLMEvaluator): @property def evaluation_name(self) -> str: - return "context-qa-accuracy" + name = "context-qa-accuracy" + if self.inc_model_name_metric: + # include the internal LLM name to the metric + # useful when we have one LLM judge type powered by different models + name += f"_{self.model.model}" + + return name def _build_llm_chain(self, model: BaseChatModel) -> Runnable: """