diff --git a/eli5/cli/duochat/evaluate_v1.py b/eli5/cli/duochat/evaluate_v1.py
index 90d8257343fdd6a1159b20c380985908456eb615..bca9cd788207c680d5790deff692937d2621661e 100644
--- a/eli5/cli/duochat/evaluate_v1.py
+++ b/eli5/cli/duochat/evaluate_v1.py
@@ -40,9 +40,12 @@ from typing import Annotated, Optional
 import typer
 
 from eli5 import duochat
+from eli5.core.evaluators.config import build_evaluators, parse_from_yaml
+from eli5.duochat import evaluation
 
 __all__ = ["app"]
 
+
 app = typer.Typer()
 
 
@@ -71,20 +74,27 @@ def docs(
     ] = None,
     rate_limit: Annotated[
         int,
-        typer.Option(
-            help="Limits the number of evals and requests per rate_limit_period. "
-            "This is applied separately to evaluations and to predictions."
-        ),
+        typer.Option(help="Limit the number of requests to the Duo Chat API per rate_limit_period."),
     ] = 10,
     rate_limit_period: Annotated[int, typer.Option(help="Period where rate limit is counted.")] = 60,
     experiment_prefix: Annotated[
         str,
         typer.Option(help="A name prefix to make it easier to search for this experiment in LangSmith."),
     ] = "",
+    config_file: Annotated[
+        Path | None, typer.Option(exists=True, dir_okay=False, help="TODO: path to the configuration file")
+    ] = None,
 ):
-    duochat.evaluation.docs(
+    evaluators = build_evaluators(
+        parse_from_yaml(config_file)
+        if config_file
+        else evaluation.config.EvaluateDocsDefaultConfigSet().get()
+    )
+
+    evaluation.docs(
         ctx.obj.langsmith_client,
         dataset=dataset,
+        evaluators=evaluators,
         limit=limit,
         offset=offset,
         example_ids=example_ids,
diff --git a/eli5/core/evaluators/__init__.py b/eli5/core/evaluators/__init__.py
index dd1d58d7038808b1b6eeb3c02aa70f80c6867e06..6ef3967a66f5ee66f6fc5f684e9d8727a7a36cec 100644
--- a/eli5/core/evaluators/__init__.py
+++ b/eli5/core/evaluators/__init__.py
@@ -1,2 +1,2 @@
-from eli5.core.evaluators import embeddings, pairwise
+from eli5.core.evaluators import config, embeddings, pairwise
 from eli5.core.evaluators.base import *
diff --git a/eli5/core/evaluators/config.py b/eli5/core/evaluators/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a601c793a5ca05dea389f8872202f029b196239
--- /dev/null
+++ b/eli5/core/evaluators/config.py
@@ -0,0 +1,192 @@
+from abc import ABC, abstractmethod
+from enum import StrEnum
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Optional,
+    Self,
+    Type,
+    TypeAlias,
+    TypeVar,
+    cast,
+)
+
+import yaml
+from anthropic import BaseModel
+from langchain_anthropic import ChatAnthropic
+from langchain_core.language_models import BaseChatModel
+from langsmith.schemas import Example, Run
+from pydantic import ConfigDict, Field, PrivateAttr, model_validator
+
+from eli5.core.evaluators.base import BaseEvaluator, BaseLLMEvaluator
+
+__all__ = [
+    "ModelClassProvider",
+    "ModelConfig",
+    "BaseEvaluatorConfig",
+    "BaseLLMEvaluatorConfig",
+    "BaseConfigSet",
+    "EvaluationConfig",
+    "evaluator_registry",
+    "parse_from_yaml",
+    "build_evaluators",
+]
+
+
+class ModelClassProvider(StrEnum):
+    ANTHROPIC = "anthropic"
+
+
+TypeEvaluatorInput = TypeVar("TypeEvaluatorInput")
+# pylint: disable-next=invalid-name
+TypeModelFactories: TypeAlias = dict[ModelClassProvider, Type[BaseChatModel]]
+
+
+class ModelConfig(BaseModel):
+    name: str
+    class_provider: ModelClassProvider
+    params: dict[str, Any] = Field(default_factory=dict)
+
+
+class BaseEvaluatorConfig(BaseModel, Generic[TypeEvaluatorInput]):
+    _evaluator_type: Type[BaseEvaluator] = PrivateAttr()
+
+    params: dict[str, Any] = Field(default_factory=dict)
+    prepare_data: Callable[[Run, Example], TypeEvaluatorInput]
+
+
+class BaseLLMEvaluatorConfig(BaseEvaluatorConfig):
+    model_config = ConfigDict(protected_namespaces=())
+
+    model: ModelConfig
+
+
+class BaseConfigSet(ABC):
+    @abstractmethod
+    def get(self) -> list[BaseEvaluatorConfig]:
+        pass
+
+
+class EvaluationConfig(BaseModel):
+    evaluators: list[BaseEvaluatorConfig]
+
+    @model_validator(mode="before")
+    @classmethod
+    def set_data(cls, data: dict[str, Any]) -> dict[str, Any]:
+        registry = _EvaluatorConfigRegistry.instance()
+
+        evaluators = []
+        for evaluator in data["evaluators"]:
+            if config_type := registry.get(evaluator["name"]):
+                evaluators.append(config_type(**evaluator))
+            else:
+                raise ValueError(f"unsupported evaluator config {evaluator["name"]}")
+
+        return {"evaluators": evaluators}
+
+
+class _EvaluatorConfigRegistry:
+    _instance: Optional[Self] = None
+    """
+    This class implements the Singleton pattern to ensure only one instance exists.
+    """
+
+    configs: dict[str, Type[BaseEvaluatorConfig]]
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(_EvaluatorConfigRegistry, cls).__new__(cls)
+            cls._instance.configs = {}
+        return cls._instance
+
+    @classmethod
+    def instance(cls) -> Self:
+        return cls()
+
+    def register(self, name: str, config: Type[BaseEvaluatorConfig]):
+        self.configs[name] = config
+
+    def get(self, name: str) -> Optional[Type[BaseEvaluatorConfig]]:
+        if config := self.configs.get(name, None):
+            return config
+
+        return None
+
+    def list(self) -> dict[str, tuple[Type[BaseEvaluatorConfig], Type[BaseEvaluator]]]:
+        return self.configs
+
+
+_model_factories: TypeModelFactories = {ModelClassProvider.ANTHROPIC: ChatAnthropic}
+
+
+def _build_llm_evaluator(
+    registry: _EvaluatorConfigRegistry, model_factories: TypeModelFactories, config: BaseLLMEvaluatorConfig
+) -> BaseEvaluator:
+    evaluator_type = cast(Type[BaseLLMEvaluator], config.evaluator_type)
+    model_type = model_factories.get(config.model.class_provider, None)
+    if not model_type:
+        raise ValueError(f"unknown model class provider {config.model.class_provider}")
+
+    return evaluator_type(
+        model=model_type(model=config.model.name, **config.model.params),
+        prepare_data=config.prepare_data,
+        **config.params,
+    )
+
+
+def evaluator_registry(evaluator: Type[BaseEvaluator]):
+    def decorator(config: Type[BaseEvaluatorConfig]) -> Type[BaseEvaluatorConfig]:
+        class ModifiedConfig(config):
+            @property
+            def name(self) -> str:
+                return self._evaluator_type.__name__
+
+            @property
+            def evaluator_type(self) -> Type[BaseEvaluator]:
+                return self._evaluator_type
+
+            @model_validator(mode="after")
+            def set_evaluator_type(self) -> Self:
+                # pylint: disable-next=attribute-defined-outside-init; defined in BaseEvaluatorConfig
+                self._evaluator_type = evaluator
+
+                return self
+
+        # Set the correct name for the modified class
+        ModifiedConfig.__name__ = config.__name__
+        ModifiedConfig.__qualname__ = config.__qualname__
+
+        # Register the config class
+        registry = _EvaluatorConfigRegistry.instance()
+        registry.register(evaluator.__name__, ModifiedConfig)
+
+        return ModifiedConfig
+
+    return decorator
+
+
+def parse_from_yaml(path: Path) -> list[BaseEvaluatorConfig]:
+    with open(path, "r") as fp:
+        config = EvaluationConfig(**yaml.safe_load(fp))
+
+    return config.evaluators
+
+
+def build_evaluators(
+    configs: list[BaseEvaluatorConfig], extend_model_factories: Optional[TypeModelFactories] = None
+) -> list[BaseEvaluator]:
+    registry = _EvaluatorConfigRegistry.instance()
+    model_factories = {**_model_factories, **(extend_model_factories or {})}
+
+    evaluators = []
+    for config in configs:
+        if isinstance(config, BaseLLMEvaluatorConfig):
+            # We currently support building LLM judges only.
+            # TODO: add building functions to support other evaluators as well.
+            evaluators.append(_build_llm_evaluator(registry, model_factories, config))
+        else:
+            raise TypeError(f"unsupported evaluator config type {type(config)}")
+
+    return evaluators
diff --git a/eli5/duochat/evaluation/__init__.py b/eli5/duochat/evaluation/__init__.py
index 8bbb9a4fcdeeb5398d18f2e80393fb2702095838..131ca6786ab2dd6cf203d67bef6c5026176d34d6 100644
--- a/eli5/duochat/evaluation/__init__.py
+++ b/eli5/duochat/evaluation/__init__.py
@@ -1,3 +1,4 @@
+from eli5.duochat.evaluation import config
 from eli5.duochat.evaluation.docs import *
 from eli5.duochat.evaluation.pairwise import *
 from eli5.duochat.evaluation.resources import *
diff --git a/eli5/duochat/evaluation/config/.examples/docs_multiple_llms_judges.yaml b/eli5/duochat/evaluation/config/.examples/docs_multiple_llms_judges.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8ad3ad3f1827cb061e4f0ea612883cc49f16b32
--- /dev/null
+++ b/eli5/duochat/evaluation/config/.examples/docs_multiple_llms_judges.yaml
@@ -0,0 +1,23 @@
+.model_anthropic_params: &model_anthropic_params
+  temperature: 0.0
+  timeout: 60
+  max_tokens: 2048
+  max_retries: 1
+
+.cot_qa_doc_accuracy_params: &cot_qa_doc_accuracy_params
+  limits_calls: 10
+  limits_period: 60
+
+evaluators:
+  - name: CotQADocAccuracyEvaluator
+    model:
+      name: claude-3-haiku-20240307
+      class_provider: anthropic
+      params: *model_anthropic_params
+    params: *cot_qa_doc_accuracy_params
+  - name: CotQADocAccuracyEvaluator
+    model:
+      name: claude-3-sonnet-20240229
+      class_provider: anthropic
+      params: *model_anthropic_params
+    params: *cot_qa_doc_accuracy_params
diff --git a/eli5/duochat/evaluation/config/__init__.py b/eli5/duochat/evaluation/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a3d12e0ad22565a786e00ec0d1f2a7366e04c4
--- /dev/null
+++ b/eli5/duochat/evaluation/config/__init__.py
@@ -0,0 +1 @@
+from eli5.duochat.evaluation.config.docs import *
diff --git a/eli5/duochat/evaluation/config/docs.py b/eli5/duochat/evaluation/config/docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cc23ef1bdb060db59ff1fdb20d8f2ba093defab
--- /dev/null
+++ b/eli5/duochat/evaluation/config/docs.py
@@ -0,0 +1,42 @@
+from typing import Any, Callable
+
+from langsmith.schemas import Example, Run
+
+from eli5.core.evaluators.config import (
+    BaseConfigSet,
+    BaseEvaluatorConfig,
+    BaseLLMEvaluatorConfig,
+    ModelClassProvider,
+    ModelConfig,
+    evaluator_registry,
+)
+from eli5.duochat.evaluation.evaluators import (
+    CotQADocAccuracyEvaluator,
+    CotQADocEvaluatorInput,
+)
+
+__all__ = ["ConfigCotQADocAccuracyEvaluator", "EvaluateDocsDefaultConfigSet"]
+
+
+@evaluator_registry(CotQADocAccuracyEvaluator)
+class ConfigCotQADocAccuracyEvaluator(BaseLLMEvaluatorConfig):
+    """
+    Define config for the CotQADocAccuracyEvaluator evaluator and set default values.
+    """
+
+    model: ModelConfig = ModelConfig(
+        name="claude-3-sonnet-20240229",
+        class_provider=ModelClassProvider.ANTHROPIC,
+        params={"temperature": 0, "max_tokens": 2_048, "timeout": 60},
+    )
+    params: dict[str, Any] = {"inc_model_name_metric": False, "limits_calls": 10, "limits_period": 60}
+    prepare_data: Callable[[Run, Example], CotQADocEvaluatorInput] = lambda run, example: {
+        "query": example.inputs.get("question"),
+        "context": "\n".join([example.inputs.get("context"), example.outputs.get("possible_answer")]),
+        "answer": run.outputs.get("actual_answer", ""),
+    }
+
+
+class EvaluateDocsDefaultConfigSet(BaseConfigSet):
+    def get(self) -> list[BaseEvaluatorConfig]:
+        return [ConfigCotQADocAccuracyEvaluator()]
diff --git a/eli5/duochat/evaluation/docs.py b/eli5/duochat/evaluation/docs.py
index 9bdee2854f83ada9fa0dc8794717876bf3d366c1..a12c5fa671a38b7bbc147f7c0a532eaa9abb0050 100644
--- a/eli5/duochat/evaluation/docs.py
+++ b/eli5/duochat/evaluation/docs.py
@@ -1,13 +1,12 @@
 from datetime import datetime
 from typing import Optional
 
-from langchain_anthropic import ChatAnthropic
 from langsmith import Client
 from langsmith.evaluation import evaluate as ls_evaluate
 
+from eli5.core.evaluators import BaseEvaluator
 from eli5.core.utils import conditionally_rate_limited
 from eli5.duochat.clients import get_duo_chat_answer
-from eli5.duochat.evaluation.evaluators import CotQADocAccuracyEvaluator
 
 __all__ = ["docs"]
 
@@ -15,6 +14,7 @@ __all__ = ["docs"]
 def docs(
     client: Client,
     dataset: str,
+    evaluators: list[BaseEvaluator],
     limit: Optional[int] = None,
     offset: int = 0,
     example_ids: Optional[list[str]] = None,
@@ -29,6 +29,7 @@ def docs(
         client (Client): A LangSmith Client instance for interacting with the LangSmith API.
         dataset (str): The name of the dataset to use for evaluation compatible with the format of
           'duo_chat.cot_qa_docs.1'. This dataset should contain examples with context, questions, and possible answers.
+        evaluators: TODO, update
         limit (int, optional): The maximum number of examples to evaluate. Defaults to 0 (no limit).
         offset (int, optional): The number of examples to skip before starting evaluation. Defaults to 0.
         example_ids (str, optional): list of IDs of example to evaluate. If provided, overrides limit and offset.
@@ -58,23 +59,6 @@ def docs(
 
         return {"actual_answer": actual_answer}
 
-    # Initialize Claude model for LLM-based evaluation
-    model = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.0)
-
-    # Set up evaluators for LangSmith.
-    evaluators = [
-        CotQADocAccuracyEvaluator(
-            model=model,
-            prepare_data=lambda run, example: {
-                "query": example.inputs.get("question"),
-                "context": "\n".join([example.inputs.get("context"), example.outputs.get("possible_answer")]),
-                "answer": run.outputs.get("actual_answer", ""),
-            },
-            limits_calls=rate_limit,
-            limits_period=rate_limit_period,
-        )
-    ]
-
     time = datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")
     prefix = experiment_prefix or f"Run {dataset} on GDK on {time}"
 
diff --git a/eli5/duochat/evaluation/evaluators/cot_qa_docs.py b/eli5/duochat/evaluation/evaluators/cot_qa_docs.py
index f1a6fdd419ad377a92fde3d681f34933e502c896..ded4e99b6035f6f9ada47088cef4d7aa94850c0a 100644
--- a/eli5/duochat/evaluation/evaluators/cot_qa_docs.py
+++ b/eli5/duochat/evaluation/evaluators/cot_qa_docs.py
@@ -68,6 +68,7 @@ class CotQADocAccuracyEvaluator(BaseLLMEvaluator):
     context, question, and answer. The evaluator assigns a score from 1 to 4.
     """
 
+    inc_model_name_metric: bool = True
     prepare_data: Callable[[Run, Example], CotQADocEvaluatorInput]
     retry_exceptions: tuple[Type[Exception]] = (
         anthropic.InternalServerError,
@@ -77,7 +78,13 @@ class CotQADocAccuracyEvaluator(BaseLLMEvaluator):
 
     @property
     def evaluation_name(self) -> str:
-        return "context-qa-accuracy"
+        name = "context-qa-accuracy"
+        if self.inc_model_name_metric:
+            # include the internal LLM name to the metric
+            # useful when we have one LLM judge type powered by different models
+            name += f"_{self.model.model}"
+
+        return name
 
     def _build_llm_chain(self, model: BaseChatModel) -> Runnable:
         """