Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)

from typing_extensions import TypedDict

from pydantic import root_validator
from langsmith import run_helpers as rh
from langsmith import schemas

Expand Down Expand Up @@ -67,6 +67,29 @@ class FeedbackConfig(TypedDict, total=False):
"""The maximum value permitted value (if continuous type)."""
categories: Optional[list[Union[Category, dict]]]

class FeedbackConfigModel(BaseModel):
"""Validated configuration for feedback."""

type: Literal["continuous", "categorical", "freeform"]
min: Optional[Union[float, int]] = None
max: Optional[Union[float, int]] = None
categories: Optional[list[Union[Category, dict]]] = None

class Config:
extra = "forbid"

@root_validator(pre=True)
def reject_unknown_fields(cls, values):
known_fields = {"type", "min", "max", "categories"}
unknown = set(values.keys()) - known_fields
if unknown:
raise ValueError(
f"Unknown fields in feedback_config: {', '.join(unknown)}. "
"Please use only supported keys."
)
return values



class EvaluationResult(BaseModel):
"""Evaluation result."""
Expand All @@ -83,8 +106,9 @@ class EvaluationResult(BaseModel):
"""What the correct value should be, if applicable."""
evaluator_info: dict = Field(default_factory=dict)
"""Additional information about the evaluator."""
feedback_config: Optional[Union[FeedbackConfig, dict]] = None
"""The configuration used to generate this feedback."""
feedback_config: Optional[FeedbackConfigModel] = None
"""The configuration used to generate this feedback.
Unknown fields will raise a validation error."""
source_run_id: Optional[Union[uuid.UUID, str]] = None
"""The ID of the trace of the evaluator itself."""
target_run_id: Optional[Union[uuid.UUID, str]] = None
Expand Down Expand Up @@ -273,8 +297,8 @@ def _coerce_evaluation_result(
return EvaluationResult(**{"source_run_id": source_run_id, **result})
except ValidationError as e:
raise ValueError(
"Expected an EvaluationResult object, or dict with a metric"
f" 'key' and optional 'score'; got {result}"
f"Invalid EvaluationResult: {e.errors()}.\n"
f"Got: {result}"
) from e

def _coerce_evaluation_results(
Expand Down
94 changes: 94 additions & 0 deletions python/tests/evaluation/test_feedback_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pytest
from pydantic import ValidationError
from langsmith.evaluation.evaluator import EvaluationResult

# 1. Should accept a valid feedback_config dict
def test_feedback_config_valid_dict():
result = EvaluationResult(
key="sentiment",
value="positive",
feedback_config={
"type": "continuous",
"min": 0,
"max": 1,
"categories": [{"label": "good", "value": 1}],
}
)
assert result.feedback_config.type == "continuous"
assert result.feedback_config.min == 0
assert result.feedback_config.max == 1
assert result.feedback_config.categories == [{"label": "good", "value": 1}]

# 2. Should raise a ValidationError if unknown fields are passed
def test_feedback_config_rejects_unknown_fields():
with pytest.raises(ValidationError) as excinfo:
EvaluationResult(
key="sentiment",
value="positive",
feedback_config={"type": "continuous", "threshold": 1.0}
)
assert "Unknown fields" in str(excinfo.value)
assert "threshold" in str(excinfo.value)

# 3. Should reject non-string literal values for 'type'
def test_feedback_config_literal_enforced():
with pytest.raises(ValidationError) as excinfo:
EvaluationResult(
key="sentiment",
value="positive",
feedback_config={"type": 1.0} # 🚫 invalid
)
assert "unexpected value" in str(excinfo.value)
assert "continuous" in str(excinfo.value)
assert "categorical" in str(excinfo.value)
assert "freeform" in str(excinfo.value)

# 4. Should work when only some valid fields are present
def test_feedback_config_partial_valid():
result = EvaluationResult(
key="toxicity",
value="low",
feedback_config={"type": "categorical"} # ✅ type is required, rest optional
)
assert result.feedback_config.type == "categorical"
assert result.feedback_config.min is None
assert result.feedback_config.max is None
assert result.feedback_config.categories is None

# 5. Should raise if feedback_config dict is missing 'type'
def test_feedback_config_missing_type_rejected():
with pytest.raises(ValidationError) as excinfo:
EvaluationResult(
key="sentiment",
value="positive",
feedback_config={"min": 0, "max": 1} # 🚫 missing type
)
assert "field required" in str(excinfo.value)
assert "type" in str(excinfo.value)

# 6. Original Example: Violates literal condition for type
def test_example_violates_literal_condition():
with pytest.raises(ValidationError):
EvaluationResult(
key="sentiment",
value="positive",
feedback_config={"type": 1.0, "threshold": 1.0} # 🚫 both issues
)

# 7. Original Example: Follows literal condition but extra data rejected
def test_example_follows_literal_condition_but_extra_removed():
with pytest.raises(ValidationError) as excinfo:
EvaluationResult(
key="sentiment",
value="positive",
feedback_config={"type": "continuous", "threshold": 1.0} # 🚫 unknown field
)
assert "Unknown fields" in str(excinfo.value)

# 8. Original Example: No feedback_config at all (✅ allowed)
def test_feedback_config_optional_completely():
result = EvaluationResult(
key="sentiment",
value="positive"
)
assert result.feedback_config is None