Skip to content

Commit 0322e3f

Browse files
ChibionosChibi Vikramclaude
authored
fix: support Claude 4.5 models in LLM evaluators and gateway (#1269)
Co-authored-by: Chibi Vikram <chibivikram@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7390a18 commit 0322e3f

12 files changed

Lines changed: 530 additions & 15 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "uipath"
3-
version = "2.8.15"
3+
version = "2.8.16"
44
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
55
readme = { file = "README.md", content-type = "text/markdown" }
66
requires-python = ">=3.11"

samples/calculator/evaluations/eval-sets/default.json

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
"LLMJudgeOutputEvaluator",
1010
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
1111
"TrajectoryEvaluator",
12-
"CorrectOperatorEvaluator"
12+
"CorrectOperatorEvaluator",
13+
"LLMJudgeHaiku45",
14+
"LLMJudgeSonnet45"
1315
],
1416
"evaluations": [
1517
{
@@ -49,6 +51,16 @@
4951
},
5052
"TrajectoryEvaluator": {
5153
"expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
54+
},
55+
"LLMJudgeHaiku45": {
56+
"expectedOutput": {
57+
"result": 8.0
58+
}
59+
},
60+
"LLMJudgeSonnet45": {
61+
"expectedOutput": {
62+
"result": 8.0
63+
}
5264
}
5365
},
5466
"mockingStrategy": {
@@ -87,7 +99,9 @@
8799
"LLMJudgeOutputEvaluator": null,
88100
"LLMJudgeStrictJSONSimilarityOutputEvaluator": null,
89101
"TrajectoryEvaluator": null,
90-
"CorrectOperatorEvaluator": null
102+
"CorrectOperatorEvaluator": null,
103+
"LLMJudgeHaiku45": null,
104+
"LLMJudgeSonnet45": null
91105
}
92106
},
93107
{
@@ -127,6 +141,16 @@
127141
},
128142
"TrajectoryEvaluator": {
129143
"expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
144+
},
145+
"LLMJudgeHaiku45": {
146+
"expectedOutput": {
147+
"result": 8.0
148+
}
149+
},
150+
"LLMJudgeSonnet45": {
151+
"expectedOutput": {
152+
"result": 8.0
153+
}
130154
}
131155
}
132156
},

samples/calculator/evaluations/eval-sets/legacy.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
"equality-with-target-key",
99
"llm-as-a-judge",
1010
"json-similarity",
11-
"trajectory"
11+
"trajectory",
12+
"llm-as-a-judge-haiku-4.5",
13+
"llm-as-a-judge-sonnet-4.5"
1214
],
1315
"evaluations": [
1416
{
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"fileName": "llm-as-a-judge-haiku-4.5.json",
3+
"id": "llm-as-a-judge-haiku-4.5",
4+
"name": "LLMAsAJudge Haiku 4.5 Evaluator",
5+
"description": "An evaluator that judges the agent based on it's run history and expected behavior using Claude Haiku 4.5",
6+
"category": 3,
7+
"type": 7,
8+
"prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
9+
"targetOutputKey": "*",
10+
"model": "anthropic.claude-haiku-4-5-20251001-v1:0",
11+
"maxTokens": 8000,
12+
"createdAt": "2026-02-04T00:00:00.000Z",
13+
"updatedAt": "2026-02-04T00:00:00.000Z"
14+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"fileName": "llm-as-a-judge-sonnet-4.5.json",
3+
"id": "llm-as-a-judge-sonnet-4.5",
4+
"name": "LLMAsAJudge Sonnet 4.5 Evaluator",
5+
"description": "An evaluator that judges the agent based on it's run history and expected behavior using Claude Sonnet 4.5",
6+
"category": 3,
7+
"type": 7,
8+
"prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
9+
"targetOutputKey": "*",
10+
"model": "anthropic.claude-sonnet-4-5-20250929-v1:0",
11+
"maxTokens": 8000,
12+
"createdAt": "2026-02-04T00:00:00.000Z",
13+
"updatedAt": "2026-02-04T00:00:00.000Z"
14+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"version": "1.0",
3+
"id": "LLMJudgeHaiku45",
4+
"description": "Uses Claude Haiku 4.5 to judge semantic similarity between expected and actual output.",
5+
"evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
6+
"evaluatorConfig": {
7+
"name": "LLMJudgeHaiku45",
8+
"targetOutputKey": "*",
9+
"model": "anthropic.claude-haiku-4-5-20251001-v1:0",
10+
"prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
11+
"temperature": 0.0,
12+
"maxTokens": 8000,
13+
"defaultEvaluationCriteria": {
14+
"expectedOutput": {
15+
"result": 5.0
16+
}
17+
}
18+
}
19+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"version": "1.0",
3+
"id": "LLMJudgeSonnet45",
4+
"description": "Uses Claude Sonnet 4.5 to judge semantic similarity between expected and actual output.",
5+
"evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
6+
"evaluatorConfig": {
7+
"name": "LLMJudgeSonnet45",
8+
"targetOutputKey": "*",
9+
"model": "anthropic.claude-sonnet-4-5-20250929-v1:0",
10+
"prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
11+
"temperature": 0.0,
12+
"maxTokens": 8000,
13+
"defaultEvaluationCriteria": {
14+
"expectedOutput": {
15+
"result": 5.0
16+
}
17+
}
18+
}
19+
}

src/uipath/eval/evaluators/llm_as_judge_evaluator.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,16 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
235235
"tool_choice": tool_choice,
236236
}
237237

238-
# Only include max_tokens if explicitly set (don't pass None to API)
239-
if self.evaluator_config.max_tokens is not None:
240-
request_data["max_tokens"] = self.evaluator_config.max_tokens
238+
# Set max_tokens - use explicit config value, or default for Claude 4.5 models
239+
max_tokens_value = self.evaluator_config.max_tokens
240+
if max_tokens_value is None:
241+
# Claude 4.5 models require max_tokens, set default to 8000
242+
if "claude-haiku-4-5" in model or "claude-sonnet-4-5" in model:
243+
max_tokens_value = 8000
244+
245+
# Only include max_tokens if set (don't pass None to API)
246+
if max_tokens_value is not None:
247+
request_data["max_tokens"] = max_tokens_value
241248

242249
if self.llm_service is None:
243250
raise UiPathEvaluationError(
@@ -252,14 +259,30 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
252259
f"🤖 Calling LLM evaluator with model: {model} (using function calling)"
253260
)
254261
max_tokens_str = (
255-
str(self.evaluator_config.max_tokens)
256-
if self.evaluator_config.max_tokens is not None
257-
else "unset"
262+
str(max_tokens_value) if max_tokens_value is not None else "unset"
258263
)
259264
logger.debug(
260265
f"Request data: model={model}, max_tokens={max_tokens_str}, temperature={self.evaluator_config.temperature}, tool_choice=required"
261266
)
262267

268+
# Log full request body for debugging
269+
import copy
270+
271+
request_body_for_log = copy.deepcopy(request_data)
272+
# Convert tool_choice to dict for logging
273+
if "tool_choice" in request_body_for_log:
274+
request_body_for_log["tool_choice"] = request_body_for_log[
275+
"tool_choice"
276+
].model_dump()
277+
# Convert tools to dict for logging
278+
if "tools" in request_body_for_log:
279+
request_body_for_log["tools"] = [
280+
t.model_dump() for t in request_body_for_log["tools"]
281+
]
282+
logger.info(
283+
f"📤 Full request body:\n{json.dumps(request_body_for_log, indent=2)}"
284+
)
285+
263286
try:
264287
response = await self.llm_service(**request_data)
265288
except Exception as e:

src/uipath/platform/chat/_llm_gateway_service.py

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -507,15 +507,23 @@ class Country(BaseModel):
507507
)
508508
endpoint = Endpoint("/" + endpoint)
509509

510+
# Build request body - Claude models don't support some OpenAI-specific parameters
511+
is_claude_model = "claude" in model.lower()
512+
510513
request_body = {
511514
"messages": converted_messages,
512515
"max_tokens": max_tokens,
513516
"temperature": temperature,
514-
"n": n,
515-
"frequency_penalty": frequency_penalty,
516-
"presence_penalty": presence_penalty,
517-
"top_p": top_p,
518517
}
518+
519+
# Only add OpenAI-specific parameters for non-Claude models
520+
if not is_claude_model:
521+
request_body["n"] = n
522+
request_body["frequency_penalty"] = frequency_penalty
523+
request_body["presence_penalty"] = presence_penalty
524+
if top_p is not None:
525+
request_body["top_p"] = top_p
526+
519527
if top_k is not None:
520528
request_body["top_k"] = top_k
521529

@@ -557,8 +565,47 @@ class Country(BaseModel):
557565
headers = {
558566
**DEFAULT_LLM_HEADERS,
559567
"X-UiPath-LlmGateway-NormalizedApi-ModelName": model,
568+
"X-UiPath-LLMGateway-AllowFull4xxResponse": "true", # Debug: show full error details
560569
}
561570

571+
# Log the complete request for debugging
572+
import json as json_module
573+
import logging
574+
575+
logger = logging.getLogger(__name__)
576+
577+
logger.info("=" * 80)
578+
logger.info("📤 LLM Gateway Normalized API Request")
579+
logger.info("=" * 80)
580+
logger.info(f"Model: {model}")
581+
logger.info(f"Endpoint: {endpoint}")
582+
logger.info(f"API Version: {NORMALIZED_API_VERSION}")
583+
logger.info(f"Is Claude Model: {is_claude_model}")
584+
logger.info("-" * 80)
585+
logger.info("Headers:")
586+
for key, value in headers.items():
587+
logger.info(f" {key}: {value}")
588+
logger.info("-" * 80)
589+
logger.info("Request Body:")
590+
# Create a copy for logging with tools truncated for readability
591+
log_body: dict[str, Any] = request_body.copy()
592+
tools_list = log_body.get("tools")
593+
if tools_list and isinstance(tools_list, list):
594+
log_body["tools"] = f"[{len(tools_list)} tool(s)]"
595+
messages_list = log_body.get("messages")
596+
if messages_list and isinstance(messages_list, list):
597+
log_body["messages"] = [
598+
{
599+
**msg,
600+
"content": msg["content"][:100] + "..."
601+
if len(msg.get("content", "")) > 100
602+
else msg["content"],
603+
}
604+
for msg in messages_list
605+
]
606+
logger.info(json_module.dumps(log_body, indent=2))
607+
logger.info("=" * 80)
608+
562609
async with get_llm_semaphore():
563610
response = await self.request_async(
564611
"POST",
@@ -568,6 +615,7 @@ class Country(BaseModel):
568615
headers=headers,
569616
)
570617

618+
logger.info(f"✅ Response received with status: {response.status_code}")
571619
return ChatCompletion.model_validate(response.json())
572620

573621
def _convert_tool_to_uipath_format(self, tool: ToolDefinition) -> dict[str, Any]:

0 commit comments

Comments
 (0)