fix: support Claude 4.5 models in LLM evaluators and gateway (#1269)

Chibionos · Chibi Vikram · claude · web-flow · commit 0322e3f44577 · 2026-02-11T16:18:45.000-08:00
Co-authored-by: Chibi Vikram &lt;chibivikram@gmail.com&gt;
Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.8.15"
+version = "2.8.16"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/samples/calculator/evaluations/eval-sets/default.json b/samples/calculator/evaluations/eval-sets/default.json
@@ -9,7 +9,9 @@
     "LLMJudgeOutputEvaluator",
     "LLMJudgeStrictJSONSimilarityOutputEvaluator",
     "TrajectoryEvaluator",
-    "CorrectOperatorEvaluator"
+    "CorrectOperatorEvaluator",
+    "LLMJudgeHaiku45",
+    "LLMJudgeSonnet45"
   ],
   "evaluations": [
     {
@@ -49,6 +51,16 @@
         },
         "TrajectoryEvaluator": {
           "expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
+        },
+        "LLMJudgeHaiku45": {
+          "expectedOutput": {
+            "result": 8.0
+          }
+        },
+        "LLMJudgeSonnet45": {
+          "expectedOutput": {
+            "result": 8.0
+          }
         }
       },
       "mockingStrategy": {
@@ -87,7 +99,9 @@
         "LLMJudgeOutputEvaluator": null,
         "LLMJudgeStrictJSONSimilarityOutputEvaluator": null,
         "TrajectoryEvaluator": null,
-        "CorrectOperatorEvaluator": null
+        "CorrectOperatorEvaluator": null,
+        "LLMJudgeHaiku45": null,
+        "LLMJudgeSonnet45": null
       }
     },
     {
@@ -127,6 +141,16 @@
         },
         "TrajectoryEvaluator": {
           "expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
+        },
+        "LLMJudgeHaiku45": {
+          "expectedOutput": {
+            "result": 8.0
+          }
+        },
+        "LLMJudgeSonnet45": {
+          "expectedOutput": {
+            "result": 8.0
+          }
         }
       }
     },
diff --git a/samples/calculator/evaluations/eval-sets/legacy.json b/samples/calculator/evaluations/eval-sets/legacy.json
@@ -8,7 +8,9 @@
     "equality-with-target-key",
     "llm-as-a-judge",
     "json-similarity",
-    "trajectory"
+    "trajectory",
+    "llm-as-a-judge-haiku-4.5",
+    "llm-as-a-judge-sonnet-4.5"
   ],
   "evaluations": [
     {
diff --git a/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge-haiku-4.5.json b/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge-haiku-4.5.json
@@ -0,0 +1,14 @@
+{
+  "fileName": "llm-as-a-judge-haiku-4.5.json",
+  "id": "llm-as-a-judge-haiku-4.5",
+  "name": "LLMAsAJudge Haiku 4.5 Evaluator",
+  "description": "An evaluator that judges the agent based on it's run history and expected behavior using Claude Haiku 4.5",
+  "category": 3,
+  "type": 7,
+  "prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
+  "targetOutputKey": "*",
+  "model": "anthropic.claude-haiku-4-5-20251001-v1:0",
+  "maxTokens": 8000,
+  "createdAt": "2026-02-04T00:00:00.000Z",
+  "updatedAt": "2026-02-04T00:00:00.000Z"
+}
diff --git a/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge-sonnet-4.5.json b/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge-sonnet-4.5.json
@@ -0,0 +1,14 @@
+{
+  "fileName": "llm-as-a-judge-sonnet-4.5.json",
+  "id": "llm-as-a-judge-sonnet-4.5",
+  "name": "LLMAsAJudge Sonnet 4.5 Evaluator",
+  "description": "An evaluator that judges the agent based on it's run history and expected behavior using Claude Sonnet 4.5",
+  "category": 3,
+  "type": 7,
+  "prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
+  "targetOutputKey": "*",
+  "model": "anthropic.claude-sonnet-4-5-20250929-v1:0",
+  "maxTokens": 8000,
+  "createdAt": "2026-02-04T00:00:00.000Z",
+  "updatedAt": "2026-02-04T00:00:00.000Z"
+}
diff --git a/samples/calculator/evaluations/evaluators/llm-judge-haiku-4.5.json b/samples/calculator/evaluations/evaluators/llm-judge-haiku-4.5.json
@@ -0,0 +1,19 @@
+{
+  "version": "1.0",
+  "id": "LLMJudgeHaiku45",
+  "description": "Uses Claude Haiku 4.5 to judge semantic similarity between expected and actual output.",
+  "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+  "evaluatorConfig": {
+    "name": "LLMJudgeHaiku45",
+    "targetOutputKey": "*",
+    "model": "anthropic.claude-haiku-4-5-20251001-v1:0",
+    "prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
+    "temperature": 0.0,
+    "maxTokens": 8000,
+    "defaultEvaluationCriteria": {
+      "expectedOutput": {
+        "result": 5.0
+      }
+    }
+  }
+}
diff --git a/samples/calculator/evaluations/evaluators/llm-judge-sonnet-4.5.json b/samples/calculator/evaluations/evaluators/llm-judge-sonnet-4.5.json
@@ -0,0 +1,19 @@
+{
+  "version": "1.0",
+  "id": "LLMJudgeSonnet45",
+  "description": "Uses Claude Sonnet 4.5 to judge semantic similarity between expected and actual output.",
+  "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+  "evaluatorConfig": {
+    "name": "LLMJudgeSonnet45",
+    "targetOutputKey": "*",
+    "model": "anthropic.claude-sonnet-4-5-20250929-v1:0",
+    "prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
+    "temperature": 0.0,
+    "maxTokens": 8000,
+    "defaultEvaluationCriteria": {
+      "expectedOutput": {
+        "result": 5.0
+      }
+    }
+  }
+}
diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py
@@ -235,9 +235,16 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
             "tool_choice": tool_choice,
         }
 
-        # Only include max_tokens if explicitly set (don't pass None to API)
-        if self.evaluator_config.max_tokens is not None:
-            request_data["max_tokens"] = self.evaluator_config.max_tokens
+        # Set max_tokens - use explicit config value, or default for Claude 4.5 models
+        max_tokens_value = self.evaluator_config.max_tokens
+        if max_tokens_value is None:
+            # Claude 4.5 models require max_tokens, set default to 8000
+            if "claude-haiku-4-5" in model or "claude-sonnet-4-5" in model:
+                max_tokens_value = 8000
+
+        # Only include max_tokens if set (don't pass None to API)
+        if max_tokens_value is not None:
+            request_data["max_tokens"] = max_tokens_value
 
         if self.llm_service is None:
             raise UiPathEvaluationError(
@@ -252,14 +259,30 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
             f"🤖 Calling LLM evaluator with model: {model} (using function calling)"
         )
         max_tokens_str = (
-            str(self.evaluator_config.max_tokens)
-            if self.evaluator_config.max_tokens is not None
-            else "unset"
+            str(max_tokens_value) if max_tokens_value is not None else "unset"
         )
         logger.debug(
             f"Request data: model={model}, max_tokens={max_tokens_str}, temperature={self.evaluator_config.temperature}, tool_choice=required"
         )
 
+        # Log full request body for debugging
+        import copy
+
+        request_body_for_log = copy.deepcopy(request_data)
+        # Convert tool_choice to dict for logging
+        if "tool_choice" in request_body_for_log:
+            request_body_for_log["tool_choice"] = request_body_for_log[
+                "tool_choice"
+            ].model_dump()
+        # Convert tools to dict for logging
+        if "tools" in request_body_for_log:
+            request_body_for_log["tools"] = [
+                t.model_dump() for t in request_body_for_log["tools"]
+            ]
+        logger.info(
+            f"📤 Full request body:\n{json.dumps(request_body_for_log, indent=2)}"
+        )
+
         try:
             response = await self.llm_service(**request_data)
         except Exception as e:
diff --git a/src/uipath/platform/chat/_llm_gateway_service.py b/src/uipath/platform/chat/_llm_gateway_service.py
@@ -507,15 +507,23 @@ class Country(BaseModel):
         )
         endpoint = Endpoint("/" + endpoint)
 
+        # Build request body - Claude models don't support some OpenAI-specific parameters
+        is_claude_model = "claude" in model.lower()
+
         request_body = {
             "messages": converted_messages,
             "max_tokens": max_tokens,
             "temperature": temperature,
-            "n": n,
-            "frequency_penalty": frequency_penalty,
-            "presence_penalty": presence_penalty,
-            "top_p": top_p,
         }
+
+        # Only add OpenAI-specific parameters for non-Claude models
+        if not is_claude_model:
+            request_body["n"] = n
+            request_body["frequency_penalty"] = frequency_penalty
+            request_body["presence_penalty"] = presence_penalty
+            if top_p is not None:
+                request_body["top_p"] = top_p
+
         if top_k is not None:
             request_body["top_k"] = top_k
 
@@ -557,8 +565,47 @@ class Country(BaseModel):
         headers = {
             **DEFAULT_LLM_HEADERS,
             "X-UiPath-LlmGateway-NormalizedApi-ModelName": model,
+            "X-UiPath-LLMGateway-AllowFull4xxResponse": "true",  # Debug: show full error details
         }
 
+        # Log the complete request for debugging
+        import json as json_module
+        import logging
+
+        logger = logging.getLogger(__name__)
+
+        logger.info("=" * 80)
+        logger.info("📤 LLM Gateway Normalized API Request")
+        logger.info("=" * 80)
+        logger.info(f"Model: {model}")
+        logger.info(f"Endpoint: {endpoint}")
+        logger.info(f"API Version: {NORMALIZED_API_VERSION}")
+        logger.info(f"Is Claude Model: {is_claude_model}")
+        logger.info("-" * 80)
+        logger.info("Headers:")
+        for key, value in headers.items():
+            logger.info(f"  {key}: {value}")
+        logger.info("-" * 80)
+        logger.info("Request Body:")
+        # Create a copy for logging with tools truncated for readability
+        log_body: dict[str, Any] = request_body.copy()
+        tools_list = log_body.get("tools")
+        if tools_list and isinstance(tools_list, list):
+            log_body["tools"] = f"[{len(tools_list)} tool(s)]"
+        messages_list = log_body.get("messages")
+        if messages_list and isinstance(messages_list, list):
+            log_body["messages"] = [
+                {
+                    **msg,
+                    "content": msg["content"][:100] + "..."
+                    if len(msg.get("content", "")) > 100
+                    else msg["content"],
+                }
+                for msg in messages_list
+            ]
+        logger.info(json_module.dumps(log_body, indent=2))
+        logger.info("=" * 80)
+
         async with get_llm_semaphore():
             response = await self.request_async(
                 "POST",
@@ -568,6 +615,7 @@ class Country(BaseModel):
                 headers=headers,
             )
 
+        logger.info(f"✅ Response received with status: {response.status_code}")
         return ChatCompletion.model_validate(response.json())
 
     def _convert_tool_to_uipath_format(self, tool: ToolDefinition) -> dict[str, Any]:
diff --git a/tests/evaluators/test_evaluator_methods.py b/tests/evaluators/test_evaluator_methods.py
diff --git a/tests/sdk/services/test_llm_service.py b/tests/sdk/services/test_llm_service.py
diff --git a/uv.lock b/uv.lock