diff --git a/sagemaker-train/README.rst b/sagemaker-train/README.rst index 90e306c42d..c1a1195902 100644 --- a/sagemaker-train/README.rst +++ b/sagemaker-train/README.rst @@ -47,7 +47,7 @@ Table of Contents Installing the SageMaker Python SDK Train ------------------------------------ +----------------------------------------- You can install from source by cloning this repository and running a pip install command in the root directory of the repository: diff --git a/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py index 5d37e53f8c..d6bad422c6 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py @@ -301,7 +301,7 @@ class BenchMarkEvaluator(BaseEvaluator): benchmark: _Benchmark subtasks: Optional[Union[str, List[str]]] = None - evaluate_base_model: bool = True + evaluate_base_model: bool = False _hyperparameters: Optional[Any] = None diff --git a/sagemaker-train/src/sagemaker/train/evaluate/custom_scorer_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/custom_scorer_evaluator.py index 290a6f80ba..78d297006c 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/custom_scorer_evaluator.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/custom_scorer_evaluator.py @@ -137,7 +137,7 @@ class CustomScorerEvaluator(BaseEvaluator): _hyperparameters: Optional[Any] = None # Template-required fields - evaluate_base_model: bool = True + evaluate_base_model: bool = False @validator('dataset', pre=True) def _resolve_dataset(cls, v): diff --git a/sagemaker-train/src/sagemaker/train/evaluate/llm_as_judge_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/llm_as_judge_evaluator.py index 98e1c50c48..8438b65688 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/llm_as_judge_evaluator.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/llm_as_judge_evaluator.py @@ -123,7 +123,7 @@ class LLMAsJudgeEvaluator(BaseEvaluator): custom_metrics: Optional[str] = None # Template-required fields - evaluate_base_model: bool = True + evaluate_base_model: bool = False @validator('dataset', pre=True) def _resolve_dataset(cls, v): diff --git a/sagemaker-train/tests/unit/train/evaluate/test_benchmark_evaluator.py b/sagemaker-train/tests/unit/train/evaluate/test_benchmark_evaluator.py index 858bb12d32..d87a435ba0 100644 --- a/sagemaker-train/tests/unit/train/evaluate/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/unit/train/evaluate/test_benchmark_evaluator.py @@ -130,7 +130,7 @@ def test_benchmark_evaluator_initialization_minimal(mock_artifact, mock_resolve) assert evaluator.benchmark == _Benchmark.MMLU assert evaluator.model == DEFAULT_MODEL - assert evaluator.evaluate_base_model is True + assert evaluator.evaluate_base_model is False assert evaluator.subtasks == "ALL" @@ -525,7 +525,7 @@ def test_benchmark_evaluator_get_benchmark_template_additions(mock_artifact, moc assert additions['strategy'] == 'zs_cot' assert additions['evaluation_metric'] == 'accuracy' assert additions['subtask'] == 'abstract_algebra' - assert additions['evaluate_base_model'] is True + assert additions['evaluate_base_model'] is False @patch('sagemaker.train.common_utils.recipe_utils._is_nova_model') diff --git a/sagemaker-train/tests/unit/train/evaluate/test_custom_scorer_evaluator.py b/sagemaker-train/tests/unit/train/evaluate/test_custom_scorer_evaluator.py index 1f37632903..9267cc7f73 100644 --- a/sagemaker-train/tests/unit/train/evaluate/test_custom_scorer_evaluator.py +++ b/sagemaker-train/tests/unit/train/evaluate/test_custom_scorer_evaluator.py @@ -85,7 +85,7 @@ def test_custom_scorer_evaluator_initialization_minimal(mock_artifact, mock_reso assert evaluator.evaluator == _BuiltInMetric.PRIME_MATH assert evaluator.dataset == DEFAULT_DATASET assert evaluator.model == DEFAULT_MODEL - assert evaluator.evaluate_base_model is True + assert evaluator.evaluate_base_model is False @patch('sagemaker.train.common_utils.model_resolution._resolve_base_model') @@ -952,7 +952,7 @@ def test_custom_scorer_evaluator_get_custom_scorer_template_additions_builtin( assert additions['task'] == 'gen_qa' assert additions['strategy'] == 'gen_qa' assert additions['evaluation_metric'] == 'all' - assert additions['evaluate_base_model'] is True + assert additions['evaluate_base_model'] is False assert additions['evaluator_arn'] is None assert additions['preset_reward_function'] == 'prime_math' assert 'temperature' in additions diff --git a/sagemaker-train/tests/unit/train/evaluate/test_llm_as_judge_evaluator.py b/sagemaker-train/tests/unit/train/evaluate/test_llm_as_judge_evaluator.py index 5af23f7960..60f89b6b69 100644 --- a/sagemaker-train/tests/unit/train/evaluate/test_llm_as_judge_evaluator.py +++ b/sagemaker-train/tests/unit/train/evaluate/test_llm_as_judge_evaluator.py @@ -67,7 +67,7 @@ def test_llm_as_judge_evaluator_initialization_minimal(mock_artifact, mock_resol assert evaluator.evaluator_model == DEFAULT_EVALUATOR_MODEL assert evaluator.dataset == DEFAULT_DATASET assert evaluator.model == DEFAULT_MODEL - assert evaluator.evaluate_base_model is True + assert evaluator.evaluate_base_model is False assert evaluator.builtin_metrics is None assert evaluator.custom_metrics is None @@ -472,7 +472,7 @@ def test_llm_as_judge_evaluator_get_llmaj_template_additions(mock_artifact, mock assert additions['top_p'] == '1.0' # pipeline_name is no longer in template additions - it's resolved dynamically in execution.py assert 'pipeline_name' not in additions - assert additions['evaluate_base_model'] is True + assert additions['evaluate_base_model'] is False # Verify S3 upload was called mock_s3_upload.assert_called_once()