droidrun · RasulOs · Jun 11, 2026 · Jun 11, 2026
diff --git a/mobilerun/agent/providers/setup_service.py b/mobilerun/agent/providers/setup_service.py
@@ -18,6 +18,10 @@
 DEFAULT_KWARGS_BY_VARIANT: dict[str, dict[str, int]] = {
     "anthropic_oauth": {"max_tokens": 1024},
     "gemini_oauth_code_assist": {"max_tokens": 1024},
+    # Without this, llama-index asks Ollama for the model's MAXIMUM context,
+    # which allocates the full KV cache (256K-context models -> ~19 GB) and
+    # spills to CPU on typical machines. -1 restores model-max for big GPUs.
+    "Ollama": {"context_window": 32768},
 }
 
 HIDDEN_ROLE_FALLBACKS: tuple[str, ...] = ("app_opener", "structured_output")

diff --git a/mobilerun/agent/utils/llm_picker.py b/mobilerun/agent/utils/llm_picker.py
@@ -88,6 +88,78 @@ def _validate_gemini_oauth_model(model: object) -> None:
         )
 
 
+# Default Ollama context size. llama-index's own default (-1) resolves to the
+# model's maximum context, which allocates the full KV cache up front (e.g. a
+# 256K-context model -> ~19 GB, spilling to CPU) — and because mobilerun sends
+# num_ctx per request, it overrides every Ollama-side setting
+# (OLLAMA_CONTEXT_LENGTH, Modelfile, /set parameter), so users cannot fix it
+# server-side. ``context_window: -1`` in profile kwargs restores model-max.
+_OLLAMA_DEFAULT_CONTEXT_WINDOW = 32768
+
+_warned_ollama_kwargs: set[str] = set()
+
+
+def _prepare_ollama_kwargs(kwargs: dict[str, Any], llm_class: Any) -> dict[str, Any]:
+    """Translate provider-portable kwargs for llama-index's Ollama class.
+
+    ``max_tokens`` is not an Ollama constructor field and pydantic silently
+    drops it; translate it to ``additional_kwargs.num_predict`` (an explicit
+    ``num_predict`` wins). Also default ``context_window`` (see
+    ``_OLLAMA_DEFAULT_CONTEXT_WINDOW``) and keep it aligned with an explicit
+    ``additional_kwargs.num_ctx`` so the -1 path's hidden ``client.show()``
+    lookup is never triggered by mobilerun defaults.
+    """
+    kwargs = dict(kwargs)
+    additional_kwargs = dict(kwargs.get("additional_kwargs") or {})
+
+    if "max_tokens" in kwargs and "max_tokens" not in llm_class.model_fields:
+        max_tokens = kwargs.pop("max_tokens")
+        if isinstance(max_tokens, bool) or max_tokens is None:
+            valid_max_tokens = None
+        else:
+            try:
+                valid_max_tokens = int(max_tokens)
+            except (TypeError, ValueError):
+                valid_max_tokens = None
+        if valid_max_tokens is None:
+            logger.warning(
+                f"Ignoring non-integer max_tokens={max_tokens!r} for Ollama."
+            )
+        elif "num_predict" in additional_kwargs:
+            if additional_kwargs["num_predict"] != valid_max_tokens:
+                logger.warning(
+                    f"Both max_tokens={valid_max_tokens} and "
+                    f"additional_kwargs.num_predict="
+                    f"{additional_kwargs['num_predict']} are set for Ollama; "
+                    f"num_predict wins."
+                )
+        else:
+            additional_kwargs["num_predict"] = valid_max_tokens
+
+    if kwargs.get("context_window") is None:
+        context_window = _OLLAMA_DEFAULT_CONTEXT_WINDOW
+        if "num_ctx" in additional_kwargs:
+            try:
+                context_window = int(additional_kwargs["num_ctx"])
+            except (TypeError, ValueError):
+                pass
+        kwargs["context_window"] = context_window
+
+    if additional_kwargs:
+        kwargs["additional_kwargs"] = additional_kwargs
+
+    for key, value in kwargs.items():
+        if value is None or key in llm_class.model_fields:
+            continue
+        if key not in _warned_ollama_kwargs:
+            _warned_ollama_kwargs.add(key)
+            logger.warning(
+                f"Ollama does not accept the {key!r} option; it will be ignored."
+            )
+
+    return kwargs
+
+
 def _load_openai_responses(**kwargs: Any) -> LLM:
     from llama_index.llms.openai.responses import OpenAIResponses
 
@@ -232,6 +304,7 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM
         from llama_index.llms.ollama import Ollama
 
         llm_class = Ollama
+        kwargs = _prepare_ollama_kwargs(kwargs, Ollama)
     elif provider_name == "Anthropic":
         return _load_anthropic(**kwargs)
     elif provider_name == "OpenRouter":

diff --git a/mobilerun/cli/configure_wizard.py b/mobilerun/cli/configure_wizard.py
@@ -259,6 +259,18 @@ def _set_profile_max_tokens(profile: Any, value: int) -> None:
     profile.kwargs["max_tokens"] = value
 
 
+def _set_profile_context_window(profile: Any, value: int) -> None:
+    profile.kwargs = dict(profile.kwargs)
+    profile.kwargs["context_window"] = value
+
+
+def _any_ollama_profile(config) -> bool:
+    return any(
+        getattr(profile, "provider", "") == "Ollama"
+        for profile in config.llm_profiles.values()
+    )
+
+
 def _toggle_label(enabled: bool) -> str:
     """Return a toggle indicator: ON/OFF."""
     return "[ON]" if enabled else "[OFF]"
@@ -289,31 +301,40 @@ def _configure_advanced_settings(
         vision_on = _is_vision_enabled(config)
         reasoning_on = config.agent.reasoning
 
+        choices = [
+            SelectChoice(
+                value="vision",
+                label=f"Vision {_toggle_label(vision_on)}",
+            ),
+            SelectChoice(
+                value="reasoning",
+                label=f"Reasoning {_toggle_label(reasoning_on)}",
+            ),
+            SelectChoice(
+                value="max_steps",
+                label="Maximum steps",
+            ),
+            SelectChoice(
+                value="temperature",
+                label="Temperature",
+            ),
+            SelectChoice(
+                value="max_tokens",
+                label="Max tokens",
+            ),
+        ]
+        if _any_ollama_profile(config):
+            choices.append(
+                SelectChoice(
+                    value="context_window",
+                    label="Context window (Ollama)",
+                )
+            )
+        choices.append(SelectChoice(value="done", label="Done"))
+
         selected = _select_with_back(
             "Advanced settings",
-            [
-                SelectChoice(
-                    value="vision",
-                    label=f"Vision {_toggle_label(vision_on)}",
-                ),
-                SelectChoice(
-                    value="reasoning",
-                    label=f"Reasoning {_toggle_label(reasoning_on)}",
-                ),
-                SelectChoice(
-                    value="max_steps",
-                    label="Maximum steps",
-                ),
-                SelectChoice(
-                    value="temperature",
-                    label="Temperature",
-                ),
-                SelectChoice(
-                    value="max_tokens",
-                    label="Max tokens",
-                ),
-                SelectChoice(value="done", label="Done"),
-            ],
+            choices,
             default=default_selection,
         )
 
@@ -346,6 +367,20 @@ def _configure_advanced_settings(
             for role in _ALL_CONFIG_ROLES:
                 if role in config.llm_profiles:
                     _set_profile_max_tokens(config.llm_profiles[role], value)
+        elif selected == "context_window":
+            current_value = config.llm_profiles[_ALL_CONFIG_ROLES[0]].kwargs.get(
+                "context_window", 32768
+            )
+            try:
+                current_default = int(current_value)
+            except (TypeError, ValueError):
+                current_default = 32768
+            value = _prompt_int(
+                console, "Context window (-1 = model max)", default=current_default
+            )
+            for role in _ALL_CONFIG_ROLES:
+                if role in config.llm_profiles:
+                    _set_profile_context_window(config.llm_profiles[role], value)
 
         default_selection = selected
 

diff --git a/mobilerun/config_example.yaml b/mobilerun/config_example.yaml
@@ -76,6 +76,16 @@ llm_profiles:
     # api_key_source: auto  # auto = saved env file first, then shell env; env = shell only; file = saved env file
     # kwargs: # optional kwargs, add api_key in kwargs if not already in .env
     #   max_tokens: 8192
+    #
+    # Ollama example — max_tokens is translated to Ollama's num_predict, and
+    # context_window controls num_ctx (defaults to 32768; -1 = model maximum,
+    # which preallocates the full KV cache and can spill to CPU):
+    # provider: Ollama
+    # model: qwen3:8b
+    # base_url: http://localhost:11434
+    # kwargs:
+    #   max_tokens: 2048
+    #   context_window: 32768
 
   # Executor: Selects and executes atomic actions
   executor:

diff --git a/mobilerun/config_manager/loader.py b/mobilerun/config_manager/loader.py
@@ -39,7 +39,9 @@ def load(cls, config_path: Optional[str] = None) -> MobileConfig:
         Load config with resolution order:
         1. Explicit config_path argument
         2. MOBILERUN_CONFIG env var
-        3. User config (~/.config/mobilerun/config.yaml)
+        3. User config (platformdirs config dir for "droidrun", e.g.
+           ~/Library/Application Support/droidrun/config.yaml on macOS,
+           ~/.config/droidrun/config.yaml on Linux)
         4. Package defaults (creates user config)
         """
         if config_path:

diff --git a/tests/test_llm_picker.py b/tests/test_llm_picker.py
@@ -161,3 +161,147 @@ def test_anthropic_current_catalog_models_have_metadata(model: str) -> None:
 
     assert metadata.model_name == model
     assert metadata.context_window > 0
+
+
+# --- Ollama kwarg translation (max_tokens / context_window) ------------------
+
+
+import logging
+
+
+@pytest.fixture
+def mobilerun_caplog(caplog):
+    """caplog wired to the non-propagating "mobilerun" logger."""
+    logger = logging.getLogger("mobilerun")
+    previous = logger.propagate
+    logger.propagate = True
+    caplog.set_level(logging.WARNING, logger="mobilerun")
+    yield caplog
+    logger.propagate = previous
+
+
+def _ollama_class():
+    from llama_index.llms.ollama import Ollama
+
+    return Ollama
+
+
+def _prepare(kwargs):
+    from mobilerun.agent.utils.llm_picker import _prepare_ollama_kwargs
+
+    return _prepare_ollama_kwargs(kwargs, _ollama_class())
+
+
+def test_ollama_max_tokens_translates_to_num_predict() -> None:
+    out = _prepare({"model": "qwen3:0.6b", "max_tokens": 2048})
+
+    assert "max_tokens" not in out
+    assert out["additional_kwargs"]["num_predict"] == 2048
+
+
+def test_ollama_explicit_num_predict_wins_over_max_tokens(mobilerun_caplog) -> None:
+    out = _prepare(
+        {
+            "model": "qwen3:0.6b",
+            "max_tokens": 2048,
+            "additional_kwargs": {"num_predict": 512},
+        }
+    )
+
+    assert out["additional_kwargs"]["num_predict"] == 512
+    assert any("num_predict wins" in r.message for r in mobilerun_caplog.records)
+
+
+def test_ollama_equal_num_predict_and_max_tokens_no_warning(mobilerun_caplog) -> None:
+    out = _prepare(
+        {
+            "model": "qwen3:0.6b",
+            "max_tokens": 512,
+            "additional_kwargs": {"num_predict": 512},
+        }
+    )
+
+    assert out["additional_kwargs"]["num_predict"] == 512
+    assert not any("num_predict wins" in r.message for r in mobilerun_caplog.records)
+
+
+def test_ollama_numeric_string_max_tokens_is_converted() -> None:
+    out = _prepare({"model": "qwen3:0.6b", "max_tokens": "1024"})
+
+    assert out["additional_kwargs"]["num_predict"] == 1024
+
+
+@pytest.mark.parametrize("bad", ["lots", True, None])
+def test_ollama_invalid_max_tokens_warns_and_skips(bad, mobilerun_caplog) -> None:
+    out = _prepare({"model": "qwen3:0.6b", "max_tokens": bad})
+
+    assert "max_tokens" not in out
+    assert "num_predict" not in out.get("additional_kwargs", {})
+    assert any("Ignoring non-integer max_tokens" in r.message for r in mobilerun_caplog.records)
+
+
+def test_ollama_context_window_defaults_to_32k() -> None:
+    out = _prepare({"model": "qwen3:0.6b"})
+
+    assert out["context_window"] == 32768
+
+
+@pytest.mark.parametrize("explicit", [8192, -1])
+def test_ollama_explicit_context_window_is_preserved(explicit) -> None:
+    out = _prepare({"model": "qwen3:0.6b", "context_window": explicit})
+
+    assert out["context_window"] == explicit
+
+
+def test_ollama_num_ctx_mirrors_into_context_window() -> None:
+    out = _prepare(
+        {"model": "qwen3:0.6b", "additional_kwargs": {"num_ctx": 16384}}
+    )
+
+    assert out["context_window"] == 16384
+    assert out["additional_kwargs"]["num_ctx"] == 16384
+
+
+def test_ollama_non_numeric_num_ctx_falls_back_to_default() -> None:
+    out = _prepare(
+        {"model": "qwen3:0.6b", "additional_kwargs": {"num_ctx": "max"}}
+    )
+
+    assert out["context_window"] == 32768
+
+
+def test_ollama_unknown_kwarg_warns_once(mobilerun_caplog) -> None:
+    from mobilerun.agent.utils import llm_picker
+
+    llm_picker._warned_ollama_kwargs.discard("frobnicate")
+    _prepare({"model": "qwen3:0.6b", "frobnicate": 1})
+    _prepare({"model": "qwen3:0.6b", "frobnicate": 1})
+
+    warnings = [r for r in mobilerun_caplog.records if "'frobnicate'" in r.message]
+    assert len(warnings) == 1
+
+
+def test_ollama_translation_disabled_if_class_grows_max_tokens_field() -> None:
+    from mobilerun.agent.utils.llm_picker import _prepare_ollama_kwargs
+
+    class FakeOllama:
+        model_fields = {"model": None, "max_tokens": None, "context_window": None}
+
+    out = _prepare_ollama_kwargs({"model": "m", "max_tokens": 99}, FakeOllama)
+
+    assert out["max_tokens"] == 99
+    assert "additional_kwargs" not in out
+
+
+def test_load_llm_ollama_end_to_end_applies_translation() -> None:
+    llm = load_llm("Ollama", model="qwen3:0.6b", max_tokens=256)
+
+    assert llm.context_window == 32768
+    assert llm.additional_kwargs["num_predict"] == 256
+    assert not hasattr(llm, "max_tokens") or "max_tokens" not in type(llm).model_fields
+
+
+def test_ollama_wizard_default_includes_context_window() -> None:
+    from mobilerun.agent.providers.setup_service import DEFAULT_KWARGS_BY_VARIANT
+
+    assert DEFAULT_KWARGS_BY_VARIANT["Ollama"] == {"context_window": 32768}