openvinotoolkit · mzegla · Jan 27, 2026 · Jan 22, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
@@ -41,30 +41,11 @@ python export_model.py text_generation --help
 ```
 Expected Output:
 ```console
-usage: export_model.py text_generation [-h]
-                                       [--model_repository_path MODEL_REPOSITORY_PATH]
-                                       --source_model SOURCE_MODEL
-                                       [--model_name MODEL_NAME]
-                                       [--weight-format PRECISION]
-                                       [--config_file_path CONFIG_FILE_PATH]
-                                       [--overwrite_models]
-                                       [--target_device TARGET_DEVICE]
-                                       [--ov_cache_dir OV_CACHE_DIR]
-                                       [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS]
-                                       [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}]
-                                       [--kv_cache_precision {u8}]
-                                       [--enable_prefix_caching]
-                                       [--disable_dynamic_split_fuse]
-                                       [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS]
-                                       [--max_num_seqs MAX_NUM_SEQS]
-                                       [--cache_size CACHE_SIZE]
-                                       [--draft_source_model DRAFT_SOURCE_MODEL]
-                                       [--draft_model_name DRAFT_MODEL_NAME]
-                                       [--max_prompt_len MAX_PROMPT_LEN]
-                                       [--prompt_lookup_decoding]
-                                       [--reasoning_parser {qwen3,gptoss}]
-                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss}]
-                                       [--enable_tool_guided_generation]
+usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR]
+                                       [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS]
+                                       [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}]
+                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss}] [--enable_tool_guided_generation]
+
 options:
   -h, --help            show this help message and exit
   --model_repository_path MODEL_REPOSITORY_PATH
@@ -102,6 +83,7 @@ options:
                         HF model name or path to the local folder with PyTorch or OpenVINO draft model. Using this option will create configuration for speculative decoding
   --draft_model_name DRAFT_MODEL_NAME
                         Draft model name that should be used in the deployment. Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.
+  --draft_eagle3        Set this flag if you use EAGLE3 draft model for speculative decoding
   --max_prompt_len MAX_PROMPT_LEN
                         Sets NPU specific property for maximum number of tokens in the prompt. Not effective if target device is not NPU
   --prompt_lookup_decoding

diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -49,6 +49,7 @@ def add_common_arguments(parser):
                          'Using this option will create configuration for speculative decoding', dest='draft_source_model')
 parser_text.add_argument('--draft_model_name', required=False, default=None, help='Draft model name that should be used in the deployment. '
                          'Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.', dest='draft_model_name')
+parser_text.add_argument('--draft_eagle3', action='store_true', help='Set this flag if you use EAGLE3 draft model for speculative decoding', dest='draft_eagle3')
 parser_text.add_argument('--max_prompt_len', required=False, type=int, default=None, help='Sets NPU specific property for maximum number of tokens in the prompt. '
                          'Not effective if target device is not NPU', dest='max_prompt_len')
 parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
@@ -237,7 +238,8 @@ def add_common_arguments(parser):
           device: "{{target_device|default("CPU", true)}}",
           {%- if draft_model_dir_name %}
           # Speculative decoding configuration
-          draft_models_path: "./{{draft_model_dir_name}}",{% endif %}
+          draft_models_path: "./{{draft_model_dir_name}}",
+          draft_device: "{{target_device|default("CPU", true)}}",{% endif %}
           {%- if reasoning_parser %}
           reasoning_parser: "{{reasoning_parser}}",{% endif %}
           {%- if tool_parser %}
@@ -429,7 +431,11 @@ def export_text_generation_model(model_repository_path, source_model, model_name
         else: # assume HF model name or local pytorch model folder
             print("Exporting draft LLM model to ", draft_llm_model_path)
             if not os.path.isdir(draft_llm_model_path) or args['overwrite_models']:
-                optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(draft_source_model, precision, draft_llm_model_path)
+                additional_options = ""
+                if args["draft_eagle3"]:
+                    print("Using eagle3 option for the draft model export")
+                    additional_options += " --eagle3  --task text-generation-with-past"
+                optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
-                optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
+                optimum_command = (
+                    "optimum-cli export openvino "
+                    f"--model {draft_source_model} "
+                    f"--weight-format {precision} "
+                    "--trust-remote-code"
+                    f"{additional_options} "
+                    f"{draft_llm_model_path}"
+                )
-                optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
+                optimum_command = (
+                    "optimum-cli export openvino "
+                    f"--model {draft_source_model} "
+                    f"--weight-format {precision} "
+                    "--trust-remote-code"
+                    f"{additional_options} "
+                    f"{draft_llm_model_path}"
+                )
                 if os.system(optimum_command):
                     raise ValueError("Failed to export llm model", source_model)
 

diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt
@@ -4,6 +4,7 @@
 --pre
 optimum-intel@git+https://github.com/huggingface/optimum-intel.git
 accelerate
+datasets
 diffusers  # for image generation
 einops
 nncf@git+https://github.com/openvinotoolkit/nncf.git

diff --git a/demos/continuous_batching/speculative_decoding/README.md b/demos/continuous_batching/speculative_decoding/README.md
@@ -15,6 +15,154 @@ This demo shows how to use speculative decoding in the model serving scenario, b
 
 **Model Server deployment**: Installed Docker Engine or OVMS binary package according to the [baremetal deployment guide](../../../docs/deploying_server_baremetal.md)
 
+# Eagle3
+Currently using [EAGLE3](https://github.com/SafeAILab/EAGLE) requires some specific preparations hence dedicated section.
+
+## Model considerations
+
+For this demo we picked a pair of models from [available models](https://github.com/SafeAILab/EAGLE#eagle-3-models-on-hugging-face):
+- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) as a main model
+- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3) as a draft model
+
+both in INT4 precision.
+
+## Model preparation
+
+Python environment setup:
+```console
+# Install regular requirements for OVMS export script
+curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
+pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt
+
+# Override optimum-intel with version supporting eagle3
+python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@xufang/add_eagle3_draft_model_conversion
+
+mkdir models
+```
+
+Run `export_model.py` script to download and quantize the model:
+
+```console
+python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model Tengyunw/qwen3_8b_eagle3 --draft_eagle3 --weight-format int4 --config_file_path models/config.json --model_repository_path models
+```
+
+Draft model inherits all scheduler properties from the main model.
+
+You should have a model folder like below:
+```
+models
+├── config.json
+└── Qwen
+    └── Qwen3-8B
+        ├── added_tokens.json
+        ├── chat_template.jinja
+        ├── config.json
+        ├── generation_config.json
+        ├── graph.pbtxt
+        ├── merges.txt
+        ├── openvino_config.json
+        ├── openvino_detokenizer.bin
+        ├── openvino_detokenizer.xml
+        ├── openvino_model.bin
+        ├── openvino_model.xml
+        ├── openvino_tokenizer.bin
+        ├── openvino_tokenizer.xml
+        ├── special_tokens_map.json
+        ├── Tengyunw-qwen3_8b_eagle3
+        │   ├── config.json
+        │   ├── generation_config.json
+        │   ├── openvino_model.bin
+        │   └── openvino_model.xml
+        ├── tokenizer_config.json
+        ├── tokenizer.json
+        └── vocab.json
+```
+
+## Server Deployment
+
+:::{dropdown} **Deploying with Docker**
+```bash
+docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:weekly --rest_port 8000 --config_path /workspace/config.json
+```
+
+Running above command starts the container with no accelerators support. 
+To deploy on devices other than CPU, change `target_device` parameter in `export_model.py` call and follow [AI accelerators guide](../../../docs/accelerators.md) for additionally required docker parameters.
+:::
+
+:::{dropdown} **Deploying on Bare Metal**
+
+Assuming you have unpacked model server package, make sure to:
+
+- **On Windows**: run `setupvars` script
+- **On Linux**: set `LD_LIBRARY_PATH` and `PATH` environment variables
+
+as mentioned in [deployment guide](../../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server.
+
+Depending on how you prepared models in the first step of this demo, they are deployed to either CPU or GPU (it's defined in `config.json`). If you run on GPU make sure to have appropriate drivers installed, so the device is accessible for the model server.
+
+```bat
+ovms --rest_port 8000 --config_path ./models/config.json
+```
+:::
+
+## Check performance
+
+Let's check how the deployed model is doing by running performance test. For that purpose we can use vLLM benchmark script and sonnet dataset.
+
+Install vLLM and download sonnet dataset: 
+```bash
+pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
+curl https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/benchmarks/sonnet.txt -o sonnet.txt
+```
+
+Run benchmark with 100 requests sent sequentially:
+```bash
+vllm bench serve --dataset-name sonnet --dataset-path sonnet.txt --backend openai-chat --host localhost --port 8000 --endpoint /v3/chat/completions --max-concurrency 1 --tokenizer Qwen/Qwen3-8B --model Qwen/Qwen3-8B --num_prompts 100
+
+Starting initial single prompt test run...
+Skipping endpoint ready check.
+Starting main benchmark run...
+Traffic request rate: inf
+Burstiness factor: 1.0 (Poisson process)
+Maximum request concurrency: 1
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:59<00:00,  4.19s/it]
+tip: install termplotlib and gnuplot to plot the metrics
+============ Serving Benchmark Result ============
+Successful requests:                     100
+Failed requests:                         0
+Maximum request concurrency:             1
+Benchmark duration (s):                  419.00
+Total input tokens:                      54256
+Total generated tokens:                  15000
+Request throughput (req/s):              0.24
+Output token throughput (tok/s):         35.80
+Peak output token throughput (tok/s):    16.00
+Peak concurrent requests:                2.00
+Total token throughput (tok/s):          165.29
+---------------Time to First Token----------------
+Mean TTFT (ms):                          426.71
+Median TTFT (ms):                        424.97
+P99 TTFT (ms):                           635.37
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          25.25
+Median TPOT (ms):                        25.09
+P99 TPOT (ms):                           29.22
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           66.29
+Median ITL (ms):                         66.75
+P99 ITL (ms):                            72.11
+==================================================
+```
+
+## Limitations
+
+Eagle3 deployments currently have following known limitations:
+- stateful mode (pipeline_type: LM) not supported,
+- concurrency not supported (max 1 request processed at a time),
+- prefix caching not supported
+
+# Classic Models
+
 ## Model considerations
 
 From the functional perspective both main and draft models must use the same tokenizer, so the tokens from the draft model are correctly matched in the the main model.
@@ -51,7 +199,7 @@ python export_model.py text_generation --source_model meta-llama/CodeLlama-7b-hf
 Draft model inherits all scheduler properties from the main model.
 
 You should have a model folder like below:
-```console
+```
 models
 ├── config.json
 └── meta-llama
@@ -157,7 +305,7 @@ stream = client.completions.create(
     prompt="<s>def quicksort(numbers):",
     temperature=0,
     max_tokens=100,
-    extra_body={"num_assistant_tokens": 5}
+    extra_body={"num_assistant_tokens": 5},
     stream=True,
 )
 for chunk in stream: