From c21492ca40771be3b4f5b881105fb76461dbc52e Mon Sep 17 00:00:00 2001
From: sdelannoypavy <solene.delannoypavy_externe@rte-france.com>
Date: Fri, 29 May 2026 22:18:35 +0200
Subject: [PATCH 1/9] add mirror descent

---
 .../MirrorDescent/mirror_descent.jl           | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 src/algorithms/MirrorDescent/mirror_descent.jl

diff --git a/src/algorithms/MirrorDescent/mirror_descent.jl b/src/algorithms/MirrorDescent/mirror_descent.jl
new file mode 100644
index 0000000..c167b7c
--- /dev/null
+++ b/src/algorithms/MirrorDescent/mirror_descent.jl
@@ -0,0 +1,139 @@
+"""
+$TYPEDEF
+
+Mirror Descent algorithm for learning coordinated solutions.
+
+This algorithm is designed for stochastic benchmarks.
+
+Reference: <https://arxiv.org/abs/2505.04757>
+
+# Fields
+$TYPEDFIELDS
+"""
+@kwdef struct MirrorDescent{A} <: AbstractImitationAlgorithm
+    "inner imitation algorithm for supervised learning"
+    inner_algorithm::A = PerturbedFenchelYoungLossImitation()
+end
+
+"""
+$TYPEDSIGNATURES
+Generate a dataset for the provided benchmark and train a DFLPolicy using the Mirror Descent algorithm.
+
+# Core training method
+"""
+
+
+function train_policy(
+    algorithm::MirrorDescent,
+    benchmark::ExogenousStochasticBenchmark;
+    dataset_size=30,
+    epochs=10,
+    iterations=10,
+    κ = 1.0,
+    metrics::Tuple=(),
+    seed=nothing,
+)
+
+    train_dataset = generate_dataset(benchmark, dataset_size; seed=seed)
+
+    # Initialize model and create policy
+    model = generate_statistical_model(benchmark; seed=seed)
+    maximizer = generate_maximizer(benchmark)
+    policy = DFLPolicy(model, maximizer)
+
+    # vector because we store one history per iteration
+    histories_per_iteration = MVHistory[]
+
+    anticipative_solver = generate_anticipative_solver(benchmark;) 
+    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark;) 
+
+    # perturb = true correspond to "real" iterations of mirror descent
+    # we compute solutions with the penalized anticipative solver  + perturbation
+
+    # perturb = false correspond to imitation learning
+    # we use the anticipative solver without perturbation
+    # usefull to start with one iteration of pure imitation learning
+    perturb = false
+
+    # Train policy
+    for n_it in 1:iterations
+        println("Iteration $n_it / $iterations")
+
+        if n_it > 1
+            perturb = true
+        end
+
+
+        # Generate anticipative solutions as training data
+        augmented_dataset = augment_dataset(
+            algorithm.inner_algorithm, benchmark, train_dataset, model, maximizer, anticipative_solver, parametric_anticipative_solver;
+            κ = κ, perturb = perturb
+        )
+
+
+        # Train policy on augmented dataset
+        history = train_policy!(
+            algorithm.inner_algorithm,
+            policy,
+            augmented_dataset;
+            epochs = epochs,
+            metrics = metrics,
+            maximizer_kwargs=sample -> sample.context,
+        )
+
+        push!(histories_per_iteration, history)
+    end
+
+    return histories_per_iteration, policy
+end
+
+
+function augment_dataset(
+    algorithm::PerturbedFenchelYoungLossImitation,
+    bench::ExogenousStochasticBenchmark,
+    train_dataset::AbstractArray,
+    model,
+    maximizer,
+    anticipative_solver,
+    parametric_anticipative_solver;
+    κ = 1.0,
+    perturb = false
+)
+
+    (; nb_samples, ε, threaded, training_optimizer, seed) = algorithm
+
+    augmented_dataset = Vector{DataSample}()
+
+    if perturb
+        perturbed_maximizer = PerturbedAdditive(
+            parametric_anticipative_solver; ε=κ*ε, nb_samples=nb_samples
+        )
+    end
+
+
+    for sample in train_dataset
+
+        θ = model(sample.x)
+
+        if perturb
+            if is_minimization_problem(bench)
+                y = perturbed_maximizer(-κ*θ; scenario = sample.scenario, context = sample) 
+            else
+                y = perturbed_maximizer(κ*θ; scenario = sample.scenario, context = sample)
+            end
+        else
+            y = anticipative_solver(sample.scenario; context = sample)
+        end
+
+        augmented_datasample = DataSample(;
+            x = sample.x,
+            y,
+            instance = sample.context,
+            extra = sample.extra
+        )
+
+        push!(augmented_dataset, augmented_datasample)
+    end
+
+    return augmented_dataset
+end
\ No newline at end of file

From 2c13ca07fb8cb3bf49d3fcd5fe063b069591e5d2 Mon Sep 17 00:00:00 2001
From: sdelannoypavy <solene.delannoypavy_externe@rte-france.com>
Date: Fri, 29 May 2026 22:49:13 +0200
Subject: [PATCH 2/9] corrected bug

---
 src/DecisionFocusedLearningAlgorithms.jl |  3 ++-
 test.jl                                  | 26 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 test.jl

diff --git a/src/DecisionFocusedLearningAlgorithms.jl b/src/DecisionFocusedLearningAlgorithms.jl
index d7a6250..19fdf70 100644
--- a/src/DecisionFocusedLearningAlgorithms.jl
+++ b/src/DecisionFocusedLearningAlgorithms.jl
@@ -25,6 +25,7 @@ include("algorithms/abstract_algorithm.jl")
 include("algorithms/supervised/fyl.jl")
 include("algorithms/supervised/anticipative_imitation.jl")
 include("algorithms/supervised/dagger.jl")
+include("algorithms/MirrorDescent/mirror_descent.jl")
 
 export TrainingContext
 
@@ -41,7 +42,7 @@ export AbstractMetric,
 
 export AbstractAlgorithm, AbstractImitationAlgorithm
 export PerturbedFenchelYoungLossImitation,
-    DAgger, AnticipativeImitation, train_policy!, train_policy
+    DAgger, AnticipativeImitation, train_policy!, train_policy, MirrorDescent
 export AbstractPolicy, DFLPolicy
 
 end
diff --git a/test.jl b/test.jl
new file mode 100644
index 0000000..dadf851
--- /dev/null
+++ b/test.jl
@@ -0,0 +1,26 @@
+# To be used to visualize loss across iterations
+
+using DecisionFocusedLearningAlgorithms
+using DecisionFocusedLearningBenchmarks
+
+benchmark = ContextualStochasticArgmaxBenchmark()
+
+anticipative_solver = generate_anticipative_solver(benchmark)
+algorithm = DecisionFocusedLearningAlgorithms.MirrorDescent()
+
+κ = 0.1
+train_dataset_size = 5
+nb_epochs          = 2
+nb_iterations      = 2
+seed               = 3
+
+histories_r, _ = DecisionFocusedLearningAlgorithms.train_policy(
+    algorithm, benchmark;
+    dataset_size = train_dataset_size,
+    epochs       = nb_epochs,
+    iterations   = nb_iterations,
+    seed         = seed,
+    κ            = κ,
+)
+
+

From 7635f1fdeac040e3e595e30e2b42592e7c248410 Mon Sep 17 00:00:00 2001
From: sdelannoypavy <solene.delannoypavy_externe@rte-france.com>
Date: Fri, 29 May 2026 23:01:38 +0200
Subject: [PATCH 3/9] corrected bug

---
 src/algorithms/MirrorDescent/mirror_descent.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/algorithms/MirrorDescent/mirror_descent.jl b/src/algorithms/MirrorDescent/mirror_descent.jl
index c167b7c..2f282e2 100644
--- a/src/algorithms/MirrorDescent/mirror_descent.jl
+++ b/src/algorithms/MirrorDescent/mirror_descent.jl
@@ -66,7 +66,7 @@ function train_policy(
 
         # Generate anticipative solutions as training data
         augmented_dataset = augment_dataset(
-            algorithm.inner_algorithm, benchmark, train_dataset, model, maximizer, anticipative_solver, parametric_anticipative_solver;
+            algorithm.inner_algorithm, benchmark, train_dataset, model, anticipative_solver, parametric_anticipative_solver;
             κ = κ, perturb = perturb
         )
 
@@ -90,10 +90,9 @@ end
 
 function augment_dataset(
     algorithm::PerturbedFenchelYoungLossImitation,
-    bench::ExogenousStochasticBenchmark,
+    bench::AbstractStochasticBenchmark,
     train_dataset::AbstractArray,
     model,
-    maximizer,
     anticipative_solver,
     parametric_anticipative_solver;
     κ = 1.0,

From d9ad94d361dbedf06e111c083a4a2e0a9d45d71e Mon Sep 17 00:00:00 2001
From: sdelannoypavy <solene.delannoypavy_externe@rte-france.com>
Date: Fri, 29 May 2026 23:11:14 +0200
Subject: [PATCH 4/9] Delete test.jl

---
 test.jl | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 test.jl

diff --git a/test.jl b/test.jl
deleted file mode 100644
index dadf851..0000000
--- a/test.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-# To be used to visualize loss across iterations
-
-using DecisionFocusedLearningAlgorithms
-using DecisionFocusedLearningBenchmarks
-
-benchmark = ContextualStochasticArgmaxBenchmark()
-
-anticipative_solver = generate_anticipative_solver(benchmark)
-algorithm = DecisionFocusedLearningAlgorithms.MirrorDescent()
-
-κ = 0.1
-train_dataset_size = 5
-nb_epochs          = 2
-nb_iterations      = 2
-seed               = 3
-
-histories_r, _ = DecisionFocusedLearningAlgorithms.train_policy(
-    algorithm, benchmark;
-    dataset_size = train_dataset_size,
-    epochs       = nb_epochs,
-    iterations   = nb_iterations,
-    seed         = seed,
-    κ            = κ,
-)
-
-

From 0f7347bc6a686db3094d6eb1b93acc87bb4092c7 Mon Sep 17 00:00:00 2001
From: sdelannoypavy <solene.delannoypavy_externe@rte-france.com>
Date: Mon, 8 Jun 2026 11:47:00 +0200
Subject: [PATCH 5/9] Fix review comments

---
 Project.toml                                  |  14 +-
 .../mirror_descent/mirror_descent.jl          | 196 ++++++++++++++++++
 test/mirror_descent.jl                        |  95 +++++++++
 3 files changed, 301 insertions(+), 4 deletions(-)
 create mode 100644 src/algorithms/mirror_descent/mirror_descent.jl
 create mode 100644 test/mirror_descent.jl

diff --git a/Project.toml b/Project.toml
index a4b06c5..2418b61 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,17 @@
 name = "DecisionFocusedLearningAlgorithms"
 uuid = "46d52364-bc3b-4fac-a992-eb1d3ef2de15"
-version = "0.2.0"
 authors = ["Members of JuliaDecisionFocusedLearning and contributors"]
-
-[workspace]
-projects = ["docs", "test"]
+version = "0.2.0"
 
 [deps]
 DecisionFocusedLearningBenchmarks = "2fbe496a-299b-4c81-bab5-c44dfc55cf20"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 InferOpt = "4846b161-c94e-4150-8dac-c7ae193c601f"
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -21,12 +21,18 @@ ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
 [compat]
 DecisionFocusedLearningBenchmarks = "0.5.0, 0.6"
 DocStringExtensions = "0.9.5"
+Documenter = "1.17.0"
 Flux = "0.16.9"
 InferOpt = "0.7.1"
+Literate = "2.21.0"
 MLUtils = "0.4.8"
+Plots = "1.41.6"
 ProgressMeter = "1.11.0"
 Random = "1.11.0"
 Statistics = "1.11.1"
 UnicodePlots = "3.8.2"
 ValueHistories = "0.5.6"
 julia = "1.11"
+
+[workspace]
+projects = ["docs", "test"]
diff --git a/src/algorithms/mirror_descent/mirror_descent.jl b/src/algorithms/mirror_descent/mirror_descent.jl
new file mode 100644
index 0000000..b0847cb
--- /dev/null
+++ b/src/algorithms/mirror_descent/mirror_descent.jl
@@ -0,0 +1,196 @@
+"""
+$TYPEDEF
+
+Mirror Descent algorithm for learning coordinated solutions.
+
+This algorithm is designed for stochastic benchmarks.
+
+Reference: <https://arxiv.org/abs/2505.04757>
+
+# Fields
+$TYPEDFIELDS
+"""
+@kwdef struct MirrorDescent{A<:PerturbedFenchelYoungLossImitation} <: AbstractAlgorithm
+    "inner imitation algorithm for supervised learning"
+    inner_algorithm::A = PerturbedFenchelYoungLossImitation()
+end
+
+"""
+$TYPEDSIGNATURES
+
+Train a DFLPolicy using the Mirror Descent algorithm on a provided training dataset.
+
+# Core training method
+
+# Arguments
+- `epochs`: number of training epochs per iteration
+- `iterations`: number of mirror descent iterations
+- `κ`: scaling factor for the perturbation magnitude
+- `metrics`: tuple of metrics to track during training
+- `verbose`: if true, prints progress at each iteration
+- `imitation_start`: if true, the first iteration uses pure imitation learning (no perturbation)
+"""
+
+function train_policy!(
+    benchmark::ExogenousStochasticBenchmark,
+    algorithm::MirrorDescent,
+    policy::DFLPolicy,
+    train_dataset,
+    anticipative_solver,
+    perturbed_anticipative_solver;
+    epochs=10,
+    iterations=10,
+    κ=1.0,
+    metrics::Tuple=(),
+    verbose::Bool=false,
+    imitation_start::Bool=true
+)
+
+    augmented_dataset = train_dataset
+    return map(1:iterations) do n_it
+        if verbose
+            println("Iteration $n_it / $iterations")
+        end
+
+        perturb = n_it > 1 || !imitation_start
+
+        augmented_dataset = augment_dataset(
+            benchmark, augmented_dataset, policy.statistical_model, anticipative_solver, perturbed_anticipative_solver;
+            κ=κ, perturb=perturb
+        )
+
+        train_policy!(
+            algorithm.inner_algorithm,
+            policy,
+            augmented_dataset;
+            epochs=epochs,
+            metrics=metrics,
+            maximizer_kwargs=sample -> sample.context,
+        )
+    end
+end
+
+"""
+$TYPEDSIGNATURES
+
+Generate a dataset for the provided benchmark and train a DFLPolicy using the Mirror Descent algorithm.
+
+# Benchmark convenience wrapper
+
+This high-level function handles all setup from the benchmark and returns a trained policy.
+
+# Arguments
+- `dataset_size`: number of samples in the training dataset
+- `epochs`: number of training epochs per iteration
+- `iterations`: number of mirror descent iterations
+- `κ`: scaling factor for the perturbation magnitude
+- `metrics`: tuple of metrics to track during training
+- `seed`: random seed for reproducibility
+- `verbose`: if true, prints progress at each iteration
+- `imitation_start`: if true, the first iteration uses pure imitation learning (no perturbation)
+- `model_kwargs`: additional keyword arguments passed to `generate_statistical_model`
+- `maximizer_kwargs`: additional keyword arguments passed to `generate_maximizer`
+- `solver_kwargs`: additional keyword arguments passed to `generate_anticipative_solver` and `generate_parametric_anticipative_solver`
+- `nb_scenarios`: number of scenarios per instance. 
+- `context_per_instance`: number of contexts per instance. 
+"""
+
+
+
+function train_policy(
+    algorithm::MirrorDescent,
+    benchmark::ExogenousStochasticBenchmark;
+    dataset_size=30,
+    epochs=10,
+    iterations=10,
+    κ=1.0,
+    metrics::Tuple=(),
+    seed=nothing,
+    verbose::Bool=false,
+    imitation_start::Bool=true,
+    model_kwargs=(;),
+    maximizer_kwargs=(;),
+    solver_kwargs=(;),
+    nb_scenarios = 1,
+    context_per_instance = 1,
+)
+    train_dataset = generate_dataset(benchmark, dataset_size; nb_scenarios=nb_scenarios, contexts_per_instance=context_per_instance, seed=seed)
+
+    model = generate_statistical_model(benchmark; seed=seed, model_kwargs...)
+    maximizer = generate_maximizer(benchmark; maximizer_kwargs...)
+    policy = DFLPolicy(model, maximizer)
+
+    anticipative_solver = generate_anticipative_solver(benchmark; solver_kwargs...)
+    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark; solver_kwargs...)
+    (; nb_samples, ε, threaded, seed) = algorithm.inner_algorithm
+    perturbed_anticipative_solver = PerturbedAdditive((θ; scenario, kwargs...) -> parametric_anticipative_solver(θ, scenario; kwargs...); ε=κ*ε, nb_samples=nb_samples, seed=seed, threaded=threaded)
+
+
+    histories_per_iteration = train_policy!(
+        benchmark, algorithm, policy, train_dataset, anticipative_solver, perturbed_anticipative_solver;
+        epochs=epochs, iterations=iterations, κ=κ, metrics=metrics, verbose=verbose, imitation_start=imitation_start
+    )
+
+    return histories_per_iteration, policy
+end
+
+function augment_dataset(
+    bench::ExogenousStochasticBenchmark,
+    train_dataset::AbstractArray,
+    model,
+    anticipative_solver,
+    perturbed_anticipative_solver;
+    κ=1.0,
+    perturb=false
+)
+    return _augment_dataset(
+        Val(fieldtype(eltype(train_dataset), :y) !== Nothing),
+        bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
+        κ=κ, perturb=perturb
+    )
+end
+
+# Raw dataset (samples have no y) → create new DataSamples
+function _augment_dataset(
+    ::Val{false},
+    bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
+    κ=1.0, perturb=false
+)
+    return map(train_dataset) do sample
+        θ = model(sample.x)
+        if perturb
+            if is_minimization_problem(bench)
+                y = perturbed_anticipative_solver(-κ*θ; scenario=sample.scenario, sample.context...)
+            else
+                y = perturbed_anticipative_solver(κ*θ; scenario=sample.scenario, sample.context...)
+            end
+        else
+            y = anticipative_solver(sample.scenario; sample.context...)
+        end
+        DataSample(sample; y=y)
+    end
+end
+
+# Augmented dataset (samples already have y) → update y in place
+function _augment_dataset(
+    ::Val{true},
+    bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
+    κ=1.0, perturb=false
+)
+    for (i, sample) in enumerate(train_dataset)
+        θ = model(sample.x)
+        if perturb
+            if is_minimization_problem(bench)
+                y = perturbed_anticipative_solver(-κ*θ; scenario=sample.scenario, sample.context...)
+            else
+                y = perturbed_anticipative_solver(κ*θ; scenario=sample.scenario, sample.context...)
+            end
+        else
+            y = anticipative_solver(sample.scenario; sample.context...)
+        end
+        ET = eltype(sample.y)
+        y_converted = convert(typeof(sample.y), ET <: Integer ? round.(ET, y) : y)
+        train_dataset[i] = DataSample(sample; y=y_converted)
+    end
+    return train_dataset
+end
\ No newline at end of file
diff --git a/test/mirror_descent.jl b/test/mirror_descent.jl
new file mode 100644
index 0000000..0a42cc3
--- /dev/null
+++ b/test/mirror_descent.jl
@@ -0,0 +1,95 @@
+using DecisionFocusedLearningAlgorithms
+using DecisionFocusedLearningBenchmarks
+using Test
+using ValueHistories
+using Statistics: mean
+
+@testset "MirrorDescent Training" begin
+
+    @testset "MirrorDescent - ContextualStochasticArgmax basic" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=5, epochs=2, iterations=2, seed=0
+        )
+
+        @test histories isa Vector
+        @test length(histories) == 2
+        @test all(h isa MVHistory for h in histories)
+        @test all(haskey(h, :training_loss) for h in histories)
+        @test policy isa DFLPolicy
+    end
+
+    @testset "MirrorDescent - StochasticVehicleScheduling basic" begin
+        benchmark = StochasticVehicleSchedulingBenchmark()
+        algorithm = MirrorDescent()
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=1, epochs=2, iterations=2, seed=0
+        )
+
+        @test histories isa Vector
+        @test length(histories) == 2
+        @test all(h isa MVHistory for h in histories)
+        @test all(haskey(h, :training_loss) for h in histories)
+        @test policy isa DFLPolicy
+    end
+
+    @testset "MirrorDescent - imitation_start=false" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=5, epochs=2, iterations=2, seed=0, imitation_start=false
+        )
+
+        @test histories isa Vector
+        @test length(histories) == 2
+        @test policy isa DFLPolicy
+    end
+
+    @testset "MirrorDescent - performance improves over iterations" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        val_dataset = generate_dataset(benchmark, 100; seed=99)
+
+        val_metric = FunctionMetric(:val_obj, val_dataset) do ctx, data
+            vals = map(data) do s
+                θ = ctx.policy.statistical_model(s.x)
+                y = ctx.policy.maximizer(θ; s.context...)
+                Float64(DecisionFocusedLearningBenchmarks.objective_value(benchmark, s, y))
+            end
+            (val_obj = mean(vals),)
+        end
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=20, epochs=3, iterations=5, seed=0, metrics=(val_metric,)
+        )
+
+        val_objs = [get(histories[i], :val_obj)[2][end] for i in 1:5]
+
+        # Performance should improve at each iteration
+        @test (val_objs[4] > val_objs[1])
+    end
+
+    @testset "MirrorDescent - with metrics" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        metrics = (FunctionMetric(ctx -> ctx.epoch, :epoch),)
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=5, epochs=2, iterations=2, seed=0, metrics=metrics
+        )
+
+        @test all(haskey(h, :epoch) for h in histories)
+    end
+
+end

From 5da6cc99525bec91c0f3a20a75ed513d7e9aa328 Mon Sep 17 00:00:00 2001
From: BatyLeo <leo.baty67@gmail.com>
Date: Fri, 19 Jun 2026 17:57:42 +0200
Subject: [PATCH 6/9] style: formatting + cleanup useless dependencies

---
 Project.toml                                  |  12 +--
 .../MirrorDescent/mirror_descent.jl           |  47 ++++----
 .../mirror_descent/mirror_descent.jl          | 100 +++++++++++++-----
 test/mirror_descent.jl                        |  37 ++++---
 4 files changed, 123 insertions(+), 73 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2418b61..1a71616 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,15 +1,16 @@
 name = "DecisionFocusedLearningAlgorithms"
 uuid = "46d52364-bc3b-4fac-a992-eb1d3ef2de15"
-authors = ["Members of JuliaDecisionFocusedLearning and contributors"]
 version = "0.2.0"
+authors = ["Members of JuliaDecisionFocusedLearning and contributors"]
+
+[workspace]
+projects = ["docs", "test"]
 
 [deps]
 DecisionFocusedLearningBenchmarks = "2fbe496a-299b-4c81-bab5-c44dfc55cf20"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 InferOpt = "4846b161-c94e-4150-8dac-c7ae193c601f"
-Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -21,10 +22,8 @@ ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
 [compat]
 DecisionFocusedLearningBenchmarks = "0.5.0, 0.6"
 DocStringExtensions = "0.9.5"
-Documenter = "1.17.0"
 Flux = "0.16.9"
 InferOpt = "0.7.1"
-Literate = "2.21.0"
 MLUtils = "0.4.8"
 Plots = "1.41.6"
 ProgressMeter = "1.11.0"
@@ -33,6 +32,3 @@ Statistics = "1.11.1"
 UnicodePlots = "3.8.2"
 ValueHistories = "0.5.6"
 julia = "1.11"
-
-[workspace]
-projects = ["docs", "test"]
diff --git a/src/algorithms/MirrorDescent/mirror_descent.jl b/src/algorithms/MirrorDescent/mirror_descent.jl
index 2f282e2..3f5ee0c 100644
--- a/src/algorithms/MirrorDescent/mirror_descent.jl
+++ b/src/algorithms/MirrorDescent/mirror_descent.jl
@@ -22,18 +22,16 @@ Generate a dataset for the provided benchmark and train a DFLPolicy using the Mi
 # Core training method
 """
 
-
 function train_policy(
     algorithm::MirrorDescent,
     benchmark::ExogenousStochasticBenchmark;
     dataset_size=30,
     epochs=10,
     iterations=10,
-    κ = 1.0,
+    κ=1.0,
     metrics::Tuple=(),
     seed=nothing,
 )
-
     train_dataset = generate_dataset(benchmark, dataset_size; seed=seed)
 
     # Initialize model and create policy
@@ -44,8 +42,8 @@ function train_policy(
     # vector because we store one history per iteration
     histories_per_iteration = MVHistory[]
 
-    anticipative_solver = generate_anticipative_solver(benchmark;) 
-    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark;) 
+    anticipative_solver = generate_anticipative_solver(benchmark;)
+    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark;)
 
     # perturb = true correspond to "real" iterations of mirror descent
     # we compute solutions with the penalized anticipative solver  + perturbation
@@ -63,21 +61,25 @@ function train_policy(
             perturb = true
         end
 
-
         # Generate anticipative solutions as training data
         augmented_dataset = augment_dataset(
-            algorithm.inner_algorithm, benchmark, train_dataset, model, anticipative_solver, parametric_anticipative_solver;
-            κ = κ, perturb = perturb
+            algorithm.inner_algorithm,
+            benchmark,
+            train_dataset,
+            model,
+            anticipative_solver,
+            parametric_anticipative_solver;
+            κ=κ,
+            perturb=perturb,
         )
 
-
         # Train policy on augmented dataset
         history = train_policy!(
             algorithm.inner_algorithm,
             policy,
             augmented_dataset;
-            epochs = epochs,
-            metrics = metrics,
+            epochs=epochs,
+            metrics=metrics,
             maximizer_kwargs=sample -> sample.context,
         )
 
@@ -87,7 +89,6 @@ function train_policy(
     return histories_per_iteration, policy
 end
 
-
 function augment_dataset(
     algorithm::PerturbedFenchelYoungLossImitation,
     bench::AbstractStochasticBenchmark,
@@ -95,44 +96,38 @@ function augment_dataset(
     model,
     anticipative_solver,
     parametric_anticipative_solver;
-    κ = 1.0,
-    perturb = false
+    κ=1.0,
+    perturb=false,
 )
-
     (; nb_samples, ε, threaded, training_optimizer, seed) = algorithm
 
     augmented_dataset = Vector{DataSample}()
 
     if perturb
         perturbed_maximizer = PerturbedAdditive(
-            parametric_anticipative_solver; ε=κ*ε, nb_samples=nb_samples
+            parametric_anticipative_solver; ε=κ * ε, nb_samples=nb_samples
         )
     end
 
-
     for sample in train_dataset
-
         θ = model(sample.x)
 
         if perturb
             if is_minimization_problem(bench)
-                y = perturbed_maximizer(-κ*θ; scenario = sample.scenario, context = sample) 
+                y = perturbed_maximizer(-κ * θ; scenario=sample.scenario, context=sample)
             else
-                y = perturbed_maximizer(κ*θ; scenario = sample.scenario, context = sample)
+                y = perturbed_maximizer(κ * θ; scenario=sample.scenario, context=sample)
             end
         else
-            y = anticipative_solver(sample.scenario; context = sample)
+            y = anticipative_solver(sample.scenario; context=sample)
         end
 
         augmented_datasample = DataSample(;
-            x = sample.x,
-            y,
-            instance = sample.context,
-            extra = sample.extra
+            x=sample.x, y, instance=sample.context, extra=sample.extra
         )
 
         push!(augmented_dataset, augmented_datasample)
     end
 
     return augmented_dataset
-end
\ No newline at end of file
+end
diff --git a/src/algorithms/mirror_descent/mirror_descent.jl b/src/algorithms/mirror_descent/mirror_descent.jl
index b0847cb..d773c85 100644
--- a/src/algorithms/mirror_descent/mirror_descent.jl
+++ b/src/algorithms/mirror_descent/mirror_descent.jl
@@ -43,9 +43,8 @@ function train_policy!(
     κ=1.0,
     metrics::Tuple=(),
     verbose::Bool=false,
-    imitation_start::Bool=true
+    imitation_start::Bool=true,
 )
-
     augmented_dataset = train_dataset
     return map(1:iterations) do n_it
         if verbose
@@ -55,8 +54,13 @@ function train_policy!(
         perturb = n_it > 1 || !imitation_start
 
         augmented_dataset = augment_dataset(
-            benchmark, augmented_dataset, policy.statistical_model, anticipative_solver, perturbed_anticipative_solver;
-            κ=κ, perturb=perturb
+            benchmark,
+            augmented_dataset,
+            policy.statistical_model,
+            anticipative_solver,
+            perturbed_anticipative_solver;
+            κ=κ,
+            perturb=perturb,
         )
 
         train_policy!(
@@ -95,8 +99,6 @@ This high-level function handles all setup from the benchmark and returns a trai
 - `context_per_instance`: number of contexts per instance. 
 """
 
-
-
 function train_policy(
     algorithm::MirrorDescent,
     benchmark::ExogenousStochasticBenchmark;
@@ -111,24 +113,47 @@ function train_policy(
     model_kwargs=(;),
     maximizer_kwargs=(;),
     solver_kwargs=(;),
-    nb_scenarios = 1,
-    context_per_instance = 1,
+    nb_scenarios=1,
+    context_per_instance=1,
 )
-    train_dataset = generate_dataset(benchmark, dataset_size; nb_scenarios=nb_scenarios, contexts_per_instance=context_per_instance, seed=seed)
+    train_dataset = generate_dataset(
+        benchmark,
+        dataset_size;
+        nb_scenarios=nb_scenarios,
+        contexts_per_instance=context_per_instance,
+        seed=seed,
+    )
 
     model = generate_statistical_model(benchmark; seed=seed, model_kwargs...)
     maximizer = generate_maximizer(benchmark; maximizer_kwargs...)
     policy = DFLPolicy(model, maximizer)
 
     anticipative_solver = generate_anticipative_solver(benchmark; solver_kwargs...)
-    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark; solver_kwargs...)
+    parametric_anticipative_solver = generate_parametric_anticipative_solver(
+        benchmark; solver_kwargs...
+    )
     (; nb_samples, ε, threaded, seed) = algorithm.inner_algorithm
-    perturbed_anticipative_solver = PerturbedAdditive((θ; scenario, kwargs...) -> parametric_anticipative_solver(θ, scenario; kwargs...); ε=κ*ε, nb_samples=nb_samples, seed=seed, threaded=threaded)
-
+    perturbed_anticipative_solver = PerturbedAdditive(
+        (θ; scenario, kwargs...) -> parametric_anticipative_solver(θ, scenario; kwargs...);
+        ε=κ * ε,
+        nb_samples=nb_samples,
+        seed=seed,
+        threaded=threaded,
+    )
 
     histories_per_iteration = train_policy!(
-        benchmark, algorithm, policy, train_dataset, anticipative_solver, perturbed_anticipative_solver;
-        epochs=epochs, iterations=iterations, κ=κ, metrics=metrics, verbose=verbose, imitation_start=imitation_start
+        benchmark,
+        algorithm,
+        policy,
+        train_dataset,
+        anticipative_solver,
+        perturbed_anticipative_solver;
+        epochs=epochs,
+        iterations=iterations,
+        κ=κ,
+        metrics=metrics,
+        verbose=verbose,
+        imitation_start=imitation_start,
     )
 
     return histories_per_iteration, policy
@@ -141,28 +166,42 @@ function augment_dataset(
     anticipative_solver,
     perturbed_anticipative_solver;
     κ=1.0,
-    perturb=false
+    perturb=false,
 )
     return _augment_dataset(
         Val(fieldtype(eltype(train_dataset), :y) !== Nothing),
-        bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
-        κ=κ, perturb=perturb
+        bench,
+        train_dataset,
+        model,
+        anticipative_solver,
+        perturbed_anticipative_solver;
+        κ=κ,
+        perturb=perturb,
     )
 end
 
 # Raw dataset (samples have no y) → create new DataSamples
 function _augment_dataset(
     ::Val{false},
-    bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
-    κ=1.0, perturb=false
+    bench,
+    train_dataset,
+    model,
+    anticipative_solver,
+    perturbed_anticipative_solver;
+    κ=1.0,
+    perturb=false,
 )
     return map(train_dataset) do sample
         θ = model(sample.x)
         if perturb
             if is_minimization_problem(bench)
-                y = perturbed_anticipative_solver(-κ*θ; scenario=sample.scenario, sample.context...)
+                y = perturbed_anticipative_solver(
+                    -κ * θ; scenario=sample.scenario, sample.context...
+                )
             else
-                y = perturbed_anticipative_solver(κ*θ; scenario=sample.scenario, sample.context...)
+                y = perturbed_anticipative_solver(
+                    κ * θ; scenario=sample.scenario, sample.context...
+                )
             end
         else
             y = anticipative_solver(sample.scenario; sample.context...)
@@ -174,16 +213,25 @@ end
 # Augmented dataset (samples already have y) → update y in place
 function _augment_dataset(
     ::Val{true},
-    bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
-    κ=1.0, perturb=false
+    bench,
+    train_dataset,
+    model,
+    anticipative_solver,
+    perturbed_anticipative_solver;
+    κ=1.0,
+    perturb=false,
 )
     for (i, sample) in enumerate(train_dataset)
         θ = model(sample.x)
         if perturb
             if is_minimization_problem(bench)
-                y = perturbed_anticipative_solver(-κ*θ; scenario=sample.scenario, sample.context...)
+                y = perturbed_anticipative_solver(
+                    -κ * θ; scenario=sample.scenario, sample.context...
+                )
             else
-                y = perturbed_anticipative_solver(κ*θ; scenario=sample.scenario, sample.context...)
+                y = perturbed_anticipative_solver(
+                    κ * θ; scenario=sample.scenario, sample.context...
+                )
             end
         else
             y = anticipative_solver(sample.scenario; sample.context...)
@@ -193,4 +241,4 @@ function _augment_dataset(
         train_dataset[i] = DataSample(sample; y=y_converted)
     end
     return train_dataset
-end
\ No newline at end of file
+end
diff --git a/test/mirror_descent.jl b/test/mirror_descent.jl
index 0a42cc3..3373c64 100644
--- a/test/mirror_descent.jl
+++ b/test/mirror_descent.jl
@@ -5,14 +5,12 @@ using ValueHistories
 using Statistics: mean
 
 @testset "MirrorDescent Training" begin
-
     @testset "MirrorDescent - ContextualStochasticArgmax basic" begin
         benchmark = ContextualStochasticArgmaxBenchmark()
         algorithm = MirrorDescent()
 
         histories, policy = train_policy(
-            algorithm, benchmark;
-            dataset_size=5, epochs=2, iterations=2, seed=0
+            algorithm, benchmark; dataset_size=5, epochs=2, iterations=2, seed=0
         )
 
         @test histories isa Vector
@@ -27,8 +25,7 @@ using Statistics: mean
         algorithm = MirrorDescent()
 
         histories, policy = train_policy(
-            algorithm, benchmark;
-            dataset_size=1, epochs=2, iterations=2, seed=0
+            algorithm, benchmark; dataset_size=1, epochs=2, iterations=2, seed=0
         )
 
         @test histories isa Vector
@@ -43,8 +40,13 @@ using Statistics: mean
         algorithm = MirrorDescent()
 
         histories, policy = train_policy(
-            algorithm, benchmark;
-            dataset_size=5, epochs=2, iterations=2, seed=0, imitation_start=false
+            algorithm,
+            benchmark;
+            dataset_size=5,
+            epochs=2,
+            iterations=2,
+            seed=0,
+            imitation_start=false,
         )
 
         @test histories isa Vector
@@ -64,12 +66,17 @@ using Statistics: mean
                 y = ctx.policy.maximizer(θ; s.context...)
                 Float64(DecisionFocusedLearningBenchmarks.objective_value(benchmark, s, y))
             end
-            (val_obj = mean(vals),)
+            (val_obj=mean(vals),)
         end
 
         histories, policy = train_policy(
-            algorithm, benchmark;
-            dataset_size=20, epochs=3, iterations=5, seed=0, metrics=(val_metric,)
+            algorithm,
+            benchmark;
+            dataset_size=20,
+            epochs=3,
+            iterations=5,
+            seed=0,
+            metrics=(val_metric,),
         )
 
         val_objs = [get(histories[i], :val_obj)[2][end] for i in 1:5]
@@ -85,11 +92,15 @@ using Statistics: mean
         metrics = (FunctionMetric(ctx -> ctx.epoch, :epoch),)
 
         histories, policy = train_policy(
-            algorithm, benchmark;
-            dataset_size=5, epochs=2, iterations=2, seed=0, metrics=metrics
+            algorithm,
+            benchmark;
+            dataset_size=5,
+            epochs=2,
+            iterations=2,
+            seed=0,
+            metrics=metrics,
         )
 
         @test all(haskey(h, :epoch) for h in histories)
     end
-
 end

From e3d6cca027ec1bcb10a2eff1a91a2b784479a0fe Mon Sep 17 00:00:00 2001
From: BatyLeo <leo.baty67@gmail.com>
Date: Sun, 21 Jun 2026 03:02:22 +0200
Subject: [PATCH 7/9] refactor: reorganize the code and fix failing tests

---
 Project.toml                                  |   4 +-
 src/DecisionFocusedLearningAlgorithms.jl      |   2 +-
 .../MirrorDescent/mirror_descent.jl           | 133 --------
 .../mirror_descent/mirror_descent.jl          | 305 +++++++++---------
 test/Project.toml                             |   1 -
 test/mirror_descent.jl                        |   6 +-
 6 files changed, 156 insertions(+), 295 deletions(-)
 delete mode 100644 src/algorithms/MirrorDescent/mirror_descent.jl

diff --git a/Project.toml b/Project.toml
index 1a71616..fc60d0c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,7 +12,6 @@ DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 InferOpt = "4846b161-c94e-4150-8dac-c7ae193c601f"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -20,12 +19,11 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
 
 [compat]
-DecisionFocusedLearningBenchmarks = "0.5.0, 0.6"
+DecisionFocusedLearningBenchmarks = "0.6.1"
 DocStringExtensions = "0.9.5"
 Flux = "0.16.9"
 InferOpt = "0.7.1"
 MLUtils = "0.4.8"
-Plots = "1.41.6"
 ProgressMeter = "1.11.0"
 Random = "1.11.0"
 Statistics = "1.11.1"
diff --git a/src/DecisionFocusedLearningAlgorithms.jl b/src/DecisionFocusedLearningAlgorithms.jl
index 19fdf70..38d9ec6 100644
--- a/src/DecisionFocusedLearningAlgorithms.jl
+++ b/src/DecisionFocusedLearningAlgorithms.jl
@@ -25,7 +25,7 @@ include("algorithms/abstract_algorithm.jl")
 include("algorithms/supervised/fyl.jl")
 include("algorithms/supervised/anticipative_imitation.jl")
 include("algorithms/supervised/dagger.jl")
-include("algorithms/MirrorDescent/mirror_descent.jl")
+include("algorithms/mirror_descent/mirror_descent.jl")
 
 export TrainingContext
 
diff --git a/src/algorithms/MirrorDescent/mirror_descent.jl b/src/algorithms/MirrorDescent/mirror_descent.jl
deleted file mode 100644
index 3f5ee0c..0000000
--- a/src/algorithms/MirrorDescent/mirror_descent.jl
+++ /dev/null
@@ -1,133 +0,0 @@
-"""
-$TYPEDEF
-
-Mirror Descent algorithm for learning coordinated solutions.
-
-This algorithm is designed for stochastic benchmarks.
-
-Reference: <https://arxiv.org/abs/2505.04757>
-
-# Fields
-$TYPEDFIELDS
-"""
-@kwdef struct MirrorDescent{A} <: AbstractImitationAlgorithm
-    "inner imitation algorithm for supervised learning"
-    inner_algorithm::A = PerturbedFenchelYoungLossImitation()
-end
-
-"""
-$TYPEDSIGNATURES
-Generate a dataset for the provided benchmark and train a DFLPolicy using the Mirror Descent algorithm.
-
-# Core training method
-"""
-
-function train_policy(
-    algorithm::MirrorDescent,
-    benchmark::ExogenousStochasticBenchmark;
-    dataset_size=30,
-    epochs=10,
-    iterations=10,
-    κ=1.0,
-    metrics::Tuple=(),
-    seed=nothing,
-)
-    train_dataset = generate_dataset(benchmark, dataset_size; seed=seed)
-
-    # Initialize model and create policy
-    model = generate_statistical_model(benchmark; seed=seed)
-    maximizer = generate_maximizer(benchmark)
-    policy = DFLPolicy(model, maximizer)
-
-    # vector because we store one history per iteration
-    histories_per_iteration = MVHistory[]
-
-    anticipative_solver = generate_anticipative_solver(benchmark;)
-    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark;)
-
-    # perturb = true correspond to "real" iterations of mirror descent
-    # we compute solutions with the penalized anticipative solver  + perturbation
-
-    # perturb = false correspond to imitation learning
-    # we use the anticipative solver without perturbation
-    # usefull to start with one iteration of pure imitation learning
-    perturb = false
-
-    # Train policy
-    for n_it in 1:iterations
-        println("Iteration $n_it / $iterations")
-
-        if n_it > 1
-            perturb = true
-        end
-
-        # Generate anticipative solutions as training data
-        augmented_dataset = augment_dataset(
-            algorithm.inner_algorithm,
-            benchmark,
-            train_dataset,
-            model,
-            anticipative_solver,
-            parametric_anticipative_solver;
-            κ=κ,
-            perturb=perturb,
-        )
-
-        # Train policy on augmented dataset
-        history = train_policy!(
-            algorithm.inner_algorithm,
-            policy,
-            augmented_dataset;
-            epochs=epochs,
-            metrics=metrics,
-            maximizer_kwargs=sample -> sample.context,
-        )
-
-        push!(histories_per_iteration, history)
-    end
-
-    return histories_per_iteration, policy
-end
-
-function augment_dataset(
-    algorithm::PerturbedFenchelYoungLossImitation,
-    bench::AbstractStochasticBenchmark,
-    train_dataset::AbstractArray,
-    model,
-    anticipative_solver,
-    parametric_anticipative_solver;
-    κ=1.0,
-    perturb=false,
-)
-    (; nb_samples, ε, threaded, training_optimizer, seed) = algorithm
-
-    augmented_dataset = Vector{DataSample}()
-
-    if perturb
-        perturbed_maximizer = PerturbedAdditive(
-            parametric_anticipative_solver; ε=κ * ε, nb_samples=nb_samples
-        )
-    end
-
-    for sample in train_dataset
-        θ = model(sample.x)
-
-        if perturb
-            if is_minimization_problem(bench)
-                y = perturbed_maximizer(-κ * θ; scenario=sample.scenario, context=sample)
-            else
-                y = perturbed_maximizer(κ * θ; scenario=sample.scenario, context=sample)
-            end
-        else
-            y = anticipative_solver(sample.scenario; context=sample)
-        end
-
-        augmented_datasample = DataSample(;
-            x=sample.x, y, instance=sample.context, extra=sample.extra
-        )
-
-        push!(augmented_dataset, augmented_datasample)
-    end
-
-    return augmented_dataset
-end
diff --git a/src/algorithms/mirror_descent/mirror_descent.jl b/src/algorithms/mirror_descent/mirror_descent.jl
index d773c85..a4b65d9 100644
--- a/src/algorithms/mirror_descent/mirror_descent.jl
+++ b/src/algorithms/mirror_descent/mirror_descent.jl
@@ -15,24 +15,88 @@ $TYPEDFIELDS
     inner_algorithm::A = PerturbedFenchelYoungLossImitation()
 end
 
+# Helper function to augment a dataset with anticipative solutions
+function _augment_with_anticipative(dataset, anticipative_solver)
+    return map(dataset) do sample
+        y = anticipative_solver(sample.scenario; sample.context...)
+        return DataSample(sample; y=y)
+    end
+end
+
+# Helper function to create a perturbed sample
+function _perturbed_sample(sample, model, perturbed_solver, is_minimization, κ)
+    θ = model(sample.x)
+    signed_θ = is_minimization ? -κ * θ : κ * θ
+    y = perturbed_solver(signed_θ; scenario=sample.scenario, sample.context...)
+    return DataSample(sample; y=y)
+end
+
+# Helper function to augment a dataset with perturbed solutions
+function _augment_with_perturbed(dataset, model, perturbed_solver, is_minimization; κ=1.0)
+    return map(dataset) do sample
+        return _perturbed_sample(sample, model, perturbed_solver, is_minimization, κ)
+    end
+end
+
+# Helper function to augment a dataset with perturbed solutions in-place
+function _augment_with_perturbed!(dataset, model, perturbed_solver, is_minimization; κ=1.0)
+    for i in eachindex(dataset)
+        dataset[i] = _perturbed_sample(
+            dataset[i], model, perturbed_solver, is_minimization, κ
+        )
+    end
+    return dataset
+end
+
+# Helper function to run the mirror descent loop for a given number of iterations
+function _mirror_descent_loop(
+    algorithm,
+    policy,
+    input_dataset,
+    perturbed_solver,
+    is_minimization;
+    md_iters,
+    epochs,
+    κ,
+    metrics,
+    verbose,
+)
+    # Allocate the perturbed dataset once. Subsequent iterations mutate in place.
+    dataset = _augment_with_perturbed(
+        input_dataset, policy.statistical_model, perturbed_solver, is_minimization; κ
+    )
+    return map(1:md_iters) do n_it
+        verbose && println("Mirror descent iteration $n_it / $md_iters")
+        if n_it > 1
+            _augment_with_perturbed!(
+                dataset, policy.statistical_model, perturbed_solver, is_minimization; κ
+            )
+        end
+        return train_policy!(algorithm.inner_algorithm, policy, dataset; epochs, metrics)
+    end
+end
+
 """
 $TYPEDSIGNATURES
 
 Train a DFLPolicy using the Mirror Descent algorithm on a provided training dataset.
 
-# Core training method
+When `imitation_start=true`, the first iteration is a pure imitation step using
+`anticipative_solver`; subsequent iterations are the mirror descent loop using
+`perturbed_anticipative_solver`.
 
 # Arguments
-- `epochs`: number of training epochs per iteration
-- `iterations`: number of mirror descent iterations
-- `κ`: scaling factor for the perturbation magnitude
-- `metrics`: tuple of metrics to track during training
-- `verbose`: if true, prints progress at each iteration
-- `imitation_start`: if true, the first iteration uses pure imitation learning (no perturbation)
+- `iterations=10`: total number of mirror descent iterations (includes the imitation step
+when `imitation_start=true`)
+- `epochs=10`: number of inner training epochs per mirror descent iteration
+- `κ=1.0`: scaling factor applied to `θ` before passing it to the perturbed solver
+- `metrics::Tuple=()`: metrics forwarded to the inner training algorithm
+- `verbose=false`: if true, prints progress at each iteration
+- `imitation_start=true`: if true, run a pure imitation step against the
+  anticipative solver as the first iteration
+- `is_minimization=true`: set to false if the objective is a maximization problem
 """
-
 function train_policy!(
-    benchmark::ExogenousStochasticBenchmark,
     algorithm::MirrorDescent,
     policy::DFLPolicy,
     train_dataset,
@@ -44,34 +108,44 @@ function train_policy!(
     metrics::Tuple=(),
     verbose::Bool=false,
     imitation_start::Bool=true,
+    is_minimization::Bool=true,
 )
-    augmented_dataset = train_dataset
-    return map(1:iterations) do n_it
-        if verbose
-            println("Iteration $n_it / $iterations")
-        end
-
-        perturb = n_it > 1 || !imitation_start
-
-        augmented_dataset = augment_dataset(
-            benchmark,
-            augmented_dataset,
-            policy.statistical_model,
-            anticipative_solver,
-            perturbed_anticipative_solver;
-            κ=κ,
-            perturb=perturb,
+    if imitation_start
+        verbose && println("Imitation step")
+        dataset = _augment_with_anticipative(train_dataset, anticipative_solver)
+        h_imitation = train_policy!(
+            algorithm.inner_algorithm, policy, dataset; epochs, metrics
         )
-
-        train_policy!(
-            algorithm.inner_algorithm,
+        md_iters = iterations - 1
+        md_iters >= 1 || return [h_imitation]
+        rest = _mirror_descent_loop(
+            algorithm,
             policy,
-            augmented_dataset;
-            epochs=epochs,
-            metrics=metrics,
-            maximizer_kwargs=sample -> sample.context,
+            dataset,
+            perturbed_anticipative_solver,
+            is_minimization;
+            md_iters,
+            epochs,
+            κ,
+            metrics,
+            verbose,
         )
+        return pushfirst!(rest, h_imitation)
     end
+
+    # else
+    return _mirror_descent_loop(
+        algorithm,
+        policy,
+        train_dataset,
+        perturbed_anticipative_solver,
+        is_minimization;
+        md_iters=iterations,
+        epochs,
+        κ,
+        metrics,
+        verbose,
+    )
 end
 
 """
@@ -79,60 +153,65 @@ $TYPEDSIGNATURES
 
 Generate a dataset for the provided benchmark and train a DFLPolicy using the Mirror Descent algorithm.
 
-# Benchmark convenience wrapper
-
-This high-level function handles all setup from the benchmark and returns a trained policy.
+This high-level wrapper builds every component (`model`, `maximizer`,
+`anticipative_solver`, `parametric_anticipative_solver`, `train_dataset`) from the
+benchmark, each exposed as an optional keyword so callers can override any of them
+without dropping to [`train_policy!`](@ref).
 
 # Arguments
-- `dataset_size`: number of samples in the training dataset
-- `epochs`: number of training epochs per iteration
-- `iterations`: number of mirror descent iterations
-- `κ`: scaling factor for the perturbation magnitude
-- `metrics`: tuple of metrics to track during training
-- `seed`: random seed for reproducibility
-- `verbose`: if true, prints progress at each iteration
-- `imitation_start`: if true, the first iteration uses pure imitation learning (no perturbation)
-- `model_kwargs`: additional keyword arguments passed to `generate_statistical_model`
-- `maximizer_kwargs`: additional keyword arguments passed to `generate_maximizer`
-- `solver_kwargs`: additional keyword arguments passed to `generate_anticipative_solver` and `generate_parametric_anticipative_solver`
-- `nb_scenarios`: number of scenarios per instance. 
-- `context_per_instance`: number of contexts per instance. 
+- `dataset_size=30`: number of samples in the training dataset
+(used when `train_dataset` is not provided)
+- `nb_scenarios=1`: number of scenarios per instance
+(used when `train_dataset` is not provided)
+- `context_per_instance=1`: number of contexts per instance
+(used when `train_dataset` is not provided)
+- `seed=nothing`: random seed for reproducibility
+(used in `model` and `train_dataset` when not provided)
+- `model`: statistical model to wrap in the policy
+(defaults to `generate_statistical_model(benchmark; seed)`)
+- `maximizer`: combinatorial oracle to wrap in the policy
+(defaults to `generate_maximizer(benchmark)`)
+- `anticipative_solver`: oracle used in pure-imitation iterations
+(defaults to `generate_anticipative_solver(benchmark)`)
+- `parametric_anticipative_solver`: parametric oracle wrapped in `PerturbedAdditive` for
+mirror-descent iterations (defaults to `generate_parametric_anticipative_solver(benchmark)`)
+- `train_dataset`: training dataset (defaults to `generate_dataset(benchmark, dataset_size; ...)`)
+- `epochs=10`: number of inner training epochs per mirror descent iteration
+- `iterations=10`: total number of mirror descent iterations
+- `κ=1.0`: scaling factor applied to `θ` before passing it to the perturbed solver
+- `metrics::Tuple=()`: metrics forwarded to the inner training algorithm
+- `verbose=false`: if true, prints a banner at each iteration
+- `imitation_start=true`: if true, run a pure imitation step against the anticipative solver as the
+first iteration
 """
-
 function train_policy(
     algorithm::MirrorDescent,
     benchmark::ExogenousStochasticBenchmark;
     dataset_size=30,
-    epochs=10,
-    iterations=10,
-    κ=1.0,
-    metrics::Tuple=(),
-    seed=nothing,
-    verbose::Bool=false,
-    imitation_start::Bool=true,
-    model_kwargs=(;),
-    maximizer_kwargs=(;),
-    solver_kwargs=(;),
     nb_scenarios=1,
     context_per_instance=1,
-)
-    train_dataset = generate_dataset(
+    seed=nothing,
+    model=generate_statistical_model(benchmark; seed=seed),
+    maximizer=generate_maximizer(benchmark),
+    anticipative_solver=generate_anticipative_solver(benchmark),
+    parametric_anticipative_solver=generate_parametric_anticipative_solver(benchmark),
+    train_dataset=generate_dataset(
         benchmark,
         dataset_size;
         nb_scenarios=nb_scenarios,
         contexts_per_instance=context_per_instance,
         seed=seed,
-    )
-
-    model = generate_statistical_model(benchmark; seed=seed, model_kwargs...)
-    maximizer = generate_maximizer(benchmark; maximizer_kwargs...)
+    ),
+    epochs=10,
+    iterations=10,
+    κ=1.0,
+    metrics::Tuple=(),
+    verbose::Bool=false,
+    imitation_start::Bool=true,
+)
     policy = DFLPolicy(model, maximizer)
 
-    anticipative_solver = generate_anticipative_solver(benchmark; solver_kwargs...)
-    parametric_anticipative_solver = generate_parametric_anticipative_solver(
-        benchmark; solver_kwargs...
-    )
-    (; nb_samples, ε, threaded, seed) = algorithm.inner_algorithm
+    (; nb_samples, ε, threaded) = algorithm.inner_algorithm
     perturbed_anticipative_solver = PerturbedAdditive(
         (θ; scenario, kwargs...) -> parametric_anticipative_solver(θ, scenario; kwargs...);
         ε=κ * ε,
@@ -142,7 +221,6 @@ function train_policy(
     )
 
     histories_per_iteration = train_policy!(
-        benchmark,
         algorithm,
         policy,
         train_dataset,
@@ -154,91 +232,8 @@ function train_policy(
         metrics=metrics,
         verbose=verbose,
         imitation_start=imitation_start,
+        is_minimization=is_minimization_problem(benchmark),
     )
 
     return histories_per_iteration, policy
 end
-
-function augment_dataset(
-    bench::ExogenousStochasticBenchmark,
-    train_dataset::AbstractArray,
-    model,
-    anticipative_solver,
-    perturbed_anticipative_solver;
-    κ=1.0,
-    perturb=false,
-)
-    return _augment_dataset(
-        Val(fieldtype(eltype(train_dataset), :y) !== Nothing),
-        bench,
-        train_dataset,
-        model,
-        anticipative_solver,
-        perturbed_anticipative_solver;
-        κ=κ,
-        perturb=perturb,
-    )
-end
-
-# Raw dataset (samples have no y) → create new DataSamples
-function _augment_dataset(
-    ::Val{false},
-    bench,
-    train_dataset,
-    model,
-    anticipative_solver,
-    perturbed_anticipative_solver;
-    κ=1.0,
-    perturb=false,
-)
-    return map(train_dataset) do sample
-        θ = model(sample.x)
-        if perturb
-            if is_minimization_problem(bench)
-                y = perturbed_anticipative_solver(
-                    -κ * θ; scenario=sample.scenario, sample.context...
-                )
-            else
-                y = perturbed_anticipative_solver(
-                    κ * θ; scenario=sample.scenario, sample.context...
-                )
-            end
-        else
-            y = anticipative_solver(sample.scenario; sample.context...)
-        end
-        DataSample(sample; y=y)
-    end
-end
-
-# Augmented dataset (samples already have y) → update y in place
-function _augment_dataset(
-    ::Val{true},
-    bench,
-    train_dataset,
-    model,
-    anticipative_solver,
-    perturbed_anticipative_solver;
-    κ=1.0,
-    perturb=false,
-)
-    for (i, sample) in enumerate(train_dataset)
-        θ = model(sample.x)
-        if perturb
-            if is_minimization_problem(bench)
-                y = perturbed_anticipative_solver(
-                    -κ * θ; scenario=sample.scenario, sample.context...
-                )
-            else
-                y = perturbed_anticipative_solver(
-                    κ * θ; scenario=sample.scenario, sample.context...
-                )
-            end
-        else
-            y = anticipative_solver(sample.scenario; sample.context...)
-        end
-        ET = eltype(sample.y)
-        y_converted = convert(typeof(sample.y), ET <: Integer ? round.(ET, y) : y)
-        train_dataset[i] = DataSample(sample; y=y_converted)
-    end
-    return train_dataset
-end
diff --git a/test/Project.toml b/test/Project.toml
index 6940310..b596e5f 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -15,7 +15,6 @@ DecisionFocusedLearningAlgorithms = {path = ".."}
 
 [compat]
 Aqua = "0.8"
-DecisionFocusedLearningBenchmarks = "0.5"
 Documenter = "1"
 JuliaFormatter = "2"
 MLUtils = "0.4"
diff --git a/test/mirror_descent.jl b/test/mirror_descent.jl
index 3373c64..171eb1d 100644
--- a/test/mirror_descent.jl
+++ b/test/mirror_descent.jl
@@ -64,9 +64,11 @@ using Statistics: mean
             vals = map(data) do s
                 θ = ctx.policy.statistical_model(s.x)
                 y = ctx.policy.maximizer(θ; s.context...)
-                Float64(DecisionFocusedLearningBenchmarks.objective_value(benchmark, s, y))
+                return Float64(
+                    DecisionFocusedLearningBenchmarks.objective_value(benchmark, s, y)
+                )
             end
-            (val_obj=mean(vals),)
+            return (val_obj=mean(vals),)
         end
 
         histories, policy = train_policy(

From 521ae9d326ce8d86cfa0a2f516b2a369345ff647 Mon Sep 17 00:00:00 2001
From: BatyLeo <leo.baty67@gmail.com>
Date: Sun, 21 Jun 2026 11:38:41 +0200
Subject: [PATCH 8/9] test: connect and improve tests

---
 test/Project.toml      |  1 +
 test/mirror_descent.jl | 53 ++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl       |  4 ++++
 3 files changed, 58 insertions(+)

diff --git a/test/Project.toml b/test/Project.toml
index b596e5f..261fc39 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -7,6 +7,7 @@ InferOpt = "4846b161-c94e-4150-8dac-c7ae193c601f"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
 
diff --git a/test/mirror_descent.jl b/test/mirror_descent.jl
index 171eb1d..435f0e3 100644
--- a/test/mirror_descent.jl
+++ b/test/mirror_descent.jl
@@ -4,6 +4,19 @@ using Test
 using ValueHistories
 using Statistics: mean
 
+function _val_obj_metric(benchmark, val_data)
+    return FunctionMetric(:val_obj, val_data) do ctx, data
+        vals = map(data) do s
+            θ = ctx.policy.statistical_model(s.x)
+            y = ctx.policy.maximizer(θ; s.context...)
+            return Float64(
+                DecisionFocusedLearningBenchmarks.objective_value(benchmark, s, y)
+            )
+        end
+        return (val_obj=mean(vals),)
+    end
+end
+
 @testset "MirrorDescent Training" begin
     @testset "MirrorDescent - ContextualStochasticArgmax basic" begin
         benchmark = ContextualStochasticArgmaxBenchmark()
@@ -105,4 +118,44 @@ using Statistics: mean
 
         @test all(haskey(h, :epoch) for h in histories)
     end
+
+    @testset "MirrorDescent - trained beats untrained (ContextualStochasticArgmax)" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        val_data = generate_dataset(benchmark, 50; seed=99)
+
+        histories, _ = train_policy(
+            MirrorDescent(),
+            benchmark;
+            dataset_size=20,
+            epochs=5,
+            iterations=5,
+            seed=0,
+            metrics=(_val_obj_metric(benchmark, val_data),),
+        )
+
+        obj_untrained = get(histories[1], :val_obj)[2][1]
+        obj_trained = get(histories[end], :val_obj)[2][end]
+
+        @test obj_trained > obj_untrained
+    end
+
+    @testset "MirrorDescent - trained beats untrained (StochasticVehicleScheduling)" begin
+        benchmark = StochasticVehicleSchedulingBenchmark()
+        val_data = generate_dataset(benchmark, 10; seed=99)
+
+        histories, _ = train_policy(
+            MirrorDescent(),
+            benchmark;
+            dataset_size=10,
+            epochs=5,
+            iterations=2,
+            seed=0,
+            metrics=(_val_obj_metric(benchmark, val_data),),
+        )
+
+        obj_untrained = get(histories[1], :val_obj)[2][1]
+        obj_trained = get(histories[end], :val_obj)[2][end]
+
+        @test obj_trained < obj_untrained
+    end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 02565a1..88bbf7c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -13,4 +13,8 @@ using DecisionFocusedLearningAlgorithms
     @testset "DAgger" begin
         include("dagger.jl")
     end
+
+    @testset "MirrorDescent" begin
+        include("mirror_descent.jl")
+    end
 end

From 502c76e6057a29b317a12536d1f58af0ff745227 Mon Sep 17 00:00:00 2001
From: BatyLeo <leo.baty67@gmail.com>
Date: Sun, 21 Jun 2026 12:08:16 +0200
Subject: [PATCH 9/9] ci: move formatting test to its own ci job, and only run
 it on latest julia version

---
 .github/workflows/Format.yml | 35 +++++++++++++++++++++++++++++++++++
 test/Project.toml            |  2 --
 test/code.jl                 |  7 -------
 3 files changed, 35 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/Format.yml

diff --git a/.github/workflows/Format.yml b/.github/workflows/Format.yml
new file mode 100644
index 0000000..b9dfbe6
--- /dev/null
+++ b/.github/workflows/Format.yml
@@ -0,0 +1,35 @@
+name: Format
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+jobs:
+  format-check:
+    name: JuliaFormatter
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v7
+      - uses: julia-actions/setup-julia@v3
+        with:
+          version: '1'
+      - uses: julia-actions/cache@v3
+      - name: Run JuliaFormatter
+        shell: julia --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.activate(; temp=true)
+          Pkg.add(name="JuliaFormatter", version="2")
+          using JuliaFormatter
+          if !format(".", verbose=true, overwrite=false)
+              @error "Code is not formatted. Run `julia -e 'using JuliaFormatter; format(\".\")'` locally."
+              exit(1)
+          end
diff --git a/test/Project.toml b/test/Project.toml
index 0a6a919..603374a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -5,7 +5,6 @@ DecisionFocusedLearningBenchmarks = "2fbe496a-299b-4c81-bab5-c44dfc55cf20"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 InferOpt = "4846b161-c94e-4150-8dac-c7ae193c601f"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
-JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -19,7 +18,6 @@ Aqua = "0.8"
 DecisionFocusedLearningAlgorithms = "0.2.0"
 DecisionFocusedLearningBenchmarks = "0.6.1"
 Documenter = "1"
-JuliaFormatter = "2"
 MLUtils = "0.4"
 Test = "1"
 ValueHistories = "0.5"
diff --git a/test/code.jl b/test/code.jl
index 3f74eb9..75c76c1 100644
--- a/test/code.jl
+++ b/test/code.jl
@@ -1,7 +1,6 @@
 using Aqua
 using Documenter
 using JET
-using JuliaFormatter
 
 using DecisionFocusedLearningAlgorithms
 
@@ -20,12 +19,6 @@ end
     )
 end
 
-@testset "JuliaFormatter" begin
-    @test JuliaFormatter.format(
-        DecisionFocusedLearningAlgorithms; verbose=false, overwrite=false
-    )
-end
-
 @testset "Documenter" begin
     Documenter.doctest(DecisionFocusedLearningAlgorithms)
 end