creative_writing/writing_bench.py at main · PrimeIntellect-ai/creative_writing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
"""
Generate creative writing tasks for the Creative Writing RL Environment.

Generates N creative writing prompts with task metadata and comprehensive rubrics.
Each rubric contains a series of yes/no questions to check if the response
met all the instructions in the prompt exactly.

Uses Pydantic models for structured output matching the evaluation JSON format.
"""

import asyncio
import json
import os
import re
import random
from pathlib import Path
from typing import Any, Literal
import uuid

from openai import AsyncOpenAI, APIConnectionError, APITimeoutError, RateLimitError
from pydantic import BaseModel, Field

N = 50  # Number of tasks to generate
MAX_CONCURRENCY = 50

client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Pydantic models for structured output

class RubricCriterion(BaseModel):
    """A single rubric criterion for evaluating a response."""
    criterion: str = Field(..., description="Clear statement describing what should be checked")
    instruction_type: Literal["Objective", "Subjective"] = Field(
        ...,
        description="Objective = can be definitively verified; Subjective = requires interpretation/judgment"
    )
    instruction_necessity: Literal["Explicit", "Implicit"] = Field(
        ...,
        description="Explicit = directly stated in prompt; Implicit = reasonably inferred from context"
    )
    instruction_priority: Literal["Must have", "Nice to have"] = Field(
        ...,
        description="Must have = critical requirement; Nice to have = enhances quality but not required"
    )
    instruction_dimension: Literal[
        "Instruction Following",
        "Writing Quality",
        "Correctness",
        "Format",
        "Verbosity",
        "Other"
    ] = Field(
        ...,
        description="The dimension/category this criterion evaluates"
    )


class RubricCriteriaList(BaseModel):
    """List of rubric criteria for a task."""
    criteria: list[RubricCriterion] = Field(..., description="List of rubric criteria")


class RubricGrade(BaseModel):
    """Grade for a single rubric criterion."""
    rationale: str = Field(..., description="Brief explanation of the evaluation")
    passed: bool = Field(..., description="True if the response satisfies the criterion, false otherwise")


class RubricGradesList(BaseModel):
    """List of grades for all rubric criteria."""
    grades: list[RubricGrade] = Field(..., description="List of grades, one per criterion")


class CreativeWritingTask(BaseModel):
    """A complete creative writing task with prompt and rubric."""
    prompt_id: str = Field(..., description="Unique identifier for the prompt")
    genre: str = Field(..., description="Genre of the story")
    difficulty: str = Field(..., description="Difficulty level")
    pov: str = Field(..., description="Point of view")
    setting: str = Field(..., description="Setting type")
    target_age: str = Field(..., description="Target audience age group")
    geography: str = Field(..., description="Geographic location")
    gender: str = Field(..., description="Main character gender")
    relationship: str = Field(..., description="Secondary character relationship")
    conflict_type: str = Field(..., description="Type of conflict")
    theme: str = Field(..., description="Central theme")
    tone: str = Field(..., description="Narrative tone")
    word_constraint: int = Field(..., description="Target word count")
    prompt_text: str = Field(..., description="The creative writing prompt")


# Schema configuration constants

# Genre options
GENRES = [
    "horror",
    "romance",
    "mystery",
    "sci-fi",
    "literary_fiction",
    "thriller",
    "fantasy",
    "humor",
    "historical_fiction",
    "dystopian",
    "magical_realism",
    "western",
    "gothic",
    "adventure",
    "drama",
]

# Difficulty
DIFFICULTIES = ["easy", "medium", "hard"]

# POV options
POVS = ["first_person", "second_person", "third_person"]

# Setting options
SETTINGS = ["medieval", "city", "rural_town", "suburb", "space", "underwater", "desert", "forest", "mountain", "island"]

# Target age options
TARGET_AGES = ["children", "teens", "adults", "parents", "couples", "elderly"]

# Geography options (continent level)
GEOGRAPHIES = ["Africa", "Antarctica", "Asia", "Australia", "Europe", "North_America", "South_America"]

# Gender of main character
GENDERS = ["male", "female", "non_binary"]

# Secondary character relationship to the main character
RELATIONSHIPS = ["friend", "sister", "parent", "brother", "boyfriend", "girlfriend", "wife", "husband", "classmate", "coworker"]

# Conflict type that the plot centers around
CONFLICT_TYPES = ["character_vs_self", "character_vs_other_character", "character_vs_society", "character_vs_nature", "character_vs_machine", "character_vs_unknown"]

# Themes
THEMES = ["freedom", "memory", "belief", "identity", "justice", "belonging", "grief"]

# Tones
TONES = ["eerie", "hopeful", "melancholic", "frenetic", "whimsical", "solemn", "humorous"]

RETRYABLE_EXCEPTIONS = (
    APIConnectionError,
    APITimeoutError,
    RateLimitError,
    asyncio.TimeoutError,
    ConnectionError,
)

async def retry_async(
    func,
    max_retries: int = 7,
    initial_delay: float = 1.0,
    backoff_factor: float = 2.0,
    max_delay: float = 60.0,
    jitter_factor: float = 0.5,
):
    """Retry an async function with exponential backoff."""
    last_exception = None
    for attempt in range(max_retries):
        try:
            return await func()
        except RETRYABLE_EXCEPTIONS as e:
            last_exception = e
            if attempt == max_retries - 1:
                raise
            base_delay = min(initial_delay * (backoff_factor ** attempt), max_delay)
            jitter = base_delay * jitter_factor * random.random()
            await asyncio.sleep(base_delay + jitter)
        except Exception:
            raise
    raise RuntimeError(f"Retry logic failed after {max_retries} attempts: {last_exception}")


async def generate_with_semaphore(
    semaphore: asyncio.Semaphore,
    prompt: str,
    temperature: float = 0.9
) -> str:
    """Generate text using OpenAI API with semaphore control."""
    async def make_request():
        async with semaphore:
            response = await client.chat.completions.create(
                model="gpt-4.1-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
            )
            content = response.choices[0].message.content
            if content is None:
                raise ValueError("API returned None content")
            return content

    return await retry_async(make_request)


async def generate_with_structured_output(
    semaphore: asyncio.Semaphore,
    prompt: str,
    response_format: type[BaseModel],
    temperature: float = 0.7,
) -> BaseModel:
    """Generate structured output using OpenAI API with Pydantic parsing."""
    async def make_request():
        async with semaphore:
            response = await client.beta.chat.completions.parse(
                model="gpt-4.1-mini",
                messages=[{"role": "user", "content": prompt}],
                response_format=response_format,
                temperature=temperature,
            )
            parsed = response.choices[0].message.parsed
            if parsed is None:
                raise ValueError("API returned None parsed content")
            return parsed

    return await retry_async(make_request)


# Task generation

async def generate_single_task(
    semaphore: asyncio.Semaphore,
    task_index: int,
    genre: str,
    difficulty: str,
    pov: str,
    setting: str,
    target_age: str,
    geography: str,
    gender: str,
    relationship: str,
    conflict_type: str,
    theme: str,
    tone: str,
    word_constraint: int,
) -> dict[str, Any] | None:
    """Generate a single creative writing task with comprehensive rubric."""

    prompt = f"""Generate a creative writing prompt with the following specifications:

SPECIFICATIONS:
- Genre: {genre}
- Difficulty: {difficulty}
- POV: {pov}
- Setting: {setting}
- Target age: {target_age}
- Geography: {geography}
- Main character gender: {gender}
- Secondary character relationship: {relationship}
- Conflict type: {conflict_type}
- Theme: {theme}
- Tone: {tone}
- Word constraint: {word_constraint} words

HERE ARE SOME EXAMPLE PROMPT STYLES for reference. DO NOT copy these structures directly.
Instead, use them to understand the level of specificity and how fields are incorporated,
then create something ORIGINAL with a DIFFERENT structure, scenario, and narrative setup:

EXAMPLE 1 (Traditional narrative):
Write a {{genre}} short story for a {{target_age}} audience, told in {{pov}}.
Set it in {{setting}} within {{geography}}, where local customs strongly shape daily life.
The main character is a {{gender}} whose closest ally is their {{relationship}}, but the story's main {{conflict_type}} arises when the protagonist quietly breaks an important tradition.
Explore the theme of {{theme}} with a mostly {{tone}} tone, and resolve the conflict in a way that feels believable rather than neatly moralistic.
Keep the story to {{word_constraint}} words.

EXAMPLE 2 (Emotional distance):
Write a {{genre}} story in {{pov}}, aimed at {{target_age}} readers, set primarily in {{setting}} in {{geography}}.
Your main character is a {{gender}} who has grown emotionally distant from their {{relationship}}.
Center the story on {{conflict_type}} sparked by a new piece of technology that suddenly exposes a long-held secret.
Use a {{tone}} tone as you explore the theme of {{theme}}, and let the ending leave some questions unanswered.
Limit yourself to {{word_constraint}} words, focusing on one key day when everything changes.

EXAMPLE 3 (Confined setting):
Craft a {{genre}} short story at a {{difficulty}} level appropriate for {{target_age}} readers.
The narrative should be in {{pov}}, set in a confined {{setting}} located in {{geography}}, where everyone knows everyone.
Your {{gender}} protagonist relies heavily on their {{relationship}}, until a sudden {{conflict_type}} forces them to act alone for the first time.
Explore the theme of {{theme}} through small, concrete choices rather than speeches, keeping the tone primarily {{tone}}.
Tell the whole story in {{word_constraint}} words, with one clear turning point at the midpoint.

EXAMPLE 4 (Journey based):
Write a {{genre}} story in {{pov}} for {{target_age}} readers, set during a journey across {{geography}}.
Most scenes should unfold in and around {{setting}}, which creates practical obstacles for the characters.
The main character is a {{gender}} traveling with their {{relationship}}, and the core {{conflict_type}} emerges when they must choose between safety and honesty.
Use a mostly {{tone}} tone as you explore the theme of {{theme}}, allowing the landscape to mirror the changing emotions.
Stay within {{word_constraint}} words, focusing on a single difficult decision.

EXAMPLE 5 (Routine disruption):
At a {{difficulty}} level, write a {{genre}} short story in {{pov}} suitable for {{target_age}} readers.
Set the story in {{setting}} in {{geography}}, a place where routines are rarely questioned.
Your {{gender}} protagonist and their {{relationship}} become divided by a {{conflict_type}} that starts as something small and easily ignored.
Quietly explore {{theme}} through how they speak, avoid speaking, and what they choose not to do, maintaining a {{tone}} tone.
Keep the narrative tight and focused, with a maximum of {{word_constraint}} words.

EXAMPLE 6 (Event mishap):
Write a {{genre}} story for a {{target_age}} audience, told in {{pov}}, set during a festival or holiday in {{setting}} in {{geography}}.
The main character is a {{gender}} person who has complicated feelings about their {{relationship}}, and the main {{conflict_type}} surfaces when a public ritual goes wrong.
Let the story explore {{theme}}, with a tone that starts {{tone}} but darkens slightly as tensions rise.
Use specific sensory details (sounds, smells, textures) to bring the celebration and its unraveling to life.
Tell the entire story in about {{word_constraint}} words.

EXAMPLE 7 (Broken connections):
Create a {{genre}} short story in {{pov}}, written at a {{difficulty}} level that still works for {{target_age}} readers.
The story takes place in a relatively isolated {{setting}} somewhere in {{geography}}.
Your {{gender}} protagonist's only regular contact is their {{relationship}}, but a growing {{conflict_type}} makes even that connection feel fragile.
Explore the theme of {{theme}} with a predominantly {{tone}} tone, and end on an image that suggests change rather than explaining it.
Stay under {{word_constraint}} words, centering on one conversation that changes their relationship.

EXAMPLE 8 (Reunion):
Write a {{genre}} story in {{pov}} for {{target_age}} readers, taking place over a single night in {{setting}} within {{geography}}.
The main character is a {{gender}} person who unexpectedly meets their {{relationship}} after a long separation.
Let the central {{conflict_type}} revolve around whether to reopen old wounds or leave them closed, and explore {{theme}} through subtext more than direct discussion.
Maintain a {{tone}} tone, with the setting subtly reflecting the shifting emotional stakes.
Convey the entire encounter in {{word_constraint}} words.

EXAMPLE 9 (Mystery):
Your {{gender}} protagonist wakes up in {{setting}} with no memory of the last 24 hours.
Their {{relationship}} claims something happened that the protagonist can't believe.
In {{word_constraint}} words, write this {{genre}} confrontation in {{pov}}, exploring {{theme}} with a {{tone}} tone.
Set the story in {{geography}} and let the {{conflict_type}} build to a single moment of clarity.
Write for a {{target_age}} audience at {{difficulty}} difficulty.

EXAMPLE 10 (One room setting):
Constraints: exactly {{word_constraint}} words, {{pov}} POV, set entirely in one room within a {{setting}} in {{geography}}.
Write a {{genre}} piece for {{target_age}} readers where a {{gender}} character must convince their {{relationship}} of something they themselves doubt.
The {{conflict_type}} hinges on {{theme}}. Maintain a {{tone}} tone even as the emotional stakes rise.
No flashbacks. Everything happens in present scene.

EXAMPLE 11 (Play type scene):
Open on: A {{gender}} stranger arriving at a {{setting}} in {{geography}}, looking for someone.
Close on: That same person leaving, changed by what they learned.
In between: A {{genre}} story of exactly {{word_constraint}} words exploring {{conflict_type}} between the protagonist and their {{relationship}}.
Use {{pov}} throughout. Theme: {{theme}}. Tone: {{tone}}. Target audience: {{target_age}}.

EXAMPLE 12 (Relationship evolution):
Write a {{genre}} story in {{pov}} that takes place in {{setting}}, {{geography}}.
Your {{gender}} protagonist receives an object from their {{relationship}} that forces them to confront {{theme}}.
The {{conflict_type}} should drive the narrative, with the {{tone}} tone shifting subtly as realizations land.
Aim for {{word_constraint}} words, suitable for {{target_age}} readers.
Let the ending reframe something from the opening.

IMPORTANT: Your generated prompt must have a DIFFERENT structure and scenario than the examples above.
Be creative. The examples show the basic quality bar and how to incorporate the fields. These are not the only valid formats.
Invent a fresh framing, a unique scenario, and an unexpected angle.

The prompt you generate should:
- Be a specific, detailed instruction for writing a short piece of exactly {word_constraint} words
- Incorporate at least 8 of the specifications above naturally into the prompt
- Make semantic sense (for example, the prompt should not ask for a story with a funny tone that is scary in nature; similarly, the protagonist should not be exploring a romantic relationship with their sibling)
- Include concrete constraints (word count, paragraph count, character names, setting details, POV constraints, age constraints, and the like)
- Have clear requirements that can be evaluated objectively AND subjectively
- Be interesting and engaging for a writer to write and lead to a short story that is good literature
- Avoid requiring images, audio, or video (text only)

Return only valid JSON in this exact format:
{{
  "prompt_id": "gen_{task_index:04d}",
  "genre": "{genre}",
  "difficulty": "{difficulty}",
  "pov": "{pov}",
  "setting": "{setting}",
  "target_age": "{target_age}",
  "geography": "{geography}",
  "gender": "{gender}",
  "relationship": "{relationship}",
  "conflict_type": "{conflict_type}",
  "theme": "{theme}",
  "tone": "{tone}",
  "word_constraint": {word_constraint},
  "prompt_text": "Your ORIGINAL creative writing prompt here..."
}}

Output only valid JSON, no additional text or markdown formatting."""

    try:
        result = await generate_with_semaphore(semaphore, prompt)

        result = result.strip()
        if result.startswith("```json"):
            result = result[7:]
        if result.startswith("```"):
            result = result[3:]
        if result.endswith("```"):
            result = result[:-3]
        result = result.strip()

        task = json.loads(result)

        # Ensure required fields are provided
        required_fields = ["prompt_id", "genre", "difficulty", "pov", "setting", "target_age", "geography", "gender", "relationship", "conflict_type", "theme", "tone", "word_constraint", "prompt_text"]
        for field in required_fields:
            if field not in task:
                print(f"  WARNING: Task {task_index} missing field '{field}', skipping")
                return None

        # Generate unique IDs
        task["task_id"] = str(uuid.uuid4())
        task["prompt_id"] = f"gen_{task_index:04d}"

        # Generate the rubric for this task
        rubric = await generate_rubric(semaphore, task)
        if rubric:
            task["rubric"] = rubric
        else:
            print(f"  WARNING: Task {task_index} rubric generation failed, skipping")
            return None

        print(f"  ✓ Generated task {task_index}: {genre}/{difficulty} with {len(rubric)} rubric criteria")
        return task

    except json.JSONDecodeError as e:
        print(f"  WARNING: Task {task_index} JSON parse error: {e}")
        return None
    except Exception as e:
        print(f"  WARNING: Task {task_index} generation error: {e}")
        return None


async def generate_rubric(
    semaphore: asyncio.Semaphore,
    task: dict[str, Any]
) -> list[dict[str, str]] | None:
    """Generate a comprehensive rubric with yes/no criteria for the given task."""

    prompt_text = task.get("prompt_text", "")

    rubric_prompt = f"""You are an expert at creating evaluation rubrics for creative writing prompts.

Given the following creative writing prompt, create a comprehensive rubric consisting of YES/NO evaluation criteria.
Each criterion MUST be phrased as a question that can be answered with YES or NO.

CREATIVE WRITING PROMPT:
{prompt_text}

TASK METADATA:
- Genre: {task.get('genre', 'N/A')}
- Difficulty: {task.get('difficulty', 'N/A')}
- POV: {task.get('pov', 'N/A')}
- Setting: {task.get('setting', 'N/A')}
- Target age: {task.get('target_age', 'N/A')}
- Geography: {task.get('geography', 'N/A')}
- Main character gender: {task.get('gender', 'N/A')}
- Secondary character relationship: {task.get('relationship', 'N/A')}
- Conflict type: {task.get('conflict_type', 'N/A')}
- Theme: {task.get('theme', 'N/A')}
- Tone: {task.get('tone', 'N/A')}
- Word constraint: {task.get('word_constraint', 'N/A')} words

Create 8-15 rubric criteria. Each criterion MUST be a YES/NO question that checks if a specific requirement was met.

GUIDELINES FOR CREATING CRITERIA:
1. Extract EVERY explicit instruction from the prompt (word count, POV, setting, character details, etc.)
2. Include both objective criteria (can be definitively checked) and subjective criteria (requires judgment)
3. Each criterion MUST be phrased as a YES/NO question (e.g., "Is the story written in first person POV?" or "Does the story maintain a horror tone throughout?")
4. Do NOT phrase criteria as statements - they must be questions answerable with YES or NO
5. Cover: format requirements, character requirements, setting requirements, plot requirements, tone/theme requirements, and any special constraints

For each criterion, you MUST provide ALL of these fields:
- "criterion": A YES/NO question describing what should be checked (MUST end with a question mark)
- "instruction_type": Either "Objective" (can be definitively verified, like word count or POV) or "Subjective" (requires interpretation, like tone or quality)
- "instruction_necessity": Either "Explicit" (directly stated in the prompt) or "Implicit" (reasonably inferred from context or good writing practice)
- "instruction_priority": Either "Must have" (critical requirement that must be met) or "Nice to have" (enhances quality but not strictly required)
- "instruction_dimension": One of "Instruction Following", "Writing Quality", "Correctness", "Format", "Verbosity", or "Other"

EXAMPLE CRITERIA (note: all are YES/NO questions):
For a prompt asking for "a 300-word horror story in first person, set in a haunted house, featuring a female protagonist named Sarah":

1. criterion: "Is the story approximately 300 words in length (within 10% tolerance)?"
   instruction_type: "Objective"
   instruction_necessity: "Explicit"
   instruction_priority: "Must have"
   instruction_dimension: "Instruction Following"

2. criterion: "Is the story written in first person POV using 'I' statements?"
   instruction_type: "Objective"
   instruction_necessity: "Explicit"
   instruction_priority: "Must have"
   instruction_dimension: "Instruction Following"

3. criterion: "Does the story maintain a horror genre with elements that create tension, fear, or unease?"
   instruction_type: "Subjective"
   instruction_necessity: "Explicit"
   instruction_priority: "Must have"
   instruction_dimension: "Instruction Following"

4. criterion: "Does the story have a coherent narrative structure with a beginning, middle, and end?"
   instruction_type: "Subjective"
   instruction_necessity: "Implicit"
   instruction_priority: "Nice to have"
   instruction_dimension: "Writing Quality"
"""

    try:
        rubric_result = await generate_with_structured_output(
            semaphore,
            rubric_prompt,
            RubricCriteriaList,
            temperature=0.7
        )

        rubric = [criterion.model_dump() for criterion in rubric_result.criteria]

        # Validate rubric has enough criteria
        if len(rubric) < 5:
            print(f"  WARNING: Rubric only has {len(rubric)} criteria, expected at least 5")
            return None

        return rubric

    except Exception as e:
        print(f"  WARNING: Rubric generation error: {e}")
        return None

def format_task_for_output(task: dict[str, Any]) -> dict[str, Any]:
    """Format a task with flat rubric structure matching the evaluation JSON format."""
    output = {
        "task_id": task.get("task_id", str(uuid.uuid4())),
        "prompt_id": task.get("prompt_id", ""),
        "prompt_text": task.get("prompt_text", ""),
        "genre": task.get("genre", ""),
        "difficulty": task.get("difficulty", ""),
        "pov": task.get("pov", ""),
        "setting": task.get("setting", ""),
        "target_age": task.get("target_age", ""),
        "geography": task.get("geography", ""),
        "gender": task.get("gender", ""),
        "relationship": task.get("relationship", ""),
        "conflict_type": task.get("conflict_type", ""),
        "theme": task.get("theme", ""),
        "tone": task.get("tone", ""),
        "word_constraint": task.get("word_constraint", 0),
    }

    # Format rubric with numbered flat structure for eval JSON
    rubric = task.get("rubric", [])
    max_criteria = 15  # Maximum number of criteria slots

    for i in range(1, max_criteria + 1):
        if i <= len(rubric):
            criterion = rubric[i - 1]
            output[f"rubric - {i}. criterion"] = criterion.get("criterion", "")
            output[f"rubric - {i}. instruction_type"] = criterion.get("instruction_type", "")
            output[f"rubric - {i}. instruction_necessity"] = criterion.get("instruction_necessity", "")
            output[f"rubric - {i}. instruction_priority"] = criterion.get("instruction_priority", "")
            output[f"rubric - {i}. instruction_dimension"] = criterion.get("instruction_dimension", "")
        else:
            output[f"rubric - {i}. criterion"] = ""
            output[f"rubric - {i}. instruction_type"] = ""
            output[f"rubric - {i}. instruction_necessity"] = ""
            output[f"rubric - {i}. instruction_priority"] = ""
            output[f"rubric - {i}. instruction_dimension"] = ""

    return output


def format_task_for_verifiers(task: dict[str, Any]) -> dict[str, Any]:
    """Format a task for use with the verifiers library (prompt + info structure)."""
    prompt = [
        {"role": "user", "content": task.get("prompt_text", "")}
    ]

    # Convert rubric to the verifiers format
    rubrics = []
    for criterion in task.get("rubric", []):
        rubrics.append({
            "id": criterion.get("criterion", "")[:50],
            "text": criterion.get("criterion", ""),
            "type": criterion.get("instruction_type", "Objective"),
            "necessity": criterion.get("instruction_necessity", "Explicit"),
            "priority": criterion.get("instruction_priority", "Must have"),
            "dimension": criterion.get("instruction_dimension", "Instruction Following"),
        })

    info = {
        "task_id": task.get("task_id", ""),
        "prompt_id": task.get("prompt_id", ""),
        "genre": task.get("genre", ""),
        "difficulty": task.get("difficulty", ""),
        "pov": task.get("pov", ""),
        "setting": task.get("setting", ""),
        "target_age": task.get("target_age", ""),
        "geography": task.get("geography", ""),
        "gender": task.get("gender", ""),
        "relationship": task.get("relationship", ""),
        "conflict_type": task.get("conflict_type", ""),
        "theme": task.get("theme", ""),
        "tone": task.get("tone", ""),
        "word_constraint": task.get("word_constraint", 0),
        "rubrics": rubrics,
    }

    return {
        "prompt": prompt,
        "info": json.dumps(info),
    }


async def generate_tasks(num_tasks: int) -> list[dict[str, Any]]:
    """Generate multiple creative writing tasks with rubrics."""

    semaphore = asyncio.Semaphore(MAX_CONCURRENCY)

    # Ensure diversity in task outputs
    generation_tasks = []
    for i in range(num_tasks):
        genre = GENRES[i % len(GENRES)]
        difficulty = DIFFICULTIES[i % len(DIFFICULTIES)]
        pov = POVS[i % len(POVS)]
        setting = SETTINGS[i % len(SETTINGS)]
        target_age = TARGET_AGES[i % len(TARGET_AGES)]
        geography = GEOGRAPHIES[i % len(GEOGRAPHIES)]
        gender = GENDERS[i % len(GENDERS)]
        relationship = RELATIONSHIPS[i % len(RELATIONSHIPS)]
        conflict_type = CONFLICT_TYPES[i % len(CONFLICT_TYPES)]
        theme = THEMES[i % len(THEMES)]
        tone = TONES[i % len(TONES)]
        word_constraint = random.randint(150, 500)

        generation_tasks.append(
            generate_single_task(
                semaphore, i, genre, difficulty, pov, setting, target_age,
                geography, gender, relationship, conflict_type, theme, tone, word_constraint
            )
        )

    print(f"Generating {num_tasks} tasks with rubrics (max concurrency {MAX_CONCURRENCY})...")
    results = await asyncio.gather(*generation_tasks)

    # Remove any failed generations
    valid_tasks = [task for task in results if task is not None]

    print(f"\nSuccessfully generated {len(valid_tasks)}/{num_tasks} tasks with rubrics")
    return valid_tasks


def save_results(
    tasks: list[dict[str, Any]],
    output_dir: Path = Path("."),
    format_type: str = "both"
):
    """
    Save generated tasks to JSON files.

    Args:
        tasks: List of generated tasks
        output_dir: Directory to save files
        format_type: "flat" for eval format, "verifiers" for verifiers format, "both" for both
    """
    # Make sure new files do not overwrite existing ones
    existing_files = list(output_dir.glob("generated_tasks*.json"))

    max_num = -1
    for file in existing_files:
        match = re.match(r"generated_tasks(\d+)", file.name)
        if match:
            num = int(match.group(1))
            max_num = max(max_num, num)

    next_num = max_num + 1
    output_files = []

    if format_type in ["flat", "both"]:
        # Save in flat format
        flat_tasks = [format_task_for_output(task) for task in tasks]
        flat_filename = output_dir / f"generated_tasks{next_num}_eval.json"
        with open(flat_filename, "w") as f:
            json.dump(flat_tasks, f, indent=2)
        print(f"Eval format saved to {flat_filename}")
        output_files.append(flat_filename)

    if format_type in ["verifiers", "both"]:
        # Save in verifiers format
        verifiers_tasks = [format_task_for_verifiers(task) for task in tasks]
        verifiers_filename = output_dir / f"generated_tasks{next_num}_train.json"
        output = {
            "num_tasks": len(verifiers_tasks),
            "tasks": verifiers_tasks
        }
        with open(verifiers_filename, "w") as f:
            json.dump(output, f, indent=2)
        print(f"Verifiers format saved to {verifiers_filename}")
        output_files.append(verifiers_filename)

    return output_files


def print_stats(tasks: list[dict[str, Any]]):
    """Print statistics about generated tasks."""

    if not tasks:
        print("No tasks generated.")
        return

    # Record genre distribution
    genres = {}
    for task in tasks:
        g = task.get("genre", "unknown")
        genres[g] = genres.get(g, 0) + 1

    # Record difficulty distribution
    difficulties = {}
    for task in tasks:
        d = task.get("difficulty", "unknown")
        difficulties[d] = difficulties.get(d, 0) + 1

    # Word constraint stats
    word_constraints = [task.get("word_constraint", 0) for task in tasks]

    # Rubric stats
    rubric_lengths = [len(task.get("rubric", [])) for task in tasks]
    objective_counts = []
    subjective_counts = []
    must_have_counts = []
    nice_to_have_counts = []

    for task in tasks:
        rubric = task.get("rubric", [])
        obj_count = sum(1 for c in rubric if c.get("instruction_type") == "Objective")
        subj_count = sum(1 for c in rubric if c.get("instruction_type") == "Subjective")
        must_count = sum(1 for c in rubric if c.get("instruction_priority") == "Must have")
        nice_count = sum(1 for c in rubric if c.get("instruction_priority") == "Nice to have")
        objective_counts.append(obj_count)
        subjective_counts.append(subj_count)
        must_have_counts.append(must_count)
        nice_to_have_counts.append(nice_count)

    print("\n" + "="*60)
    print("GENERATION STATISTICS")
    print("="*60)
    print(f"Total tasks: {len(tasks)}")
    print()
    print("Genre distribution:")
    for genre, count in sorted(genres.items()):
        print(f"  {genre}: {count}")
    print()
    print("Difficulty distribution:")
    for diff, count in sorted(difficulties.items()):
        print(f"  {diff}: {count}")
    print()
    print("Word constraint range:")
    print(f"  Min: {min(word_constraints)}")
    print(f"  Max: {max(word_constraints)}")
    print(f"  Avg: {sum(word_constraints)/len(word_constraints):.0f}")
    print()
    print("Rubric statistics:")
    print(f"  Total criteria per task - Min: {min(rubric_lengths)}, Max: {max(rubric_lengths)}, Avg: {sum(rubric_lengths)/len(rubric_lengths):.1f}")
    print(f"  Objective criteria - Avg: {sum(objective_counts)/len(objective_counts):.1f}")
    print(f"  Subjective criteria - Avg: {sum(subjective_counts)/len(subjective_counts):.1f}")
    print(f"  'Must have' criteria - Avg: {sum(must_have_counts)/len(must_have_counts):.1f}")
    print(f"  'Nice to have' criteria - Avg: {sum(nice_to_have_counts)/len(nice_to_have_counts):.1f}")
    print("="*60)


async def main():
    """Main function to generate creative writing tasks with rubrics."""
    print(f"Creative Writing Task Generator (with Pydantic Rubrics)")
    print(f"=" * 60)
    print(f"Target tasks: {N}")
    print(f"Max concurrency: {MAX_CONCURRENCY}")
    print(f"Genres: {len(GENRES)}")
    print()

    tasks = await generate_tasks(N)

    if tasks:
        output_files = save_results(tasks, format_type="both")
        print_stats(tasks)
        print(f"\n✓ Complete! Results saved to:")
        for f in output_files:
            print(f"  - {f}")
    else:
        print("\n✗ No tasks were generated successfully.")


if __name__ == "__main__":
    asyncio.run(main())