neulab · neubig · Jun 3, 2026 · github-actions · Jun 3, 2026
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/README.md b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/README.md
@@ -0,0 +1,31 @@
+# SWE-ZERO 12M Trajectories Dataset
+
+## Description
+
+SWE-ZERO 12M Trajectories is a large-scale execution-free agentic coding trace dataset. It contains mini-swe-agent v1 style shell trajectories sampled from real GitHub PR snapshots, intended for mid-training coding agents on repository navigation, editing, and bash-based tool use.
+
+## Dataset Information
+
+- **Source URL**: https://huggingface.co/datasets/AlienKevin/SWE-ZERO-12M-trajectories
+- **License**: Apache-2.0
+- **Split used**: `train`
+- **Approximate size**: 12,290,800 rollouts, 122,908 unique PRs, 3,222 repositories, 16 programming languages, and 112B tokens according to the dataset card.
+- **Source task dataset**: nebius/SWE-rebench-V2-PRs
+- **Trajectory format**: mini-swe-agent v1
+- **Bootstrapping model**: ricdomolm/mini-coder-1.7b
+
+## Schema Mapping
+
+The raw dataset is a list of chat-style messages with `role` and `content` fields:
+
+- `system` messages are skipped because they only define the mini-swe-agent response format and execution-free shell constraints.
+- Initial `user` task messages become `TextObservation(source="user")`.
+- Later `user` messages beginning with `Observation:` become `TextObservation(source="environment")` with the prefix removed.
+- `assistant` messages containing fenced `bash` blocks become `CodeAction(language="bash")`; the text before the final bash block is preserved as the action description after removing a leading `THOUGHT:` label.
+- `assistant` messages without a bash block become `MessageAction` entries so malformed or terminal natural-language turns are preserved.
+
+The standardized trajectory details preserve the raw `instance_id`, repository, `trajectory_format`, `exit_status`, and `duration_sec`. Trajectory IDs are derived deterministically from the instance ID plus a content hash because the source dataset contains many independent rollouts per PR with the same `instance_id`.
+
+## Known Limitations
+
+The dataset card describes this corpus as a mid-training dataset rather than a verified SFT dataset. The trajectories are execution-free, not validated against tests, and many rollouts terminate with `incomplete` or other non-submitted statuses. This converter preserves those trajectories instead of filtering to submitted-only samples.
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/extract_raw.py b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/extract_raw.py
@@ -0,0 +1,21 @@
+import json
+import signal
+
+from datasets import load_dataset
+
+DATASET_NAME = "AlienKevin/SWE-ZERO-12M-trajectories"
+
+signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+
+def main():
+    dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
+    for item in dataset:
+        try:
+            print(json.dumps(item))
+        except BrokenPipeError:
+            return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/metadata.json b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/raw_to_standardized.py b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/raw_to_standardized.py
@@ -0,0 +1,97 @@
+import hashlib
+import json
+import re
+import sys
+
+from schema_raw import SchemaRaw
+
+from schema.action.code import CodeAction
+from schema.action.message import MessageAction
+from schema.observation.text import TextObservation
+from schema.tool_call_links import create_trajectory_with_tool_call_links
+from schema.trajectory import Trajectory
+
+OBSERVATION_PREFIX = "Observation:"
+_BASH_BLOCK_RE = re.compile(r"```bash\s*\n(.*?)\n?```", re.DOTALL | re.IGNORECASE)
+_THOUGHT_PREFIX_RE = re.compile(r"^THOUGHT:\s*", re.IGNORECASE)
+
+
+def strip_thought_prefix(content: str) -> str:
+    return _THOUGHT_PREFIX_RE.sub("", content.strip()).strip()
+
+
+def normalize_observation(content: str) -> str:
+    if content.startswith(OBSERVATION_PREFIX):
+        return content[len(OBSERVATION_PREFIX) :].lstrip()
+    return content
+
+
+def trajectory_id(data: SchemaRaw) -> str:
+    serialized_messages = json.dumps(
+        [message.model_dump() for message in data.messages],
+        sort_keys=True,
+        ensure_ascii=False,
+    )
+    digest = hashlib.sha1(serialized_messages.encode("utf-8")).hexdigest()[:12]
+    return f"{data.instance_id}-{digest}"
+
+
+def convert_user_message(content: str) -> TextObservation:
+    if content.startswith(OBSERVATION_PREFIX):
+        return TextObservation(content=normalize_observation(content), source="environment")
+    return TextObservation(content=content, source="user")
+
+
+def convert_assistant_message(content: str) -> CodeAction | MessageAction:
+    bash_matches = list(_BASH_BLOCK_RE.finditer(content))
+    if not bash_matches:
+        return MessageAction(content=content)
+
+    match = bash_matches[-1]
+    description = strip_thought_prefix(content[: match.start()])
+    command = match.group(1).strip()
+    return CodeAction(language="bash", content=command, description=description or None)
+
+
+def convert_step(step) -> list:
+    if step.role == "system":
+        return []
+    if step.role == "user":
+        return [convert_user_message(step.content)]
+    if step.role == "assistant":
+        return [convert_assistant_message(step.content)]
+    print(f"Unknown role: {step.role}", file=sys.stderr)
+    return []
+
+
+def process_data(data: SchemaRaw) -> Trajectory | None:
+    content = []
+    for step in data.messages:
+        content.extend(convert_step(step))
+
+    if not content:
+        return None
+
+    return create_trajectory_with_tool_call_links(
+        id=trajectory_id(data),
+        content=content,
+        details={
+            "instance_id": data.instance_id,
+            "repo": data.repo,
+            "trajectory_format": data.trajectory_format,
+            "exit_status": data.exit_status,
+            "duration_sec": data.duration_sec,
+        },
+    )
+
+
+if __name__ == "__main__":
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        raw_data = json.loads(line)
+        data = SchemaRaw(**raw_data)
+        standardized_data = process_data(data)
+        if standardized_data:
+            print(standardized_data.model_dump_json())
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/sample_raw.json b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/sample_raw.json