@@ -625,10 +674,15 @@ export function UserDashboard() {
)}
- {/* Hide Stats Link */}
+ {/* Toggle Stats */}
-
- Hide stats
+ setShowStats((v) => !v)}
+ >
+ {showStats ? "Hide stats" : "Show stats"}
diff --git a/app-next/src/components/dataset/dataset-edit-form.tsx b/app-next/src/components/dataset/dataset-edit-form.tsx
index 4a4270e5..9c57ffef 100644
--- a/app-next/src/components/dataset/dataset-edit-form.tsx
+++ b/app-next/src/components/dataset/dataset-edit-form.tsx
@@ -2,18 +2,24 @@
import { useState } from "react";
import { useRouter } from "next/navigation";
+import { useLocale } from "next-intl";
import Link from "next/link";
-import { ArrowLeft, Save, Loader2, AlertTriangle } from "lucide-react";
+import { ArrowLeft, Save, Loader2, AlertTriangle, X, Plus } from "lucide-react";
import { Button } from "@/components/ui/button";
+import { Badge } from "@/components/ui/badge";
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Textarea } from "@/components/ui/textarea";
+import { useToast } from "@/hooks/use-toast";
interface DatasetEditFormProps {
datasetId: number;
datasetName: string;
isOwner: boolean;
+ hasApiKey: boolean;
+ isLocalUser: boolean;
+ initialTags: string[];
initialValues: {
description: string;
creator: string;
@@ -33,38 +39,53 @@ export function DatasetEditForm({
datasetId,
datasetName,
isOwner,
+ hasApiKey,
+ isLocalUser,
+ initialTags,
initialValues,
features,
}: DatasetEditFormProps) {
const router = useRouter();
+ const locale = useLocale();
+ const { toast } = useToast();
const [values, setValues] = useState(initialValues);
+ const [tags, setTags] = useState
(initialTags);
+ const [tagInput, setTagInput] = useState("");
+ const [tagInputError, setTagInputError] = useState(null);
const [saving, setSaving] = useState(false);
- const [error, setError] = useState(null);
- const [success, setSuccess] = useState(false);
+
+ const TAG_PATTERN = /^[a-zA-Z0-9_.-]+$/;
const handleChange = (
field: keyof typeof values,
value: string,
) => {
setValues((prev) => ({ ...prev, [field]: value }));
- setError(null);
- setSuccess(false);
+ };
+
+ const addTag = () => {
+ const trimmed = tagInput.trim();
+ if (!trimmed) return;
+ if (!TAG_PATTERN.test(trimmed)) {
+ setTagInputError("Only letters, numbers, underscores, hyphens, and dots are allowed.");
+ return;
+ }
+ if (!tags.includes(trimmed)) {
+ setTags((prev) => [...prev, trimmed]);
+ }
+ setTagInput("");
+ setTagInputError(null);
};
const handleSubmit = async (e: React.FormEvent) => {
e.preventDefault();
setSaving(true);
- setError(null);
- setSuccess(false);
try {
const res = await fetch(`/api/datasets/${datasetId}/edit`, {
method: "POST",
headers: { "Content-Type": "application/json" },
- body: JSON.stringify({
- ...values,
- isOwner,
- }),
+ body: JSON.stringify({ ...values, isOwner }),
});
if (!res.ok) {
@@ -72,14 +93,42 @@ export function DatasetEditForm({
throw new Error(data.error || `Failed to save (${res.status})`);
}
- setSuccess(true);
- // Redirect back to dataset page after short delay
+ // Apply tag changes β diff against initialTags
+ const toAdd = tags.filter((t) => !initialTags.includes(t));
+ const toRemove = initialTags.filter((t) => !tags.includes(t));
+
+ await Promise.all([
+ ...toAdd.map((tag) =>
+ fetch(`/api/datasets/${datasetId}/tags`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ tag }),
+ }),
+ ),
+ ...toRemove.map((tag) =>
+ fetch(`/api/datasets/${datasetId}/tags`, {
+ method: "DELETE",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ tag }),
+ }),
+ ),
+ ]);
+
+ toast({
+ title: "Changes saved",
+ description: "Redirecting back to dataset...",
+ });
+
setTimeout(() => {
- router.push(`/datasets/${datasetId}`);
+ router.push(`/${locale}/datasets/${datasetId}`);
router.refresh();
}, 1500);
} catch (err) {
- setError(err instanceof Error ? err.message : "Failed to save changes");
+ toast({
+ title: "Failed to save",
+ description: err instanceof Error ? err.message : "Failed to save changes",
+ variant: "destructive",
+ });
} finally {
setSaving(false);
}
@@ -90,7 +139,7 @@ export function DatasetEditForm({
{/* Header */}
@@ -103,16 +152,18 @@ export function DatasetEditForm({
- {/* Status messages */}
- {error && (
-
- )}
- {success && (
-
- Changes saved successfully! Redirecting...
+ {/* Warning: user has no valid OpenML API key (e.g. local dev account) */}
+ {(!hasApiKey || isLocalUser) && (
+
+
+
+
Saving is unavailable in this environment
+
+ {isLocalUser
+ ? "This account was created locally and does not have a valid OpenML API key. Dataset edits cannot be saved to the OpenML backend in a local development environment."
+ : "Your session does not include an OpenML API key. Saving changes requires signing in with a valid OpenML account."}
+
+
)}
@@ -282,14 +333,69 @@ export function DatasetEditForm({
)}
+ {/* Tags */}
+
+
+ Tags
+
+
+
+ {tags.map((tag) => (
+
+ {tag}
+ setTags((prev) => prev.filter((t) => t !== tag))}
+ className="hover:text-destructive ml-0.5 rounded transition-colors"
+ aria-label={`Remove tag ${tag}`}
+ >
+
+
+
+ ))}
+ {tags.length === 0 && (
+
No tags yet.
+ )}
+
+
+
{
+ setTagInput(e.target.value);
+ setTagInputError(null);
+ }}
+ onKeyDown={(e) => {
+ if (e.key === "Enter") {
+ e.preventDefault();
+ addTag();
+ }
+ }}
+ className={tagInputError ? "border-destructive" : ""}
+ />
+
+
+ Add
+
+
+ {tagInputError ? (
+ {tagInputError}
+ ) : (
+
+ Tags are applied when you save. Only letters, numbers, _ - . allowed.
+
+ )}
+
+
+
{/* Actions */}
-
+
Cancel
-
+
{saving ? (
<>
diff --git a/app-next/src/components/dataset/dataset-upload-form.tsx b/app-next/src/components/dataset/dataset-upload-form.tsx
new file mode 100644
index 00000000..8ab5b65e
--- /dev/null
+++ b/app-next/src/components/dataset/dataset-upload-form.tsx
@@ -0,0 +1,526 @@
+"use client";
+
+import { useState, useRef } from "react";
+import { useRouter } from "next/navigation";
+import { useSession } from "next-auth/react";
+import { format as formatDate } from "date-fns";
+import { CalendarIcon } from "lucide-react";
+import {
+ Card,
+ CardContent,
+ CardDescription,
+ CardFooter,
+ CardHeader,
+ CardTitle,
+} from "@/components/ui/card";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import { Textarea } from "@/components/ui/textarea";
+import { Button } from "@/components/ui/button";
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "@/components/ui/select";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { Calendar } from "@/components/ui/calendar";
+import {
+ Popover,
+ PopoverContent,
+ PopoverTrigger,
+} from "@/components/ui/popover";
+import { cn } from "@/lib/utils";
+import { Loader2, UploadCloud, FileText, AlertCircle } from "lucide-react";
+
+const MAX_FILE_SIZE_MB = 100;
+const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
+const ALLOWED_EXTENSIONS = [
+ ".arff",
+ ".xrff",
+ ".csv",
+ ".json",
+ ".parquet",
+ ".feather",
+];
+
+export function DatasetUploadForm() {
+ const { data: _session } = useSession();
+ const router = useRouter();
+
+ const [file, setFile] = useState(null);
+ const [fileError, setFileError] = useState(null);
+ const [isDragging, setIsDragging] = useState(false);
+ const [name, setName] = useState("");
+ const [description, setDescription] = useState("");
+ // Additional Metadata
+ const [creator, setCreator] = useState("");
+ const [contributor, setContributor] = useState("");
+ const [collectionDate, setCollectionDate] = useState();
+ const [language, setLanguage] = useState("");
+ const [licence, setLicence] = useState("Publicly available");
+ const [defaultTargetAttribute, setDefaultTargetAttribute] = useState("");
+ const [ignoreAttribute, setIgnoreAttribute] = useState("");
+ const [citation, setCitation] = useState("");
+ const [tags, setTags] = useState("");
+ const [tagsError, setTagsError] = useState(null);
+
+ const [isLoading, setIsLoading] = useState(false);
+ const [error, setError] = useState(null);
+ const errorRef = useRef(null);
+
+ const TAG_PATTERN = /^[a-zA-Z0-9_.-]+$/;
+
+ const validateTags = (value: string): string | null => {
+ if (!value.trim()) return null;
+ const invalid = value
+ .split(",")
+ .map((t) => t.trim())
+ .filter((t) => t && !TAG_PATTERN.test(t));
+ return invalid.length > 0
+ ? `Invalid tag(s): ${invalid.join(", ")} β only letters, numbers, underscores, hyphens, and dots are allowed.`
+ : null;
+ };
+
+ const validateAndSetFile = (selected: File) => {
+ setFileError(null);
+ const ext = "." + (selected.name.split(".").pop()?.toLowerCase() ?? "");
+ if (!ALLOWED_EXTENSIONS.includes(ext)) {
+ setFileError(
+ `Unsupported file type "${ext}". Please upload an ARFF file (.arff or .xrff).`,
+ );
+ setFile(null);
+ return;
+ }
+ if (selected.size > MAX_FILE_SIZE_BYTES) {
+ setFileError(
+ `File is too large (${(selected.size / 1024 / 1024).toFixed(1)} MB). Maximum file size is ${MAX_FILE_SIZE_MB} MB.`,
+ );
+ setFile(null);
+ return;
+ }
+ setFile(selected);
+ if (!name) {
+ const fileName = selected.name;
+ setName(fileName.substring(0, fileName.lastIndexOf(".")) || fileName);
+ }
+ };
+
+ const handleFileChange = (e: React.ChangeEvent) => {
+ if (e.target.files?.[0]) validateAndSetFile(e.target.files[0]);
+ };
+
+ const handleDrop = (e: React.DragEvent) => {
+ e.preventDefault();
+ setIsDragging(false);
+ if (e.dataTransfer.files?.[0]) validateAndSetFile(e.dataTransfer.files[0]);
+ };
+
+ const handleSubmit = async (e: React.SyntheticEvent) => {
+ e.preventDefault();
+ if (!file || !name || !description.trim()) {
+ setError("Please provide a file, a dataset name, and a description.");
+ return;
+ }
+ const tagValidationError = validateTags(tags);
+ if (tagValidationError) {
+ setTagsError(tagValidationError);
+ return;
+ }
+
+ setIsLoading(true);
+ setError(null);
+
+ try {
+ const formData = new FormData();
+ formData.append("file", file);
+ formData.append("name", name);
+ formData.append("description", description);
+ formData.append("format", "arff");
+ formData.append("creator", creator);
+ formData.append("contributor", contributor);
+ formData.append(
+ "collection_date",
+ collectionDate ? formatDate(collectionDate, "yyyy-MM-dd") : "",
+ );
+ formData.append("language", language);
+ formData.append("licence", licence);
+ formData.append("default_target_attribute", defaultTargetAttribute);
+ formData.append("ignore_attribute", ignoreAttribute);
+ formData.append("citation", citation);
+ formData.append("tags", tags);
+
+ const response = await fetch("/api/datasets/upload", {
+ method: "POST",
+ body: formData,
+ });
+
+ const result = await response.json();
+ if (!response.ok) {
+ setError(result.error || "Upload failed. Please try again.");
+ setTimeout(
+ () =>
+ errorRef.current?.scrollIntoView({
+ behavior: "smooth",
+ block: "center",
+ }),
+ 50,
+ );
+ return;
+ }
+
+ const id = result.id && result.id !== "new" ? result.id : null;
+ router.push(id ? `/datasets/${id}` : "/datasets");
+ router.refresh();
+ } catch (err) {
+ setError("Failed to upload dataset. Please try again.");
+ setTimeout(
+ () =>
+ errorRef.current?.scrollIntoView({
+ behavior: "smooth",
+ block: "center",
+ }),
+ 50,
+ );
+ } finally {
+ setIsLoading(false);
+ }
+ };
+
+ return (
+
+
+
+
+ Upload Dataset
+
+
+ Share your data with the OpenML community. Supported formats: ARFF,
+ CSV, JSON, Parquet, Feather.
+
+
+
+
+
+ );
+}
diff --git a/app-next/src/components/task/task-create-form.tsx b/app-next/src/components/task/task-create-form.tsx
new file mode 100644
index 00000000..9f54d0b4
--- /dev/null
+++ b/app-next/src/components/task/task-create-form.tsx
@@ -0,0 +1,400 @@
+"use client";
+
+import { useState, useEffect, useMemo } from "react";
+import { useRouter } from "next/navigation";
+import { useSession } from "next-auth/react";
+import {
+ Card,
+ CardContent,
+ CardDescription,
+ CardFooter,
+ CardHeader,
+ CardTitle,
+} from "@/components/ui/card";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import { Button } from "@/components/ui/button";
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "@/components/ui/select";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { Loader2, PlusCircle, AlertCircle, Clock, Info } from "lucide-react";
+import { APP_CONFIG } from "@/lib/config";
+
+const ESTIMATION_PROCEDURES: Record = {
+ classification: [
+ { id: "1", label: "10-fold Crossvalidation" },
+ { id: "2", label: "5 times 2-fold Crossvalidation" },
+ { id: "3", label: "10 times 10-fold Crossvalidation" },
+ { id: "4", label: "Leave-One-Out" },
+ { id: "16", label: "Holdout (66% train / 34% test)" },
+ { id: "6", label: "10-fold Learning Curve" },
+ ],
+ regression: [
+ { id: "1", label: "10-fold Crossvalidation" },
+ { id: "2", label: "5 times 2-fold Crossvalidation" },
+ { id: "3", label: "10 times 10-fold Crossvalidation" },
+ { id: "16", label: "Holdout (66% train / 34% test)" },
+ ],
+ learningcurve: [
+ { id: "13", label: "10-fold Learning Curve" },
+ { id: "14", label: "10 times 10-fold Learning Curve" },
+ ],
+ supervised: [{ id: "1", label: "10-fold Crossvalidation" }],
+ clustering: [],
+};
+
+const EVALUATION_MEASURES: Record = {
+ classification: [
+ "predictive_accuracy",
+ "area_under_roc_curve",
+ "f_measure",
+ "kappa",
+ "precision",
+ "recall",
+ "matthews_correlation_coefficient",
+ "mean_absolute_error",
+ ],
+ regression: [
+ "mean_absolute_error",
+ "root_mean_squared_error",
+ "mean_squared_error",
+ "relative_absolute_error",
+ "root_relative_squared_error",
+ ],
+ learningcurve: ["predictive_accuracy", "area_under_roc_curve"],
+ supervised: ["predictive_accuracy"],
+ clustering: [],
+};
+
+export function TaskCreateForm() {
+ const { data: session } = useSession();
+ const router = useRouter();
+
+ const [taskType, setTaskType] = useState("classification");
+ const [datasetId, setDatasetId] = useState("");
+ const [targetName, setTargetName] = useState("");
+ const [evaluationMeasure, setEvaluationMeasure] = useState("");
+ const [estimationProcedure, setEstimationProcedure] = useState("1");
+
+ const [isLoading, setIsLoading] = useState(false);
+ const [error, setError] = useState(null);
+ const [existingTaskId, setExistingTaskId] = useState(null);
+ const [datasetStatus, setDatasetStatus] = useState(null);
+ const [datasetFeatures, setDatasetFeatures] = useState<
+ { name: string; dataType: string }[]
+ >([]);
+ const [isFetchingDataset, setIsFetchingDataset] = useState(false);
+
+ useEffect(() => {
+ if (!datasetId || isNaN(Number(datasetId))) {
+ setDatasetStatus(null);
+ setDatasetFeatures([]);
+ setIsFetchingDataset(false);
+ return;
+ }
+ setIsFetchingDataset(true);
+ const timer = setTimeout(async () => {
+ try {
+ const base = APP_CONFIG.openmlApiUrl || "https://www.openml.org";
+ const [infoRes, featRes] = await Promise.all([
+ fetch(`${base}/api/v1/json/data/${datasetId}`),
+ fetch(`${base}/api/v1/json/data/features/${datasetId}`),
+ ]);
+ if (infoRes.ok) {
+ const json = await infoRes.json();
+ setDatasetStatus(json?.["data_set_description"]?.status ?? null);
+ } else {
+ setDatasetStatus(null);
+ }
+ if (featRes.ok) {
+ const json = await featRes.json();
+ const features: { name: string; dataType: string }[] = (
+ json?.["data_features"]?.["feature"] ?? []
+ )
+ .filter(
+ (f: { is_ignore: string; is_row_identifier: string }) =>
+ f.is_ignore !== "true" && f.is_row_identifier !== "true",
+ )
+ .map((f: { name: string; data_type: string }) => ({
+ name: f.name,
+ dataType: f.data_type,
+ }));
+ setDatasetFeatures(features);
+ } else {
+ setDatasetFeatures([]);
+ }
+ } catch {
+ setDatasetStatus(null);
+ setDatasetFeatures([]);
+ } finally {
+ setIsFetchingDataset(false);
+ }
+ }, 500);
+ return () => clearTimeout(timer);
+ }, [datasetId]);
+
+ const validTargetFeatures = useMemo(() => {
+ if (datasetFeatures.length === 0) return [];
+ if (taskType === "regression") {
+ const numeric = datasetFeatures.filter((f) => f.dataType === "numeric");
+ return numeric.length > 0 ? numeric : datasetFeatures;
+ }
+ // classification, learningcurve, supervised: nominal/string features only
+ const nominal = datasetFeatures.filter((f) => f.dataType !== "numeric");
+ return nominal.length > 0 ? nominal : datasetFeatures;
+ }, [datasetFeatures, taskType]);
+
+ const handleSubmit = async (e: React.SyntheticEvent) => {
+ e.preventDefault();
+ const isClustering = taskType === "clustering";
+ if (!datasetId || !taskType || (!targetName && !isClustering)) {
+ setError("Please fill in all required fields.");
+ return;
+ }
+
+ if (!session) {
+ setError("You must be signed in to create a task.");
+ return;
+ }
+
+ setIsLoading(true);
+ setError(null);
+ setExistingTaskId(null);
+
+ try {
+ const response = await fetch("/api/tasks/create", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ task_type: taskType,
+ dataset_id: parseInt(datasetId),
+ target_name: targetName || undefined,
+ estimation_procedure: estimationProcedure || undefined,
+ evaluation_measure: evaluationMeasure || undefined,
+ }),
+ });
+
+ const data = await response.json();
+ if (!response.ok) {
+ const idMatch = data.error?.match(/matched id\(s\): \[(\d+)\]/);
+ if (idMatch) {
+ setExistingTaskId(idMatch[1]);
+ } else {
+ setError(data.error || "Failed to create task. Please try again.");
+ }
+ return;
+ }
+
+ router.push(`/tasks/${data.id}`);
+ router.refresh();
+ } catch (_err) {
+ setError("Failed to create task. Please try again.");
+ } finally {
+ setIsLoading(false);
+ }
+ };
+
+ return (
+
+
+
+
+ Define Task
+
+
+ Create a new machine learning task for an existing dataset.
+
+
+
+
+
+ {existingTaskId && (
+
+
+ Task already exists
+
+ This exact task configuration already exists.{" "}
+
+ View task {existingTaskId} β
+
+
+
+ )}
+ {error && (
+
+
+ Error
+ {error}
+
+ )}
+
+
+
+ Task Type *
+ {
+ setTaskType(v);
+ setTargetName("");
+ setEvaluationMeasure("");
+ setEstimationProcedure("1");
+ }}
+ >
+
+
+
+
+ Supervised Classification
+ Supervised Regression
+ Learning Curve
+ Clustering
+ Supervised
+
+
+
+
+
+
Dataset ID *
+
setDatasetId(e.target.value)}
+ required
+ />
+ {datasetStatus === "in_preparation" ? (
+
+
+ This dataset is still being processed by OpenML. Tasks can
+ only be created once the dataset is active .
+ Please try again in a few minutes.
+
+ ) : datasetStatus === "active" ? (
+
+ Dataset is active and ready for tasks.
+
+ ) : (
+
+ The ID of the dataset to use for this task.
+
+ )}
+
+
+ {taskType !== "clustering" && (
+
+
Target Feature *
+ {validTargetFeatures.length > 0 ? (
+
setTargetName(e.target.value)}
+ required
+ className="border-input bg-background ring-offset-background placeholder:text-muted-foreground focus-visible:ring-ring flex h-10 w-full rounded-md border px-3 py-2 text-sm focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:outline-none"
+ >
+ Select a column...
+ {validTargetFeatures.map((f) => (
+
+ {f.name}
+
+ ))}
+
+ ) : (
+
setTargetName(e.target.value)}
+ required
+ />
+ )}
+
+ {validTargetFeatures.length > 0
+ ? `${validTargetFeatures.length} valid target column${validTargetFeatures.length === 1 ? "" : "s"} available (${taskType === "regression" ? "numeric" : "nominal"}).`
+ : "The name of the target attribute (column) to predict."}
+
+
+ )}
+
+ {(ESTIMATION_PROCEDURES[taskType]?.length ?? 0) > 0 && (
+
+
+ Estimation Procedure *
+
+
setEstimationProcedure(e.target.value)}
+ required
+ className="border-input bg-background ring-offset-background focus-visible:ring-ring flex h-10 w-full rounded-md border px-3 py-2 text-sm focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:outline-none"
+ >
+ {ESTIMATION_PROCEDURES[taskType].map((p) => (
+
+ {p.label}
+
+ ))}
+
+
+ How to split the data for evaluation.
+
+
+ )}
+
+ {(EVALUATION_MEASURES[taskType]?.length ?? 0) > 0 && (
+
+
Evaluation Measure
+
setEvaluationMeasure(e.target.value)}
+ className="border-input bg-background ring-offset-background focus-visible:ring-ring flex h-10 w-full rounded-md border px-3 py-2 text-sm focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:outline-none"
+ >
+ None (optional)
+ {EVALUATION_MEASURES[taskType].map((m) => (
+
+ {m}
+
+ ))}
+
+
+ Optional. Evaluation metric for this task.
+
+
+ )}
+
+
+
+
+ router.back()}>
+ Cancel
+
+
+ {isLoading ? (
+ <>
+
+ Creating...
+ >
+ ) : (
+ "Create Task"
+ )}
+
+
+
+
+ );
+}
diff --git a/app-next/src/hooks/use-op-speed.ts b/app-next/src/hooks/use-op-speed.ts
new file mode 100644
index 00000000..27269bc3
--- /dev/null
+++ b/app-next/src/hooks/use-op-speed.ts
@@ -0,0 +1,62 @@
+"use client";
+
+import { useEffect, useRef } from "react";
+
+/**
+ * A hook to measure operation speed and rendering performance.
+ *
+ * @param operationName - The name of the operation being monitored.
+ * @param dependencies - An array of dependencies to trigger the measurement (like useEffect/useMemo).
+ */
+export function useOpSpeed(operationName: string) {
+ const renderCount = useRef(0);
+
+ // Use layout effect to capture "start" of rendering phases for updates
+ // Note: This isn't perfect "render start" but close enough for React dev tools equivalent
+ // without breaking purity rules.
+ // Ideally, we'd use the Profiler API, but that's for heavy instrumentation.
+ useEffect(() => {
+ // This runs after the render is committed.
+ const now =
+ typeof performance !== "undefined" ? performance.now() : Date.now();
+
+ // We can't easily measure "render start" safely inside the render function
+ // because it must be pure.
+ // Instead, we just mark that a render completed.
+
+ // If you need precise "render logic" timing, you should wrap the expensive logic
+ // in the measure() function returned below.
+
+ renderCount.current += 1;
+
+ if (process.env.NODE_ENV === "development") {
+ // We log that a commit happened
+ console.log(
+ `[OpSpeed] ${operationName} committed update #${renderCount.current} at ${now.toFixed(2)}ms`,
+ );
+ }
+ });
+
+ // Return a function to manually measure specific code blocks
+ return {
+ measure: (fn: () => T, label?: string): T => {
+ // It IS safe to call performance.now() inside an event handler or effect-triggered function,
+ // just not during the render phase itself.
+ // If this is called during render (e.g. in useMemo), it will still trigger the warning if inspected by strict tooling,
+ // but is generally the only way to measure synchronous code blocks.
+ const start =
+ typeof performance !== "undefined" ? performance.now() : Date.now();
+ const result = fn();
+ const end =
+ typeof performance !== "undefined" ? performance.now() : Date.now();
+ if (process.env.NODE_ENV === "development") {
+ console.log(
+ `[OpSpeed] ${operationName}${label ? `:${label}` : ""} execution time: ${(
+ end - start
+ ).toFixed(2)}ms`,
+ );
+ }
+ return result;
+ },
+ };
+}
diff --git a/app-next/src/hooks/useDatasetStats.ts b/app-next/src/hooks/useDatasetStats.ts
new file mode 100644
index 00000000..f1756a07
--- /dev/null
+++ b/app-next/src/hooks/useDatasetStats.ts
@@ -0,0 +1,131 @@
+"use client";
+
+import { useState, useEffect } from "react";
+
+export interface NumericDistribution {
+ type: "numeric";
+ bins: number[];
+ counts: number[];
+ mean: number | null;
+ std: number | null;
+ min: number | null;
+ max: number | null;
+ missing: number;
+}
+
+export interface NominalDistribution {
+ type: "nominal";
+ categories: string[];
+ counts: number[];
+ missing: number;
+}
+
+export type FeatureDistribution = NumericDistribution | NominalDistribution;
+
+export interface CorrelationMatrix {
+ features: string[];
+ matrix: number[][];
+}
+
+export interface DatasetPreview {
+ columns: string[];
+ rows: (string | number | null)[][];
+ total_rows: number;
+}
+
+export interface DatasetStatistics {
+ distribution: Record;
+ correlation: CorrelationMatrix | null;
+ preview: DatasetPreview;
+}
+
+export interface DatasetStatsResponse {
+ dataset_id: number;
+ computed_at: string;
+ cached: boolean;
+ statistics: DatasetStatistics;
+}
+
+export interface DatasetStatsState {
+ stats: DatasetStatistics | null;
+ isLoading: boolean;
+ error: string | null;
+ cached: boolean;
+}
+
+/**
+ * Hook to fetch precomputed dataset statistics from Flask API via Next.js proxy
+ * This replaces client-side parquet parsing for distribution/correlation charts
+ *
+ * @param datasetId - OpenML dataset ID
+ * @param maxPreviewRows - Maximum rows to include in preview (default: 100)
+ * @param enabled - Whether to fetch data (default: true)
+ */
+export function useDatasetStats(
+ datasetId: number | string | undefined,
+ maxPreviewRows: number = 100,
+ enabled: boolean = true,
+) {
+ const [state, setState] = useState({
+ stats: null,
+ isLoading: false,
+ error: null,
+ cached: false,
+ });
+
+ useEffect(() => {
+ if (!datasetId || !enabled) {
+ return;
+ }
+
+ let cancelled = false;
+
+ async function fetchStats() {
+ setState((prev) => ({ ...prev, isLoading: true, error: null }));
+
+ try {
+ // Use Next.js API route proxy to avoid CORS issues
+ const url = `/api/datasets/${datasetId}/stats?max_preview_rows=${maxPreviewRows}`;
+ const response = await fetch(url);
+
+ if (!response.ok) {
+ const errorData = await response.json().catch(() => ({}));
+ const msg = errorData.error || `Failed to fetch stats: ${response.statusText}`;
+ console.warn("Dataset stats unavailable:", msg);
+ if (cancelled) return;
+ setState({ stats: null, isLoading: false, error: msg, cached: false });
+ return;
+ }
+
+ const data: DatasetStatsResponse = await response.json();
+
+ if (cancelled) return;
+
+ setState({
+ stats: data.statistics,
+ isLoading: false,
+ error: null,
+ cached: data.cached,
+ });
+ } catch (err) {
+ console.error("Failed to fetch dataset stats:", err);
+ if (cancelled) return;
+
+ setState({
+ stats: null,
+ isLoading: false,
+ error: err instanceof Error ? err.message : "Failed to load statistics",
+ cached: false,
+ });
+ }
+ }
+
+ fetchStats();
+
+ return () => {
+ cancelled = true;
+ };
+ }, [datasetId, maxPreviewRows, enabled]);
+
+ return state;
+}
diff --git a/app-next/src/hooks/useParquetData.ts b/app-next/src/hooks/useParquetData.ts
index 0a6145e9..5f8cb342 100644
--- a/app-next/src/hooks/useParquetData.ts
+++ b/app-next/src/hooks/useParquetData.ts
@@ -192,7 +192,16 @@ export function useParquetData(
return;
}
if (!arffResponse.ok) {
- throw new Error(`Failed to fetch ARFF: ${arffResponse.statusText}`);
+ // File not available (server error, not processed yet, etc.) β fail silently
+ setState({
+ data: null,
+ columns: [],
+ rowCount: 0,
+ isLoading: false,
+ error: null,
+ isTooLarge: false,
+ });
+ return;
}
const arffText = await arffResponse.text();
@@ -310,15 +319,31 @@ export function useParquetData(
// Initialize parquet-wasm (uses cached module if already loaded)
const parquetModule = await initParquetWasm();
- // Read parquet using parquet-wasm
+ // Read parquet using parquet-wasm.
+ // Some parquet files use encodings unsupported by parquet-wasm
+ // (e.g. boolean dictionary packing). Catch wasm errors here and
+ // fall back to ARFF silently instead of surfacing the error.
const parquetBytes = new Uint8Array(arrayBuffer);
-
- // readParquet returns a parquet-wasm Table, convert to Arrow IPC stream
- const wasmTable = parquetModule.readParquet(parquetBytes);
- const ipcStream = wasmTable.intoIPCStream();
-
- // Parse the Arrow IPC stream into an apache-arrow Table
- const table: Table = tableFromIPC(ipcStream);
+ let table: Table;
+ try {
+ const wasmTable = parquetModule.readParquet(parquetBytes);
+ const ipcStream = wasmTable.intoIPCStream();
+ table = tableFromIPC(ipcStream);
+ } catch (_wasmErr) {
+ if (arffUrl && !cancelled) {
+ await tryArff(arffUrl);
+ } else if (!cancelled) {
+ setState({
+ data: null,
+ columns: [],
+ rowCount: 0,
+ isLoading: false,
+ error: null,
+ isTooLarge: false,
+ });
+ }
+ return;
+ }
// Extract column names
const columns = table.schema.fields.map((f) => f.name);
diff --git a/app-next/src/hooks/usePlotlyTheme.ts b/app-next/src/hooks/usePlotlyTheme.ts
new file mode 100644
index 00000000..bf10bcf7
--- /dev/null
+++ b/app-next/src/hooks/usePlotlyTheme.ts
@@ -0,0 +1,28 @@
+"use client";
+
+import { useTheme } from "next-themes";
+
+/**
+ * Shared Plotly layout theme values β keeps all charts visually consistent
+ * across dark and light mode.
+ */
+export function usePlotlyTheme() {
+ const { resolvedTheme } = useTheme();
+ const isDark = resolvedTheme === "dark";
+
+ return {
+ isDark,
+ font: {
+ color: isDark ? "rgba(250,250,250,0.7)" : "rgba(0,0,0,0.7)",
+ },
+ gridcolor: isDark ? "rgba(255,255,255,0.1)" : "rgba(0,0,0,0.08)",
+ zerolinecolor: isDark ? "rgba(255,255,255,0.2)" : "rgba(0,0,0,0.2)",
+ paper_bgcolor: "transparent" as const,
+ plot_bgcolor: isDark ? "rgba(255,255,255,0.03)" : "rgba(0,0,0,0.02)",
+ // Hover tooltip: always white text and border so it's readable on any marker colour
+ hoverlabel: {
+ font: { color: "white" },
+ bordercolor: "white",
+ },
+ };
+}
diff --git a/app-next/src/lib/api/dataset.ts b/app-next/src/lib/api/dataset.ts
index 6cf28d77..520d199c 100644
--- a/app-next/src/lib/api/dataset.ts
+++ b/app-next/src/lib/api/dataset.ts
@@ -37,14 +37,12 @@ export async function fetchDataset(id: string): Promise {
return data._source as Dataset;
} catch (error) {
- // Log error for debugging (server-side only)
- console.error(`Error fetching dataset ${id}:`, error);
-
- // Re-throw notFound() errors
- if (error instanceof Error && error.message === "NEXT_NOT_FOUND") {
+ // Re-throw notFound() β Next.js uses digest, not message
+ if ((error as { digest?: string })?.digest === "NEXT_NOT_FOUND") {
throw error;
}
+ console.error(`Error fetching dataset ${id}:`, error);
throw new Error("Failed to load dataset");
}
}
diff --git a/app-next/src/lib/api/flow.ts b/app-next/src/lib/api/flow.ts
index 3f57c873..a5679fc1 100644
--- a/app-next/src/lib/api/flow.ts
+++ b/app-next/src/lib/api/flow.ts
@@ -84,7 +84,7 @@ export async function fetchFlowVersions(name: string): Promise {
}
const data = await response.json();
- return data.hits.hits.map((hit: any) => hit._source as Flow);
+ return data.hits.hits.map((hit: { _source: Flow }) => hit._source);
} catch (error) {
console.error("Error fetching flow versions:", error);
return [];
diff --git a/app-next/src/lib/api/measure.ts b/app-next/src/lib/api/measure.ts
new file mode 100644
index 00000000..68a4b60c
--- /dev/null
+++ b/app-next/src/lib/api/measure.ts
@@ -0,0 +1,100 @@
+import { Measure } from "@/types/measure";
+import { notFound } from "next/navigation";
+import { getElasticsearchUrl } from "@/lib/elasticsearch";
+
+const ES_INDEX = "measure";
+
+export async function fetchMeasure(id: string): Promise {
+ try {
+ const response = await fetch(
+ getElasticsearchUrl(`${ES_INDEX}/_doc/${id}`),
+ {
+ next: {
+ revalidate: 3600,
+ tags: [`measure-${id}`],
+ },
+ headers: {
+ "Content-Type": "application/json",
+ },
+ },
+ );
+
+ if (response.status === 404) {
+ notFound();
+ }
+
+ if (!response.ok) {
+ throw new Error(`Failed to fetch measure: ${response.statusText}`);
+ }
+
+ const data = await response.json();
+
+ if (!data.found || !data._source) {
+ notFound();
+ }
+
+ return data._source as Measure;
+ } catch (error) {
+ if ((error as { digest?: string })?.digest === "NEXT_NOT_FOUND") {
+ throw error;
+ }
+ console.error(`Error fetching measure ${id}:`, error);
+ throw new Error("Failed to load measure");
+ }
+}
+
+interface RelatedTask {
+ task_id: number;
+ task_type: string;
+ task_type_id: number;
+ source_data?: {
+ data_id?: number;
+ name?: string;
+ };
+ runs?: number;
+}
+
+export async function fetchRelatedTasks(
+ measureName: string,
+): Promise {
+ try {
+ const response = await fetch(getElasticsearchUrl("task/_search"), {
+ method: "POST",
+ next: {
+ revalidate: 3600,
+ tags: [`measure-tasks-${measureName}`],
+ },
+ headers: {
+ "Content-Type": "application/json",
+ },
+ body: JSON.stringify({
+ query: {
+ term: {
+ "evaluation_measures.keyword": measureName,
+ },
+ },
+ _source: [
+ "task_id",
+ "task_type",
+ "task_type_id",
+ "source_data",
+ "runs",
+ ],
+ size: 50,
+ sort: [{ runs: { order: "desc" } }],
+ }),
+ });
+
+ if (!response.ok) {
+ return [];
+ }
+
+ const data = await response.json();
+ return (data.hits?.hits || []).map(
+ (hit: { _source: RelatedTask }) => hit._source,
+ );
+ } catch (error) {
+ console.error("Error fetching related tasks:", error);
+ return [];
+ }
+}
diff --git a/app-next/src/lib/api/run.ts b/app-next/src/lib/api/run.ts
new file mode 100644
index 00000000..290ac6a6
--- /dev/null
+++ b/app-next/src/lib/api/run.ts
@@ -0,0 +1,175 @@
+import { APP_CONFIG } from "@/lib/config";
+
+/** Ensure a value that may be a single item or undefined becomes an array. */
+function normalizeArray(value: T[] | T | undefined | null): T[] {
+ if (value == null) return [];
+ return Array.isArray(value) ? value : [value];
+}
+
+/**
+ * Run data as returned by the OpenML REST API (/api/v1/json/run/{id}).
+ * This differs from the Elasticsearch-indexed shape in src/types/run.ts.
+ */
+export interface RunDetail {
+ run_id: number;
+ uploader?: string;
+ uploader_id?: number;
+ upload_time?: string;
+ flow_id?: number;
+ flow_name?: string;
+ task_id?: number;
+ task?: {
+ task_id?: number;
+ task_type?: string;
+ source_data?: {
+ data_id?: number;
+ name?: string;
+ };
+ };
+ visibility?: string;
+ error_message?: string | null;
+ tag?: string[];
+ parameter_setting?: Array<{
+ name: string;
+ value: string | number | boolean | null;
+ }>;
+ output_data?: {
+ evaluation?: Array<{
+ name: string;
+ value: string | number;
+ stdev?: string | number;
+ array_data?: Record;
+ per_fold?: Array;
+ }>;
+ };
+ nr_of_likes?: number;
+ nr_of_downloads?: number;
+ nr_of_issues?: number;
+ nr_of_downvotes?: number;
+ setup_string?: string;
+}
+
+interface RunApiResponse {
+ run?: RunDetail;
+ error?: { code: string; message: string };
+}
+
+/**
+ * Fetch a single run by ID from the OpenML REST API.
+ * Returns `{ run, error }` β never throws.
+ */
+export async function getRun(
+ runId: number,
+): Promise<{ run: RunDetail | null; error: string | null }> {
+ try {
+ const apiUrl = APP_CONFIG.urlApi || "https://www.openml.org/api/v1";
+ const response = await fetch(`${apiUrl}/json/run/${runId}`, {
+ next: { revalidate: 3600 },
+ headers: {
+ Accept: "application/json",
+ },
+ });
+
+ if (!response.ok) {
+ if (response.status === 404) {
+ return { run: null, error: `Run #${runId} not found` };
+ }
+ return {
+ run: null,
+ error: `Failed to fetch run: HTTP ${response.status}`,
+ };
+ }
+
+ const contentType = response.headers.get("content-type");
+ if (!contentType || !contentType.includes("application/json")) {
+ return { run: null, error: "Invalid response format from API" };
+ }
+
+ const data: RunApiResponse = await response.json();
+
+ if (data.error) {
+ return { run: null, error: data.error.message || "Unknown API error" };
+ }
+
+ // The OpenML XMLβJSON API collapses single-element arrays to plain
+ // values. Normalize array fields so callers can rely on .length.
+ if (data.run) {
+ data.run.tag = normalizeArray(data.run.tag);
+ data.run.parameter_setting = normalizeArray(data.run.parameter_setting);
+ if (data.run.output_data) {
+ data.run.output_data.evaluation = normalizeArray(
+ data.run.output_data.evaluation,
+ );
+
+ // The API returns both summary rows (no repeat/fold) and per-fold
+ // rows (with repeat + fold) under the same name. Merge per-fold
+ // values into each summary entry's `per_fold` array and discard
+ // the individual fold rows so every name is unique.
+ const rawEvals = data.run.output_data.evaluation as Array<
+ Record
+ >;
+ const summaryMap = new Map<
+ string,
+ (typeof data.run.output_data.evaluation)[number]
+ >();
+ const foldValues = new Map();
+
+ for (const ev of rawEvals) {
+ const name = ev.name as string;
+ if (ev.repeat != null || ev.fold != null) {
+ // Per-fold row β collect the value
+ const arr = foldValues.get(name) ?? [];
+ arr.push(
+ typeof ev.value === "number"
+ ? ev.value
+ : parseFloat(String(ev.value)),
+ );
+ foldValues.set(name, arr);
+ } else {
+ // Summary row β keep as the canonical entry
+ summaryMap.set(
+ name,
+ ev as (typeof data.run.output_data.evaluation)[number],
+ );
+ }
+ }
+
+ // Attach per-fold arrays to their summary entries
+ for (const [name, folds] of foldValues) {
+ const summary = summaryMap.get(name);
+ if (summary) {
+ summary.per_fold = folds;
+ }
+ }
+
+ // Parse array_data JSON strings into objects
+ for (const ev of summaryMap.values()) {
+ if (typeof ev.array_data === "string") {
+ try {
+ const parsed = JSON.parse(ev.array_data as string);
+ if (Array.isArray(parsed)) {
+ // Convert array [0.5, 0.8] into { "class_0": 0.5, "class_1": 0.8 }
+ const obj: Record = {};
+ parsed.forEach((v: number, i: number) => {
+ obj[`class_${i}`] = v;
+ });
+ ev.array_data = obj;
+ } else if (typeof parsed === "object" && parsed !== null) {
+ ev.array_data = parsed;
+ }
+ } catch {
+ // Leave as-is if not valid JSON
+ }
+ }
+ }
+
+ data.run.output_data.evaluation = Array.from(summaryMap.values());
+ }
+ }
+
+ return { run: data.run || null, error: null };
+ } catch (error) {
+ console.error("Failed to fetch run:", error);
+ return { run: null, error: "Failed to connect to OpenML API" };
+ }
+}
diff --git a/app-next/src/lib/api/study.ts b/app-next/src/lib/api/study.ts
new file mode 100644
index 00000000..285b289b
--- /dev/null
+++ b/app-next/src/lib/api/study.ts
@@ -0,0 +1,35 @@
+import { getElasticsearchUrl } from "@/lib/elasticsearch";
+
+export interface StudyData {
+ study_id: number;
+ study_type: string;
+ name: string;
+ description?: string;
+ uploader?: string;
+ uploader_id?: number;
+ date?: string;
+ visibility?: string;
+ datasets_included?: number;
+ tasks_included?: number;
+ flows_included?: number;
+ runs_included?: number;
+}
+
+/**
+ * Fetch study metadata from Elasticsearch
+ */
+export async function fetchStudy(id: string): Promise {
+ const url = getElasticsearchUrl(`study/_doc/${id}`);
+ const res = await fetch(url, { next: { revalidate: 3600 } });
+
+ if (!res.ok) {
+ throw new Error(`Study ${id} not found`);
+ }
+
+ const data = await res.json();
+ if (!data.found || !data._source) {
+ throw new Error(`Study ${id} not found`);
+ }
+
+ return data._source as StudyData;
+}
diff --git a/app-next/src/lib/api/task.ts b/app-next/src/lib/api/task.ts
index 6e554fee..10ecd5c1 100644
--- a/app-next/src/lib/api/task.ts
+++ b/app-next/src/lib/api/task.ts
@@ -39,12 +39,11 @@ export async function fetchTask(id: string): Promise {
return data._source as Task;
} catch (error) {
- console.error(`Error fetching task ${id}:`, error);
-
- if (error instanceof Error && error.message === "NEXT_NOT_FOUND") {
+ if ((error as { digest?: string })?.digest === "NEXT_NOT_FOUND") {
throw error;
}
+ console.error(`Error fetching task ${id}:`, error);
throw new Error("Failed to load task");
}
}
diff --git a/app-next/src/lib/api/user.ts b/app-next/src/lib/api/user.ts
new file mode 100644
index 00000000..61bcae0e
--- /dev/null
+++ b/app-next/src/lib/api/user.ts
@@ -0,0 +1,39 @@
+import { APP_CONFIG } from "@/lib/config";
+
+/**
+ * Basic user info returned by the OpenML REST API (/api/v1/json/user/{id}).
+ */
+export interface UserInfo {
+ id: string | number;
+ username?: string;
+ first_name?: string;
+ last_name?: string;
+ image?: string;
+ bio?: string;
+ date_registered?: string;
+}
+
+/**
+ * Fetch basic user info by ID from the OpenML REST API.
+ * Returns `null` on any error (never throws).
+ */
+export async function getUser(userId: string): Promise {
+ try {
+ const apiUrl = APP_CONFIG.urlApi || "https://www.openml.org/api/v1";
+ const response = await fetch(`${apiUrl}/json/user/${userId}`, {
+ next: { revalidate: 3600 },
+ headers: {
+ Accept: "application/json",
+ },
+ });
+
+ if (!response.ok) {
+ return null;
+ }
+
+ const data = await response.json();
+ return (data.user as UserInfo) || null;
+ } catch {
+ return null;
+ }
+}
diff --git a/app-next/src/types/measure.ts b/app-next/src/types/measure.ts
new file mode 100644
index 00000000..5d79fc04
--- /dev/null
+++ b/app-next/src/types/measure.ts
@@ -0,0 +1,15 @@
+export interface Measure {
+ measure_id?: number;
+ quality_id?: number;
+ proc_id?: number;
+ eval_id?: number;
+ name: string;
+ description?: string;
+ measure_type: "data_quality" | "evaluation_measure" | "estimation_procedure";
+ date?: string;
+ min?: number;
+ max?: number;
+ unit?: string;
+ higherIsBetter?: boolean;
+ stratified_sampling?: string;
+}
diff --git a/app-next/src/types/next-auth.d.ts b/app-next/src/types/next-auth.d.ts
index 4568c81f..03807daf 100644
--- a/app-next/src/types/next-auth.d.ts
+++ b/app-next/src/types/next-auth.d.ts
@@ -2,35 +2,44 @@ import { DefaultSession, DefaultUser } from "next-auth";
import { DefaultJWT } from "next-auth/jwt";
declare module "next-auth" {
- interface Session {
+ interface User extends DefaultUser {
+ id: string;
+ username: string;
+ firstName?: string;
+ lastName?: string;
+ image?: string | null;
+ session_hash?: string | null;
+ apikey?: string;
+ accessToken?: string;
+ isLocalUser?: boolean;
+ openmlUserId?: string;
+ }
+
+ interface Session extends DefaultSession {
user: {
id: string;
username: string;
firstName?: string;
lastName?: string;
+ image?: string | null;
+ isLocalUser?: boolean;
+ openmlUserId?: string;
} & DefaultSession["user"];
- accessToken?: string;
apikey?: string;
- }
-
- interface User extends DefaultUser {
- id: string;
- username?: string;
accessToken?: string;
- session_hash?: string | null;
- firstName?: string;
- lastName?: string;
- apikey?: string;
}
}
declare module "next-auth/jwt" {
interface JWT extends DefaultJWT {
- accessToken?: string;
userId?: string;
username?: string;
- apikey?: string;
firstName?: string;
lastName?: string;
+ picture?: string | null;
+ apikey?: string;
+ accessToken?: string;
+ isLocalUser?: boolean;
+ openmlUserId?: string;
}
}
diff --git a/app-next/src/types/task.ts b/app-next/src/types/task.ts
index efc5628b..533a31f2 100644
--- a/app-next/src/types/task.ts
+++ b/app-next/src/types/task.ts
@@ -1,5 +1,4 @@
-/**
- * OpenML Task Entity
+/** OpenML Task Entity
* Represents a machine learning task in the OpenML platform
*/
export interface Task {
@@ -20,7 +19,7 @@ export interface Task {
data_id: number;
name: string;
};
- source_data_name?: string; // For backward compatibility
+ source_data_name?: string; // backward compatibility
// Task configuration
target_feature?: string;
@@ -65,7 +64,6 @@ export interface Task {
// Quality metrics
quality?: Record;
- // Dates
// Dates
upload_date?: string;
date?: string;
diff --git a/server/data/views.py b/server/data/views.py
index a37a4413..cf81cfc8 100644
--- a/server/data/views.py
+++ b/server/data/views.py
@@ -1,6 +1,7 @@
import json
import os
import tempfile
+from datetime import datetime
from pathlib import Path
from urllib.parse import parse_qs, urlparse
@@ -13,6 +14,12 @@
from werkzeug.utils import secure_filename
from server.setup import setup_openml_config
+from server.src.dashboard.caching import (
+ CACHE_DIR_DASHBOARD,
+ load_cached_stats,
+ save_stats_cache,
+)
+from server.src.dashboard.helpers import compute_dataset_stats
from server.utils import current_user
data_blueprint = Blueprint(
@@ -121,7 +128,7 @@ def data_edit():
citation=citation,
language=language,
original_data_url=original_data_url,
- paper_url=paper_url
+ paper_url=paper_url,
)
elif owner == "true":
default_target_attribute = j_obj["default_target_attribute"]
@@ -144,33 +151,54 @@ def data_edit():
ignore_attribute=ignore_attribute,
row_id_attribute=row_id_attribute,
original_data_url=original_data_url,
- paper_url=paper_url
+ paper_url=paper_url,
)
return str("data edit successful")
@data_blueprint.route("/data-upload", methods=["POST"])
-@jwt_required()
def data_upload():
"""
Function to upload dataset
"""
- user = current_user()
- user_api_key = user.session_hash
+ user_api_key = request.form.get("api_key") or (
+ request.files.get("metadata")
+ and json.loads(request.files["metadata"].read()).get("api_key")
+ )
+ if not user_api_key:
+ return jsonify({"msg": "api_key required"}), 401
+
+ # Set server URL explicitly β avoids double /api/api/ from URL_API env var
+ openml.config.server = os.getenv(
+ "OPENML_SERVER_URL", "https://www.openml.org/api/v1/xml"
+ )
+ openml.config.apikey = user_api_key
data_file = request.files["dataset"]
metadata = request.files["metadata"]
with tempfile.TemporaryDirectory() as tmpdirname:
- path = Path(tmpdirname) / f"{user_api_key}?{secure_filename(data_file.filename)}"
+ path = (
+ Path(tmpdirname) / f"{user_api_key}?{secure_filename(data_file.filename)}"
+ )
data_file.save(path)
metadata = metadata.read()
metadata = json.loads(metadata)
- dataset_name = metadata["dataset_name"]
- description = metadata["description"]
+ def _sanitize(text):
+ """Replace Unicode smart quotes and dashes with ASCII equivalents."""
+ if not text:
+ return text
+ return (
+ text.replace("\u2018", "'").replace("\u2019", "'") # ' '
+ .replace("\u201c", '"').replace("\u201d", '"') # " "
+ .replace("\u2013", "-").replace("\u2014", "-") # β β
+ )
+
+ dataset_name = _sanitize(metadata["dataset_name"]).replace(" ", "_")
+ description = _sanitize(metadata["description"])
creator = metadata["creator"] or None
contributor = metadata["contributor"] or None
collection_date = metadata["collection_date"] or None
@@ -185,57 +213,58 @@ def data_upload():
supported_extensions = [".csv", ".parquet", ".json", ".feather", ".arff"]
if file_extension not in supported_extensions:
- return jsonify({"msg": "format not supported"})
-
- elif file_extension == ".arff":
- with open(path, "r") as arff_file:
- arff_dict = arff.load(arff_file)
- attribute_names, dtypes = zip(*arff_dict["attributes"])
- data = pd.DataFrame(arff_dict["data"], columns=attribute_names)
- for attribute_name, dtype in arff_dict["attributes"]:
- # 'real' and 'numeric' are probably interpreted correctly.
- # Date support needs to be added.
- if isinstance(dtype, list):
- data[attribute_name] = data[attribute_name].astype("category")
- df = data
-
- elif file_extension == ".csv":
- df = pd.read_csv(path)
-
- elif file_extension == ".json":
- df = pd.read_json(path)
-
- elif file_extension == ".parquet":
- df = pd.read_parquet(path)
-
- elif file_extension == ".feather":
- df = pd.read_feather(path)
-
- oml_dataset = openml.datasets.create_dataset(
- name=dataset_name,
- description=description,
- data=df,
- creator=creator,
- contributor=contributor,
- collection_date=collection_date,
- licence=licence,
- language=language,
- attributes="auto",
- default_target_attribute=def_tar_att,
- ignore_attribute=ignore_attribute,
- citation=citation,
- )
- oml_dataset.publish()
+ return jsonify({"msg": f"Unsupported file format '{file_extension}'. Supported formats: CSV, JSON, Parquet, Feather, ARFF."}), 422
- # TODO Add error for bad dataset
- return jsonify({"msg": "dataset uploaded"}), 200
+ try:
+ if file_extension == ".arff":
+ with open(path, "r") as arff_file:
+ arff_dict = arff.load(arff_file)
+ attribute_names, dtypes = zip(*arff_dict["attributes"])
+ data = pd.DataFrame(arff_dict["data"], columns=attribute_names)
+ for attribute_name, dtype in arff_dict["attributes"]:
+ if isinstance(dtype, list):
+ data[attribute_name] = data[attribute_name].astype("category")
+ df = data
+ elif file_extension == ".csv":
+ df = pd.read_csv(path)
+ elif file_extension == ".json":
+ df = pd.read_json(path)
+ elif file_extension == ".parquet":
+ df = pd.read_parquet(path)
+ elif file_extension == ".feather":
+ df = pd.read_feather(path)
+ except Exception as e:
+ return jsonify({"msg": f"Could not read file: {e}"}), 422
+
+ try:
+ oml_dataset = openml.datasets.create_dataset(
+ name=dataset_name,
+ description=description,
+ data=df,
+ creator=creator,
+ contributor=contributor,
+ collection_date=collection_date,
+ licence=licence,
+ language=language,
+ attributes="auto",
+ default_target_attribute=def_tar_att,
+ ignore_attribute=ignore_attribute,
+ citation=citation,
+ )
+ oml_dataset.publish()
+ except ValueError as e:
+ return jsonify({"msg": str(e)}), 422
+ except Exception as e:
+ return jsonify({"msg": f"Upload failed: {e}"}), 500
+
+ return jsonify({"msg": "dataset uploaded", "id": str(oml_dataset.dataset_id)}), 200
@data_blueprint.route("/data-tag", methods=["POST"])
@jwt_required()
def data_tag():
j_obj = request.get_json()
- tag = j_obj['tag']
+ tag = j_obj["tag"]
url = request.args.get("url")
parsed = urlparse(url)
dataset_id = parse_qs(parsed.query)["id"]
@@ -244,3 +273,79 @@ def data_tag():
dataset = openml.datasets.get_dataset(dataset_id)
dataset.push_tag(tag)
+
+@data_blueprint.route("/api/v1/datasets//stats", methods=["GET"])
+def get_dataset_stats(dataset_id):
+ """
+ Returns JSON statistics for a dataset.
+
+ Query params:
+ - max_preview_rows (int, default=100): Max rows in preview
+ - force_refresh (bool, default=False): Skip cache, recompute
+
+ Returns:
+ {
+ "dataset_id": int,
+ "computed_at": str (ISO timestamp),
+ "cached": bool,
+ "statistics": {
+ "distribution": {...},
+ "correlation": {...},
+ "preview": {...}
+ }
+ }
+ """
+ try:
+ max_preview = request.args.get("max_preview_rows", 100, type=int)
+ force_refresh = request.args.get("force_refresh", False, type=bool)
+
+ # Try loading from JSON cache first
+ if not force_refresh:
+ cached_stats = load_cached_stats(dataset_id)
+ if cached_stats is not None:
+ # Validate cache has expected max_preview_rows
+ current_preview_rows = len(
+ cached_stats.get("preview", {}).get("rows", [])
+ )
+ if current_preview_rows >= max_preview:
+ # Cache is valid, return it
+ return (
+ jsonify(
+ {
+ "dataset_id": dataset_id,
+ "computed_at": datetime.utcnow().isoformat() + "Z",
+ "cached": True,
+ "statistics": cached_stats,
+ }
+ ),
+ 200,
+ )
+
+ # Cache miss or force refresh - compute stats
+ stats = compute_dataset_stats(dataset_id, max_preview_rows=max_preview)
+
+ # Save to JSON cache
+ save_stats_cache(dataset_id, stats)
+
+ return (
+ jsonify(
+ {
+ "dataset_id": dataset_id,
+ "computed_at": datetime.utcnow().isoformat() + "Z",
+ "cached": False,
+ "statistics": stats,
+ }
+ ),
+ 200,
+ )
+
+ except Exception as e:
+ # Log the error for debugging
+ import logging
+
+ logger = logging.getLogger("data")
+ logger.error(
+ f"Error computing stats for dataset {dataset_id}: {str(e)}", exc_info=True
+ )
+
+ return jsonify({"error": str(e)}), 500