From 589db2657e15c26e3d5b37985a4575a7a0fa1f84 Mon Sep 17 00:00:00 2001 From: Helder Mendes Date: Thu, 19 Feb 2026 10:14:58 +0100 Subject: [PATCH 1/2] Fix TypeScript error: Add stratified_sampling property to Measure type Co-Authored-By: Claude Fix TypeScript error: Add stratified_sampling property to Measure type# --- app-next/src/types/measure.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 app-next/src/types/measure.ts diff --git a/app-next/src/types/measure.ts b/app-next/src/types/measure.ts new file mode 100644 index 00000000..5d79fc04 --- /dev/null +++ b/app-next/src/types/measure.ts @@ -0,0 +1,15 @@ +export interface Measure { + measure_id?: number; + quality_id?: number; + proc_id?: number; + eval_id?: number; + name: string; + description?: string; + measure_type: "data_quality" | "evaluation_measure" | "estimation_procedure"; + date?: string; + min?: number; + max?: number; + unit?: string; + higherIsBetter?: boolean; + stratified_sampling?: string; +} From 6c9cf3a9c6d44340f506f60432409309b0776bde Mon Sep 17 00:00:00 2001 From: Helder Mendes Date: Thu, 19 Feb 2026 10:22:57 +0100 Subject: [PATCH 2/2] Fix TypeScript errors: Add explicit types for Plotly and distribution categories - Add 'as any' type cast for Plotly layout with tickfont property - Add explicit 'any' type annotation for distribution category parameters Co-Authored-By: Claude Fix TypeScript errors: Add explicit types for Plotly and distribution categories --- .../dataset/data-analysis-section.tsx | 202 +++++++---- .../measure/measure-analysis-section.tsx | 316 ++++++++++++++++++ 2 files changed, 455 insertions(+), 63 deletions(-) create mode 100644 app-next/src/components/measure/measure-analysis-section.tsx diff --git a/app-next/src/components/dataset/data-analysis-section.tsx b/app-next/src/components/dataset/data-analysis-section.tsx index 5a76f944..783da27d 100644 --- a/app-next/src/components/dataset/data-analysis-section.tsx +++ b/app-next/src/components/dataset/data-analysis-section.tsx @@ -1,7 +1,7 @@ "use client"; import dynamic from "next/dynamic"; -import { useState, useRef, useCallback, useEffect, useMemo } from "react"; +import { useState, useRef, useCallback, useMemo } from "react"; import { ChevronDown, ChevronLeft, @@ -39,6 +39,7 @@ import { useTheme } from "next-themes"; import { cn } from "@/lib/utils"; import type { Dataset, DatasetFeature } from "@/types/dataset"; import { useParquetData, computeDistribution } from "@/hooks/useParquetData"; +import { useDatasetStats } from "@/hooks/useDatasetStats"; // Dynamic import for Plotly (required for SSR compatibility) const Plot = dynamic(() => import("react-plotly.js"), { @@ -94,6 +95,9 @@ export function DataAnalysisSection({ isHugeDataset ? undefined : dataset.url, ); + // Try stats API first (faster, works for all sizes) + const statsState = useDatasetStats(dataset.data_id, 100, !isHugeDataset); + // Fullscreen toggle handler const toggleFullscreen = useCallback(async () => { if (!containerRef.current) return; @@ -431,6 +435,8 @@ export function DataAnalysisSection({ isTooLarge={parquetState.isTooLarge} isHugeDataset={isHugeDataset} datasetId={dataset.data_id} + statsData={statsState.stats?.distribution} + isLoadingStats={statsState.isLoading} /> @@ -441,6 +447,8 @@ export function DataAnalysisSection({ parquetData={parquetState.data} isLoadingParquet={parquetState.isLoading} datasetId={dataset.data_id} + statsData={statsState.stats?.correlation} + isLoadingStats={statsState.isLoading} /> @@ -595,7 +603,8 @@ function FeatureDistributionPlots({ isLoadingParquet, isTooLarge, isHugeDataset, - datasetId, + statsData, + isLoadingStats, }: { numericFeatures?: DatasetFeature[]; nominalFeatures?: DatasetFeature[]; @@ -605,6 +614,8 @@ function FeatureDistributionPlots({ isTooLarge?: boolean; isHugeDataset?: boolean; datasetId?: number; + statsData?: Record; + isLoadingStats?: boolean; }) { // Whether parquet data is unavailable (too large to load in browser) const dataUnavailable = isTooLarge || isHugeDataset; @@ -630,11 +641,6 @@ function FeatureDistributionPlots({ featurePage * FEATURES_PER_PAGE, ); - // Reset to page 1 when filter changes - useEffect(() => { - setFeaturePage(1); - }, [filterText]); - // State for feature selection const [selectedFeatures, setSelectedFeatures] = useState>(() => { const initialSelection = new Set(); @@ -699,7 +705,10 @@ function FeatureDistributionPlots({ setFilterText(e.target.value)} + onChange={(e) => { + setFilterText(e.target.value); + setFeaturePage(1); + }} className="h-8 max-w-xs" /> @@ -776,7 +785,7 @@ function FeatureDistributionPlots({ {/* Distribution Plots */} - {isLoadingParquet && !dataUnavailable ? ( + {(isLoadingStats || (isLoadingParquet && !dataUnavailable)) ? (
Loading data... @@ -793,6 +802,7 @@ function FeatureDistributionPlots({ feature={feature} parquetData={parquetData} dataUnavailable={dataUnavailable} + statsData={statsData} /> ))}
@@ -808,24 +818,60 @@ function DistributionPlot({ feature, parquetData, dataUnavailable, + statsData, }: { feature: DatasetFeature; parquetData: Record | null; targetFeature?: DatasetFeature; targetColors?: string[]; dataUnavailable?: boolean; + statsData?: Record; }) { const { resolvedTheme } = useTheme(); const isDark = resolvedTheme === "dark"; const isNumeric = feature.type === "numeric"; - // Compute distribution from parquet data or use feature.distr + // Compute distribution from stats API first, then fall back to parquet data or feature.distr const distribution = useMemo(() => { + // Priority 1: Use stats API datanpm if available + if (statsData && statsData[feature.name]) { + const featureStats = statsData[feature.name]; + if (featureStats.type === "numeric") { + // Convert bins/counts from stats API to plot format + return { + bins: featureStats.bins?.map((binEdge: number, i: number, arr: number[]) => ({ + min: binEdge, + max: arr[i + 1] || binEdge, + count: featureStats.counts[i] || 0, + })).slice(0, -1), // Remove last incomplete bin + stats: { + min: featureStats.min, + max: featureStats.max, + mean: featureStats.mean, + median: (featureStats.min + featureStats.max) / 2, // Approximate + q1: featureStats.min, + q3: featureStats.max, + std: featureStats.std, + }, + }; + } else { + // Nominal feature from stats API + return { + categories: featureStats.categories?.map((cat: string, i: number) => ({ + value: cat, + count: featureStats.counts[i], + })), + }; + } + } + + // Priority 2: Compute from parquet data if (parquetData && parquetData[feature.name]) { const dataType = feature.type === "numeric" ? "numeric" : "nominal"; return computeDistribution(parquetData[feature.name], dataType); } - // Fallback to feature.distr if available (works for nominal without parquet) + + // Priority 3: Fallback to feature.distr if available (works for nominal without parquet) if (feature.distr && feature.distr.length > 0) { if (isNumeric) { return { @@ -840,7 +886,7 @@ function DistributionPlot({ }; } return null; - }, [parquetData, feature, isNumeric]); + }, [statsData, parquetData, feature, isNumeric]); // Numeric features in too-large datasets: show "coming soon" if (dataUnavailable && isNumeric) { @@ -864,15 +910,28 @@ function DistributionPlot({ ); } - // Filter out null values for Plotly + // Filter out null values for Plotly (only if using parquet data with raw values) const cleanData = parquetData?.[feature.name]?.filter( (v): v is string | number => v !== null, ) || []; - // Prepare bar chart data for nominal features - const barLabels = distribution.categories?.map((c) => c.value) || []; - const barValues = distribution.categories?.map((c) => c.count) || []; + // Prepare data based on distribution type + const hasRawData = cleanData.length > 0 && !statsData?.[feature.name]; + const hasBinnedData = distribution?.bins && distribution.bins.length > 0; + + // For numeric: use histogram with raw data OR bar chart with binned data + // For nominal: use bar chart with category counts + const barLabels = distribution?.categories?.map((c: any) => c.value) || []; + const barValues = distribution?.categories?.map((c: any) => c.count) || []; + + // For binned numeric data, create bar chart labels from bin ranges + const binnedLabels = hasBinnedData + ? distribution.bins.map((bin: any) => `${bin.min.toFixed(1)}-${bin.max.toFixed(1)}`) + : []; + const binnedValues = hasBinnedData + ? distribution.bins.map((bin: any) => bin.count) + : []; return (
@@ -886,13 +945,23 @@ function DistributionPlot({ | null; isLoadingParquet?: boolean; datasetId?: number; + statsData?: { features: string[]; matrix: number[][] } | null; + isLoadingStats?: boolean; }) { const { resolvedTheme } = useTheme(); const isDark = resolvedTheme === "dark"; @@ -955,32 +1024,46 @@ function CorrelationHeatmap({ const MAX_FEATURES = 20; const limitedFeatures = numericFeatures.slice(0, MAX_FEATURES); - // Compute correlation matrix - const correlationMatrix = useMemo(() => { - if (!parquetData || limitedFeatures.length === 0) return null; - - const n = limitedFeatures.length; - const matrix: number[][] = []; - - for (let i = 0; i < n; i++) { - const row: number[] = []; - for (let j = 0; j < n; j++) { - if (i === j) { - row.push(1); - } else if (j < i) { - // Use symmetric value - row.push(matrix[j][i]); - } else { - const xData = parquetData[limitedFeatures[i].name] || []; - const yData = parquetData[limitedFeatures[j].name] || []; - row.push(computePearsonCorrelation(xData, yData)); + // Use stats API data first, then fall back to computing from parquet data + const correlationData = useMemo(() => { + // Priority 1: Use stats API data if available + if (statsData && statsData.features && statsData.matrix) { + return { + features: statsData.features, + matrix: statsData.matrix, + }; + } + + // Priority 2: Compute from parquet data + if (parquetData && limitedFeatures.length > 0) { + const n = limitedFeatures.length; + const matrix: number[][] = []; + + for (let i = 0; i < n; i++) { + const row: number[] = []; + for (let j = 0; j < n; j++) { + if (i === j) { + row.push(1); + } else if (j < i) { + // Use symmetric value + row.push(matrix[j][i]); + } else { + const xData = parquetData[limitedFeatures[i].name] || []; + const yData = parquetData[limitedFeatures[j].name] || []; + row.push(computePearsonCorrelation(xData, yData)); + } } + matrix.push(row); } - matrix.push(row); + + return { + features: limitedFeatures.map((f) => f.name), + matrix, + }; } - return matrix; - }, [parquetData, limitedFeatures]); + return null; + }, [statsData, parquetData, limitedFeatures]); if (numericFeatures.length === 0) { return ( @@ -990,8 +1073,8 @@ function CorrelationHeatmap({ ); } - if (!parquetData) { - if (isLoadingParquet) { + if (!correlationData) { + if (isLoadingStats || isLoadingParquet) { return (
@@ -1009,21 +1092,14 @@ function CorrelationHeatmap({ ); } - if (!correlationMatrix) { - return ( -
- -
- ); - } - - const featureNames = limitedFeatures.map((f) => f.name); + const featureNames = correlationData.features; + const correlationMatrix = correlationData.matrix; return (
{numericFeatures.length > MAX_FEATURES && (

- Showing correlation for first {MAX_FEATURES} of{" "} + Showing correlation for first {featureNames.length} of{" "} {numericFeatures.length} numeric features.

)} @@ -1047,7 +1123,7 @@ function CorrelationHeatmap({ }, ]} layout={{ - height: Math.max(400, limitedFeatures.length * 25), + height: Math.max(400, featureNames.length * 25), margin: { l: 120, r: 20, t: 20, b: 120 }, font: { color: isDark ? "rgba(250,250,250,0.6)" : "rgba(0,0,0,0.6)", diff --git a/app-next/src/components/measure/measure-analysis-section.tsx b/app-next/src/components/measure/measure-analysis-section.tsx new file mode 100644 index 00000000..34b842bb --- /dev/null +++ b/app-next/src/components/measure/measure-analysis-section.tsx @@ -0,0 +1,316 @@ +"use client"; + +import dynamic from "next/dynamic"; +import { useState, useEffect, useMemo } from "react"; +import { Loader2, TrendingUp, TrendingDown } from "lucide-react"; +import { Card, CardContent } from "@/components/ui/card"; +import { Skeleton } from "@/components/ui/skeleton"; +import { Alert, AlertDescription } from "@/components/ui/alert"; +import { useTheme } from "next-themes"; +import { entityColors } from "@/constants/entityColors"; +import type { Measure } from "@/types/measure"; + +// Dynamic import for Plotly (required for SSR compatibility) +const Plot = dynamic(() => import("react-plotly.js"), { + ssr: false, + loading: () => ( +
+ +
+ ), +}); + +interface MeasureAnalysisSectionProps { + measure: Measure; +} + +export function MeasureAnalysisSection({ + measure, +}: MeasureAnalysisSectionProps) { + const [relatedMeasures, setRelatedMeasures] = useState([]); + const [loading, setLoading] = useState(true); + const { theme } = useTheme(); + + useEffect(() => { + const fetchRelatedMeasures = async () => { + try { + // Fetch all measures of the same type for comparison + const esQuery = { + query: { + bool: { + filter: [{ term: { measure_type: measure.measure_type } }], + }, + }, + size: 500, + sort: [{ date: { order: "asc" } }], + }; + + const res = await fetch("/api/search", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + indexName: "measure", + esQuery, + }), + }); + + if (res.ok) { + const data = await res.json(); + const measures = + data.hits?.hits?.map((hit: any) => hit._source) || []; + setRelatedMeasures(measures); + } + } catch (error) { + console.error("[MeasureAnalysisSection] Error:", error); + } finally { + setLoading(false); + } + }; + + fetchRelatedMeasures(); + }, [measure.measure_type]); + + // Compute timeline data + const timelineData = useMemo(() => { + if (relatedMeasures.length === 0) return null; + + // Group by year + const yearCounts: Record = {}; + relatedMeasures.forEach((m) => { + if (m.date) { + const year = new Date(m.date).getFullYear().toString(); + yearCounts[year] = (yearCounts[year] || 0) + 1; + } + }); + + const years = Object.keys(yearCounts).sort(); + const counts = years.map((y) => yearCounts[y]); + + return { years, counts }; + }, [relatedMeasures]); + + // Compute stats + const stats = useMemo(() => { + if (measure.measure_type === "evaluation_measure") { + const higher = relatedMeasures.filter( + (m) => m.higherIsBetter === "1" || m.higherIsBetter === 1, + ).length; + const lower = relatedMeasures.filter( + (m) => m.higherIsBetter === "0" || m.higherIsBetter === 0, + ).length; + + return { + current: measure.higherIsBetter + ? "Higher is better" + : "Lower is better", + higherCount: higher, + lowerCount: lower, + }; + } + + if (measure.measure_type === "estimation_procedure") { + const stratifiedCount = relatedMeasures.filter( + (m) => m.stratified_sampling === "true", + ).length; + + return { + current: + measure.stratified_sampling === "true" + ? "Stratified" + : "Non-stratified", + stratifiedCount, + nonStratifiedCount: relatedMeasures.length - stratifiedCount, + }; + } + + return null; + }, [measure, relatedMeasures]); + + if (loading) { + return ( +
+ +
+ ); + } + + return ( +
+ {/* Stats Cards */} + {stats && ( +
+ {measure.measure_type === "evaluation_measure" && ( + <> + + +
+
+ +
+
+

{stats.higherCount}

+

+ Higher is Better +

+
+
+
+
+ + + +
+
+ +
+
+

{stats.lowerCount}

+

+ Lower is Better +

+
+
+
+
+ + )} + + {measure.measure_type === "estimation_procedure" && ( + <> + + +
+

+ {stats.stratifiedCount} +

+

+ Stratified Procedures +

+
+
+
+ + + +
+

+ {stats.nonStratifiedCount} +

+

+ Non-Stratified Procedures +

+
+
+
+ + )} +
+ )} + + {/* Timeline Chart */} + {timelineData && timelineData.years.length > 1 && ( + + +

+ Measures Added Over Time +

+ %{x}
Measures Added: %{y}", + }, + ]} + layout={ + { + xaxis: { + title: { + text: "Year", + font: { + color: + theme === "dark" + ? "rgba(255,255,255,0.5)" + : "rgba(0,0,0,0.5)", + }, + }, + tickfont: { + color: + theme === "dark" + ? "rgba(255,255,255,0.5)" + : "rgba(0,0,0,0.5)", + }, + gridcolor: + theme === "dark" + ? "rgba(255,255,255,0.1)" + : "rgba(0,0,0,0.1)", + linecolor: + theme === "dark" + ? "rgba(255,255,255,0.1)" + : "rgba(0,0,0,0.1)", + }, + yaxis: { + title: { + text: "Count", + font: { + color: + theme === "dark" + ? "rgba(255,255,255,0.5)" + : "rgba(0,0,0,0.5)", + }, + }, + tickfont: { + color: + theme === "dark" + ? "rgba(255,255,255,0.5)" + : "rgba(0,0,0,0.5)", + }, + gridcolor: + theme === "dark" + ? "rgba(255,255,255,0.1)" + : "rgba(0,0,0,0.1)", + linecolor: + theme === "dark" + ? "rgba(255,255,255,0.1)" + : "rgba(0,0,0,0.1)", + }, + hovermode: "closest", + height: 300, + margin: { l: 50, r: 20, t: 20, b: 50 }, + plot_bgcolor: "transparent", + paper_bgcolor: "transparent", + } as any + } + config={{ + responsive: true, + displayModeBar: false, + }} + style={{ width: "100%", height: "300px" }} + /> +
+
+ )} + + {/* Current Measure Properties */} + {(measure.min !== undefined || measure.max !== undefined) && ( + + +
+ Range: + + {measure.min ?? "−∞"} to {measure.max ?? "+∞"} + {measure.unit && ` ${measure.unit}`} + +
+
+
+ )} +
+ ); +}