From 868ed98492370ed92ede7b534e3ee6c5af5bf054 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Fri, 21 Nov 2025 10:49:58 +0000 Subject: [PATCH 1/4] Add IFS uncompressed data --- README.md | 2 +- pyproject.toml | 4 +- .../data_loader/datasets/all.py | 1 + .../data_loader/datasets/ifs_uncompressed.py | 172 ++++++++++++++++++ 4 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py diff --git a/README.md b/README.md index 207819e..9eb9493 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ To download all the data used for the benchmark run the following commands: ```bash uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci uv run python -m climatebenchpress.data_loader.datasets.cams -uv run python -m climatebenchpress.data_loader.datasets.era5 +uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed uv run python -m climatebenchpress.data_loader.datasets.nextgems uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos diff --git a/pyproject.toml b/pyproject.toml index b5418dd..d6b9e78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,9 @@ dependencies = [ "cf-xarray~=0.10.0", "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", + "earthkit-regrid>=0.5.0", "fsspec>=2024.10.0,<2025.4", + "gribscan>=0.0.14", "healpy~=1.18.0", # These versions need to be pinned to be compatible with the NextGEMS # catalog at https://data.nextgems-h2020.eu/online.yaml. @@ -52,5 +54,5 @@ where = ["src"] addopts = ["--import-mode=importlib"] [[tool.mypy.overrides]] -module = ["fsspec.*", "intake.*", "healpy.*"] +module = ["fsspec.*", "intake.*", "healpy.*", "earthkit.*"] follow_untyped_imports = true diff --git a/src/climatebenchpress/data_loader/datasets/all.py b/src/climatebenchpress/data_loader/datasets/all.py index 41d69d2..0f3700e 100644 --- a/src/climatebenchpress/data_loader/datasets/all.py +++ b/src/climatebenchpress/data_loader/datasets/all.py @@ -4,4 +4,5 @@ from .cmip6.all import * from .era5 import * from .esa_biomass_cci import * +from .ifs_uncompressed import * from .nextgems import * diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py new file mode 100644 index 0000000..666fce5 --- /dev/null +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -0,0 +1,172 @@ +__all__ = ["IFSUncompressedDataset"] + +import argparse +from pathlib import Path + +import earthkit.regrid +import numpy as np +import requests +import xarray as xr + +from .. import ( + monitor, + open_downloaded_canonicalized_dataset, + open_downloaded_tiny_canonicalized_dataset, +) +from .abc import Dataset + +BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket" + + +class IFSUncompressedDataset(Dataset): + """Dataset for IFS uncompressed data. + + Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/) + experiment from the Integrated Forecasting System (IFS) model. Crucially, + this dataset contains uncompressed 64-bit floating point data. + """ + + name = "ifs-uncompressed" + + @staticmethod + def download(download_path: Path, progress: bool = True): + ds = load_hplp_data(leveltype="sfc", gridtype="reduced_gg") + ds = ds[["msl", "10u", "10v"]] + ds_regridded = regrid_to_regular( + ds, + in_grid={"grid": "O400"}, + out_grid={"grid": [0.25, 0.25]}, + ) + downloadfile = download_path / "ifs_uncompressed.zarr" + with monitor.progress_bar(progress): + ds_regridded.to_zarr( + downloadfile, mode="w", encoding=dict(), compute=False + ).compute() + + @staticmethod + def open(download_path: Path) -> xr.Dataset: + ds = xr.open_dataset(download_path / "ifs_uncompressed.zarr") + + # Needed to make the dataset CF-compliant. + ds.longitude.attrs["axis"] = "X" + ds.latitude.attrs["axis"] = "Y" + ds.time.attrs["standard_name"] = "time" + return ds + + +def load_hplp_data(leveltype=None, gridtype=None, step=None, remap=False): + """Function taken from: https://github.com/climet-eu/compression-lab-notebooks/blob/main/04-example-datasets/01-hplp.ipynb.""" + if leveltype not in {"pl", "ml", "sfc", "wave"}: + raise ValueError( + f"Invalid leveltype: '{leveltype}'. Available leveltypes: pl, ml, sfc, wave" + ) + + if leveltype in {"ml", "pl"} and not gridtype: + raise ValueError( + f"Gridtype is required for leveltype '{leveltype}'. Available: reduced_gg, sh" + ) + + if remap and gridtype != "sh": + raise ValueError("Only 'sh' fields can be remapped.") + + if leveltype == "wave" and gridtype != "reduced_ll": + print("Warning: Wave model data are stored on a reduced_ll grid.") + + if leveltype == "sfc" and gridtype != "reduced_gg": + print("Warning: Surface level data are stored on a reduced_gg grid.") + + if step and not (leveltype == "ml" and gridtype == "reduced_gg"): + print( + "Warning: Specifying 'step' is unnecessary for this configuration and will be ignored." + ) + + if leveltype in {"sfc", "wave"}: + url = f"{BASE_URL}/hplp/hplp_{leveltype}.grib" + elif leveltype == "ml" and gridtype == "reduced_gg": + if step is None: + raise ValueError( + "The ml reduced_gg data are split into two parts:\n" + " - Steps: 0, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120 (2020-07-21T00:00:00 to 2020-07-26T00:00:00)\n" + " - Steps: 132, 144, 156, 168, 180, 192, 204, 216, 228, 240 (2020-07-26T12:00:00 to 2020-07-31T00:00:00)\n" + "Specify a step smaller than 120 for accessing the first part, \n" + "and a step greater or equal to 132 for accessing the second part." + ) + if step <= 120: + url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_0_120.grib" + else: + url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_132_240.grib" + else: + url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}" + ( + "_O400.grib" if remap else ".grib" + ) + ref = requests.get(f"{url}.ref").json() + + print(f"Loading dataset {url}") + + return xr.open_dataset( + "reference://", + engine="zarr", + backend_kwargs=dict(storage_options=dict(fo=ref, asynchronous=False)), + consolidated=False, + ) + + +def regrid_to_regular(ds, in_grid, out_grid): + """Regrid dataset to a regular lat-lon grid. + + Parameters + ---------- + ds : xr.Dataset + The input dataset to regrid + in_grid : dict + The input grid specification for earthkit.regrid.interpolate + out_grid : dict + The output grid specification for earthkit.regrid.interpolate. Is assumed to be + a regular lat-lon grid with equal spacing in latitude and longitude, e.g. {"grid": [0.25, 0.25]}. + """ + out_data = {var: [] for var in ds.data_vars} + for var in ds.data_vars: + for time in ds.time: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + out_data[var].append(r) + + dx = out_grid["grid"][0] + assert ( + out_grid["grid"][0] == out_grid["grid"][1] + ), "Only grids with equal latitude and longitude spacing are supported." + lats = np.linspace(90, -90, int(180 / dx) + 1) + lons = np.linspace(0, 360 - dx, int(360 / dx)) + coords = { + "time": ds.time, + "latitude": lats, + "longitude": lons, + } + out_ds = xr.Dataset( + { + var: (("time", "latitude", "longitude"), out_data[var]) + for var in ds.data_vars + }, + coords=coords, + ) + return out_ds + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--basepath", type=Path, default=Path()) + args = parser.parse_args() + + ds = open_downloaded_canonicalized_dataset( + IFSUncompressedDataset, basepath=args.basepath + ) + open_downloaded_tiny_canonicalized_dataset( + IFSUncompressedDataset, basepath=args.basepath + ) + + for v, da in ds.items(): + print(f"- {v}: {da.dims}") From 04f1ec7b0b5cbc5dbd7c1ec87f8a6b9047002087 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Tue, 25 Nov 2025 11:59:50 +0000 Subject: [PATCH 2/4] Pin earthkit-regrid version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d6b9e78..d4251c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ "cf-xarray~=0.10.0", "cftime~=1.6.0", "dask>=2024.12.0,<2025.4", - "earthkit-regrid>=0.5.0", + "earthkit-regrid~=0.5.0", "fsspec>=2024.10.0,<2025.4", "gribscan>=0.0.14", "healpy~=1.18.0", From b3a6137088faf84f2c7365d695b42e653b9f2d37 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 15 Dec 2025 09:38:08 +0000 Subject: [PATCH 3/4] Refresh uv cache before installation --- .github/workflows/ci.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0eb312b..f77bea8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,7 @@ jobs: run: uv python install - name: Install the package - run: uv sync --all-extras --all-groups && uv pip install . + run: uv sync --refresh --all-extras --all-groups && uv pip install . - name: Run tests run: uv run pytest diff --git a/pyproject.toml b/pyproject.toml index d4251c7..c42f0f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "dask>=2024.12.0,<2025.4", "earthkit-regrid~=0.5.0", "fsspec>=2024.10.0,<2025.4", - "gribscan>=0.0.14", + "gribscan~=0.0.14", "healpy~=1.18.0", # These versions need to be pinned to be compatible with the NextGEMS # catalog at https://data.nextgems-h2020.eu/online.yaml. From ecffc828c8783b9000f6d286330067c8802d08b0 Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Mon, 15 Dec 2025 09:50:19 +0000 Subject: [PATCH 4/4] Use permalink --- src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py index 666fce5..adece58 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -55,7 +55,7 @@ def open(download_path: Path) -> xr.Dataset: def load_hplp_data(leveltype=None, gridtype=None, step=None, remap=False): - """Function taken from: https://github.com/climet-eu/compression-lab-notebooks/blob/main/04-example-datasets/01-hplp.ipynb.""" + """Function taken from: https://github.com/climet-eu/compression-lab-notebooks/blob/d297ee98be916359fde16ab36f0f9e0681662df8/04-example-datasets/01-hplp.ipynb.""" if leveltype not in {"pl", "ml", "sfc", "wave"}: raise ValueError( f"Invalid leveltype: '{leveltype}'. Available leveltypes: pl, ml, sfc, wave"