From 3cb05b0b26f52c89b498619d8f6cd65e4f3f20e7 Mon Sep 17 00:00:00 2001 From: Christopher Harrop Date: Mon, 18 May 2026 09:46:28 -0600 Subject: [PATCH 1/5] Update spack-stack to 2.1.0 and ubuntu to 26.04 --- .github/workflows/docker.yml | 554 ++++++++++++----------------------- README.md | 29 +- docker-compose-test.yml | 14 +- docker-compose.yml | 14 +- frontend/Dockerfile | 227 ++++++++------ master/Dockerfile | 22 +- node/Dockerfile | 22 +- test/test_hello.sh | 9 +- 8 files changed, 391 insertions(+), 500 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 23d0fbb..86bf355 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -8,16 +8,110 @@ on: workflow_dispatch: env: - REGISTRY_FRONTEND_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend - REGISTRY_MASTER_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master - REGISTRY_NODE_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node - AWS_REGION: us-east-2 + SPACK_STACK_VERSION: 2.1.0 + REGISTRY_FRONTEND_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend + REGISTRY_MASTER_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master + REGISTRY_NODE_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node jobs: + resolve_versions: + name: Resolve Version Tags + runs-on: ubuntu-latest + outputs: + ubuntu_version: ${{ steps.resolve.outputs.ubuntu_version }} + steps: + - + name: Resolve concrete ubuntu:latest version + id: resolve + run: | + UBUNTU_VERSION=$(docker run --rm ubuntu:latest bash -lc '. /etc/os-release && echo "$VERSION_ID"') + echo "ubuntu_version=${UBUNTU_VERSION}" >> "$GITHUB_OUTPUT" + echo "Resolved ubuntu:latest to VERSION_ID=${UBUNTU_VERSION}" + + build-test-push-amd64: + runs-on: ubuntu2204-8c-32g-300ssd + needs: + - resolve_versions + timeout-minutes: 360 + permissions: + packages: write + contents: read + id-token: write + steps: + - + name: Checkout repository + uses: actions/checkout@v4 + - + name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + logout: false + - + name: Prune pre-loaded GHA docker images + run: | + docker images + docker image prune -a -f + docker images + - + name: Build and start containers + run: docker compose -f docker-compose-test.yml up --build --pull never -d + - + name: Check cluster logs + run: docker compose -f docker-compose-test.yml logs + - + name: Check status of the cluster containers + run: docker compose -f docker-compose-test.yml ps + - + name: Check status of Slurm + run: docker exec spack-stack-frontend sinfo + - + name: Run a Slurm job + run: docker exec spack-stack-frontend srun hostname + - + name: Test ssh access to Slurm compute nodes + run: | + docker exec spack-stack-frontend timeout 1s ssh slurmnode1 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode2 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode3 hostname + - + name: Load spack-stack envs + run: | + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-mpas-env; module list" + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-fv3-env; module list" + - + name: Compile and run MPI program + run: docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" + - + name: Tag and push tested amd64 images + run: | + VERSION_TAG="ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }}" + + docker tag ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest ${{ env.REGISTRY_FRONTEND_IMAGE }}:${VERSION_TAG} + + docker tag ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest ${{ env.REGISTRY_MASTER_IMAGE }}:${VERSION_TAG} + + docker tag ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest ${{ env.REGISTRY_NODE_IMAGE }}:${VERSION_TAG} + + docker push ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + docker push ${{ env.REGISTRY_FRONTEND_IMAGE }}:${VERSION_TAG} + docker push ${{ env.REGISTRY_MASTER_IMAGE }}:latest + docker push ${{ env.REGISTRY_MASTER_IMAGE }}:${VERSION_TAG} + docker push ${{ env.REGISTRY_NODE_IMAGE }}:latest + docker push ${{ env.REGISTRY_NODE_IMAGE }}:${VERSION_TAG} + - + name: Shut down Slurm cluster containers + if: always() + run: docker compose -f docker-compose-test.yml down + build-frontend-arm64: runs-on: LinuxARM64-8core-32G-300Gb - #needs: docker_compose_test + needs: + - resolve_versions + - build-test-push-amd64 timeout-minutes: 360 permissions: packages: write @@ -51,21 +145,6 @@ jobs: - name: Test Docker Installation run: docker run hello-world - - - name: Install AWS CLI - run: sudo apt-get install -y --no-install-recommends awscli - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: ${{ env.AWS_REGION }} - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ secrets.AWS_GITHUB_ROLE }} - role-duration-seconds: 21600 # 6 hours - role-session-name: spackstackslurmcluster-github-actions - - - name: Test authentication - run: | - aws sts get-caller-identity - name: Checkout repository uses: actions/checkout@v4 @@ -77,6 +156,7 @@ jobs: images: ${{ env.REGISTRY_FRONTEND_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= @@ -108,9 +188,7 @@ jobs: platforms: linux/arm64 labels: ${{ steps.meta.outputs.labels }} secrets: | - "access_key_id=${{ env.AWS_ACCESS_KEY_ID }}" - "secret_access_key=${{ env.AWS_SECRET_ACCESS_KEY }}" - "session_token=${{ env.AWS_SESSION_TOKEN }}" + "github_token=${{ secrets.GITHUB_TOKEN }}" cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache,mode=max outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true @@ -136,153 +214,11 @@ jobs: with: limit-access-to-actor: true - build-frontend-amd64: - runs-on: ubuntu2204-8c-32g-300ssd - #needs: docker_compose_test - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write - steps: - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install AWS CLI - run: sudo apt-get install -y --no-install-recommends awscli - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: ${{ env.AWS_REGION }} - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ secrets.AWS_GITHUB_ROLE }} - role-duration-seconds: 21600 # 6 hours - role-session-name: spackstackslurmcluster-github-actions - - - name: Test authentication - run: | - aws sts get-caller-identity - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_FRONTEND_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Prune pre-loaded GHA docker images - run: | - docker images - docker image prune -a -f - docker images - - - name: Build spack-stack and push by digest - id: build - uses: docker/build-push-action@v5 - with: - context: ./frontend - file: ./frontend/Dockerfile - platforms: linux/amd64 - secrets: | - "access_key_id=${{ env.AWS_ACCESS_KEY_ID }}" - "secret_access_key=${{ env.AWS_SECRET_ACCESS_KEY }}" - "session_token=${{ env.AWS_SESSION_TOKEN }}" - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: frontend-digests-linux-amd64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 - - - name: Debug session - if: ${{ failure() }} - uses: mxschmitt/action-tmate@v3 - timeout-minutes: 60 - with: - limit-access-to-actor: true - - merge-frontend: - runs-on: ubuntu-latest - needs: - - build-frontend-amd64 - - build-frontend-arm64 - steps: - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download digests - uses: actions/download-artifact@v4 - with: - path: /tmp/digests - pattern: frontend-digests-* - merge-multiple: true - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_FRONTEND_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Create manifest list and push - working-directory: /tmp/digests - run: | - docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.REGISTRY_FRONTEND_IMAGE }}@sha256:%s ' *) - - - name: Inspect image - run: | - docker buildx imagetools inspect ${{ env.REGISTRY_FRONTEND_IMAGE }}:${{ steps.meta.outputs.version }} - build-master-arm64: runs-on: LinuxARM64-8core-32G-300Gb - #needs: docker_compose_test + needs: + - resolve_versions + - build-test-push-amd64 timeout-minutes: 360 permissions: packages: write @@ -327,6 +263,7 @@ jobs: images: ${{ env.REGISTRY_MASTER_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= @@ -375,15 +312,44 @@ jobs: if-no-files-found: error retention-days: 1 - build-master-amd64: - runs-on: ubuntu2204-8c-32g-300ssd - #needs: docker_compose_test + build-node-arm64: + runs-on: LinuxARM64-8core-32G-300Gb + needs: + - resolve_versions + - build-test-push-amd64 timeout-minutes: 360 permissions: packages: write contents: read id-token: write steps: + - + # Beta ARM runners do not have Docker installed + name: Install Docker + run: | + # Uninstall incompatible packages + for pkg in docker.io containerd runc; do sudo apt-get remove $pkg; done + # Add Docker's official GPG key: + sudo apt-get update + sudo apt-get install ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + # Add the repository to Apt sources: + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update -y + # Install docker packages + sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + # Allow runner use to run docker without sudo + sudo usermod -aG docker $USER + sudo apt-get install acl + sudo setfacl --modify user:$USER:rw /var/run/docker.sock + - + name: Test Docker Installation + run: docker run hello-world - name: Checkout repository uses: actions/checkout@v4 @@ -392,9 +358,10 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.REGISTRY_MASTER_IMAGE }} + images: ${{ env.REGISTRY_NODE_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= @@ -417,17 +384,17 @@ jobs: docker image prune -a -f docker images - - name: Build spack-stack and push by digest + name: Build and push by digest id: build uses: docker/build-push-action@v5 with: - context: ./master - file: ./master/Dockerfile - platforms: linux/amd64 + context: ./node + file: ./node/Dockerfile + platforms: linux/arm64 labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache,mode=max + outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Export digest run: | @@ -438,16 +405,21 @@ jobs: name: Upload digest uses: actions/upload-artifact@v4 with: - name: master-digests-linux-amd64 + name: node-digests-linux-arm64 path: /tmp/digests/* if-no-files-found: error retention-days: 1 - merge-master: + merge-frontend: runs-on: ubuntu-latest needs: - - build-master-amd64 - - build-master-arm64 + - build-test-push-amd64 + - build-frontend-arm64 + - resolve_versions + permissions: + packages: write + contents: read + id-token: write steps: - name: Checkout repository @@ -457,7 +429,7 @@ jobs: uses: actions/download-artifact@v4 with: path: /tmp/digests - pattern: master-digests-* + pattern: frontend-digests-* merge-multiple: true - name: Set up Docker Buildx @@ -467,9 +439,10 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.REGISTRY_MASTER_IMAGE }} + images: ${{ env.REGISTRY_FRONTEND_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= @@ -487,134 +460,50 @@ jobs: working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.REGISTRY_MASTER_IMAGE }}@sha256:%s ' *) + ${{ env.REGISTRY_FRONTEND_IMAGE }}:ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} \ + $(printf '${{ env.REGISTRY_FRONTEND_IMAGE }}@sha256:%s ' *) - name: Inspect image run: | - docker buildx imagetools inspect ${{ env.REGISTRY_MASTER_IMAGE }}:${{ steps.meta.outputs.version }} + docker buildx imagetools inspect ${{ env.REGISTRY_FRONTEND_IMAGE }}:${{ steps.meta.outputs.version }} - build-node-arm64: - runs-on: LinuxARM64-8core-32G-300Gb - #needs: docker_compose_test - timeout-minutes: 360 + merge-master: + runs-on: ubuntu-latest + needs: + - build-test-push-amd64 + - build-master-arm64 + - resolve_versions permissions: packages: write contents: read id-token: write steps: - - - # Beta ARM runners do not have Docker installed - name: Install Docker - run: | - # Uninstall incompatible packages - for pkg in docker.io containerd runc; do sudo apt-get remove $pkg; done - # Add Docker's official GPG key: - sudo apt-get update - sudo apt-get install ca-certificates curl - sudo install -m 0755 -d /etc/apt/keyrings - sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc - sudo chmod a+r /etc/apt/keyrings/docker.asc - # Add the repository to Apt sources: - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update -y - # Install docker packages - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Allow runner use to run docker without sudo - sudo usermod -aG docker $USER - sudo apt-get install acl - sudo setfacl --modify user:$USER:rw /var/run/docker.sock - - - name: Test Docker Installation - run: docker run hello-world - name: Checkout repository uses: actions/checkout@v4 - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 + name: Download digests + uses: actions/download-artifact@v4 with: - images: ${{ env.REGISTRY_NODE_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= + path: /tmp/digests + pattern: master-digests-* + merge-multiple: true - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Prune pre-loaded GHA docker images - run: | - docker images - docker image prune -a -f - docker images - - - name: Build and push by digest - id: build - uses: docker/build-push-action@v5 - with: - context: ./node - file: ./node/Dockerfile - platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: node-digests-linux-arm64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 - - build-node-amd64: - runs-on: ubuntu2204-8c-32g-300ssd - #needs: docker_compose_test - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write - steps: - - - name: Checkout repository - uses: actions/checkout@v4 - name: Docker meta id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.REGISTRY_NODE_IMAGE }} + images: ${{ env.REGISTRY_MASTER_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= suffix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - name: Login to GHCR uses: docker/login-action@v3 @@ -624,43 +513,27 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} logout: false - - name: Prune pre-loaded GHA docker images + name: Create manifest list and push + working-directory: /tmp/digests run: | - docker images - docker image prune -a -f - docker images - - - name: Build spack-stack and push by digest - id: build - uses: docker/build-push-action@v5 - with: - context: ./node - file: ./node/Dockerfile - platforms: linux/amd64 - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + ${{ env.REGISTRY_MASTER_IMAGE }}:ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} \ + $(printf '${{ env.REGISTRY_MASTER_IMAGE }}@sha256:%s ' *) - - name: Export digest + name: Inspect image run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: node-digests-linux-amd64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 + docker buildx imagetools inspect ${{ env.REGISTRY_MASTER_IMAGE }}:${{ steps.meta.outputs.version }} merge-node: runs-on: ubuntu-latest needs: - - build-node-amd64 + - build-test-push-amd64 - build-node-arm64 + - resolve_versions + permissions: + packages: write + contents: read + id-token: write steps: - name: Checkout repository @@ -683,6 +556,7 @@ jobs: images: ${{ env.REGISTRY_NODE_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= @@ -700,60 +574,10 @@ jobs: working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + ${{ env.REGISTRY_NODE_IMAGE }}:ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} \ $(printf '${{ env.REGISTRY_NODE_IMAGE }}@sha256:%s ' *) - name: Inspect image run: | docker buildx imagetools inspect ${{ env.REGISTRY_NODE_IMAGE }}:${{ steps.meta.outputs.version }} - docker-compose-test: - runs-on: ubuntu2204-8c-32g-300ssd - needs: - - merge-frontend - - merge-master - - merge-node - steps: - - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: Build and start containers - run: docker compose -f docker-compose-test.yml up --build -d - - - - name: Check cluster logs - run: docker compose -f docker-compose-test.yml logs - - - - name: Check status of the cluster containers - run: docker compose -f docker-compose-test.yml ps - - - - name: Check status of Slurm - run: docker exec spack-stack-frontend sinfo - - - - name: Run a Slurm job - run: docker exec spack-stack-frontend srun hostname - - - - name: Test ssh access to Slurm compute nodes - run: | - docker exec spack-stack-frontend timeout 1s ssh slurmnode1 hostname - docker exec spack-stack-frontend timeout 1s ssh slurmnode2 hostname - docker exec spack-stack-frontend timeout 1s ssh slurmnode3 hostname - - - - name: Load spack-stack envs - run: | - docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/install/modulefiles/Core ; module load stack-gcc stack-openmpi stack-python jedi-mpas-env; module list" - docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/install/modulefiles/Core ; module load stack-gcc stack-openmpi stack-python jedi-fv3-env; module list" - - - - name: Compile and run MPI program - run: | - docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" - - - - name: Shut down Slurm cluster containers - run: docker compose -f docker-compose-test.yml down diff --git a/README.md b/README.md index 87d0aa8..4f1984d 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,32 @@ sizes. The cluster behaves as if it were running on multiple nodes even if the containers are all running on the same host machine. +# Building the Containers + +To build the containers from source: + +## Master and Node Containers + +```bash +docker build -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest -f master/Dockerfile master/ +docker build -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest -f node/Dockerfile node/ +``` + +## Frontend Container + +The frontend container requires a GitHub personal access token (PAT) with package write permissions to push built packages to the GitHub Container Registry build cache. Set your token in an environment variable and pass it as a secret during build: + +```bash +export GITHUB_TOKEN=your_github_pat_here +docker build --progress=plain \ + --secret id=github_token,env=GITHUB_TOKEN \ + -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest \ + -f frontend/Dockerfile \ + frontend/ +``` + +**Note:** The `--progress=plain` flag shows full build output. The frontend build compiles 355+ scientific software packages from source and can take several hours on first build. Subsequent builds use the cached packages from GHCR. + # Quick Start To start the slurm cluster environment: @@ -69,10 +95,9 @@ docker exec -it spack-stack-frontend bash -l Next, load the spack-stack base environment: ``` -module use /opt/spack-stack/envs/unified-env/install/modulefiles/Core +module use /opt/spack-stack/envs/unified-env/modules/Core module load stack-gcc module load stack-openmpi -module load stack-python ``` Once the basic spack-stack modules are loaded, you can choose from multiple spack-stack environments for different purposes. diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 043ad8c..0af8911 100644 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -3,7 +3,7 @@ services: build: context: ./frontend dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest container_name: spack-stack-frontend hostname: slurmfrontend user: admin @@ -17,7 +17,7 @@ services: build: context: ./master dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest container_name: spack-stack-master hostname: slurmmaster user: admin @@ -35,7 +35,7 @@ services: build: context: ./node dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node1 hostname: slurmnode1 user: admin @@ -49,7 +49,7 @@ services: links: - slurmmaster slurmnode2: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node2 hostname: slurmnode2 user: admin @@ -63,7 +63,7 @@ services: links: - slurmmaster slurmnode3: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node3 hostname: slurmnode3 user: admin @@ -77,7 +77,7 @@ services: links: - slurmmaster slurmnode4: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node4 hostname: slurmnode4 user: admin @@ -91,7 +91,7 @@ services: links: - slurmmaster slurmnode5: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node5 hostname: slurmnode5 user: admin diff --git a/docker-compose.yml b/docker-compose.yml index 2c8f9fe..0da1643 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ services: build: context: ./frontend dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest container_name: spack-stack-frontend hostname: slurmfrontend user: admin @@ -16,7 +16,7 @@ services: build: context: ./master dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest container_name: spack-stack-master hostname: slurmmaster user: admin @@ -33,7 +33,7 @@ services: build: context: ./node dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node1 hostname: slurmnode1 user: admin @@ -46,7 +46,7 @@ services: links: - slurmmaster slurmnode2: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node2 hostname: slurmnode2 user: admin @@ -59,7 +59,7 @@ services: links: - slurmmaster slurmnode3: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node3 hostname: slurmnode3 user: admin @@ -72,7 +72,7 @@ services: links: - slurmmaster slurmnode4: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node4 hostname: slurmnode4 user: admin @@ -85,7 +85,7 @@ services: links: - slurmmaster slurmnode5: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node5 hostname: slurmnode5 user: admin diff --git a/frontend/Dockerfile b/frontend/Dockerfile index cecd539..aa0a7c5 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -8,16 +8,16 @@ SHELL ["/bin/bash", "-c"] # Install OS packages RUN apt-get -y update \ && apt-get -y install --no-install-recommends \ - awscli \ bc \ build-essential \ ca-certificates \ curl \ - emacs \ + emacs \ file \ - g++ \ - gcc \ - gfortran \ + gcc-13 \ + g++-13 \ + gfortran-13 \ + cpp-13 \ git \ gnupg2 \ iproute2 \ @@ -28,22 +28,21 @@ RUN apt-get -y update \ python3-pip \ python3-setuptools \ subversion \ + lua5.4 \ + liblua5.4-dev \ + lua-posix \ + lua-filesystem \ tcl-dev \ tcsh \ unzip \ zstd \ - && pip3 install boto3 \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 \ + --slave /usr/bin/g++ g++ /usr/bin/g++-13 \ + --slave /usr/bin/gfortran gfortran /usr/bin/gfortran-13 -# Install Lua and Lmod -RUN wget https://sourceforge.net/projects/lmod/files/lua-5.1.4.9.tar.bz2 \ - && tar xvfj lua-5.1.4.9.tar.bz2 \ - && pushd lua-5.1.4.9 \ - && ./configure --prefix=/usr \ - && make -j 4 \ - && make install \ - && popd \ - && git clone --recursive https://github.com/TACC/Lmod.git \ +# Install Lmod +RUN git clone --recursive https://github.com/TACC/Lmod.git \ && pushd Lmod \ && ./configure --prefix=/usr \ && make -j 4 \ @@ -51,106 +50,166 @@ RUN wget https://sourceforge.net/projects/lmod/files/lua-5.1.4.9.tar.bz2 \ && echo "source /usr/lmod/lmod/init/bash" >> /etc/bash.bashrc \ && echo "source /usr/lmod/lmod/init/bash" >> /etc/profile \ && popd \ - && rm -rf lua* Lmod - -# Copy patch files into /tmp for use when installing spack-stack -COPY cc.patch.aarch64 /tmp -COPY cc.patch.x86_64 /tmp -COPY openmpi.package.py.patch.aarch64 /tmp -COPY openmpi.package.py.patch.x86_64 /tmp + && rm -rf Lmod -# Clone spack-stack and create and configure the unified env +# Clone spack-stack 2.1.0 RUN cd /opt \ - && git clone -b release/1.8.0 --recurse-submodules https://github.com/jcsda/spack-stack.git \ - && pushd spack-stack \ - && . ./setup.sh \ - && pushd spack \ - && mv /tmp/cc.patch.$(uname -m) cc.patch \ - && mv /tmp/openmpi.package.py.patch.$(uname -m) openmpi.package.py.patch \ - && patch -f -p0 < openmpi.package.py.patch \ - && patch -f -p0 < cc.patch \ - && popd \ - && spack stack create env --site linux.default --template unified-dev --name unified-env --compiler gcc \ - && pushd envs/unified-env \ + && git clone -b 2.1.0 --recurse-submodules https://github.com/jcsda/spack-stack.git + +# Create and configure the unified env using the container site +RUN cd /opt/spack-stack \ + && source setup.sh \ + && spack stack create env --site container --template unified-dev --name unified-env --compiler gcc \ + && cd envs/unified-env \ && spack env activate . \ - && spack mirror add --s3-access-key-id "" --s3-access-key-secret "" s3_spack_stack_buildcache_ro s3://chiltepin-us-east-2/spack-stack/ \ - && export SPACK_SYSTEM_CONFIG_PATH="$PWD/site" \ - && spack external find --scope system \ + # Fix system external versions for Ubuntu 26.04 \ + && spack external find --scope "env:/opt/spack-stack/envs/unified-env:/opt/spack-stack/envs/unified-env/site" \ --exclude cmake \ --exclude curl \ --exclude openssl \ --exclude openssh \ --exclude python \ - && spack external find --scope system wget \ - && spack compiler find --scope system \ - && unset SPACK_SYSTEM_CONFIG_PATH \ - && spack config add "packages:all:compiler:[gcc@11.4.0]" \ - && spack config add "packages:all:providers:mpi:[openmpi@4.1.6]" \ - && spack config add "packages:fontconfig:variants:+pic" \ - && spack config add "packages:pixman:variants:+pic" \ - && spack config add "packages:cairo:variants:+pic" \ - && spack config --scope env:/opt/spack-stack/envs/unified-env:common add "packages:openmpi:require:~internal-hwloc +two_level_namespace schedulers=slurm +pmi" \ + && spack external find --scope "env:/opt/spack-stack/envs/unified-env:/opt/spack-stack/envs/unified-env/site" wget \ + # Add slurm as an external package \ && echo " slurm:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " externals:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && echo " - spec: slurm@23.11.7" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " - spec: slurm@25.11.5" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " prefix: /usr" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " buildable: false" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && if [ "$(uname -m)" == "aarch64" ]; then \ - spack config --scope env:/opt/spack-stack/envs/unified-env:common remove "packages:wgrib2" ; \ - spack config --scope env:/opt/spack-stack/envs/unified-env:common remove "modules:default:lmod:wgrib2" ; \ - fi \ + # Add munge as an external package so spack uses the system munge (same one munged starts at boot) \ + # rather than building its own, which would cause LD_LIBRARY_PATH conflicts with system Slurm tools \ + && MUNGE_VERSION=$(dpkg -l munge 2>/dev/null | awk '/^ii/{print $3}' | cut -d: -f2 | cut -d- -f1 || echo "0.5.16") \ + && echo " munge:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " externals:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " - spec: munge@${MUNGE_VERSION}" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " prefix: /usr" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " buildable: false" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + # Configure openmpi to use Slurm scheduler integration (OpenMPI 5 uses PMIx, not +pmi) \ + && spack -e . config add 'packages:openmpi:require:[schedulers=slurm]' \ + # Ensure PMIx includes munge security plugin to avoid psec/munge runtime warnings. + # Use require (not just variants preference) so concretization cannot silently pick ~munge. \ + && spack -e . config add 'packages:pmix:require:[+munge]' \ + # Configure lmod modules instead of tcl \ && sed -i 's/tcl/lmod/g' site/modules.yaml \ - && sed -i 's/tcl/lmod/g' common/modules.yaml \ - && sed -i 's:{^mpi.name}/{^mpi.version}/{compiler.name}/{compiler.version}/{name}:{name}:' common/modules.yaml \ - && sed -i 's:{compiler.name}/{compiler.version}/{name}:{name}:' common/modules.yaml + # Use unhashed module names like other spack-stack deployments, with Ursa-style suffix rules to avoid naming clashes \ + && printf '%s\n' \ + ' hash_length: 0' \ + ' all:' \ + ' suffixes:' \ + ' +debug: debug' \ + ' build_type=Debug: debug' \ + ' esmf:' \ + ' suffixes:' \ + ' ~openmp: noopenmp' \ + ' ip:' \ + ' suffixes:' \ + ' ~openmp: noopenmp' \ + ' neptune-env:' \ + ' suffixes:' \ + ' ~openmp: noopenmp' \ + ' mapl:' \ + ' suffixes:' \ + ' ^esmf@8.6.1~debug ~openmp snapshot=none: esmf-8.6.1-noopenmp' \ + ' ^esmf@8.6.1+debug ~openmp snapshot=none: esmf-8.6.1-debug-noopenmp' \ + ' ^esmf@8.6.1~debug +openmp snapshot=none: esmf-8.6.1' \ + ' ^esmf@8.6.1+debug +openmp snapshot=none: esmf-8.6.1-debug' \ + ' ^esmf@8.8.0~debug ~openmp snapshot=none: esmf-8.8.0-noopenmp' \ + ' ^esmf@8.8.0+debug ~openmp snapshot=none: esmf-8.8.0-debug-noopenmp' \ + ' ^esmf@8.8.0~debug +openmp snapshot=none: esmf-8.8.0' \ + ' ^esmf@8.8.0+debug +openmp snapshot=none: esmf-8.8.0-debug' \ + ' ^esmf@8.9.1~debug ~openmp snapshot=none: esmf-8.9.1-noopenmp' \ + ' ^esmf@8.9.1+debug ~openmp snapshot=none: esmf-8.9.1-debug-noopenmp' \ + ' ^esmf@8.9.1~debug +openmp snapshot=none: esmf-8.9.1' \ + ' ^esmf@8.9.1+debug +openmp snapshot=none: esmf-8.9.1-debug' \ + ' fms:' \ + ' suffixes:' \ + ' constants=GEOS: geos-constants' \ + ' constants=GFS: gfs-constants' \ + >> site/modules.yaml \ + # Force env module root so final stage can copy a stable path \ + && spack -e . config add 'modules:default:roots:lmod:$env/modules' + # TODO: Check if wgrib2 removal is still needed for aarch64 with spack-stack 2.1.0 and ubuntu 26.04. + # This was needed for old spack-stack and ubuntu 22.04, but may be obsolete now. + # Uncomment below if aarch64 CI build fails with wgrib2 error: + # && if [ "$(uname -m)" == "aarch64" ]; then \ + # spack config --scope common remove "packages:wgrib2" ; \ + # spack config --scope common remove "modules:default:lmod:wgrib2" ; \ + # fi -# Concretize the Spack environment +# Add the build cache mirror and concretize RUN cd /opt/spack-stack \ - && . ./setup.sh \ - && cd /opt/spack-stack/envs/unified-env \ + && source setup.sh \ + && cd envs/unified-env \ && spack env activate . \ - && spack concretize 2>&1 | tee log.concretize + && spack mirror add --unsigned ghcr_buildcache oci://ghcr.io/noaa-gsl/dockerspackstackslurmcluster/buildcache \ + && spack concretize 2>&1 | tee log.concretize \ + && spack spec openmpi | tee log.openmpi.spec \ + && grep -Eq 'schedulers(:=|=)slurm' log.openmpi.spec \ + && spack spec pmix | tee log.pmix.spec \ + && grep -q '+munge' log.pmix.spec \ + && grep -q 'munge' log.pmix.spec # Install the Spack environment -RUN --mount=type=secret,id=access_key_id --mount=type=secret,id=secret_access_key --mount=type=secret,id=session_token <&1 | tee log.install - if [ -f /run/secrets/access_key_id ]; then - spack buildcache update-index s3_spack_stack_buildcache_rw + # Install the environment + spack install --no-check-signature 2>&1 | tee log.install + + # Update the build cache index if credentials were provided + if [ -f /run/secrets/github_token ]; then + spack buildcache update-index ghcr_buildcache fi EOF # Create the modulefiles and cleanup -RUN cd /opt \ - && pushd spack-stack \ - && . ./setup.sh \ - && pushd envs/unified-env \ - && spack env activate . \ +RUN cd /opt/spack-stack \ + && source setup.sh \ + && cd envs/unified-env \ && source /usr/lmod/lmod/init/bash \ - && spack module lmod refresh -y \ - && spack stack setup-meta-modules \ + && spack -e . module lmod refresh -y --delete-tree \ + && spack -e . stack setup-meta-modules \ && spack gc -y \ - && find /opt/spack-stack/envs/unified-env/install/gcc/11.4.0 -name .spack -type d -print0 | xargs -0 rm -rf "{}" \ && rm -rf ~/.spack -# Copy installed environment into final images +# Verify module layout before final stage COPY and create a compatibility path if needed +RUN set -euxo pipefail \ + && env_dir=/opt/spack-stack/envs/unified-env \ + && if [ -d "$env_dir/modules" ]; then \ + module_root="$env_dir/modules"; \ + elif [ -d "$env_dir/install/modulefiles" ]; then \ + ln -s "$env_dir/install/modulefiles" "$env_dir/modules"; \ + module_root="$env_dir/modules"; \ + else \ + alt_root=$(find "$env_dir/install" -type d -name modulefiles | head -n 1 || true); \ + if [ -n "$alt_root" ]; then \ + ln -s "$alt_root" "$env_dir/modules"; \ + module_root="$env_dir/modules"; \ + else \ + echo "ERROR: No module directory found under $env_dir"; \ + find "$env_dir" -maxdepth 5 -type d | sort; \ + exit 1; \ + fi; \ + fi \ + && test -d "$module_root/Core" \ + && ls -la "$module_root/Core" + +# Copy installed environment into final image FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest +# Default Slurm MPI plugin so users do not need to pass --mpi=pmix on every srun. +ENV SLURM_MPI_TYPE=pmix + COPY --from=builder /usr /usr COPY --from=builder /etc /etc COPY --from=builder /opt/spack-stack/envs/unified-env/install /opt/spack-stack/envs/unified-env/install +COPY --from=builder /opt/spack-stack/envs/unified-env/modules /opt/spack-stack/envs/unified-env/modules diff --git a/master/Dockerfile b/master/Dockerfile index 4d4392a..159e27e 100644 --- a/master/Dockerfile +++ b/master/Dockerfile @@ -11,12 +11,11 @@ RUN < hello.out +srun --mpi=pmix -N 3 --tasks-per-node=2 ./hello.exe | sort > hello.out +diff hello.out hello.baseline + +srun -N 3 --tasks-per-node=2 ./hello.exe | sort > hello.out diff hello.out hello.baseline From 800f7e5f19d582ad333fe410dd1080257356a371 Mon Sep 17 00:00:00 2001 From: Christopher Harrop Date: Mon, 18 May 2026 10:23:41 -0600 Subject: [PATCH 2/5] Use generic x86_64 microarchitecture and add secrets for binary cache use --- .github/workflows/docker.yml | 2 +- docker-compose-test.yml | 6 ++++++ docker-compose.yml | 6 ++++++ frontend/Dockerfile | 7 +++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 86bf355..e9217ee 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -58,7 +58,7 @@ jobs: docker images - name: Build and start containers - run: docker compose -f docker-compose-test.yml up --build --pull never -d + run: GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }} docker compose -f docker-compose-test.yml up --build --pull never -d - name: Check cluster logs run: docker compose -f docker-compose-test.yml logs diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 0af8911..5cf6117 100644 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -3,6 +3,8 @@ services: build: context: ./frontend dockerfile: ./Dockerfile + secrets: + - github_token image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest container_name: spack-stack-frontend hostname: slurmfrontend @@ -107,3 +109,7 @@ services: volumes: home-vol: opt-vol: + +secrets: + github_token: + environment: GITHUB_TOKEN diff --git a/docker-compose.yml b/docker-compose.yml index 0da1643..c1343ca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,8 @@ services: build: context: ./frontend dockerfile: ./Dockerfile + secrets: + - github_token image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest container_name: spack-stack-frontend hostname: slurmfrontend @@ -100,3 +102,7 @@ services: volumes: home-vol: opt-vol: + +secrets: + github_token: + environment: GITHUB_TOKEN diff --git a/frontend/Dockerfile b/frontend/Dockerfile index aa0a7c5..37b2e90 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -89,6 +89,13 @@ RUN cd /opt/spack-stack \ # Ensure PMIx includes munge security plugin to avoid psec/munge runtime warnings. # Use require (not just variants preference) so concretization cannot silently pick ~munge. \ && spack -e . config add 'packages:pmix:require:[+munge]' \ + # Normalize target selection so builds on different hosts can share buildcache artifacts. \ + # Use generic targets (x86_64 / aarch64) for maximum portability at the cost of SIMD optimizations. \ + && if [ "$(uname -m)" = "x86_64" ]; then \ + spack -e . config add 'packages:all:target:[x86_64]' ; \ + elif [ "$(uname -m)" = "aarch64" ]; then \ + spack -e . config add 'packages:all:target:[aarch64]' ; \ + fi \ # Configure lmod modules instead of tcl \ && sed -i 's/tcl/lmod/g' site/modules.yaml \ # Use unhashed module names like other spack-stack deployments, with Ursa-style suffix rules to avoid naming clashes \ From 2638632bcff9c019a8c7a7031e7769af95795900 Mon Sep 17 00:00:00 2001 From: Christopher Harrop Date: Mon, 18 May 2026 11:03:55 -0600 Subject: [PATCH 3/5] Update CI workflow to build using caches if available --- .github/workflows/docker.yml | 130 +++++++++++++++++++++++++++++------ frontend/Dockerfile | 2 +- 2 files changed, 109 insertions(+), 23 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index e9217ee..2b52c5f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -42,6 +42,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host - name: Login to GHCR uses: docker/login-action@v3 @@ -57,8 +62,43 @@ jobs: docker image prune -a -f docker images - - name: Build and start containers - run: GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }} docker compose -f docker-compose-test.yml up --build --pull never -d + name: Build frontend image + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile + platforms: linux/amd64 + tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + secrets: | + "github_token=${{ secrets.GITHUB_TOKEN }}" + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache,mode=max + load: true + - + name: Build master image + uses: docker/build-push-action@v5 + with: + context: ./master + file: ./master/Dockerfile + platforms: linux/amd64 + tags: ${{ env.REGISTRY_MASTER_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache,mode=max + load: true + - + name: Build node image + uses: docker/build-push-action@v5 + with: + context: ./node + file: ./node/Dockerfile + platforms: linux/amd64 + tags: ${{ env.REGISTRY_NODE_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache,mode=max + load: true + - + name: Start containers for testing + run: docker compose -f docker-compose-test.yml up --pull never -d - name: Check cluster logs run: docker compose -f docker-compose-test.yml logs @@ -85,27 +125,76 @@ jobs: - name: Compile and run MPI program run: docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" - - - name: Tag and push tested amd64 images - run: | - VERSION_TAG="ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }}" - - docker tag ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest ${{ env.REGISTRY_FRONTEND_IMAGE }}:${VERSION_TAG} - - docker tag ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest ${{ env.REGISTRY_MASTER_IMAGE }}:${VERSION_TAG} - - docker tag ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest ${{ env.REGISTRY_NODE_IMAGE }}:${VERSION_TAG} - - docker push ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest - docker push ${{ env.REGISTRY_FRONTEND_IMAGE }}:${VERSION_TAG} - docker push ${{ env.REGISTRY_MASTER_IMAGE }}:latest - docker push ${{ env.REGISTRY_MASTER_IMAGE }}:${VERSION_TAG} - docker push ${{ env.REGISTRY_NODE_IMAGE }}:latest - docker push ${{ env.REGISTRY_NODE_IMAGE }}:${VERSION_TAG} - name: Shut down Slurm cluster containers if: always() run: docker compose -f docker-compose-test.yml down + - + name: Push frontend by digest + id: push-frontend + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile + platforms: linux/amd64 + secrets: | + "github_token=${{ secrets.GITHUB_TOKEN }}" + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache + outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true + - + name: Push master by digest + id: push-master + uses: docker/build-push-action@v5 + with: + context: ./master + file: ./master/Dockerfile + platforms: linux/amd64 + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache + outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true + - + name: Push node by digest + id: push-node + uses: docker/build-push-action@v5 + with: + context: ./node + file: ./node/Dockerfile + platforms: linux/amd64 + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache + outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true + - + name: Export digests + run: | + mkdir -p /tmp/digests/frontend /tmp/digests/master /tmp/digests/node + frontend_digest="${{ steps.push-frontend.outputs.digest }}" + master_digest="${{ steps.push-master.outputs.digest }}" + node_digest="${{ steps.push-node.outputs.digest }}" + touch "/tmp/digests/frontend/${frontend_digest#sha256:}" + touch "/tmp/digests/master/${master_digest#sha256:}" + touch "/tmp/digests/node/${node_digest#sha256:}" + - + name: Upload frontend digest + uses: actions/upload-artifact@v4 + with: + name: frontend-digests-linux-amd64 + path: /tmp/digests/frontend/* + if-no-files-found: error + retention-days: 1 + - + name: Upload master digest + uses: actions/upload-artifact@v4 + with: + name: master-digests-linux-amd64 + path: /tmp/digests/master/* + if-no-files-found: error + retention-days: 1 + - + name: Upload node digest + uses: actions/upload-artifact@v4 + with: + name: node-digests-linux-amd64 + path: /tmp/digests/node/* + if-no-files-found: error + retention-days: 1 build-frontend-arm64: runs-on: LinuxARM64-8core-32G-300Gb @@ -460,7 +549,6 @@ jobs: working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - ${{ env.REGISTRY_FRONTEND_IMAGE }}:ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} \ $(printf '${{ env.REGISTRY_FRONTEND_IMAGE }}@sha256:%s ' *) - name: Inspect image @@ -517,7 +605,6 @@ jobs: working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - ${{ env.REGISTRY_MASTER_IMAGE }}:ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} \ $(printf '${{ env.REGISTRY_MASTER_IMAGE }}@sha256:%s ' *) - name: Inspect image @@ -574,7 +661,6 @@ jobs: working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - ${{ env.REGISTRY_NODE_IMAGE }}:ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} \ $(printf '${{ env.REGISTRY_NODE_IMAGE }}@sha256:%s ' *) - name: Inspect image diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 37b2e90..2ef7445 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -163,7 +163,7 @@ RUN --mount=type=secret,id=github_token < Date: Tue, 19 May 2026 08:28:27 -0600 Subject: [PATCH 4/5] Add tests for arm and cleanup --- .github/workflows/docker.yml | 320 ++++++++-------------- .github/workflows/package-cleanup.yaml | 100 ++++++- README.md | 30 ++ docker-compose-test.yml | 2 + docker-compose.yml | 2 + frontend/Dockerfile | 46 +--- frontend/cc.patch.aarch64 | 11 - frontend/cc.patch.x86_64 | 0 frontend/openmpi.package.py.patch.aarch64 | 10 - frontend/openmpi.package.py.patch.x86_64 | 10 - 10 files changed, 249 insertions(+), 282 deletions(-) delete mode 100644 frontend/cc.patch.aarch64 delete mode 100644 frontend/cc.patch.x86_64 delete mode 100644 frontend/openmpi.package.py.patch.aarch64 delete mode 100644 frontend/openmpi.package.py.patch.x86_64 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2b52c5f..d92c171 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -69,6 +69,8 @@ jobs: file: ./frontend/Dockerfile platforms: linux/amd64 tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + build-args: | + SPACK_BUILD_JOBS=8 secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache @@ -137,6 +139,8 @@ jobs: context: ./frontend file: ./frontend/Dockerfile platforms: linux/amd64 + build-args: | + SPACK_BUILD_JOBS=8 secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache @@ -195,12 +199,18 @@ jobs: path: /tmp/digests/node/* if-no-files-found: error retention-days: 1 + - + name: Debug session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3 + timeout-minutes: 60 + with: + limit-access-to-actor: true - build-frontend-arm64: + build-test-push-arm64: runs-on: LinuxARM64-8core-32G-300Gb needs: - resolve_versions - - build-test-push-amd64 timeout-minutes: 360 permissions: packages: write @@ -208,7 +218,6 @@ jobs: id-token: write steps: - - # Beta ARM runners do not have Docker installed name: Install Docker run: | # Uninstall incompatible packages @@ -237,22 +246,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_FRONTEND_IMAGE }} - tags: | - type=raw,value=latest - type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} - flavor: | - latest=true - prefix= - suffix= - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host - name: Login to GHCR uses: docker/login-action@v3 @@ -268,242 +266,156 @@ jobs: docker image prune -a -f docker images - - name: Build and push by digest - id: build + name: Build frontend image uses: docker/build-push-action@v5 with: context: ./frontend file: ./frontend/Dockerfile platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} + tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + build-args: | + SPACK_BUILD_JOBS=8 secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" + load: true - - name: Upload digest - uses: actions/upload-artifact@v4 + name: Build master image + uses: docker/build-push-action@v5 with: - name: frontend-digests-linux-arm64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 + context: ./master + file: ./master/Dockerfile + platforms: linux/arm64 + tags: ${{ env.REGISTRY_MASTER_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache,mode=max + load: true - - name: Debug session - if: ${{ failure() }} - uses: mxschmitt/action-tmate@v3 - timeout-minutes: 60 + name: Build node image + uses: docker/build-push-action@v5 with: - limit-access-to-actor: true - - build-master-arm64: - runs-on: LinuxARM64-8core-32G-300Gb - needs: - - resolve_versions - - build-test-push-amd64 - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write - steps: + context: ./node + file: ./node/Dockerfile + platforms: linux/arm64 + tags: ${{ env.REGISTRY_NODE_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache,mode=max + load: true - - # Beta ARM runners do not have Docker installed - name: Install Docker - run: | - # Uninstall incompatible packages - for pkg in docker.io containerd runc; do sudo apt-get remove $pkg; done - # Add Docker's official GPG key: - sudo apt-get update - sudo apt-get install ca-certificates curl - sudo install -m 0755 -d /etc/apt/keyrings - sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc - sudo chmod a+r /etc/apt/keyrings/docker.asc - # Add the repository to Apt sources: - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update -y - # Install docker packages - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Allow runner use to run docker without sudo - sudo usermod -aG docker $USER - sudo apt-get install acl - sudo setfacl --modify user:$USER:rw /var/run/docker.sock + name: Start containers for testing + run: docker compose -f docker-compose-test.yml up --pull never -d - - name: Test Docker Installation - run: docker run hello-world + name: Check cluster logs + run: docker compose -f docker-compose-test.yml logs - - name: Checkout repository - uses: actions/checkout@v4 + name: Check status of the cluster containers + run: docker compose -f docker-compose-test.yml ps - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_MASTER_IMAGE }} - tags: | - type=raw,value=latest - type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} - flavor: | - latest=true - prefix= - suffix= + name: Check status of Slurm + run: docker exec spack-stack-frontend sinfo - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + name: Run a Slurm job + run: docker exec spack-stack-frontend srun hostname - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false + name: Test ssh access to Slurm compute nodes + run: | + docker exec spack-stack-frontend timeout 1s ssh slurmnode1 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode2 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode3 hostname - - name: Prune pre-loaded GHA docker images + name: Load spack-stack envs run: | - docker images - docker image prune -a -f - docker images + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-mpas-env; module list" + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-fv3-env; module list" - - name: Build and push by digest - id: build + name: Compile and run MPI program + run: docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" + - + name: Shut down Slurm cluster containers + if: always() + run: docker compose -f docker-compose-test.yml down + - + name: Push frontend by digest + id: push-frontend + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile + platforms: linux/arm64 + build-args: | + SPACK_BUILD_JOBS=8 + secrets: | + "github_token=${{ secrets.GITHUB_TOKEN }}" + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache + outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true + - + name: Push master by digest + id: push-master uses: docker/build-push-action@v5 with: context: ./master file: ./master/Dockerfile platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache,mode=max outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: master-digests-linux-arm64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 - - build-node-arm64: - runs-on: LinuxARM64-8core-32G-300Gb - needs: - - resolve_versions - - build-test-push-amd64 - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write - steps: - - - # Beta ARM runners do not have Docker installed - name: Install Docker - run: | - # Uninstall incompatible packages - for pkg in docker.io containerd runc; do sudo apt-get remove $pkg; done - # Add Docker's official GPG key: - sudo apt-get update - sudo apt-get install ca-certificates curl - sudo install -m 0755 -d /etc/apt/keyrings - sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc - sudo chmod a+r /etc/apt/keyrings/docker.asc - # Add the repository to Apt sources: - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update -y - # Install docker packages - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Allow runner use to run docker without sudo - sudo usermod -aG docker $USER - sudo apt-get install acl - sudo setfacl --modify user:$USER:rw /var/run/docker.sock - - - name: Test Docker Installation - run: docker run hello-world - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_NODE_IMAGE }} - tags: | - type=raw,value=latest - type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} - flavor: | - latest=true - prefix= - suffix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Prune pre-loaded GHA docker images - run: | - docker images - docker image prune -a -f - docker images - - - name: Build and push by digest - id: build + name: Push node by digest + id: push-node uses: docker/build-push-action@v5 with: context: ./node file: ./node/Dockerfile platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache,mode=max outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - name: Export digest + name: Export digests run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" + mkdir -p /tmp/digests/frontend /tmp/digests/master /tmp/digests/node + frontend_digest="${{ steps.push-frontend.outputs.digest }}" + master_digest="${{ steps.push-master.outputs.digest }}" + node_digest="${{ steps.push-node.outputs.digest }}" + touch "/tmp/digests/frontend/${frontend_digest#sha256:}" + touch "/tmp/digests/master/${master_digest#sha256:}" + touch "/tmp/digests/node/${node_digest#sha256:}" + - + name: Upload frontend digest + uses: actions/upload-artifact@v4 + with: + name: frontend-digests-linux-arm64 + path: /tmp/digests/frontend/* + if-no-files-found: error + retention-days: 1 - - name: Upload digest + name: Upload master digest + uses: actions/upload-artifact@v4 + with: + name: master-digests-linux-arm64 + path: /tmp/digests/master/* + if-no-files-found: error + retention-days: 1 + - + name: Upload node digest uses: actions/upload-artifact@v4 with: name: node-digests-linux-arm64 - path: /tmp/digests/* + path: /tmp/digests/node/* if-no-files-found: error retention-days: 1 + - + name: Debug session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3 + timeout-minutes: 60 + with: + limit-access-to-actor: true merge-frontend: runs-on: ubuntu-latest needs: - build-test-push-amd64 - - build-frontend-arm64 + - build-test-push-arm64 - resolve_versions permissions: packages: write @@ -559,7 +471,7 @@ jobs: runs-on: ubuntu-latest needs: - build-test-push-amd64 - - build-master-arm64 + - build-test-push-arm64 - resolve_versions permissions: packages: write @@ -615,7 +527,7 @@ jobs: runs-on: ubuntu-latest needs: - build-test-push-amd64 - - build-node-arm64 + - build-test-push-arm64 - resolve_versions permissions: packages: write diff --git a/.github/workflows/package-cleanup.yaml b/.github/workflows/package-cleanup.yaml index 5ebf9a9..692ed16 100644 --- a/.github/workflows/package-cleanup.yaml +++ b/.github/workflows/package-cleanup.yaml @@ -1,40 +1,45 @@ name: PackageCleanup on: - push: - branches: [ main ] - pull_request: - branches: [ main ] workflow_dispatch: + inputs: + buildcache_cutoff: + description: 'Delete buildcache entries older than this (ISO date, e.g. 2026-05-18). Leave empty to skip buildcache cleanup.' + required: false + default: '' + dry_run: + description: 'Set to true to only list what would be deleted without actually deleting' + required: false + default: 'true' jobs: - cleanup-packages: + cleanup-untagged: runs-on: ubuntu-latest permissions: packages: write contents: read steps: - - name: Remove untagged versions of dockerspackstackslurmcluster/frontend + name: Remove untagged versions of dockerspackstackslurmcluster/slurm-spack-stack-frontend uses: actions/delete-package-versions@v5 with: - package-name: 'dockerspackstackslurmcluster/frontend' + package-name: 'dockerspackstackslurmcluster/slurm-spack-stack-frontend' package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' - - name: Remove untagged versions of dockerspackstackslurmcluster/master + name: Remove untagged versions of dockerspackstackslurmcluster/slurm-spack-stack-master uses: actions/delete-package-versions@v5 with: - package-name: 'dockerspackstackslurmcluster/master' + package-name: 'dockerspackstackslurmcluster/slurm-spack-stack-master' package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' - - name: Remove untagged versions of dockerspackstackslurmcluster/node + name: Remove untagged versions of dockerspackstackslurmcluster/slurm-spack-stack-node uses: actions/delete-package-versions@v5 with: - package-name: 'dockerspackstackslurmcluster/node' + package-name: 'dockerspackstackslurmcluster/slurm-spack-stack-node' package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' @@ -86,3 +91,76 @@ jobs: package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' + + cleanup-stale-buildcache: + if: ${{ github.event.inputs.buildcache_cutoff != '' }} + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + steps: + - + name: Clean stale buildcache entries + uses: actions/github-script@v7 + with: + script: | + const cutoff = new Date('${{ github.event.inputs.buildcache_cutoff }}'); + const dryRun = '${{ github.event.inputs.dry_run }}' === 'true'; + const org = 'noaa-gsl'; + const packageName = 'dockerspackstackslurmcluster/buildcache'; + + console.log(`Cutoff date: ${cutoff.toISOString()}`); + console.log(`Dry run: ${dryRun}`); + + let deleted = 0; + let kept = 0; + let page = 1; + const perPage = 100; + + while (true) { + const versions = await github.rest.packages.getAllPackageVersionsForPackageOwnedByOrg({ + package_type: 'container', + package_name: packageName, + org: org, + per_page: perPage, + page: page, + }); + + if (versions.data.length === 0) break; + + for (const version of versions.data) { + const createdAt = new Date(version.created_at); + const tags = version.metadata?.container?.tags || []; + + // Never delete index entries - they are updated in place by spack buildcache update-index + const isIndex = tags.some(t => t.includes('index') || t.startsWith('_')); + if (isIndex) { + console.log(`Preserving index: ${version.id} (tags: ${tags.join(', ')})`); + kept++; + continue; + } + + if (createdAt < cutoff) { + if (dryRun) { + console.log(`[DRY RUN] Would delete: ${version.id} (created ${createdAt.toISOString()}, tags: ${tags.join(', ')})`); + } else { + console.log(`Deleting: ${version.id} (created ${createdAt.toISOString()}, tags: ${tags.join(', ')})`); + await github.rest.packages.deletePackageVersionForOrg({ + package_type: 'container', + package_name: packageName, + org: org, + package_version_id: version.id, + }); + } + deleted++; + } else { + console.log(`Keeping: ${version.id} (created ${createdAt.toISOString()}, tags: ${tags.join(', ')})`); + kept++; + } + } + + if (versions.data.length < perPage) break; + page++; + } + + console.log(`\nSummary: ${deleted} ${dryRun ? 'would be ' : ''}deleted, ${kept} kept`); diff --git a/README.md b/README.md index 4f1984d..af74cf7 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,36 @@ docker build --progress=plain \ **Note:** The `--progress=plain` flag shows full build output. The frontend build compiles 355+ scientific software packages from source and can take several hours on first build. Subsequent builds use the cached packages from GHCR. +### Configuring Parallel Build Jobs + +The frontend Dockerfile uses the `SPACK_BUILD_JOBS` build argument to control how many packages Spack compiles in parallel (default: 8). This should match the number of CPU cores available: + +**For 8-core systems (default):** +```bash +docker build --build-arg SPACK_BUILD_JOBS=8 ... +``` + +**For 16-core systems:** +```bash +docker build --build-arg SPACK_BUILD_JOBS=16 ... +``` + +**With Docker Compose:** +```bash +docker compose build --build-arg SPACK_BUILD_JOBS=16 +``` + +You can also modify the default in `docker-compose.yml`: +```yaml +services: + slurmfrontend: + build: + args: + SPACK_BUILD_JOBS: 16 # Change from default 8 +``` + +**Performance note:** Increasing from 8 to 16 jobs typically provides 20-40% speedup (not 2x) due to dependency constraints and potential memory pressure. On 32GB RAM systems, 16 parallel jobs leaves only ~2GB per job, which may cause swapping for memory-intensive packages like ESMF or JEDI components. + # Quick Start To start the slurm cluster environment: diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 5cf6117..6ab0d06 100644 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -3,6 +3,8 @@ services: build: context: ./frontend dockerfile: ./Dockerfile + args: + SPACK_BUILD_JOBS: 8 secrets: - github_token image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest diff --git a/docker-compose.yml b/docker-compose.yml index c1343ca..7ff97c6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,8 @@ services: build: context: ./frontend dockerfile: ./Dockerfile + args: + SPACK_BUILD_JOBS: 8 secrets: - github_token image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 2ef7445..801f1b7 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,5 +1,8 @@ FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest AS builder +# Default to 8 build jobs; override with --build-arg SPACK_BUILD_JOBS=16 for larger runners +ARG SPACK_BUILD_JOBS=8 + ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Etc/UTC @@ -98,7 +101,7 @@ RUN cd /opt/spack-stack \ fi \ # Configure lmod modules instead of tcl \ && sed -i 's/tcl/lmod/g' site/modules.yaml \ - # Use unhashed module names like other spack-stack deployments, with Ursa-style suffix rules to avoid naming clashes \ + # Use unhashed module names like other spack-stack deployments, with suffix rules to avoid naming clashes \ && printf '%s\n' \ ' hash_length: 0' \ ' all:' \ @@ -135,13 +138,6 @@ RUN cd /opt/spack-stack \ >> site/modules.yaml \ # Force env module root so final stage can copy a stable path \ && spack -e . config add 'modules:default:roots:lmod:$env/modules' - # TODO: Check if wgrib2 removal is still needed for aarch64 with spack-stack 2.1.0 and ubuntu 26.04. - # This was needed for old spack-stack and ubuntu 22.04, but may be obsolete now. - # Uncomment below if aarch64 CI build fails with wgrib2 error: - # && if [ "$(uname -m)" == "aarch64" ]; then \ - # spack config --scope common remove "packages:wgrib2" ; \ - # spack config --scope common remove "modules:default:lmod:wgrib2" ; \ - # fi # Add the build cache mirror and concretize RUN cd /opt/spack-stack \ @@ -149,12 +145,9 @@ RUN cd /opt/spack-stack \ && cd envs/unified-env \ && spack env activate . \ && spack mirror add --unsigned ghcr_buildcache oci://ghcr.io/noaa-gsl/dockerspackstackslurmcluster/buildcache \ - && spack concretize 2>&1 | tee log.concretize \ - && spack spec openmpi | tee log.openmpi.spec \ - && grep -Eq 'schedulers(:=|=)slurm' log.openmpi.spec \ - && spack spec pmix | tee log.pmix.spec \ - && grep -q '+munge' log.pmix.spec \ - && grep -q 'munge' log.pmix.spec + && spack concretize > /dev/null 2>&1 \ + && spack spec openmpi 2>&1 | grep -Eq 'schedulers(:=|=)slurm' \ + && spack spec pmix 2>&1 | grep -q '+munge' # Install the Spack environment RUN --mount=type=secret,id=github_token < Date: Tue, 19 May 2026 10:27:38 -0600 Subject: [PATCH 5/5] Fix incorrect message in README, don't run CI on changes to README --- .github/workflows/docker.yml | 8 ++++++++ README.md | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d92c171..51fed34 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -3,8 +3,16 @@ name: Docker Slurm on: push: branches: [ main ] + paths-ignore: + - '**.md' + - 'LICENSE' + - '.gitignore' pull_request: branches: [ main ] + paths-ignore: + - '**.md' + - 'LICENSE' + - '.gitignore' workflow_dispatch: env: diff --git a/README.md b/README.md index af74cf7..d048711 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ docker build --progress=plain \ ### Configuring Parallel Build Jobs -The frontend Dockerfile uses the `SPACK_BUILD_JOBS` build argument to control how many packages Spack compiles in parallel (default: 8). This should match the number of CPU cores available: +The frontend Dockerfile uses the `SPACK_BUILD_JOBS` build argument to control the number of parallel make jobs (`-j` flag) used when building each package (default: 8). This should match the number of CPU cores available: **For 8-core systems (default):** ```bash @@ -80,7 +80,7 @@ services: SPACK_BUILD_JOBS: 16 # Change from default 8 ``` -**Performance note:** Increasing from 8 to 16 jobs typically provides 20-40% speedup (not 2x) due to dependency constraints and potential memory pressure. On 32GB RAM systems, 16 parallel jobs leaves only ~2GB per job, which may cause swapping for memory-intensive packages like ESMF or JEDI components. +**Performance note:** Higher values speed up compilation of individual packages, especially large ones like ESMF, JEDI components, and NetCDF. However, on 32GB RAM systems, values above 8 may cause memory pressure during compilation of memory-intensive Fortran packages, potentially leading to swapping or OOM errors. # Quick Start