diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 23d0fbb..51fed34 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -3,142 +3,44 @@ name: Docker Slurm on: push: branches: [ main ] + paths-ignore: + - '**.md' + - 'LICENSE' + - '.gitignore' pull_request: branches: [ main ] + paths-ignore: + - '**.md' + - 'LICENSE' + - '.gitignore' workflow_dispatch: env: - REGISTRY_FRONTEND_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend - REGISTRY_MASTER_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master - REGISTRY_NODE_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node - AWS_REGION: us-east-2 + SPACK_STACK_VERSION: 2.1.0 + REGISTRY_FRONTEND_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend + REGISTRY_MASTER_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master + REGISTRY_NODE_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node jobs: - build-frontend-arm64: - runs-on: LinuxARM64-8core-32G-300Gb - #needs: docker_compose_test - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write + resolve_versions: + name: Resolve Version Tags + runs-on: ubuntu-latest + outputs: + ubuntu_version: ${{ steps.resolve.outputs.ubuntu_version }} steps: - - # Beta ARM runners do not have Docker installed - name: Install Docker + name: Resolve concrete ubuntu:latest version + id: resolve run: | - # Uninstall incompatible packages - for pkg in docker.io containerd runc; do sudo apt-get remove $pkg; done - # Add Docker's official GPG key: - sudo apt-get update - sudo apt-get install ca-certificates curl - sudo install -m 0755 -d /etc/apt/keyrings - sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc - sudo chmod a+r /etc/apt/keyrings/docker.asc - # Add the repository to Apt sources: - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update -y - # Install docker packages - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Allow runner use to run docker without sudo - sudo usermod -aG docker $USER - sudo apt-get install acl - sudo setfacl --modify user:$USER:rw /var/run/docker.sock - - - name: Test Docker Installation - run: docker run hello-world - - - name: Install AWS CLI - run: sudo apt-get install -y --no-install-recommends awscli - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: ${{ env.AWS_REGION }} - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ secrets.AWS_GITHUB_ROLE }} - role-duration-seconds: 21600 # 6 hours - role-session-name: spackstackslurmcluster-github-actions - - - name: Test authentication - run: | - aws sts get-caller-identity - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_FRONTEND_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Prune pre-loaded GHA docker images - run: | - docker images - docker image prune -a -f - docker images - - - name: Build and push by digest - id: build - uses: docker/build-push-action@v5 - with: - context: ./frontend - file: ./frontend/Dockerfile - platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} - secrets: | - "access_key_id=${{ env.AWS_ACCESS_KEY_ID }}" - "secret_access_key=${{ env.AWS_SECRET_ACCESS_KEY }}" - "session_token=${{ env.AWS_SESSION_TOKEN }}" - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: frontend-digests-linux-arm64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 - - - name: Debug session - if: ${{ failure() }} - uses: mxschmitt/action-tmate@v3 - timeout-minutes: 60 - with: - limit-access-to-actor: true + UBUNTU_VERSION=$(docker run --rm ubuntu:latest bash -lc '. /etc/os-release && echo "$VERSION_ID"') + echo "ubuntu_version=${UBUNTU_VERSION}" >> "$GITHUB_OUTPUT" + echo "Resolved ubuntu:latest to VERSION_ID=${UBUNTU_VERSION}" - build-frontend-amd64: + build-test-push-amd64: runs-on: ubuntu2204-8c-32g-300ssd - #needs: docker_compose_test + needs: + - resolve_versions timeout-minutes: 360 permissions: packages: write @@ -148,36 +50,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - - name: Install AWS CLI - run: sudo apt-get install -y --no-install-recommends awscli - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: ${{ env.AWS_REGION }} - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ secrets.AWS_GITHUB_ROLE }} - role-duration-seconds: 21600 # 6 hours - role-session-name: spackstackslurmcluster-github-actions - - - name: Test authentication - run: | - aws sts get-caller-identity - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_FRONTEND_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host - name: Login to GHCR uses: docker/login-action@v3 @@ -193,309 +70,155 @@ jobs: docker image prune -a -f docker images - - name: Build spack-stack and push by digest - id: build + name: Build frontend image uses: docker/build-push-action@v5 with: context: ./frontend file: ./frontend/Dockerfile platforms: linux/amd64 + tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + build-args: | + SPACK_BUILD_JOBS=8 secrets: | - "access_key_id=${{ env.AWS_ACCESS_KEY_ID }}" - "secret_access_key=${{ env.AWS_SECRET_ACCESS_KEY }}" - "session_token=${{ env.AWS_SESSION_TOKEN }}" - labels: ${{ steps.meta.outputs.labels }} + "github_token=${{ secrets.GITHUB_TOKEN }}" cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" + load: true - - name: Upload digest - uses: actions/upload-artifact@v4 + name: Build master image + uses: docker/build-push-action@v5 with: - name: frontend-digests-linux-amd64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 + context: ./master + file: ./master/Dockerfile + platforms: linux/amd64 + tags: ${{ env.REGISTRY_MASTER_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache,mode=max + load: true - - name: Debug session - if: ${{ failure() }} - uses: mxschmitt/action-tmate@v3 - timeout-minutes: 60 + name: Build node image + uses: docker/build-push-action@v5 with: - limit-access-to-actor: true - - merge-frontend: - runs-on: ubuntu-latest - needs: - - build-frontend-amd64 - - build-frontend-arm64 - steps: - - - name: Checkout repository - uses: actions/checkout@v4 + context: ./node + file: ./node/Dockerfile + platforms: linux/amd64 + tags: ${{ env.REGISTRY_NODE_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache,mode=max + load: true - - name: Download digests - uses: actions/download-artifact@v4 - with: - path: /tmp/digests - pattern: frontend-digests-* - merge-multiple: true + name: Start containers for testing + run: docker compose -f docker-compose-test.yml up --pull never -d - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + name: Check cluster logs + run: docker compose -f docker-compose-test.yml logs - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_FRONTEND_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= + name: Check status of the cluster containers + run: docker compose -f docker-compose-test.yml ps - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false + name: Check status of Slurm + run: docker exec spack-stack-frontend sinfo - - name: Create manifest list and push - working-directory: /tmp/digests - run: | - docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.REGISTRY_FRONTEND_IMAGE }}@sha256:%s ' *) + name: Run a Slurm job + run: docker exec spack-stack-frontend srun hostname - - name: Inspect image + name: Test ssh access to Slurm compute nodes run: | - docker buildx imagetools inspect ${{ env.REGISTRY_FRONTEND_IMAGE }}:${{ steps.meta.outputs.version }} - - build-master-arm64: - runs-on: LinuxARM64-8core-32G-300Gb - #needs: docker_compose_test - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write - steps: + docker exec spack-stack-frontend timeout 1s ssh slurmnode1 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode2 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode3 hostname - - # Beta ARM runners do not have Docker installed - name: Install Docker + name: Load spack-stack envs run: | - # Uninstall incompatible packages - for pkg in docker.io containerd runc; do sudo apt-get remove $pkg; done - # Add Docker's official GPG key: - sudo apt-get update - sudo apt-get install ca-certificates curl - sudo install -m 0755 -d /etc/apt/keyrings - sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc - sudo chmod a+r /etc/apt/keyrings/docker.asc - # Add the repository to Apt sources: - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update -y - # Install docker packages - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - # Allow runner use to run docker without sudo - sudo usermod -aG docker $USER - sudo apt-get install acl - sudo setfacl --modify user:$USER:rw /var/run/docker.sock + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-mpas-env; module list" + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-fv3-env; module list" - - name: Test Docker Installation - run: docker run hello-world - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_MASTER_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= + name: Compile and run MPI program + run: docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + name: Shut down Slurm cluster containers + if: always() + run: docker compose -f docker-compose-test.yml down - - name: Login to GHCR - uses: docker/login-action@v3 + name: Push frontend by digest + id: push-frontend + uses: docker/build-push-action@v5 with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Prune pre-loaded GHA docker images - run: | - docker images - docker image prune -a -f - docker images + context: ./frontend + file: ./frontend/Dockerfile + platforms: linux/amd64 + build-args: | + SPACK_BUILD_JOBS=8 + secrets: | + "github_token=${{ secrets.GITHUB_TOKEN }}" + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache + outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - name: Build and push by digest - id: build + name: Push master by digest + id: push-master uses: docker/build-push-action@v5 with: context: ./master file: ./master/Dockerfile - platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache,mode=max + platforms: linux/amd64 + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: master-digests-linux-arm64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 - - build-master-amd64: - runs-on: ubuntu2204-8c-32g-300ssd - #needs: docker_compose_test - timeout-minutes: 360 - permissions: - packages: write - contents: read - id-token: write - steps: - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_MASTER_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false - - - name: Prune pre-loaded GHA docker images - run: | - docker images - docker image prune -a -f - docker images - - - name: Build spack-stack and push by digest - id: build + name: Push node by digest + id: push-node uses: docker/build-push-action@v5 with: - context: ./master - file: ./master/Dockerfile + context: ./node + file: ./node/Dockerfile platforms: linux/amd64 - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache + outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - name: Export digest + name: Export digests run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest + mkdir -p /tmp/digests/frontend /tmp/digests/master /tmp/digests/node + frontend_digest="${{ steps.push-frontend.outputs.digest }}" + master_digest="${{ steps.push-master.outputs.digest }}" + node_digest="${{ steps.push-node.outputs.digest }}" + touch "/tmp/digests/frontend/${frontend_digest#sha256:}" + touch "/tmp/digests/master/${master_digest#sha256:}" + touch "/tmp/digests/node/${node_digest#sha256:}" + - + name: Upload frontend digest uses: actions/upload-artifact@v4 with: - name: master-digests-linux-amd64 - path: /tmp/digests/* + name: frontend-digests-linux-amd64 + path: /tmp/digests/frontend/* if-no-files-found: error - retention-days: 1 - - merge-master: - runs-on: ubuntu-latest - needs: - - build-master-amd64 - - build-master-arm64 - steps: - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download digests - uses: actions/download-artifact@v4 - with: - path: /tmp/digests - pattern: master-digests-* - merge-multiple: true - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_MASTER_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= + retention-days: 1 - - name: Login to GHCR - uses: docker/login-action@v3 + name: Upload master digest + uses: actions/upload-artifact@v4 with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - logout: false + name: master-digests-linux-amd64 + path: /tmp/digests/master/* + if-no-files-found: error + retention-days: 1 - - name: Create manifest list and push - working-directory: /tmp/digests - run: | - docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.REGISTRY_MASTER_IMAGE }}@sha256:%s ' *) + name: Upload node digest + uses: actions/upload-artifact@v4 + with: + name: node-digests-linux-amd64 + path: /tmp/digests/node/* + if-no-files-found: error + retention-days: 1 - - name: Inspect image - run: | - docker buildx imagetools inspect ${{ env.REGISTRY_MASTER_IMAGE }}:${{ steps.meta.outputs.version }} + name: Debug session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3 + timeout-minutes: 60 + with: + limit-access-to-actor: true - build-node-arm64: + build-test-push-arm64: runs-on: LinuxARM64-8core-32G-300Gb - #needs: docker_compose_test + needs: + - resolve_versions timeout-minutes: 360 permissions: packages: write @@ -503,7 +226,6 @@ jobs: id-token: write steps: - - # Beta ARM runners do not have Docker installed name: Install Docker run: | # Uninstall incompatible packages @@ -532,21 +254,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY_NODE_IMAGE }} - tags: | - type=raw,value=latest - flavor: | - latest=true - prefix= - suffix= - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host - name: Login to GHCR uses: docker/login-action@v3 @@ -562,36 +274,157 @@ jobs: docker image prune -a -f docker images - - name: Build and push by digest - id: build + name: Build frontend image + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile + platforms: linux/arm64 + tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + build-args: | + SPACK_BUILD_JOBS=8 + secrets: | + "github_token=${{ secrets.GITHUB_TOKEN }}" + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache,mode=max + load: true + - + name: Build master image + uses: docker/build-push-action@v5 + with: + context: ./master + file: ./master/Dockerfile + platforms: linux/arm64 + tags: ${{ env.REGISTRY_MASTER_IMAGE }}:latest + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache,mode=max + load: true + - + name: Build node image uses: docker/build-push-action@v5 with: context: ./node file: ./node/Dockerfile platforms: linux/arm64 - labels: ${{ steps.meta.outputs.labels }} + tags: ${{ env.REGISTRY_NODE_IMAGE }}:latest cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache,mode=max + load: true + - + name: Start containers for testing + run: docker compose -f docker-compose-test.yml up --pull never -d + - + name: Check cluster logs + run: docker compose -f docker-compose-test.yml logs + - + name: Check status of the cluster containers + run: docker compose -f docker-compose-test.yml ps + - + name: Check status of Slurm + run: docker exec spack-stack-frontend sinfo + - + name: Run a Slurm job + run: docker exec spack-stack-frontend srun hostname + - + name: Test ssh access to Slurm compute nodes + run: | + docker exec spack-stack-frontend timeout 1s ssh slurmnode1 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode2 hostname + docker exec spack-stack-frontend timeout 1s ssh slurmnode3 hostname + - + name: Load spack-stack envs + run: | + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-mpas-env; module list" + docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/modules/Core ; module load stack-gcc stack-openmpi jedi-fv3-env; module list" + - + name: Compile and run MPI program + run: docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" + - + name: Shut down Slurm cluster containers + if: always() + run: docker compose -f docker-compose-test.yml down + - + name: Push frontend by digest + id: push-frontend + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile + platforms: linux/arm64 + build-args: | + SPACK_BUILD_JOBS=8 + secrets: | + "github_token=${{ secrets.GITHUB_TOKEN }}" + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache + outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true + - + name: Push master by digest + id: push-master + uses: docker/build-push-action@v5 + with: + context: ./master + file: ./master/Dockerfile + platforms: linux/arm64 + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache + outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true + - + name: Push node by digest + id: push-node + uses: docker/build-push-action@v5 + with: + context: ./node + file: ./node/Dockerfile + platforms: linux/arm64 + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - - name: Export digest + name: Export digests run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" + mkdir -p /tmp/digests/frontend /tmp/digests/master /tmp/digests/node + frontend_digest="${{ steps.push-frontend.outputs.digest }}" + master_digest="${{ steps.push-master.outputs.digest }}" + node_digest="${{ steps.push-node.outputs.digest }}" + touch "/tmp/digests/frontend/${frontend_digest#sha256:}" + touch "/tmp/digests/master/${master_digest#sha256:}" + touch "/tmp/digests/node/${node_digest#sha256:}" + - + name: Upload frontend digest + uses: actions/upload-artifact@v4 + with: + name: frontend-digests-linux-arm64 + path: /tmp/digests/frontend/* + if-no-files-found: error + retention-days: 1 + - + name: Upload master digest + uses: actions/upload-artifact@v4 + with: + name: master-digests-linux-arm64 + path: /tmp/digests/master/* + if-no-files-found: error + retention-days: 1 - - name: Upload digest + name: Upload node digest uses: actions/upload-artifact@v4 with: name: node-digests-linux-arm64 - path: /tmp/digests/* + path: /tmp/digests/node/* if-no-files-found: error retention-days: 1 + - + name: Debug session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3 + timeout-minutes: 60 + with: + limit-access-to-actor: true - build-node-amd64: - runs-on: ubuntu2204-8c-32g-300ssd - #needs: docker_compose_test - timeout-minutes: 360 + merge-frontend: + runs-on: ubuntu-latest + needs: + - build-test-push-amd64 + - build-test-push-arm64 + - resolve_versions permissions: packages: write contents: read @@ -600,21 +433,29 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - + name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: frontend-digests-* + merge-multiple: true + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - name: Docker meta id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.REGISTRY_NODE_IMAGE }} + images: ${{ env.REGISTRY_FRONTEND_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= suffix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - name: Login to GHCR uses: docker/login-action@v3 @@ -624,43 +465,26 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} logout: false - - name: Prune pre-loaded GHA docker images + name: Create manifest list and push + working-directory: /tmp/digests run: | - docker images - docker image prune -a -f - docker images - - - name: Build spack-stack and push by digest - id: build - uses: docker/build-push-action@v5 - with: - context: ./node - file: ./node/Dockerfile - platforms: linux/amd64 - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache,mode=max - outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ env.REGISTRY_FRONTEND_IMAGE }}@sha256:%s ' *) - - name: Export digest + name: Inspect image run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: node-digests-linux-amd64 - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 + docker buildx imagetools inspect ${{ env.REGISTRY_FRONTEND_IMAGE }}:${{ steps.meta.outputs.version }} - merge-node: + merge-master: runs-on: ubuntu-latest needs: - - build-node-amd64 - - build-node-arm64 + - build-test-push-amd64 + - build-test-push-arm64 + - resolve_versions + permissions: + packages: write + contents: read + id-token: write steps: - name: Checkout repository @@ -670,7 +494,7 @@ jobs: uses: actions/download-artifact@v4 with: path: /tmp/digests - pattern: node-digests-* + pattern: master-digests-* merge-multiple: true - name: Set up Docker Buildx @@ -680,9 +504,10 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.REGISTRY_NODE_IMAGE }} + images: ${{ env.REGISTRY_MASTER_IMAGE }} tags: | type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} flavor: | latest=true prefix= @@ -700,60 +525,65 @@ jobs: working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.REGISTRY_NODE_IMAGE }}@sha256:%s ' *) + $(printf '${{ env.REGISTRY_MASTER_IMAGE }}@sha256:%s ' *) - name: Inspect image run: | - docker buildx imagetools inspect ${{ env.REGISTRY_NODE_IMAGE }}:${{ steps.meta.outputs.version }} + docker buildx imagetools inspect ${{ env.REGISTRY_MASTER_IMAGE }}:${{ steps.meta.outputs.version }} - docker-compose-test: - runs-on: ubuntu2204-8c-32g-300ssd + merge-node: + runs-on: ubuntu-latest needs: - - merge-frontend - - merge-master - - merge-node + - build-test-push-amd64 + - build-test-push-arm64 + - resolve_versions + permissions: + packages: write + contents: read + id-token: write steps: - - name: Checkout Repository + name: Checkout repository uses: actions/checkout@v4 - - name: Build and start containers - run: docker compose -f docker-compose-test.yml up --build -d - - - - name: Check cluster logs - run: docker compose -f docker-compose-test.yml logs - - - - name: Check status of the cluster containers - run: docker compose -f docker-compose-test.yml ps - + name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: node-digests-* + merge-multiple: true - - name: Check status of Slurm - run: docker exec spack-stack-frontend sinfo - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - - name: Run a Slurm job - run: docker exec spack-stack-frontend srun hostname - + name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY_NODE_IMAGE }} + tags: | + type=raw,value=latest + type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + flavor: | + latest=true + prefix= + suffix= - - name: Test ssh access to Slurm compute nodes - run: | - docker exec spack-stack-frontend timeout 1s ssh slurmnode1 hostname - docker exec spack-stack-frontend timeout 1s ssh slurmnode2 hostname - docker exec spack-stack-frontend timeout 1s ssh slurmnode3 hostname - + name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + logout: false - - name: Load spack-stack envs + name: Create manifest list and push + working-directory: /tmp/digests run: | - docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/install/modulefiles/Core ; module load stack-gcc stack-openmpi stack-python jedi-mpas-env; module list" - docker exec spack-stack-frontend bash -l -c "module use /opt/spack-stack/envs/unified-env/install/modulefiles/Core ; module load stack-gcc stack-openmpi stack-python jedi-fv3-env; module list" - + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ env.REGISTRY_NODE_IMAGE }}@sha256:%s ' *) - - name: Compile and run MPI program + name: Inspect image run: | - docker exec spack-stack-frontend bash -l -c "cd test; ./test_hello.sh" + docker buildx imagetools inspect ${{ env.REGISTRY_NODE_IMAGE }}:${{ steps.meta.outputs.version }} - - - name: Shut down Slurm cluster containers - run: docker compose -f docker-compose-test.yml down diff --git a/.github/workflows/package-cleanup.yaml b/.github/workflows/package-cleanup.yaml index 5ebf9a9..692ed16 100644 --- a/.github/workflows/package-cleanup.yaml +++ b/.github/workflows/package-cleanup.yaml @@ -1,40 +1,45 @@ name: PackageCleanup on: - push: - branches: [ main ] - pull_request: - branches: [ main ] workflow_dispatch: + inputs: + buildcache_cutoff: + description: 'Delete buildcache entries older than this (ISO date, e.g. 2026-05-18). Leave empty to skip buildcache cleanup.' + required: false + default: '' + dry_run: + description: 'Set to true to only list what would be deleted without actually deleting' + required: false + default: 'true' jobs: - cleanup-packages: + cleanup-untagged: runs-on: ubuntu-latest permissions: packages: write contents: read steps: - - name: Remove untagged versions of dockerspackstackslurmcluster/frontend + name: Remove untagged versions of dockerspackstackslurmcluster/slurm-spack-stack-frontend uses: actions/delete-package-versions@v5 with: - package-name: 'dockerspackstackslurmcluster/frontend' + package-name: 'dockerspackstackslurmcluster/slurm-spack-stack-frontend' package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' - - name: Remove untagged versions of dockerspackstackslurmcluster/master + name: Remove untagged versions of dockerspackstackslurmcluster/slurm-spack-stack-master uses: actions/delete-package-versions@v5 with: - package-name: 'dockerspackstackslurmcluster/master' + package-name: 'dockerspackstackslurmcluster/slurm-spack-stack-master' package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' - - name: Remove untagged versions of dockerspackstackslurmcluster/node + name: Remove untagged versions of dockerspackstackslurmcluster/slurm-spack-stack-node uses: actions/delete-package-versions@v5 with: - package-name: 'dockerspackstackslurmcluster/node' + package-name: 'dockerspackstackslurmcluster/slurm-spack-stack-node' package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' @@ -86,3 +91,76 @@ jobs: package-type: 'container' min-versions-to-keep: 0 delete-only-untagged-versions: 'true' + + cleanup-stale-buildcache: + if: ${{ github.event.inputs.buildcache_cutoff != '' }} + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + steps: + - + name: Clean stale buildcache entries + uses: actions/github-script@v7 + with: + script: | + const cutoff = new Date('${{ github.event.inputs.buildcache_cutoff }}'); + const dryRun = '${{ github.event.inputs.dry_run }}' === 'true'; + const org = 'noaa-gsl'; + const packageName = 'dockerspackstackslurmcluster/buildcache'; + + console.log(`Cutoff date: ${cutoff.toISOString()}`); + console.log(`Dry run: ${dryRun}`); + + let deleted = 0; + let kept = 0; + let page = 1; + const perPage = 100; + + while (true) { + const versions = await github.rest.packages.getAllPackageVersionsForPackageOwnedByOrg({ + package_type: 'container', + package_name: packageName, + org: org, + per_page: perPage, + page: page, + }); + + if (versions.data.length === 0) break; + + for (const version of versions.data) { + const createdAt = new Date(version.created_at); + const tags = version.metadata?.container?.tags || []; + + // Never delete index entries - they are updated in place by spack buildcache update-index + const isIndex = tags.some(t => t.includes('index') || t.startsWith('_')); + if (isIndex) { + console.log(`Preserving index: ${version.id} (tags: ${tags.join(', ')})`); + kept++; + continue; + } + + if (createdAt < cutoff) { + if (dryRun) { + console.log(`[DRY RUN] Would delete: ${version.id} (created ${createdAt.toISOString()}, tags: ${tags.join(', ')})`); + } else { + console.log(`Deleting: ${version.id} (created ${createdAt.toISOString()}, tags: ${tags.join(', ')})`); + await github.rest.packages.deletePackageVersionForOrg({ + package_type: 'container', + package_name: packageName, + org: org, + package_version_id: version.id, + }); + } + deleted++; + } else { + console.log(`Keeping: ${version.id} (created ${createdAt.toISOString()}, tags: ${tags.join(', ')})`); + kept++; + } + } + + if (versions.data.length < perPage) break; + page++; + } + + console.log(`\nSummary: ${deleted} ${dryRun ? 'would be ' : ''}deleted, ${kept} kept`); diff --git a/README.md b/README.md index 87d0aa8..d048711 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,62 @@ sizes. The cluster behaves as if it were running on multiple nodes even if the containers are all running on the same host machine. +# Building the Containers + +To build the containers from source: + +## Master and Node Containers + +```bash +docker build -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest -f master/Dockerfile master/ +docker build -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest -f node/Dockerfile node/ +``` + +## Frontend Container + +The frontend container requires a GitHub personal access token (PAT) with package write permissions to push built packages to the GitHub Container Registry build cache. Set your token in an environment variable and pass it as a secret during build: + +```bash +export GITHUB_TOKEN=your_github_pat_here +docker build --progress=plain \ + --secret id=github_token,env=GITHUB_TOKEN \ + -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest \ + -f frontend/Dockerfile \ + frontend/ +``` + +**Note:** The `--progress=plain` flag shows full build output. The frontend build compiles 355+ scientific software packages from source and can take several hours on first build. Subsequent builds use the cached packages from GHCR. + +### Configuring Parallel Build Jobs + +The frontend Dockerfile uses the `SPACK_BUILD_JOBS` build argument to control the number of parallel make jobs (`-j` flag) used when building each package (default: 8). This should match the number of CPU cores available: + +**For 8-core systems (default):** +```bash +docker build --build-arg SPACK_BUILD_JOBS=8 ... +``` + +**For 16-core systems:** +```bash +docker build --build-arg SPACK_BUILD_JOBS=16 ... +``` + +**With Docker Compose:** +```bash +docker compose build --build-arg SPACK_BUILD_JOBS=16 +``` + +You can also modify the default in `docker-compose.yml`: +```yaml +services: + slurmfrontend: + build: + args: + SPACK_BUILD_JOBS: 16 # Change from default 8 +``` + +**Performance note:** Higher values speed up compilation of individual packages, especially large ones like ESMF, JEDI components, and NetCDF. However, on 32GB RAM systems, values above 8 may cause memory pressure during compilation of memory-intensive Fortran packages, potentially leading to swapping or OOM errors. + # Quick Start To start the slurm cluster environment: @@ -69,10 +125,9 @@ docker exec -it spack-stack-frontend bash -l Next, load the spack-stack base environment: ``` -module use /opt/spack-stack/envs/unified-env/install/modulefiles/Core +module use /opt/spack-stack/envs/unified-env/modules/Core module load stack-gcc module load stack-openmpi -module load stack-python ``` Once the basic spack-stack modules are loaded, you can choose from multiple spack-stack environments for different purposes. diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 043ad8c..6ab0d06 100644 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -3,7 +3,11 @@ services: build: context: ./frontend dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend:latest + args: + SPACK_BUILD_JOBS: 8 + secrets: + - github_token + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest container_name: spack-stack-frontend hostname: slurmfrontend user: admin @@ -17,7 +21,7 @@ services: build: context: ./master dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest container_name: spack-stack-master hostname: slurmmaster user: admin @@ -35,7 +39,7 @@ services: build: context: ./node dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node1 hostname: slurmnode1 user: admin @@ -49,7 +53,7 @@ services: links: - slurmmaster slurmnode2: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node2 hostname: slurmnode2 user: admin @@ -63,7 +67,7 @@ services: links: - slurmmaster slurmnode3: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node3 hostname: slurmnode3 user: admin @@ -77,7 +81,7 @@ services: links: - slurmmaster slurmnode4: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node4 hostname: slurmnode4 user: admin @@ -91,7 +95,7 @@ services: links: - slurmmaster slurmnode5: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node5 hostname: slurmnode5 user: admin @@ -107,3 +111,7 @@ services: volumes: home-vol: opt-vol: + +secrets: + github_token: + environment: GITHUB_TOKEN diff --git a/docker-compose.yml b/docker-compose.yml index 2c8f9fe..7ff97c6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,11 @@ services: build: context: ./frontend dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend:latest + args: + SPACK_BUILD_JOBS: 8 + secrets: + - github_token + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest container_name: spack-stack-frontend hostname: slurmfrontend user: admin @@ -16,7 +20,7 @@ services: build: context: ./master dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest container_name: spack-stack-master hostname: slurmmaster user: admin @@ -33,7 +37,7 @@ services: build: context: ./node dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node1 hostname: slurmnode1 user: admin @@ -46,7 +50,7 @@ services: links: - slurmmaster slurmnode2: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node2 hostname: slurmnode2 user: admin @@ -59,7 +63,7 @@ services: links: - slurmmaster slurmnode3: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node3 hostname: slurmnode3 user: admin @@ -72,7 +76,7 @@ services: links: - slurmmaster slurmnode4: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node4 hostname: slurmnode4 user: admin @@ -85,7 +89,7 @@ services: links: - slurmmaster slurmnode5: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node:latest + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest container_name: spack-stack-node5 hostname: slurmnode5 user: admin @@ -100,3 +104,7 @@ services: volumes: home-vol: opt-vol: + +secrets: + github_token: + environment: GITHUB_TOKEN diff --git a/frontend/Dockerfile b/frontend/Dockerfile index cecd539..801f1b7 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,5 +1,8 @@ FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest AS builder +# Default to 8 build jobs; override with --build-arg SPACK_BUILD_JOBS=16 for larger runners +ARG SPACK_BUILD_JOBS=8 + ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Etc/UTC @@ -8,16 +11,16 @@ SHELL ["/bin/bash", "-c"] # Install OS packages RUN apt-get -y update \ && apt-get -y install --no-install-recommends \ - awscli \ bc \ build-essential \ ca-certificates \ curl \ - emacs \ + emacs \ file \ - g++ \ - gcc \ - gfortran \ + gcc-13 \ + g++-13 \ + gfortran-13 \ + cpp-13 \ git \ gnupg2 \ iproute2 \ @@ -28,22 +31,21 @@ RUN apt-get -y update \ python3-pip \ python3-setuptools \ subversion \ + lua5.4 \ + liblua5.4-dev \ + lua-posix \ + lua-filesystem \ tcl-dev \ tcsh \ unzip \ zstd \ - && pip3 install boto3 \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 \ + --slave /usr/bin/g++ g++ /usr/bin/g++-13 \ + --slave /usr/bin/gfortran gfortran /usr/bin/gfortran-13 -# Install Lua and Lmod -RUN wget https://sourceforge.net/projects/lmod/files/lua-5.1.4.9.tar.bz2 \ - && tar xvfj lua-5.1.4.9.tar.bz2 \ - && pushd lua-5.1.4.9 \ - && ./configure --prefix=/usr \ - && make -j 4 \ - && make install \ - && popd \ - && git clone --recursive https://github.com/TACC/Lmod.git \ +# Install Lmod +RUN git clone --recursive https://github.com/TACC/Lmod.git \ && pushd Lmod \ && ./configure --prefix=/usr \ && make -j 4 \ @@ -51,106 +53,144 @@ RUN wget https://sourceforge.net/projects/lmod/files/lua-5.1.4.9.tar.bz2 \ && echo "source /usr/lmod/lmod/init/bash" >> /etc/bash.bashrc \ && echo "source /usr/lmod/lmod/init/bash" >> /etc/profile \ && popd \ - && rm -rf lua* Lmod + && rm -rf Lmod -# Copy patch files into /tmp for use when installing spack-stack -COPY cc.patch.aarch64 /tmp -COPY cc.patch.x86_64 /tmp -COPY openmpi.package.py.patch.aarch64 /tmp -COPY openmpi.package.py.patch.x86_64 /tmp - -# Clone spack-stack and create and configure the unified env +# Clone spack-stack 2.1.0 RUN cd /opt \ - && git clone -b release/1.8.0 --recurse-submodules https://github.com/jcsda/spack-stack.git \ - && pushd spack-stack \ - && . ./setup.sh \ - && pushd spack \ - && mv /tmp/cc.patch.$(uname -m) cc.patch \ - && mv /tmp/openmpi.package.py.patch.$(uname -m) openmpi.package.py.patch \ - && patch -f -p0 < openmpi.package.py.patch \ - && patch -f -p0 < cc.patch \ - && popd \ - && spack stack create env --site linux.default --template unified-dev --name unified-env --compiler gcc \ - && pushd envs/unified-env \ + && git clone -b 2.1.0 --recurse-submodules https://github.com/jcsda/spack-stack.git + +# Create and configure the unified env using the container site +RUN cd /opt/spack-stack \ + && source setup.sh \ + && spack stack create env --site container --template unified-dev --name unified-env --compiler gcc \ + && cd envs/unified-env \ && spack env activate . \ - && spack mirror add --s3-access-key-id "" --s3-access-key-secret "" s3_spack_stack_buildcache_ro s3://chiltepin-us-east-2/spack-stack/ \ - && export SPACK_SYSTEM_CONFIG_PATH="$PWD/site" \ - && spack external find --scope system \ + # Fix system external versions for Ubuntu 26.04 \ + && spack external find --scope "env:/opt/spack-stack/envs/unified-env:/opt/spack-stack/envs/unified-env/site" \ --exclude cmake \ --exclude curl \ --exclude openssl \ --exclude openssh \ --exclude python \ - && spack external find --scope system wget \ - && spack compiler find --scope system \ - && unset SPACK_SYSTEM_CONFIG_PATH \ - && spack config add "packages:all:compiler:[gcc@11.4.0]" \ - && spack config add "packages:all:providers:mpi:[openmpi@4.1.6]" \ - && spack config add "packages:fontconfig:variants:+pic" \ - && spack config add "packages:pixman:variants:+pic" \ - && spack config add "packages:cairo:variants:+pic" \ - && spack config --scope env:/opt/spack-stack/envs/unified-env:common add "packages:openmpi:require:~internal-hwloc +two_level_namespace schedulers=slurm +pmi" \ + && spack external find --scope "env:/opt/spack-stack/envs/unified-env:/opt/spack-stack/envs/unified-env/site" wget \ + # Add slurm as an external package \ && echo " slurm:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " externals:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && echo " - spec: slurm@23.11.7" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " - spec: slurm@25.11.5" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " prefix: /usr" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " buildable: false" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && if [ "$(uname -m)" == "aarch64" ]; then \ - spack config --scope env:/opt/spack-stack/envs/unified-env:common remove "packages:wgrib2" ; \ - spack config --scope env:/opt/spack-stack/envs/unified-env:common remove "modules:default:lmod:wgrib2" ; \ + # Add munge as an external package so spack uses the system munge (same one munged starts at boot) \ + # rather than building its own, which would cause LD_LIBRARY_PATH conflicts with system Slurm tools \ + && MUNGE_VERSION=$(dpkg -l munge 2>/dev/null | awk '/^ii/{print $3}' | cut -d: -f2 | cut -d- -f1 || echo "0.5.16") \ + && echo " munge:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " externals:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " - spec: munge@${MUNGE_VERSION}" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " prefix: /usr" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + && echo " buildable: false" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + # Configure openmpi to use Slurm scheduler integration (OpenMPI 5 uses PMIx, not +pmi) \ + && spack -e . config add 'packages:openmpi:require:[schedulers=slurm]' \ + # Ensure PMIx includes munge security plugin to avoid psec/munge runtime warnings. + # Use require (not just variants preference) so concretization cannot silently pick ~munge. \ + && spack -e . config add 'packages:pmix:require:[+munge]' \ + # Normalize target selection so builds on different hosts can share buildcache artifacts. \ + # Use generic targets (x86_64 / aarch64) for maximum portability at the cost of SIMD optimizations. \ + && if [ "$(uname -m)" = "x86_64" ]; then \ + spack -e . config add 'packages:all:target:[x86_64]' ; \ + elif [ "$(uname -m)" = "aarch64" ]; then \ + spack -e . config add 'packages:all:target:[aarch64]' ; \ fi \ + # Configure lmod modules instead of tcl \ && sed -i 's/tcl/lmod/g' site/modules.yaml \ - && sed -i 's/tcl/lmod/g' common/modules.yaml \ - && sed -i 's:{^mpi.name}/{^mpi.version}/{compiler.name}/{compiler.version}/{name}:{name}:' common/modules.yaml \ - && sed -i 's:{compiler.name}/{compiler.version}/{name}:{name}:' common/modules.yaml + # Use unhashed module names like other spack-stack deployments, with suffix rules to avoid naming clashes \ + && printf '%s\n' \ + ' hash_length: 0' \ + ' all:' \ + ' suffixes:' \ + ' +debug: debug' \ + ' build_type=Debug: debug' \ + ' esmf:' \ + ' suffixes:' \ + ' ~openmp: noopenmp' \ + ' ip:' \ + ' suffixes:' \ + ' ~openmp: noopenmp' \ + ' neptune-env:' \ + ' suffixes:' \ + ' ~openmp: noopenmp' \ + ' mapl:' \ + ' suffixes:' \ + ' ^esmf@8.6.1~debug ~openmp snapshot=none: esmf-8.6.1-noopenmp' \ + ' ^esmf@8.6.1+debug ~openmp snapshot=none: esmf-8.6.1-debug-noopenmp' \ + ' ^esmf@8.6.1~debug +openmp snapshot=none: esmf-8.6.1' \ + ' ^esmf@8.6.1+debug +openmp snapshot=none: esmf-8.6.1-debug' \ + ' ^esmf@8.8.0~debug ~openmp snapshot=none: esmf-8.8.0-noopenmp' \ + ' ^esmf@8.8.0+debug ~openmp snapshot=none: esmf-8.8.0-debug-noopenmp' \ + ' ^esmf@8.8.0~debug +openmp snapshot=none: esmf-8.8.0' \ + ' ^esmf@8.8.0+debug +openmp snapshot=none: esmf-8.8.0-debug' \ + ' ^esmf@8.9.1~debug ~openmp snapshot=none: esmf-8.9.1-noopenmp' \ + ' ^esmf@8.9.1+debug ~openmp snapshot=none: esmf-8.9.1-debug-noopenmp' \ + ' ^esmf@8.9.1~debug +openmp snapshot=none: esmf-8.9.1' \ + ' ^esmf@8.9.1+debug +openmp snapshot=none: esmf-8.9.1-debug' \ + ' fms:' \ + ' suffixes:' \ + ' constants=GEOS: geos-constants' \ + ' constants=GFS: gfs-constants' \ + >> site/modules.yaml \ + # Force env module root so final stage can copy a stable path \ + && spack -e . config add 'modules:default:roots:lmod:$env/modules' -# Concretize the Spack environment +# Add the build cache mirror and concretize RUN cd /opt/spack-stack \ - && . ./setup.sh \ - && cd /opt/spack-stack/envs/unified-env \ + && source setup.sh \ + && cd envs/unified-env \ && spack env activate . \ - && spack concretize 2>&1 | tee log.concretize + && spack mirror add --unsigned ghcr_buildcache oci://ghcr.io/noaa-gsl/dockerspackstackslurmcluster/buildcache \ + && spack concretize > /dev/null 2>&1 \ + && spack spec openmpi 2>&1 | grep -Eq 'schedulers(:=|=)slurm' \ + && spack spec pmix 2>&1 | grep -q '+munge' # Install the Spack environment -RUN --mount=type=secret,id=access_key_id --mount=type=secret,id=secret_access_key --mount=type=secret,id=session_token <&1 | tee log.install - if [ -f /run/secrets/access_key_id ]; then - spack buildcache update-index s3_spack_stack_buildcache_rw + # Install the environment + spack install --no-check-signature 2>&1 | tee log.install + + # Update the build cache index if credentials were provided + if [ -f /run/secrets/github_token ]; then + spack buildcache update-index ghcr_buildcache fi EOF # Create the modulefiles and cleanup -RUN cd /opt \ - && pushd spack-stack \ - && . ./setup.sh \ - && pushd envs/unified-env \ - && spack env activate . \ +RUN cd /opt/spack-stack \ + && source setup.sh \ + && cd envs/unified-env \ && source /usr/lmod/lmod/init/bash \ - && spack module lmod refresh -y \ - && spack stack setup-meta-modules \ + && spack -e . module lmod refresh -y --delete-tree \ + && spack -e . stack setup-meta-modules \ && spack gc -y \ - && find /opt/spack-stack/envs/unified-env/install/gcc/11.4.0 -name .spack -type d -print0 | xargs -0 rm -rf "{}" \ && rm -rf ~/.spack -# Copy installed environment into final images +# Copy installed environment into final image FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest +# Default Slurm MPI plugin so users do not need to pass --mpi=pmix on every srun. +ENV SLURM_MPI_TYPE=pmix + COPY --from=builder /usr /usr COPY --from=builder /etc /etc COPY --from=builder /opt/spack-stack/envs/unified-env/install /opt/spack-stack/envs/unified-env/install +COPY --from=builder /opt/spack-stack/envs/unified-env/modules /opt/spack-stack/envs/unified-env/modules diff --git a/frontend/cc.patch.aarch64 b/frontend/cc.patch.aarch64 deleted file mode 100644 index 5366919..0000000 --- a/frontend/cc.patch.aarch64 +++ /dev/null @@ -1,11 +0,0 @@ ---- lib/spack/env/cc.orig 2024-06-21 20:02:54.496630733 +0000 -+++ lib/spack/env/cc 2024-06-21 19:46:13.019268774 +0000 -@@ -724,7 +724,7 @@ - esac - - # prepend target args -- preextend flags_list SPACK_TARGET_ARGS -+ #preextend flags_list SPACK_TARGET_ARGS - ;; - esac - diff --git a/frontend/cc.patch.x86_64 b/frontend/cc.patch.x86_64 deleted file mode 100644 index e69de29..0000000 diff --git a/frontend/openmpi.package.py.patch.aarch64 b/frontend/openmpi.package.py.patch.aarch64 deleted file mode 100644 index 0f81f6f..0000000 --- a/frontend/openmpi.package.py.patch.aarch64 +++ /dev/null @@ -1,10 +0,0 @@ ---- var/spack/repos/builtin/packages/openmpi/package.py 2024-06-26 09:58:06 -+++ var/spack/repos/builtin/packages/openmpi/package.py.aarch64 2024-06-26 10:07:39 -@@ -971,6 +971,7 @@ - - if spec.satisfies("+pmi"): - config_args.append("--with-pmi={0}".format(spec["slurm"].prefix)) -+ config_args.append("--with-pmi-libdir=/usr/lib/aarch64-linux-gnu") - else: - config_args.extend(self.with_or_without("pmi")) - diff --git a/frontend/openmpi.package.py.patch.x86_64 b/frontend/openmpi.package.py.patch.x86_64 deleted file mode 100644 index 755e4b3..0000000 --- a/frontend/openmpi.package.py.patch.x86_64 +++ /dev/null @@ -1,10 +0,0 @@ ---- var/spack/repos/builtin/packages/openmpi/package.py 2024-06-26 09:58:06 -+++ var/spack/repos/builtin/packages/openmpi/package.py.x86_64 2024-06-26 10:08:39 -@@ -971,6 +971,7 @@ - - if spec.satisfies("+pmi"): - config_args.append("--with-pmi={0}".format(spec["slurm"].prefix)) -+ config_args.append("--with-pmi-libdir=/usr/lib/x86_64-linux-gnu") - else: - config_args.extend(self.with_or_without("pmi")) - diff --git a/master/Dockerfile b/master/Dockerfile index 4d4392a..159e27e 100644 --- a/master/Dockerfile +++ b/master/Dockerfile @@ -11,12 +11,11 @@ RUN < hello.out +srun --mpi=pmix -N 3 --tasks-per-node=2 ./hello.exe | sort > hello.out +diff hello.out hello.baseline + +srun -N 3 --tasks-per-node=2 ./hello.exe | sort > hello.out diff hello.out hello.baseline