diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml new file mode 100644 index 00000000000..d3b0d1002dc --- /dev/null +++ b/.github/workflows/container.yml @@ -0,0 +1,60 @@ +name: container build/publish + +on: + push: + branches: + - master + paths: + - 'docker/**' + + pull_request: + branches: + - master + paths: + - 'docker/**' + + workflow_dispatch: + +concurrency: + group: ${{ github.ref }} + cancel-in-progress: true + +jobs: + # Only build container if there has been a change. + build-containers: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Docker meta + id: meta + uses: docker/metadata-action@v4 + with: + images: ghcr.io/ESMCI/cime + flavor: | + latest=auto + tags: | + type=sha + - name: Build and push + uses: docker/build-push-action@v3 + with: + target: base + context: docker/ + push: ${{ github.event_name == 'push' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0b855e6ab76..098cd4c8c01 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -4,36 +4,55 @@ on: push: branches: - master + paths: + - 'doc/**' pull_request: branches: - master + paths: + - 'doc/**' + + workflow_dispatch: permissions: contents: read jobs: - check-changes: - name: Check for changes to documentation + cleanup: + permissions: + contents: write # for git push + name: Cleanup branch previews runs-on: ubuntu-latest - outputs: - any_changed: ${{ steps.changed-check.outputs.any_changed }} + if: ${{ github.event_name == 'push' }} steps: - uses: actions/checkout@v3 with: + ref: 'gh-pages' fetch-depth: 0 lfs: true - - uses: tj-actions/changed-files@v32 - id: changed-check - with: - files: doc + path: gh-pages + - name: Remove branch previews + run: | + pushd $GITHUB_WORKSPACE/gh-pages + + for name in `ls branch/` + do + if [[ -z "$(git show-ref --quiet ${name})" ]] + then + git rm -rf branch/${name} + fi + done + + git config user.name github-actions[bot] + git config user.email github-actions[bot]@users.noreply.github.com + git commit -m "Clean up branch previews" + git push build-and-deploy: permissions: contents: write # for peaceiris/actions-gh-pages to push pull-requests: write # to comment on pull requests - needs: check-changes - if: | - needs.check-changes.outputs.any_changed == 'true' && - github.event.pull_request.head.repo.full_name == github.repository + needs: cleanup + if: ${{ always() }} name: Build and deploy documentation runs-on: ubuntu-latest steps: @@ -61,7 +80,9 @@ jobs: run: | make BUILDDIR=${PWD}/_build -C doc/ html - name: Push PR preview - if: ${{ github.event_name == 'pull_request' }} + if: | + github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{secrets.GITHUB_TOKEN}} @@ -99,31 +120,3 @@ jobs: destination_dir: './versions/master/html' user_name: 'github-actions[bot]' user_email: 'github-actions[bot]@users.noreply.github.com' - cleanup: - permissions: - contents: write # for git push - needs: build-and-deploy - name: Cleanup branch previews - runs-on: ubuntu-latest - if: ${{ github.event_name == 'push' }} - steps: - - uses: actions/checkout@v3 - with: - ref: 'gh-pages' - fetch-depth: 0 - lfs: true - - name: Remove branch previews - run: | - for name in `ls branch/` - do - if [[ -z "$(git show-ref --quiet ${name})" ]] - then - git rm -rf branch/${name} - fi - done - - name: Commit and push local changes to gh-pages - run: | - git config user.name github-actions[bot] - git config user.email github-actions[bot]@users.noreply.github.com - git commit -m "Clean up branch previews" - git push diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 00000000000..ce0933e7515 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,24 @@ +name: 'Close stale issues and PRs' +on: + schedule: + # Run every day at 1:30AM + - cron: '30 1 * * *' +jobs: + stale: + permissions: + issues: write + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v8 + with: + stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.' + days-before-stale: 90 + days-before-close: 5 + days-before-pr-close: -1 + # Issues with this label are exempt from being checked if they are stale... + exempt-issue-labels: Low Priority + # Below are currently defaults, but given in case we decide to change + operations-per-run: 30 + stale-issue-label: Stale + close-issue-reason: not_planned diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 8ffa63c038b..6af6ec139d3 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -4,10 +4,22 @@ on: push: branches: - master + paths: + - 'CIME/**' + - 'scripts/**' + - 'tools/**' + - 'utils/**' pull_request: branches: - master + paths: + - 'CIME/**' + - 'scripts/**' + - 'tools/**' + - 'utils/**' + + workflow_dispatch: concurrency: group: ${{ github.ref }} @@ -15,6 +27,7 @@ concurrency: permissions: contents: read # to fetch code (actions/checkout) + packages: read jobs: pre-commit: @@ -39,67 +52,18 @@ jobs: pre-commit run -a - # Check if there has been a change to any file under docker/ - get-docker-changes: - runs-on: ubuntu-latest - outputs: - any_changed: ${{ steps.get-changed-files.outputs.any_changed }} - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - name: Get changed files - id: get-changed-files - uses: tj-actions/changed-files@v29 - with: - files: docker - - # Only build container if there has been a change. - build-containers: - runs-on: ubuntu-latest - needs: get-docker-changes - if: ${{ needs.get-docker-changes.outputs.any_changed == 'true' }} - steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Docker meta - id: meta - uses: docker/metadata-action@v4 - with: - images: jasonb87/cime - tags: | - type=raw,value=latest - type=sha,prefix={{ date 'YYYYMMDD' }}_,format=short - - name: Build and push - uses: docker/build-push-action@v3 - with: - target: base - context: docker/ - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=registry,ref=jasonb87/cime:buildcache - cache-to: type=registry,ref=jasonb87/cime:buildcache,mode=max - # Runs unit testing under different python versions. unit-testing: runs-on: ubuntu-latest - needs: build-containers if: ${{ always() && ! cancelled() }} - container: jasonb87/cime:latest + container: + image: ghcr.io/esmci/cime:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: ['3.8', '3.9', '3.10'] steps: - name: Checkout code uses: actions/checkout@v2 @@ -130,9 +94,12 @@ jobs: # Run system tests system-testing: runs-on: ubuntu-latest - needs: build-containers if: ${{ always() && ! cancelled() }} - container: jasonb87/cime:latest + container: + image: ghcr.io/esmci/cime:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} strategy: matrix: model: ["e3sm", "cesm"] @@ -183,6 +150,8 @@ jobs: if: ${{ failure() }} shell: bash run: tar -czvf /testing-logs-${GITHUB_RUN_NUMBER}.tar.gz /storage/cases/ + # How to download artifacts: + # https://docs.github.com/en/actions/managing-workflow-runs/downloading-workflow-artifacts - name: Upload testing logs if: ${{ failure() }} uses: actions/upload-artifact@v3 diff --git a/CIME/SystemTests/README b/CIME/SystemTests/README index 31ee7c4f3e3..61d0eec7f40 100644 --- a/CIME/SystemTests/README +++ b/CIME/SystemTests/README @@ -47,7 +47,7 @@ ERP pes counts hybrid (open-MP/MPI) restart bfb test from startup, default 6 do an 11 day initial test - write a restart at day 6 (suffix base) half the number of tasks and threads for each component do a 5 day restart test starting from restart at day 6 (suffix rest) - this is just like an ERS test but the pe-counts/threading count are modified on retart + this is just like an ERS test but the pe-counts/threading count are modified on restart ERI hybrid/branch/exact restart test, default (by default STOP_N is 22 days) (1) ref1case diff --git a/CIME/SystemTests/dae.py b/CIME/SystemTests/dae.py index 2b0b58c4b4e..175254d2d1b 100644 --- a/CIME/SystemTests/dae.py +++ b/CIME/SystemTests/dae.py @@ -27,7 +27,7 @@ class DAE(SystemTestsCompareTwo): """ ########################################################################### - def __init__(self, case): + def __init__(self, case, **kwargs): ########################################################################### SystemTestsCompareTwo.__init__( self, @@ -36,6 +36,7 @@ def __init__(self, case): run_two_suffix="da", run_one_description="no data assimilation", run_two_description="data assimilation", + **kwargs, ) ########################################################################### diff --git a/CIME/SystemTests/eri.py b/CIME/SystemTests/eri.py index 0bcf4466646..272a3881add 100644 --- a/CIME/SystemTests/eri.py +++ b/CIME/SystemTests/eri.py @@ -38,11 +38,11 @@ def _helper(dout_sr, refdate, refsec, rundir): class ERI(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the ERI system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) self._testname = "ERI" def run_phase(self): diff --git a/CIME/SystemTests/erio.py b/CIME/SystemTests/erio.py index f9de01a8b27..a1e7b041cc6 100644 --- a/CIME/SystemTests/erio.py +++ b/CIME/SystemTests/erio.py @@ -10,11 +10,11 @@ class ERIO(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to file env_test.xml in the case directory """ - SystemTestsCommon.__init__(self, case, expected=["TEST"]) + SystemTestsCommon.__init__(self, case, expected=["TEST"], **kwargs) self._pio_types = self._case.get_env("run").get_valid_values("PIO_TYPENAME") self._stop_n = self._case.get_value("STOP_N") diff --git a/CIME/SystemTests/erp.py b/CIME/SystemTests/erp.py index 8f347fe6eee..f549f9e116e 100644 --- a/CIME/SystemTests/erp.py +++ b/CIME/SystemTests/erp.py @@ -15,7 +15,7 @@ class ERP(RestartTest): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize a test object """ @@ -26,6 +26,7 @@ def __init__(self, case): run_two_suffix="rest", run_one_description="initial", run_two_description="restart", + **kwargs ) def _case_two_setup(self): diff --git a/CIME/SystemTests/err.py b/CIME/SystemTests/err.py index 4dd79a85aae..355ddd5d390 100644 --- a/CIME/SystemTests/err.py +++ b/CIME/SystemTests/err.py @@ -11,7 +11,7 @@ class ERR(RestartTest): - def __init__(self, case): # pylint: disable=super-init-not-called + def __init__(self, case, **kwargs): # pylint: disable=super-init-not-called """ initialize an object interface to the ERR system test """ @@ -22,6 +22,7 @@ def __init__(self, case): # pylint: disable=super-init-not-called run_one_description="initial", run_two_description="restart", multisubmit=True, + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/erri.py b/CIME/SystemTests/erri.py index 8cec2b149ce..7851bd4bb66 100644 --- a/CIME/SystemTests/erri.py +++ b/CIME/SystemTests/erri.py @@ -12,11 +12,11 @@ class ERRI(ERR): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the ERU system test """ - ERR.__init__(self, case) + ERR.__init__(self, case, **kwargs) def _case_two_custom_postrun_action(self): rundir = self._case.get_value("RUNDIR") diff --git a/CIME/SystemTests/ers.py b/CIME/SystemTests/ers.py index df5daea488c..bebed8f04c4 100644 --- a/CIME/SystemTests/ers.py +++ b/CIME/SystemTests/ers.py @@ -9,11 +9,11 @@ class ERS(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the ERS system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) def _ers_first_phase(self): stop_n = self._case.get_value("STOP_N") diff --git a/CIME/SystemTests/ers2.py b/CIME/SystemTests/ers2.py index e65f703e36e..63a10399b49 100644 --- a/CIME/SystemTests/ers2.py +++ b/CIME/SystemTests/ers2.py @@ -8,11 +8,11 @@ class ERS2(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the ERS2 system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) def _ers2_first_phase(self): stop_n = self._case.get_value("STOP_N") diff --git a/CIME/SystemTests/ert.py b/CIME/SystemTests/ert.py index 36366395190..b912f7248b7 100644 --- a/CIME/SystemTests/ert.py +++ b/CIME/SystemTests/ert.py @@ -10,11 +10,11 @@ class ERT(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the ERT system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) def _ert_first_phase(self): diff --git a/CIME/SystemTests/funit.py b/CIME/SystemTests/funit.py index 193c485433e..1ebaf720604 100644 --- a/CIME/SystemTests/funit.py +++ b/CIME/SystemTests/funit.py @@ -12,11 +12,11 @@ class FUNIT(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the FUNIT system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) case.load_env() def build_phase(self, sharedlib_only=False, model_only=False): diff --git a/CIME/SystemTests/homme.py b/CIME/SystemTests/homme.py index 6161c2e46be..597be0b9a09 100644 --- a/CIME/SystemTests/homme.py +++ b/CIME/SystemTests/homme.py @@ -2,6 +2,6 @@ class HOMME(HommeBase): - def __init__(self, case): - HommeBase.__init__(self, case) + def __init__(self, case, **kwargs): + HommeBase.__init__(self, case, **kwargs) self.cmakesuffix = "" diff --git a/CIME/SystemTests/hommebaseclass.py b/CIME/SystemTests/hommebaseclass.py index 5c29fce7533..bad27d4aa56 100644 --- a/CIME/SystemTests/hommebaseclass.py +++ b/CIME/SystemTests/hommebaseclass.py @@ -14,11 +14,11 @@ class HommeBase(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the SMS system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) case.load_env() self.csnd = "not defined" self.cmakesuffix = self.csnd diff --git a/CIME/SystemTests/hommebfb.py b/CIME/SystemTests/hommebfb.py index 7cd6b370222..87e566bf918 100644 --- a/CIME/SystemTests/hommebfb.py +++ b/CIME/SystemTests/hommebfb.py @@ -2,6 +2,6 @@ class HOMMEBFB(HommeBase): - def __init__(self, case): - HommeBase.__init__(self, case) + def __init__(self, case, **kwargs): + HommeBase.__init__(self, case, **kwargs) self.cmakesuffix = "-bfb" diff --git a/CIME/SystemTests/icp.py b/CIME/SystemTests/icp.py index f0e3988774c..8d8c5e0ea59 100644 --- a/CIME/SystemTests/icp.py +++ b/CIME/SystemTests/icp.py @@ -6,11 +6,11 @@ class ICP(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to file env_test.xml in the case directory """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) def build_phase(self, sharedlib_only=False, model_only=False): self._case.set_value("CICE_AUTO_DECOMP", "false") diff --git a/CIME/SystemTests/irt.py b/CIME/SystemTests/irt.py index adda8b235ff..1f3637eb5a0 100644 --- a/CIME/SystemTests/irt.py +++ b/CIME/SystemTests/irt.py @@ -19,7 +19,7 @@ class IRT(RestartTest): - def __init__(self, case): + def __init__(self, case, **kwargs): RestartTest.__init__( self, case, @@ -28,6 +28,7 @@ def __init__(self, case): run_one_description="initial", run_two_description="restart", multisubmit=False, + **kwargs ) self._skip_pnl = False diff --git a/CIME/SystemTests/ldsta.py b/CIME/SystemTests/ldsta.py index f7a4a2b4729..a5f7c9196d5 100644 --- a/CIME/SystemTests/ldsta.py +++ b/CIME/SystemTests/ldsta.py @@ -30,11 +30,11 @@ def _date_to_datetime(date_obj): class LDSTA(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the SMS system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) def run_phase(self): archive_dir = self._case.get_value("DOUT_S_ROOT") diff --git a/CIME/SystemTests/mcc.py b/CIME/SystemTests/mcc.py index 4d47bf2c318..a4b839cf1e9 100644 --- a/CIME/SystemTests/mcc.py +++ b/CIME/SystemTests/mcc.py @@ -11,7 +11,7 @@ class MCC(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): self._comp_classes = [] self._test_instances = 3 SystemTestsCompareTwo.__init__( @@ -21,6 +21,7 @@ def __init__(self, case): run_two_suffix="single_instance", run_two_description="single instance", run_one_description="multi driver", + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/mvk.py b/CIME/SystemTests/mvk.py index 39b4fcb6539..2ab2f72cd33 100644 --- a/CIME/SystemTests/mvk.py +++ b/CIME/SystemTests/mvk.py @@ -28,11 +28,11 @@ class MVK(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the MVK test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) if self._case.get_value("MODEL") == "e3sm": self.component = "eam" diff --git a/CIME/SystemTests/nck.py b/CIME/SystemTests/nck.py index af0a2d0c5e6..5a391b5ecf7 100644 --- a/CIME/SystemTests/nck.py +++ b/CIME/SystemTests/nck.py @@ -15,7 +15,7 @@ class NCK(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): self._comp_classes = [] SystemTestsCompareTwo.__init__( self, @@ -24,6 +24,7 @@ def __init__(self, case): run_two_suffix="multiinst", run_one_description="one instance", run_two_description="two instances", + **kwargs, ) def _common_setup(self): diff --git a/CIME/SystemTests/ncr.py b/CIME/SystemTests/ncr.py index a1cc7d3bad5..f0de168ac13 100644 --- a/CIME/SystemTests/ncr.py +++ b/CIME/SystemTests/ncr.py @@ -15,7 +15,7 @@ class NCR(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an NCR test """ @@ -26,6 +26,7 @@ def __init__(self, case): run_two_suffix="singleinst", run_one_description="two instances, each with the same number of tasks", run_two_description="default build", + **kwargs ) def _comp_classes(self): diff --git a/CIME/SystemTests/nodefail.py b/CIME/SystemTests/nodefail.py index c770fc292bc..35f0ca3c8a6 100644 --- a/CIME/SystemTests/nodefail.py +++ b/CIME/SystemTests/nodefail.py @@ -9,11 +9,11 @@ class NODEFAIL(ERS): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the ERS system test """ - ERS.__init__(self, case) + ERS.__init__(self, case, **kwargs) self._fail_sentinel = os.path.join(case.get_value("RUNDIR"), "FAIL_SENTINEL") self._fail_str = case.get_value("NODE_FAIL_REGEX") diff --git a/CIME/SystemTests/pea.py b/CIME/SystemTests/pea.py index b20c3abd4e7..cc9509e4e5b 100644 --- a/CIME/SystemTests/pea.py +++ b/CIME/SystemTests/pea.py @@ -14,7 +14,7 @@ class PEA(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): SystemTestsCompareTwo.__init__( self, case, @@ -22,6 +22,7 @@ def __init__(self, case): run_two_suffix="mpi-serial", run_one_description="default mpi library", run_two_description="mpi-serial", + **kwargs, ) def _common_setup(self): diff --git a/CIME/SystemTests/pem.py b/CIME/SystemTests/pem.py index f74f3f93e55..fc8317f432f 100644 --- a/CIME/SystemTests/pem.py +++ b/CIME/SystemTests/pem.py @@ -15,14 +15,21 @@ class PEM(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): + build_separately = False + # cice, pop require separate builds + comps = case.get_compset_components() + if "cice" in comps or "pop" in comps: + build_separately = True + SystemTestsCompareTwo.__init__( self, case, - separate_builds=True, + separate_builds=build_separately, run_two_suffix="modpes", run_one_description="default pe counts", run_two_description="halved pe counts", + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/pet.py b/CIME/SystemTests/pet.py index fcf108bd28c..7dbaa9af79c 100644 --- a/CIME/SystemTests/pet.py +++ b/CIME/SystemTests/pet.py @@ -13,7 +13,7 @@ class PET(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize a test object """ @@ -25,6 +25,7 @@ def __init__(self, case): run_two_suffix="single_thread", run_one_description="default threading", run_two_description="threads set to 1", + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/pfs.py b/CIME/SystemTests/pfs.py index 32bdbe08002..ed61d204e8a 100644 --- a/CIME/SystemTests/pfs.py +++ b/CIME/SystemTests/pfs.py @@ -11,11 +11,11 @@ class PFS(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the PFS system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) def run_phase(self): logger.info("doing an 20 day initial test, no restarts written") diff --git a/CIME/SystemTests/pgn.py b/CIME/SystemTests/pgn.py index e2cd66b7abd..a597cc71f97 100644 --- a/CIME/SystemTests/pgn.py +++ b/CIME/SystemTests/pgn.py @@ -50,11 +50,11 @@ class PGN(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the PGN test """ - super(PGN, self).__init__(case) + super(PGN, self).__init__(case, **kwargs) if self._case.get_value("MODEL") == "e3sm": self.atmmod = "eam" self.lndmod = "elm" diff --git a/CIME/SystemTests/pre.py b/CIME/SystemTests/pre.py index 54512a00660..23547d46430 100644 --- a/CIME/SystemTests/pre.py +++ b/CIME/SystemTests/pre.py @@ -25,7 +25,7 @@ class PRE(SystemTestsCompareTwo): """ ########################################################################### - def __init__(self, case): + def __init__(self, case, **kwargs): ########################################################################### SystemTestsCompareTwo.__init__( self, @@ -34,6 +34,7 @@ def __init__(self, case): run_two_suffix="pr", run_one_description="no pause/resume", run_two_description="pause/resume", + **kwargs ) self._stopopt = "" self._stopn = 0 diff --git a/CIME/SystemTests/rep.py b/CIME/SystemTests/rep.py index 60f6b473ea9..367409ac3fa 100644 --- a/CIME/SystemTests/rep.py +++ b/CIME/SystemTests/rep.py @@ -8,9 +8,9 @@ class REP(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): SystemTestsCompareTwo.__init__( - self, case, separate_builds=False, run_two_suffix="rep2" + self, case, separate_builds=False, run_two_suffix="rep2", **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/restart_tests.py b/CIME/SystemTests/restart_tests.py index 31d1be32181..5faf2252d1b 100644 --- a/CIME/SystemTests/restart_tests.py +++ b/CIME/SystemTests/restart_tests.py @@ -18,6 +18,7 @@ def __init__( run_one_description="initial", run_two_description="restart", multisubmit=False, + **kwargs ): SystemTestsCompareTwo.__init__( self, @@ -27,6 +28,7 @@ def __init__( run_one_description=run_one_description, run_two_description=run_two_description, multisubmit=multisubmit, + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/reuseinitfiles.py b/CIME/SystemTests/reuseinitfiles.py index 5f2567f6c70..76d8bb0522e 100644 --- a/CIME/SystemTests/reuseinitfiles.py +++ b/CIME/SystemTests/reuseinitfiles.py @@ -20,7 +20,7 @@ class REUSEINITFILES(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): SystemTestsCompareTwo.__init__( self, case, @@ -32,6 +32,7 @@ def __init__(self, case): # init_generated_files from case1 and then need to make sure they are NOT # deleted like is normally done for tests: case_two_keep_init_generated_files=True, + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/seq.py b/CIME/SystemTests/seq.py index 0a51d50d283..304932d7d14 100644 --- a/CIME/SystemTests/seq.py +++ b/CIME/SystemTests/seq.py @@ -8,7 +8,7 @@ class SEQ(SystemTestsCompareTwo): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to file env_test.xml in the case directory """ @@ -19,6 +19,7 @@ def __init__(self, case): run_two_suffix="seq", run_one_description="base", run_two_description="sequence", + **kwargs ) def _case_one_setup(self): diff --git a/CIME/SystemTests/sms.py b/CIME/SystemTests/sms.py index 09722caa3d5..17672b47052 100644 --- a/CIME/SystemTests/sms.py +++ b/CIME/SystemTests/sms.py @@ -10,8 +10,8 @@ class SMS(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the SMS system test """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) diff --git a/CIME/SystemTests/system_tests_common.py b/CIME/SystemTests/system_tests_common.py index 5eaa4ef02d9..f15fbe959e0 100644 --- a/CIME/SystemTests/system_tests_common.py +++ b/CIME/SystemTests/system_tests_common.py @@ -29,6 +29,7 @@ import CIME.build as build import glob, gzip, time, traceback, os +from contextlib import ExitStack logger = logging.getLogger(__name__) @@ -36,8 +37,55 @@ INIT_GENERATED_FILES_DIRNAME = "init_generated_files" +def fix_single_exe_case(case): + """Fixes cases created with --single-exe. + + When tests are created using --single-exe, the test_scheduler will set + `BUILD_COMPLETE` to True, but some tests require calls to `case.case_setup` + which can resets `BUILD_COMPLETE` to false. This function will check if a + case was created with `--single-exe` and ensure `BUILD_COMPLETE` is True. + + Returns: + True when case required modification otherwise False. + """ + if is_single_exe_case(case): + with ExitStack() as stack: + # enter context if case is still read-only, entering the context + # multiple times can cause side effects for later calls to + # `set_value` when it's assumed the cause is writeable. + if case._read_only_mode: + stack.enter_context(case) + + case.set_value("BUILD_COMPLETE", True) + + return True + + return False + + +def is_single_exe_case(case): + """Determines if the case was created with the --single-exe option. + + If `CASEROOT` is not part of `EXEROOT` and the `TEST` variable is True, + then its safe to assume the case was created with `./create_test` + and the `--single-exe` option. + + Returns: + True when the case was created with `--single-exe` otherwise false. + """ + caseroot = case.get_value("CASEROOT") + + exeroot = case.get_value("EXEROOT") + + test = case.get_value("TEST") + + return caseroot not in exeroot and test + + class SystemTestsCommon(object): - def __init__(self, case, expected=None): + def __init__( + self, case, expected=None, **kwargs + ): # pylint: disable=unused-argument """ initialize a CIME system test object, if the locked env_run.orig.xml does not exist copy the current env_run.xml file. If it does exist restore values @@ -97,6 +145,7 @@ def _resetup_case(self, phase, reset=False): ) self._case.set_initial_test_values() self._case.case_setup(reset=True, test_mode=True) + fix_single_exe_case(self._case) def build( self, @@ -105,6 +154,7 @@ def build( ninja=False, dry_run=False, separate_builds=False, + skip_submit=False, ): """ Do NOT override this method, this method is the framework that @@ -115,6 +165,9 @@ def build( self._ninja = ninja self._dry_run = dry_run self._user_separate_builds = separate_builds + + was_run_pend = self._test_status.current_is(RUN_PHASE, TEST_PEND_STATUS) + for phase_name, phase_bool in [ (SHAREDLIB_BUILD_PHASE, not model_only), (MODEL_BUILD_PHASE, not sharedlib_only), @@ -153,6 +206,15 @@ def build( comments=("time={:d}".format(int(time_taken))), ) + # Building model while job is queued and awaiting run + if ( + skip_submit + and was_run_pend + and self._test_status.current_is(SUBMIT_PHASE, TEST_PEND_STATUS) + ): + with self._test_status: + self._test_status.set_status(SUBMIT_PHASE, TEST_PASS_STATUS) + return success def build_phase(self, sharedlib_only=False, model_only=False): @@ -379,6 +441,15 @@ def run_indv( stop_option = self._case.get_value("STOP_OPTION") run_type = self._case.get_value("RUN_TYPE") rundir = self._case.get_value("RUNDIR") + try: + self._case.check_all_input_data() + except CIMEError: + caseroot = self._case.get_value("CASEROOT") + raise CIMEError( + "Could not find all inputdata on any server, try " + "manually running `./check_input_data --download " + f"--versbose` from {caseroot!r}." + ) from None if submit_resubmits is None: do_resub = self._case.get_value("BATCH_SYSTEM") != "none" else: @@ -440,7 +511,8 @@ def _coupler_log_indicates_run_complete(self): return allgood == 0 def _component_compare_copy(self, suffix): - comments, num_copied = copy_histfiles(self._case, suffix) + # Only match .nc files + comments, num_copied = copy_histfiles(self._case, suffix, match_suffix="nc") self._expected_num_cmp = num_copied append_testlog(comments, self._orig_caseroot) @@ -834,8 +906,8 @@ class FakeTest(SystemTestsCommon): in utils.py will work with these classes. """ - def __init__(self, case, expected=None): - super(FakeTest, self).__init__(case, expected=expected) + def __init__(self, case, expected=None, **kwargs): + super(FakeTest, self).__init__(case, expected=expected, **kwargs) self._script = None self._requires_exe = False self._case._non_local = True @@ -1053,8 +1125,8 @@ def build_phase(self, sharedlib_only=False, model_only=False): class TESTBUILDFAILEXC(FakeTest): - def __init__(self, case): - FakeTest.__init__(self, case) + def __init__(self, case, **kwargs): + FakeTest.__init__(self, case, **kwargs) raise RuntimeError("Exception from init") diff --git a/CIME/SystemTests/system_tests_compare_n.py b/CIME/SystemTests/system_tests_compare_n.py index b9b53c8c561..5d7dc405304 100644 --- a/CIME/SystemTests/system_tests_compare_n.py +++ b/CIME/SystemTests/system_tests_compare_n.py @@ -40,7 +40,7 @@ """ from CIME.XML.standard_module_setup import * -from CIME.SystemTests.system_tests_common import SystemTestsCommon +from CIME.SystemTests.system_tests_common import SystemTestsCommon, fix_single_exe_case from CIME.case import Case from CIME.config import Config from CIME.test_status import * @@ -60,6 +60,8 @@ def __init__( run_descriptions=None, multisubmit=False, ignore_fieldlist_diffs=False, + dry_run=False, + **kwargs ): """ Initialize a SystemTestsCompareN object. Individual test cases that @@ -84,7 +86,7 @@ def __init__( the cases as identical. (This is needed for tests where one case exercises an option that produces extra diagnostic fields.) """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) self._separate_builds = separate_builds self._ignore_fieldlist_diffs = ignore_fieldlist_diffs @@ -129,7 +131,8 @@ def __init__( self._cases[0] = self._case self._caseroots = self._get_caseroots() - self._setup_cases_if_not_yet_done() + if not dry_run: + self._setup_cases_if_not_yet_done() self._multisubmit = ( multisubmit and self._cases[0].get_value("BATCH_SYSTEM") != "none" @@ -504,6 +507,7 @@ def _setup_case(self, i): self._activate_case(i) self._common_setup() self._case_setup(i) + fix_single_exe_case(self._cases[i]) if i == 0: # Flush the case so that, if errors occur later, then at least base case is # in a correct, post-setup state. This is important because the mere @@ -516,6 +520,7 @@ def _setup_case(self, i): # This assures that case one namelists are populated # and creates the case.test script self._case.case_setup(test_mode=False, reset=True) + fix_single_exe_case(self._case) else: # Go back to base case to ensure that's where we are for any following code self._activate_case(0) diff --git a/CIME/SystemTests/system_tests_compare_two.py b/CIME/SystemTests/system_tests_compare_two.py index bdbe47ce6db..c58cfa372ba 100644 --- a/CIME/SystemTests/system_tests_compare_two.py +++ b/CIME/SystemTests/system_tests_compare_two.py @@ -45,7 +45,7 @@ """ from CIME.XML.standard_module_setup import * -from CIME.SystemTests.system_tests_common import SystemTestsCommon +from CIME.SystemTests.system_tests_common import SystemTestsCommon, fix_single_exe_case from CIME.case import Case from CIME.config import Config from CIME.test_status import * @@ -66,6 +66,8 @@ def __init__( multisubmit=False, ignore_fieldlist_diffs=False, case_two_keep_init_generated_files=False, + dry_run=False, + **kwargs ): """ Initialize a SystemTestsCompareTwo object. Individual test cases that @@ -98,7 +100,7 @@ def __init__( is provided for the sake of specific tests, e.g., a test of the behavior of running with init_generated_files in place. """ - SystemTestsCommon.__init__(self, case) + SystemTestsCommon.__init__(self, case, **kwargs) self._separate_builds = separate_builds self._ignore_fieldlist_diffs = ignore_fieldlist_diffs @@ -136,7 +138,9 @@ def __init__( # _setup_cases_if_not_yet_done self._case2 = None - self._setup_cases_if_not_yet_done() + # Prevent additional setup_case calls when detecting support for `--single-exe` + if not dry_run: + self._setup_cases_if_not_yet_done() self._multisubmit = ( multisubmit and self._case1.get_value("BATCH_SYSTEM") != "none" @@ -548,6 +552,7 @@ def _setup_cases(self): # This assures that case one namelists are populated # and creates the case.test script self._case.case_setup(test_mode=False, reset=True) + fix_single_exe_case(self._case) # Set up case 2 with self._case2: @@ -555,6 +560,8 @@ def _setup_cases(self): self._common_setup() self._case_two_setup() + fix_single_exe_case(self._case2) + # Go back to case 1 to ensure that's where we are for any following code self._activate_case1() diff --git a/CIME/SystemTests/test_utils/user_nl_utils.py b/CIME/SystemTests/test_utils/user_nl_utils.py index eab45921c95..930d683666b 100644 --- a/CIME/SystemTests/test_utils/user_nl_utils.py +++ b/CIME/SystemTests/test_utils/user_nl_utils.py @@ -8,7 +8,7 @@ def append_to_user_nl_files(caseroot, component, contents): """ - Append the string given by 'contents' to the end of each user_nl file for + Append the string(s) given by 'contents' to the end of each user_nl file for the given component (there may be multiple such user_nl files in the case of a multi-instance test). @@ -25,9 +25,13 @@ def append_to_user_nl_files(caseroot, component, contents): matching the pattern 'user_nl_clm*'. (We do a wildcard match to handle multi-instance tests.) - contents (str): Contents to append to the end of each user_nl file + contents (str or list-like): Contents to append to the end of each user_nl + file. If list-like, each item will be appended on its own line. """ + if isinstance(contents, str): + contents = [contents] + files = _get_list_of_user_nl_files(caseroot, component) if len(files) == 0: @@ -35,7 +39,9 @@ def append_to_user_nl_files(caseroot, component, contents): for one_file in files: with open(one_file, "a") as user_nl_file: - user_nl_file.write("\n" + contents + "\n") + user_nl_file.write("\n") + for c in contents: + user_nl_file.write(c + "\n") def _get_list_of_user_nl_files(path, component): diff --git a/CIME/SystemTests/tsc.py b/CIME/SystemTests/tsc.py index d95e8b03ea3..3ecaefe75d0 100644 --- a/CIME/SystemTests/tsc.py +++ b/CIME/SystemTests/tsc.py @@ -49,11 +49,11 @@ class TSC(SystemTestsCommon): - def __init__(self, case): + def __init__(self, case, **kwargs): """ initialize an object interface to the TSC test """ - super(TSC, self).__init__(case) + super(TSC, self).__init__(case, **kwargs) if self._case.get_value("MODEL") == "e3sm": self.atmmod = "eam" self.lndmod = "elm" diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile index 1e6d0f1896c..ff8bb42ce53 100644 --- a/CIME/Tools/Makefile +++ b/CIME/Tools/Makefile @@ -541,7 +541,7 @@ cospsimulator_intr.o: $(COSP_LIBDIR)/libcosp.a endif ifdef FV3CORE_LIBDIR -$(FV3CORE_LIBDIR)/libfv3core.a: $(EXEROOT)/FMS/libfms.a +$(FV3CORE_LIBDIR)/libfv3core.a: $(LIBROOT)/libfms.a $(MAKE) -C $(FV3CORE_LIBDIR) complib COMPLIB='$(FV3CORE_LIBDIR)/libfv3core.a' F90='$(FC)' CC='$(CC)' FFLAGS='$(FFLAGS) $(FC_AUTO_R8)' CFLAGS='$(CFLAGS)' INCLDIR='$(INCLDIR)' FC_TYPE='$(COMPILER)' dyn_grid.o: $(FV3CORE_LIBDIR)/libfv3core.a @@ -613,6 +613,9 @@ endif # Remove arch flag if it exists F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS)) +ifdef GPUFLAGS + F90_LDFLAGS += $(GPUFLAGS) +endif # Machine stuff to appear last on the link step ifndef MLIBS @@ -875,7 +878,7 @@ endif ifdef FV3CORE_LIBDIR ULIBDEP += $(FV3CORE_LIBDIR)/libfv3core.a - ULIBDEP += $(EXEROOT)/FMS/libfms.a + ULIBDEP += $(LIBROOT)/libfms.a endif ifdef MPAS_LIBDIR @@ -911,7 +914,7 @@ GENF90 ?= $(CIMEROOT)/CIME/non_py/externals/genf90/genf90.pl ifeq ($(MPILIB),mpi-serial) MPISERIAL = $(INSTALL_SHAREDPATH)/lib/libmpi-serial.a - MLIBS += $(MPISERIAL) + MLIBS += -L$(INSTALL_SHAREDPATH)/lib -lmpi-serial CMAKE_OPTS += -DMPI_C_INCLUDE_PATH=$(INSTALL_SHAREDPATH)/include \ -DMPI_Fortran_INCLUDE_PATH=$(INSTALL_SHAREDPATH)/include \ -DMPI_C_LIBRARIES=$(INSTALL_SHAREDPATH)/lib/libmpi-serial.a \ diff --git a/CIME/Tools/case.build b/CIME/Tools/case.build index c8e4d54c467..4edf177198e 100755 --- a/CIME/Tools/case.build +++ b/CIME/Tools/case.build @@ -80,6 +80,14 @@ def parse_command_line(args, description): help="Build each component one at a time, separately, with output going to separate logs", ) + parser.add_argument( + "--skip-submit", + action="store_true", + help="Sets the current test phase to RUN, skipping the SUBMIT phase. This " + "may be useful if rebuilding the model while this test is in the batch queue. " + "ONLY USE IF A TEST CASE, OTHERWISE IGNORED.", + ) + parser.add_argument( "--dry-run", action="store_true", @@ -173,6 +181,7 @@ def parse_command_line(args, description): args.separate_builds, args.ninja, args.dry_run, + args.skip_submit, ) @@ -191,6 +200,7 @@ def _main_func(description): separate_builds, ninja, dry_run, + skip_submit, ) = parse_command_line(sys.argv, description) success = True @@ -234,6 +244,7 @@ def _main_func(description): ninja=ninja, dry_run=dry_run, separate_builds=separate_builds, + skip_submit=skip_submit, ) else: diff --git a/CIME/Tools/cs.status b/CIME/Tools/cs.status index 3db5402b741..4ab7b7e8ecd 100755 --- a/CIME/Tools/cs.status +++ b/CIME/Tools/cs.status @@ -103,6 +103,13 @@ def parse_command_line(args, description): help="Test root used when --test-id is given", ) + parser.add_argument( + "--force-rebuild", + action="store_true", + help="When used with 'test-id', the" + "tests will have their 'BUILD_SHAREDLIB' phase reset to 'PEND'.", + ) + args = parser.parse_args(args[1:]) _validate_args(args) @@ -120,10 +127,17 @@ def parse_command_line(args, description): args.expected_fails_file, args.test_id, args.test_root, + args.force_rebuild, ) def _validate_args(args): + if args.force_rebuild: + expect( + args.test_id != [], + "Cannot force a rebuild without 'test-id'", + ) + expect( not (args.summary and args.count_fails), "--count-fails cannot be specified with --summary", @@ -158,6 +172,7 @@ def _main_func(description): expected_fails_file, test_ids, test_root, + force_rebuild, ) = parse_command_line(sys.argv, description) for test_id in test_ids: test_paths.extend( @@ -172,6 +187,7 @@ def _main_func(description): check_throughput=check_throughput, check_memory=check_memory, expected_fails_filepath=expected_fails_file, + force_rebuild=force_rebuild, ) diff --git a/CIME/Tools/jenkins_generic_job b/CIME/Tools/jenkins_generic_job index 210c557edc2..66dbbdc6e31 100755 --- a/CIME/Tools/jenkins_generic_job +++ b/CIME/Tools/jenkins_generic_job @@ -174,6 +174,12 @@ OR help="Fail if memory check fails (fail if tests footprint grows)", ) + parser.add_argument( + "--ignore-memleak", + action="store_true", + help="Do not fail if there are memleaks", + ) + parser.add_argument( "--pes-file", help="Full pathname of an optional pes specification file. The file" @@ -252,6 +258,7 @@ OR args.update_success, args.check_throughput, args.check_memory, + args.ignore_memleak, args.pes_file, args.jenkins_id, args.queue, @@ -281,6 +288,7 @@ def _main_func(description): update_success, check_throughput, check_memory, + ignore_memleak, pes_file, jenkins_id, queue, @@ -308,6 +316,7 @@ def _main_func(description): update_success, check_throughput, check_memory, + ignore_memleak, pes_file, jenkins_id, queue, diff --git a/CIME/Tools/jenkins_script b/CIME/Tools/jenkins_script deleted file mode 100755 index c1d1728c2cf..00000000000 --- a/CIME/Tools/jenkins_script +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# -# Wrapper around jenkins_generic_job that will allow output -# from that script to always be printed to the screen and -# recoverable if Jenkins is forced to kill the job. This is the -# script that should be used from Jenkins. -# - -SCRIPT_DIR=$( cd "$( dirname "$0" )" && pwd ) -DATE_STAMP=$(date "+%Y-%m-%d_%H%M%S") -export JENKINS_START_TIME=$(date "+%s") - -umask 002 - -$SCRIPT_DIR/jenkins_generic_job --submit-to-cdash --update-success "$@" >& JENKINS_$DATE_STAMP diff --git a/CIME/Tools/standard_script_setup.py b/CIME/Tools/standard_script_setup.py index 74ad6ae39ea..1faab6f0a89 100644 --- a/CIME/Tools/standard_script_setup.py +++ b/CIME/Tools/standard_script_setup.py @@ -41,6 +41,5 @@ def check_minimum_python_version(major, minor): import CIME.utils - CIME.utils.stop_buffering_output() import logging, argparse diff --git a/CIME/XML/archive_base.py b/CIME/XML/archive_base.py index 01297da9f0e..fa42e186937 100644 --- a/CIME/XML/archive_base.py +++ b/CIME/XML/archive_base.py @@ -3,11 +3,39 @@ """ from CIME.XML.standard_module_setup import * from CIME.XML.generic_xml import GenericXML +from CIME.utils import convert_to_type logger = logging.getLogger(__name__) class ArchiveBase(GenericXML): + def exclude_testing(self, compname): + """ + Checks if component should be excluded from testing. + """ + value = self._get_attribute(compname, "exclude_testing") + + if value is None: + return False + + return convert_to_type(value, "logical") + + def _get_attribute(self, compname, attr_name): + attrib = self.get_entry_attributes(compname) + + if attrib is None: + return None + + return attrib.get(attr_name, None) + + def get_entry_attributes(self, compname): + entry = self.get_entry(compname) + + if entry is None: + return None + + return self.attrib(entry) + def get_entry(self, compname): """ Returns an xml node corresponding to compname in comp_archive_spec @@ -117,7 +145,11 @@ def get_all_hist_files(self, casename, model, from_dir, suffix="", ref_case=None ext = ext[:-1] string = model + r"\d?_?(\d{4})?\." + ext if has_suffix: - string += "." + suffix + "$" + if not suffix in string: + string += r"\." + suffix + "$" + + if not string.endswith("$"): + string += "$" logger.debug("Regex is {}".format(string)) pfile = re.compile(string) diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py index d20c67c5ab3..7682ab6e1ee 100644 --- a/CIME/XML/env_batch.py +++ b/CIME/XML/env_batch.py @@ -15,6 +15,7 @@ get_batch_script_for_job, get_logging_options, format_time, + add_flag_to_cmd, ) from CIME.locked_files import lock_file, unlock_file from collections import OrderedDict @@ -555,7 +556,7 @@ def get_batch_directives(self, case, job, overrides=None, output_format="default return "\n".join(result) - def get_submit_args(self, case, job): + def get_submit_args(self, case, job, resolve=True): """ return a list of touples (flag, name) """ @@ -563,7 +564,7 @@ def get_submit_args(self, case, job): submit_arg_nodes = self._get_arg_nodes(case, bs_nodes) - submitargs = self._process_args(case, submit_arg_nodes, job) + submitargs = self._process_args(case, submit_arg_nodes, job, resolve=resolve) return submitargs @@ -597,7 +598,7 @@ def _get_arg_nodes(self, case, bs_nodes): return submit_arg_nodes - def _process_args(self, case, submit_arg_nodes, job): + def _process_args(self, case, submit_arg_nodes, job, resolve=True): submitargs = " " for arg in submit_arg_nodes: @@ -619,19 +620,25 @@ def _process_args(self, case, submit_arg_nodes, job): if " " in flag: flag, name = flag.split() if name: - if "$" in name: + if resolve and "$" in name: rflag = self._resolve_argument(case, flag, name, job) + # This is to prevent -gpu_type=none in qsub args + if rflag.endswith("=none"): + continue if len(rflag) > len(flag): submitargs += " {}".format(rflag) else: - submitargs += " {} {}".format(flag, name) + submitargs += " " + add_flag_to_cmd(flag, name) else: submitargs += " {}".format(flag) else: - try: - submitargs += self._resolve_argument(case, flag, name, job) - except ValueError: - continue + if resolve: + try: + submitargs += self._resolve_argument(case, flag, name, job) + except ValueError: + continue + else: + submitargs += " " + add_flag_to_cmd(flag, name) return submitargs @@ -697,13 +704,8 @@ def _resolve_argument(self, case, flag, name, job): if flag == "-q" and rval == "batch" and case.get_value("MACH") == "blues": # Special case. Do not provide '-q batch' for blues raise ValueError() - if ( - flag.rfind("=", len(flag) - 1, len(flag)) >= 0 - or flag.rfind(":", len(flag) - 1, len(flag)) >= 0 - ): - submitargs = " {}{}".format(flag, str(rval).strip()) - else: - submitargs = " {} {}".format(flag, str(rval).strip()) + + submitargs = " " + add_flag_to_cmd(flag, rval) return submitargs @@ -793,20 +795,10 @@ def submit_jobs( batch_job_id = None for _ in range(num_submit): for job, dependency in jobs: - if dependency is not None: - deps = dependency.split() - else: - deps = [] - dep_jobs = [] - if user_prereq is not None: - dep_jobs.append(user_prereq) - for dep in deps: - if dep in depid.keys() and depid[dep] is not None: - dep_jobs.append(str(depid[dep])) - if prev_job is not None: - dep_jobs.append(prev_job) + dep_jobs = get_job_deps(dependency, depid, prev_job, user_prereq) logger.debug("job {} depends on {}".format(job, dep_jobs)) + result = self._submit_single_job( case, job, @@ -961,10 +953,20 @@ def _submit_single_job( return - submitargs = self.get_submit_args(case, job) - args_override = self.get_value("BATCH_COMMAND_FLAGS", subgroup=job) - if args_override: - submitargs = args_override + submitargs = case.get_value("BATCH_COMMAND_FLAGS", subgroup=job, resolved=False) + + project = case.get_value("PROJECT", subgroup=job) + + if not project: + # If there is no project then we need to remove the project flag + if ( + batch_system == "pbs" or batch_system == "cobalt" + ) and " -A " in submitargs: + submitargs = submitargs.replace("-A", "") + elif batch_system == "lsf" and " -P " in submitargs: + submitargs = submitargs.replace("-P", "") + elif batch_system == "slurm" and " --account " in submitargs: + submitargs = submitargs.replace("--account", "") if dep_jobs is not None and len(dep_jobs) > 0: logger.debug("dependencies: {}".format(dep_jobs)) @@ -1387,3 +1389,41 @@ def make_all_batch_files(self, case): input_batch_script, job ) ) + + +def get_job_deps(dependency, depid, prev_job=None, user_prereq=None): + """ + Gather list of job batch ids that a job depends on. + + Parameters + ---------- + dependency : str + List of dependent job names. + depid : dict + Lookup where keys are job names and values are the batch id. + user_prereq : str + User requested dependency. + + Returns + ------- + list + List of batch ids that job depends on. + """ + deps = [] + dep_jobs = [] + + if user_prereq is not None: + dep_jobs.append(user_prereq) + + if dependency is not None: + # Match all words, excluding "and" and "or" + deps = re.findall(r"\b(?!and\b|or\b)\w+(?:\.\w+)?\b", dependency) + + for dep in deps: + if dep in depid and depid[dep] is not None: + dep_jobs.append(str(depid[dep])) + + if prev_job is not None: + dep_jobs.append(prev_job) + + return dep_jobs diff --git a/CIME/XML/env_build.py b/CIME/XML/env_build.py index 7bd805b1c0f..fe863e414ef 100644 --- a/CIME/XML/env_build.py +++ b/CIME/XML/env_build.py @@ -18,4 +18,19 @@ def __init__( initialize an object interface to file env_build.xml in the case directory """ schema = os.path.join(utils.get_schema_path(), "env_entry_id.xsd") + self._caseroot = case_root EnvBase.__init__(self, case_root, infile, schema=schema, read_only=read_only) + + def set_value(self, vid, value, subgroup=None, ignore_type=False): + """ + Set the value of an entry-id field to value + Returns the value or None if not found + subgroup is ignored in the general routine and applied in specific methods + """ + # Do not allow any of these to be the same as CASEROOT + if vid in ("EXEROOT", "OBJDIR", "LIBROOT"): + utils.expect(value != self._caseroot, f"Cannot set {vid} to CASEROOT") + + return super(EnvBuild, self).set_value( + vid, value, subgroup=subgroup, ignore_type=ignore_type + ) diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py index c7635573f95..76c6588901b 100644 --- a/CIME/XML/env_mach_pes.py +++ b/CIME/XML/env_mach_pes.py @@ -42,6 +42,8 @@ def get_value( resolved=True, subgroup=None, max_mpitasks_per_node=None, + max_cputasks_per_gpu_node=None, + ngpus_per_node=None, ): # pylint: disable=arguments-differ # Special variable NINST_MAX is used to determine the number of # drivers in multi-driver mode. @@ -58,7 +60,13 @@ def get_value( if "NTASKS" in vid or "ROOTPE" in vid: if max_mpitasks_per_node is None: max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") - if value is not None and value < 0: + if max_cputasks_per_gpu_node is None: + max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") + if ngpus_per_node is None: + ngpus_per_node = self.get_value("NGPUS_PER_NODE") + if (ngpus_per_node and value) and value < 0: + value = -1 * value * max_cputasks_per_gpu_node + elif value and value < 0: value = -1 * value * max_mpitasks_per_node # in the nuopc driver there is only one NINST value # so that NINST_{comp} = NINST @@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False): tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1) maxrootpe = max(maxrootpe, rootpe) total_tasks = max(tt, total_tasks) + if asyncio_tasks: total_tasks = total_tasks + len(asyncio_tasks) if self.get_value("MULTI_DRIVER"): @@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count): "totaltasks > 0 expected, totaltasks = {}".format(total_tasks), ) if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"): - tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") + if self.get_value("NGPUS_PER_NODE") > 0: + tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") + else: + tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") else: - tasks_per_node = min( - self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, - self.get_value("MAX_MPITASKS_PER_NODE"), - total_tasks, - ) + ngpus_per_node = self.get_value("NGPUS_PER_NODE") + if ngpus_per_node and ngpus_per_node > 0: + tasks_per_node = min( + self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, + self.get_value("MAX_CPUTASKS_PER_GPU_NODE"), + total_tasks, + ) + else: + tasks_per_node = min( + self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, + self.get_value("MAX_MPITASKS_PER_NODE"), + total_tasks, + ) return tasks_per_node if tasks_per_node > 0 else 1 def get_total_nodes(self, total_tasks, max_thread_count): diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py index 03e84f0faee..4652f2a7d0a 100644 --- a/CIME/XML/env_mach_specific.py +++ b/CIME/XML/env_mach_specific.py @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None): def _compute_actions(self, nodes, child_tag, case, job=None): result = [] # list of tuples ("name", "argument") - compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB") + compiler = case.get_value("COMPILER") + mpilib = case.get_value("MPILIB") for node in nodes: if self._match_attribs(self.attrib(node), case, job=job): diff --git a/CIME/XML/generic_xml.py b/CIME/XML/generic_xml.py index c2a8364e090..a45ca766ee7 100644 --- a/CIME/XML/generic_xml.py +++ b/CIME/XML/generic_xml.py @@ -474,7 +474,7 @@ def write(self, outfile=None, force_write=False): # xmllint provides a better format option for the output file xmllint = find_executable("xmllint") - if xmllint is not None: + if xmllint: if isinstance(outfile, str): run_cmd_no_fail( "{} --format --output {} -".format(xmllint, outfile), @@ -691,8 +691,9 @@ def validate_xml_file(self, filename, schema): expect(os.path.isfile(filename), "xml file not found {}".format(filename)) expect(os.path.isfile(schema), "schema file not found {}".format(schema)) xmllint = find_executable("xmllint") + expect( - os.path.isfile(xmllint), + xmllint and os.path.isfile(xmllint), " xmllint not found in PATH, xmllint is required for cime. PATH={}".format( os.environ["PATH"] ), diff --git a/CIME/XML/grids.py b/CIME/XML/grids.py index 819838edddd..e34aacf2d01 100644 --- a/CIME/XML/grids.py +++ b/CIME/XML/grids.py @@ -25,6 +25,10 @@ def __init__(self, infile=None, files=None, comp_interface=None): infile = files.get_value("GRIDS_SPEC_FILE") logger.debug(" Grid specification file is {}".format(infile)) schema = files.get_schema("GRIDS_SPEC_FILE") + expect( + os.path.isfile(infile) and os.access(infile, os.R_OK), + f" grid file not found {infile}", + ) try: GenericXML.__init__(self, infile, schema) except: diff --git a/CIME/XML/machines.py b/CIME/XML/machines.py index 25d7841a50e..1b45cf5b580 100644 --- a/CIME/XML/machines.py +++ b/CIME/XML/machines.py @@ -41,9 +41,12 @@ def __init__(self, infile=None, files=None, machine=None, extra_machines_dir=Non logger.debug("Verifying using schema {}".format(schema)) self.machines_dir = os.path.dirname(infile) + if os.path.exists(infile): + checked_files.append(infile) + else: + expect(False, f"file not found {infile}") GenericXML.__init__(self, infile, schema) - checked_files.append(infile) # Append the contents of $HOME/.cime/config_machines.xml if it exists. # @@ -326,26 +329,12 @@ def get_default_MPIlib(self, attributes=None): def is_valid_compiler(self, compiler): """ Check the compiler is valid for the current machine - - >>> machobj = Machines(machine="cori-knl") - >>> machobj.get_default_compiler() - 'intel' - >>> machobj.is_valid_compiler("gnu") - True - >>> machobj.is_valid_compiler("nag") - False """ return self.get_field_from_list("COMPILERS", reqval=compiler) is not None def is_valid_MPIlib(self, mpilib, attributes=None): """ Check the MPILIB is valid for the current machine - - >>> machobj = Machines(machine="cori-knl") - >>> machobj.is_valid_MPIlib("mpi-serial") - True - >>> machobj.is_valid_MPIlib("fake-mpi") - False """ return ( mpilib == "mpi-serial" @@ -356,14 +345,6 @@ def is_valid_MPIlib(self, mpilib, attributes=None): def has_batch_system(self): """ Return if this machine has a batch system - - >>> machobj = Machines(machine="cori-knl") - >>> machobj.has_batch_system() - True - >>> machobj.set_machine("melvin") - 'melvin' - >>> machobj.has_batch_system() - False """ result = False batch_system = self.get_optional_child("BATCH_SYSTEM", root=self.machine_node) diff --git a/CIME/XML/tests.py b/CIME/XML/tests.py index 297659b2c33..4a9eefc0fc4 100644 --- a/CIME/XML/tests.py +++ b/CIME/XML/tests.py @@ -5,6 +5,9 @@ from CIME.XML.generic_xml import GenericXML from CIME.XML.files import Files +from CIME.utils import find_system_test +from CIME.SystemTests.system_tests_compare_two import SystemTestsCompareTwo +from CIME.SystemTests.system_tests_compare_n import SystemTestsCompareN logger = logging.getLogger(__name__) @@ -27,6 +30,33 @@ def __init__(self, infile=None, files=None): if os.path.isfile(infile): self.read(infile) + def support_single_exe(self, case): + """Checks if case supports --single-exe. + + Raises: + Exception: If system test cannot be found. + Exception: If `case` does not support --single-exe. + """ + testname = case.get_value("TESTCASE") + + try: + test = find_system_test(testname, case)(case, dry_run=True) + except Exception as e: + raise e + else: + # valid if subclass is SystemTestsCommon or _separate_builds is false + valid = ( + not issubclass(type(test), SystemTestsCompareTwo) + and not issubclass(type(test), SystemTestsCompareN) + ) or not test._separate_builds + + if not valid: + case_base_id = case.get_value("CASEBASEID") + + raise Exception( + f"{case_base_id} does not support the '--single-exe' option as it requires separate builds" + ) + def get_test_node(self, testname): logger.debug("Get settings for {}".format(testname)) node = self.get_child("test", {"NAME": testname}) diff --git a/CIME/build.py b/CIME/build.py index d512991504d..b8d481b80d8 100644 --- a/CIME/build.py +++ b/CIME/build.py @@ -171,6 +171,7 @@ def generate_makefile_macro(case, caseroot): "gptl", "csm_share", "csm_share_cpl7", + "mpi-serial", ] ) cmake_macro = os.path.join(caseroot, "Macros.cmake") @@ -245,10 +246,24 @@ def get_standard_cmake_args(case, sharedpath): cmake_args += " -Dcompile_threaded={} ".format( stringify_bool(case.get_build_threaded()) ) + # check settings for GPU + gpu_type = case.get_value("GPU_TYPE") + gpu_offload = case.get_value("GPU_OFFLOAD") + if gpu_type != "none": + expect( + gpu_offload != "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) + cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}" + else: + expect( + gpu_offload == "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) ocn_model = case.get_value("COMP_OCN") - atm_model = case.get_value("COMP_ATM") - if ocn_model == "mom" or atm_model == "fv3gfs": + atm_dycore = case.get_value("CAM_DYCORE") + if ocn_model == "mom" or (atm_dycore and atm_dycore == "fv3"): cmake_args += " -DUSE_FMS=TRUE " cmake_args += " -DINSTALL_SHAREDPATH={} ".format( @@ -265,6 +280,7 @@ def get_standard_cmake_args(case, sharedpath): for var in _CMD_ARGS_FOR_BUILD: cmake_args += xml_to_make_variable(case, var, cmake=True) + atm_model = case.get_value("COMP_ATM") if atm_model == "scream": cmake_args += xml_to_make_variable(case, "HOMME_TARGET", cmake=True) @@ -471,59 +487,63 @@ def _build_model_cmake( os.makedirs(build_dir) # Components-specific cmake args. Cmake requires all component inputs to be available - # regardless of requested build list - cmp_cmake_args = "" - all_models = [] - files = Files(comp_interface=comp_interface) - for model, _, _, _, config_dir in complist: - # Create the Filepath and CIME_cppdefs files - if model == "cpl": - config_dir = os.path.join( - files.get_value("COMP_ROOT_DIR_CPL"), "cime_config" - ) - - cmp_cmake_args += _create_build_metadata_for_component( - config_dir, libroot, bldroot, case - ) - all_models.append(model) - - # Call CMake - cmake_args = get_standard_cmake_args(case, sharedpath) - cmake_env = "" - ninja_path = os.path.join(srcroot, "externals/ninja/bin") - if ninja: - cmake_args += " -GNinja " - cmake_env += "PATH={}:$PATH ".format(ninja_path) - - # Glue all pieces together: - # - cmake environment - # - common (i.e. project-wide) cmake args - # - component-specific cmake args - # - path to src folder + # regardless of requested build list. We do not want to re-invoke cmake + # if it has already been called. do_timing = "/usr/bin/time -p " if os.path.exists("/usr/bin/time") else "" - cmake_cmd = "{} {}cmake {} {} {}/components".format( - cmake_env, do_timing, cmake_args, cmp_cmake_args, srcroot - ) - stat = 0 - if dry_run: - logger.info("CMake cmd:\ncd {} && {}\n\n".format(bldroot, cmake_cmd)) - else: - logger.info( - "Configuring full {} model with output to file {}".format( - cime_model, bldlog + if not os.path.exists(os.path.join(bldroot, "CMakeCache.txt")): + cmp_cmake_args = "" + all_models = [] + files = Files(comp_interface=comp_interface) + for model, _, _, _, config_dir in complist: + # Create the Filepath and CIME_cppdefs files + if model == "cpl": + config_dir = os.path.join( + files.get_value("COMP_ROOT_DIR_CPL"), "cime_config" + ) + + cmp_cmake_args += _create_build_metadata_for_component( + config_dir, libroot, bldroot, case ) + all_models.append(model) + + # Call CMake + cmake_args = get_standard_cmake_args(case, sharedpath) + cmake_env = "" + ninja_path = os.path.join(srcroot, "externals/ninja/bin") + if ninja: + cmake_args += " -GNinja " + cmake_env += "PATH={}:$PATH ".format(ninja_path) + + # Glue all pieces together: + # - cmake environment + # - common (i.e. project-wide) cmake args + # - component-specific cmake args + # - path to src folder + cmake_cmd = "{} {}cmake {} {} {}/components".format( + cmake_env, do_timing, cmake_args, cmp_cmake_args, srcroot ) - logger.info(" Calling cmake directly, see top of log file for specific call") - with open(bldlog, "w") as fd: - fd.write("Configuring with cmake cmd:\n{}\n\n".format(cmake_cmd)) + stat = 0 + if dry_run: + logger.info("CMake cmd:\ncd {} && {}\n\n".format(bldroot, cmake_cmd)) + else: + logger.info( + "Configuring full {} model with output to file {}".format( + cime_model, bldlog + ) + ) + logger.info( + " Calling cmake directly, see top of log file for specific call" + ) + with open(bldlog, "w") as fd: + fd.write("Configuring with cmake cmd:\n{}\n\n".format(cmake_cmd)) - # Add logging before running - cmake_cmd = "({}) >> {} 2>&1".format(cmake_cmd, bldlog) - stat = run_cmd(cmake_cmd, from_dir=bldroot)[0] - expect( - stat == 0, - "BUILD FAIL: cmake config {} failed, cat {}".format(cime_model, bldlog), - ) + # Add logging before running + cmake_cmd = "({}) >> {} 2>&1".format(cmake_cmd, bldlog) + stat = run_cmd(cmake_cmd, from_dir=bldroot)[0] + expect( + stat == 0, + "BUILD FAIL: cmake config {} failed, cat {}".format(cime_model, bldlog), + ) # Set up buildlist if not buildlist: @@ -756,8 +776,9 @@ def _build_libraries( libs.append("CDEPS") ocn_model = case.get_value("COMP_OCN") - atm_model = case.get_value("COMP_ATM") - if ocn_model == "mom" or atm_model == "fv3gfs": + + atm_dycore = case.get_value("CAM_DYCORE") + if ocn_model == "mom" or (atm_dycore and atm_dycore == "fv3"): libs.append("FMS") files = Files(comp_interface=comp_interface) @@ -1111,6 +1132,7 @@ def _case_build_impl( ninst_build = case.get_value("NINST_BUILD") smp_value = case.get_value("SMP_VALUE") clm_use_petsc = case.get_value("CLM_USE_PETSC") + mpaso_use_petsc = case.get_value("MPASO_USE_PETSC") cism_use_trilinos = case.get_value("CISM_USE_TRILINOS") mali_use_albany = case.get_value("MALI_USE_ALBANY") mach = case.get_value("MACH") @@ -1135,7 +1157,7 @@ def _case_build_impl( # the future there may be others -- so USE_PETSC will be true if # ANY of those are true. - use_petsc = clm_use_petsc + use_petsc = bool(clm_use_petsc) or bool(mpaso_use_petsc) case.set_value("USE_PETSC", use_petsc) # Set the overall USE_TRILINOS variable to TRUE if any of the diff --git a/CIME/case/case.py b/CIME/case/case.py index 4924baf8cda..6de8bb2a217 100644 --- a/CIME/case/case.py +++ b/CIME/case/case.py @@ -74,6 +74,7 @@ class Case(object): This class extends across multiple files, class members external to this file are listed in the following imports + """ from CIME.case.case_setup import case_setup @@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False self._env_generic_files = [] self._files = [] self._comp_interface = None + self.gpu_enabled = False self._non_local = non_local self.read_xml() @@ -275,6 +277,9 @@ def initialize_derived_attributes(self): if max_gpus_per_node: self.ngpus_per_node = self.get_value("NGPUS_PER_NODE") + # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node) + if self.ngpus_per_node > 0: + max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0)) smt_factor = max( @@ -451,6 +456,15 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None): return [] def get_value(self, item, attribute=None, resolved=True, subgroup=None): + if item == "GPU_ENABLED": + if not self.gpu_enabled: + if ( + self.get_value("GPU_TYPE") != "none" + and self.get_value("NGPUS_PER_NODE") > 0 + ): + self.gpu_enabled = True + return "true" if self.gpu_enabled else "false" + result = None for env_file in self._files: # Wait and resolve in self rather than in env_file @@ -1141,7 +1155,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib): comment = None force_tasks = None force_thrds = None - if match1: opti_tasks = match1.group(1) if opti_tasks.isdigit(): @@ -1211,7 +1224,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib): pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1 totaltasks.append((ntasks + rootpe) * nthrds) - mach_pes_obj.set_value(ntasks_str, ntasks) mach_pes_obj.set_value(nthrds_str, nthrds) mach_pes_obj.set_value(rootpe_str, rootpe) @@ -1262,6 +1274,8 @@ def configure( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): expect( @@ -1344,6 +1358,7 @@ def configure( and "MPILIB" not in x and "MAX_MPITASKS_PER_NODE" not in x and "MAX_TASKS_PER_NODE" not in x + and "MAX_CPUTASKS_PER_GPU_NODE" not in x and "MAX_GPUS_PER_NODE" not in x ] @@ -1378,6 +1393,7 @@ def configure( for name in ( "MAX_TASKS_PER_NODE", "MAX_MPITASKS_PER_NODE", + "MAX_CPUTASKS_PER_GPU_NODE", "MAX_GPUS_PER_NODE", ): dmax = machobj.get_value(name, {"compiler": compiler}) @@ -1385,13 +1401,23 @@ def configure( dmax = machobj.get_value(name) if dmax: self.set_value(name, dmax) + elif name == "MAX_CPUTASKS_PER_GPU_NODE": + logger.debug( + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) + ) elif name == "MAX_GPUS_PER_NODE": logger.debug( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) else: logger.warning( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) machdir = machobj.get_machines_dir() @@ -1509,47 +1535,62 @@ def configure( self.set_value("TEST", True) # ---------------------------------------------------------------------------------------------------------- - # Sanity check: - # 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU - # 2. For compilers without the string "gpu" in the name: - # 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as - # the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect). - # 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument - # must be set to 0. Otherwise, an error will be triggered. - # 3. For compilers with the string "gpu" in the name: - # 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered. - # 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE + # Sanity check for a GPU run: + # 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS + # 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE # XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically. - # 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. + # 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. # ---------------------------------------------------------------------------------------------------------- max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE") - if max_gpus_per_node: - if "gpu" in compiler: - if not ngpus_per_node: - ngpus_per_node = 1 - logger.warning( - "Setting ngpus_per_node to 1 for compiler {}".format(compiler) - ) - expect( - ngpus_per_node > 0, - " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) - else: - expect( - ngpus_per_node == 0, - " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) + if gpu_type and str(gpu_type).lower() != "none": + expect( + max_gpus_per_node, + f"GPUS are not defined for machine={machine_name} and compiler={compiler}", + ) + expect( + gpu_offload, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + expect( + compiler in ["nvhpc", "cray"], + f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ", + ) + valid_gpu_type = self.get_value("GPU_TYPE").split(",") + valid_gpu_type.remove("none") + expect( + gpu_type in valid_gpu_type, + f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}", + ) + valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",") + valid_gpu_offload.remove("none") + expect( + gpu_offload in valid_gpu_offload, + f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}", + ) + self.gpu_enabled = True if ngpus_per_node >= 0: self.set_value( "NGPUS_PER_NODE", - ngpus_per_node + max(1, ngpus_per_node) if ngpus_per_node <= max_gpus_per_node else max_gpus_per_node, ) + elif gpu_offload and str(gpu_offload).lower() != "none": + expect( + False, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + elif ngpus_per_node != 0: + expect( + False, + f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;", + ) + + # Set these two GPU XML variables here to overwrite the default values + # Only set them for "cesm" model + if self._cime_model == "cesm": + self.set_value("GPU_TYPE", str(gpu_type).lower()) + self.set_value("GPU_OFFLOAD", str(gpu_offload).lower()) self.initialize_derived_attributes() @@ -1586,6 +1627,13 @@ def configure( ) env_batch.set_job_defaults(bjobs, self) + # Set BATCH_COMMAND_FLAGS to the default values + + for job in bjobs: + if test and job[0] == "case.run" or not test and job[0] == "case.test": + continue + submitargs = env_batch.get_submit_args(self, job[0], resolve=False) + self.set_value("BATCH_COMMAND_FLAGS", submitargs, subgroup=job[0]) # Make sure that parallel IO is not specified if total_tasks==1 if self.total_tasks == 1: @@ -1723,7 +1771,10 @@ def _create_caseroot_sourcemods(self): if self._comp_interface == "nuopc": components.extend(["cdeps"]) - readme_message = """Put source mods for the {component} library in this directory. + readme_message_start = ( + "Put source mods for the {component} library in this directory." + ) + readme_message_end = """ WARNING: SourceMods are not kept under version control, and can easily become out of date if changes are made to the source code on which they @@ -1757,7 +1808,18 @@ def _create_caseroot_sourcemods(self): # to fail). readme_file = os.path.join(directory, "README") with open(readme_file, "w") as fd: - fd.write(readme_message.format(component=component)) + fd.write(readme_message_start.format(component=component)) + + if component == "cdeps": + readme_message_extra = """ + +Note that this subdirectory should only contain files from CDEPS's +dshr and streams source code directories. +Files related to specific data models should go in SourceMods subdirectories +for those data models (e.g., src.datm).""" + fd.write(readme_message_extra) + + fd.write(readme_message_end) if config.copy_cism_source_mods: # Note: this is CESM specific, given that we are referencing cism explitly @@ -2052,12 +2114,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None mpi_arg_string += " : " ngpus_per_node = self.get_value("NGPUS_PER_NODE") - if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank: - # 1. this setting is tested on Casper only and may not work on other machines - # 2. need to be revisited in the future for a more adaptable implementation - rundir = self.get_value("RUNDIR") - output_name = rundir + "/set_device_rank.sh" - mpi_arg_string = mpi_arg_string + " " + output_name + " " + if ngpus_per_node and ngpus_per_node > 0: + mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT") + if mpi_gpu_run_script: + mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script return self.get_resolved_value( "{} {} {} {}".format( @@ -2354,6 +2414,8 @@ def create( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): try: # Set values for env_case.xml @@ -2427,6 +2489,8 @@ def create( extra_machines_dir=extra_machines_dir, case_group=case_group, ngpus_per_node=ngpus_per_node, + gpu_type=gpu_type, + gpu_offload=gpu_offload, ) self.create_caseroot() diff --git a/CIME/case/case_clone.py b/CIME/case/case_clone.py index f829e13993c..737d26564b3 100644 --- a/CIME/case/case_clone.py +++ b/CIME/case/case_clone.py @@ -54,7 +54,10 @@ def create_clone( if os.path.isdir(os.path.join(newcase_cimeroot, "share")) and get_model() == "cesm": srcroot = newcase_cimeroot else: - srcroot = os.path.join(newcase_cimeroot, "..") + srcroot = self.get_value("SRCROOT") + if not srcroot: + srcroot = os.path.join(newcase_cimeroot, "..") + newcase = self.copy(newcasename, newcaseroot, newsrcroot=srcroot) with newcase: newcase.set_value("CIMEROOT", newcase_cimeroot) diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py index aa8fb8b6b6c..841e90391d0 100644 --- a/CIME/case/case_setup.py +++ b/CIME/case/case_setup.py @@ -21,7 +21,6 @@ copy_local_macros_to_dir, ) from CIME.utils import batch_jobid -from CIME.utils import transform_vars from CIME.test_status import * from CIME.locked_files import unlock_file, lock_file @@ -142,12 +141,29 @@ def _create_macros_cmake( ############################################################################### if not os.path.isfile(os.path.join(caseroot, "Macros.cmake")): safe_copy(os.path.join(cmake_macros_dir, "Macros.cmake"), caseroot) - if not os.path.exists(os.path.join(caseroot, "cmake_macros")): - shutil.copytree(cmake_macros_dir, case_cmake_path) - copy_depends_files( - mach_obj.get_machine_name(), mach_obj.machines_dir, caseroot, compiler - ) + if not os.path.exists(case_cmake_path): + os.mkdir(case_cmake_path) + + # This impl is coupled to contents of Macros.cmake + os_ = mach_obj.get_value("OS") + mach = mach_obj.get_machine_name() + macros = [ + "universal.cmake", + os_ + ".cmake", + compiler + ".cmake", + "{}_{}.cmake".format(compiler, os), + mach + ".cmake", + "{}_{}.cmake".format(compiler, mach), + "CMakeLists.txt", + ] + for macro in macros: + repo_macro = os.path.join(cmake_macros_dir, macro) + case_macro = os.path.join(case_cmake_path, macro) + if not os.path.exists(case_macro) and os.path.exists(repo_macro): + safe_copy(repo_macro, case_cmake_path) + + copy_depends_files(mach, mach_obj.machines_dir, caseroot, compiler) ############################################################################### @@ -482,31 +498,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None): caseroot=caseroot, is_batch=is_batch, ) - - # put the following section here to make sure the rundir is generated first - machdir = self.get_value("MACHDIR") - mach = self.get_value("MACH") - ngpus_per_node = self.get_value("NGPUS_PER_NODE") - overrides = {} - overrides["ngpus_per_node"] = ngpus_per_node - input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach)) - if os.path.isfile(input_template): - # update the wrapper script that sets the device id for each MPI rank - output_text = transform_vars( - open(input_template, "r").read(), case=self, overrides=overrides - ) - - # write it out to the run dir - rundir = self.get_value("RUNDIR") - output_name = os.path.join(rundir, "set_device_rank.sh") - logger.info("Creating file {}".format(output_name)) - with open(output_name, "w") as f: - f.write(output_text) - - # make the wrapper script executable - if os.path.isfile(output_name): - os.system("chmod +x " + output_name) - else: - expect( - False, "The file {} is not written out correctly.".format(output_name) - ) diff --git a/CIME/case/case_st_archive.py b/CIME/case/case_st_archive.py index 64005b13d09..8238cf2f912 100644 --- a/CIME/case/case_st_archive.py +++ b/CIME/case/case_st_archive.py @@ -364,8 +364,10 @@ def get_histfiles_for_restarts( histfiles = set() rest_hist_varname = archive.get_entry_value("rest_history_varname", archive_entry) if rest_hist_varname != "unset": - cmd = "ncdump -v {} {} ".format( - rest_hist_varname, os.path.join(rundir, restfile) + ncdump = shutil.which("ncdump") + expect(ncdump, "ncdump not found in path") + cmd = "{} -v {} {} ".format( + ncdump, rest_hist_varname, os.path.join(rundir, restfile) ) if testonly: out = "{} =".format(rest_hist_varname) @@ -1184,7 +1186,9 @@ def test_env_archive(self, testdir="env_archive_test"): for comp_archive_spec in comp_archive_specs: comp_expected = archive.get(comp_archive_spec, "compname") - if comp_expected == "ww3": + # Rename ww3 component when case and archive names don't match, + # specific to CESM. + if comp_expected == "ww3" and "ww" in comps_in_case: comp_expected = "ww" comp_class = archive.get(comp_archive_spec, "compclass").upper() if comp_class in components: diff --git a/CIME/case/case_submit.py b/CIME/case/case_submit.py index cc996f2f50b..71484c9dea0 100644 --- a/CIME/case/case_submit.py +++ b/CIME/case/case_submit.py @@ -287,7 +287,8 @@ def check_case(self, skip_pnl=False, chksum=False): if not skip_pnl: self.create_namelists() # Must be called before check_all_input_data logger.info("Checking that inputdata is available as part of case submission") - self.check_all_input_data(chksum=chksum) + if not self.get_value("TEST"): + self.check_all_input_data(chksum=chksum) if self.get_value("COMP_WAV") == "ww": # the ww3 buildnml has dependencies on inputdata so we must run it again diff --git a/CIME/config.py b/CIME/config.py index 8491b2f3f2e..3cef6cc0530 100644 --- a/CIME/config.py +++ b/CIME/config.py @@ -177,11 +177,6 @@ def __init__(self): False, desc="If set to `True` then COMP_ROOT_DIR_CPL is set using UFS_DRIVER if defined.", ) - self._set_attribute( - "gpus_use_set_device_rank", - True, - desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.", - ) self._set_attribute( "test_custom_project_machine", "melvin", diff --git a/CIME/cs_status.py b/CIME/cs_status.py index 8b4c479b93d..6a65ca4da71 100644 --- a/CIME/cs_status.py +++ b/CIME/cs_status.py @@ -6,7 +6,7 @@ from __future__ import print_function from CIME.XML.standard_module_setup import * from CIME.XML.expected_fails_file import ExpectedFailsFile -from CIME.test_status import TestStatus +from CIME.test_status import TestStatus, SHAREDLIB_BUILD_PHASE, TEST_PEND_STATUS import os import sys from collections import defaultdict @@ -20,6 +20,7 @@ def cs_status( check_throughput=False, check_memory=False, expected_fails_filepath=None, + force_rebuild=False, out=sys.stdout, ): """Print the test statuses of all tests in test_paths. The default @@ -56,6 +57,11 @@ def cs_status( for test_path in test_paths: test_dir = os.path.dirname(test_path) ts = TestStatus(test_dir=test_dir) + + if force_rebuild: + with ts: + ts.set_status(SHAREDLIB_BUILD_PHASE, TEST_PEND_STATUS) + test_id = os.path.basename(test_dir).split(".")[-1] if summary: output = _overall_output( diff --git a/CIME/data/config/cesm/config_files.xml b/CIME/data/config/cesm/config_files.xml index a83773d2335..3268ccb33f0 100644 --- a/CIME/data/config/cesm/config_files.xml +++ b/CIME/data/config/cesm/config_files.xml @@ -413,7 +413,7 @@ $COMP_ROOT_DIR_ATM/cime_config/testdefs/testlist_cam.xml $COMP_ROOT_DIR_GLC/cime_config/testdefs/testlist_cism.xml $COMP_ROOT_DIR_LND/cime_config/testdefs/testlist_clm.xml - $COMP_ROOT_DIR_LND/cime_config/testdefs/testlist_clm.xml + $COMP_ROOT_DIR_LND/cime_config/testdefs/testlist_slim.xml $COMP_ROOT_DIR_ICE/cime_config/testdefs/testlist_cice.xml $COMP_ROOT_DIR_ICE/cime_config/testdefs/testlist_cice.xml $COMP_ROOT_DIR_OCN/cime_config/testdefs/testlist_pop.xml diff --git a/CIME/data/config/config_tests.xml b/CIME/data/config/config_tests.xml index 045029255ec..0352b5207ca 100644 --- a/CIME/data/config/config_tests.xml +++ b/CIME/data/config/config_tests.xml @@ -66,7 +66,7 @@ ERP pes counts hybrid (open-MP/MPI) restart bfb test from startup, default 6 do an 11 day initial test - write a restart at day 6 (suffix base) half the number of tasks and threads for each component do a 5 day restart test starting from restart at day 6 (suffix rest) - this is just like an ERS test but the pe-counts/threading count are modified on retart + this is just like an ERS test but the pe-counts/threading count are modified on restart ERI hybrid/branch/exact restart test, default (by default STOP_N is 22 days) (1) ref1case diff --git a/CIME/data/config/xml_schemas/config_archive.xsd b/CIME/data/config/xml_schemas/config_archive.xsd index bc366e6178a..cc7fe137ab8 100644 --- a/CIME/data/config/xml_schemas/config_archive.xsd +++ b/CIME/data/config/xml_schemas/config_archive.xsd @@ -6,7 +6,7 @@ - + @@ -50,6 +50,7 @@ + diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd index d6e3c280a93..b025c4039e0 100644 --- a/CIME/data/config/xml_schemas/config_machines.xsd +++ b/CIME/data/config/xml_schemas/config_machines.xsd @@ -6,6 +6,8 @@ + + @@ -56,6 +58,10 @@ + + + + @@ -166,6 +172,16 @@ + + + + + + + +