Merge branch 'master' into bugfix/fs_cache

Lightning-AI · Feb 6, 2024 · 0645248 · 0645248
2 parents 3a019cb + b187bfd
commit 0645248
Show file tree

Hide file tree

Showing 21 changed files with 96 additions and 113 deletions.
diff --git a/.azure/gpu-integrations.yml b/.azure/gpu-integrations.yml
@@ -33,7 +33,9 @@ jobs:
       DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0,1"; print(gpus)' )
       # these two caches assume to run repetitively on the same set of machines
       TORCH_HOME: "/var/tmp/torch"
-      HF_HOME: "/var/tmp/huggingface"
+      TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
+      HF_HOME: "/var/tmp/hf/home"
+      HF_HUB_CACHE: "/var/tmp/hf/hub"
       PIP_CACHE_DIR: "/var/tmp/pip"
     container:
       image: "$(docker-image)"

diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml
@@ -42,7 +42,10 @@ jobs:
       # these two caches assume to run repetitively on the same set of machines
       #  see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481
       TORCH_HOME: "/var/tmp/torch"
-      HF_HOME: "/var/tmp/huggingface"
+      TOKENIZERS_PARALLELISM: "false"
+      TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
+      HF_HOME: "/var/tmp/hf/home"
+      HF_HUB_CACHE: "/var/tmp/hf/hub"
       PIP_CACHE_DIR: "/var/tmp/pip"
       # MKL_THREADING_LAYER: "GNU"
       MKL_SERVICE_FORCE_INTEL: 1
@@ -104,20 +107,17 @@ jobs:
         displayName: "Sanity check"
 
       - bash: |
-          printf "cache location: $(TORCH_HOME)\n"
-          mkdir -p $(TORCH_HOME)  # in case cache was void
-          ls -lh $(TORCH_HOME)
-          printf "cache location: $(HF_HOME)\n"
-          mkdir -p $(HF_HOME)  # in case cache was void
-          ls -lh $(HF_HOME)
+          pip install -q py-tree
+          py-tree /var/tmp/torch
+          py-tree /var/tmp/hf
         displayName: "Show caches"
 
       - bash: |
-          python -m pytest torchmetrics --cov=torchmetrics \
-            --timeout=240 --durations=50 \
-            --numprocesses=5 --dist=loadfile
+          python -m pytest torchmetrics -s --cov=torchmetrics \
+            --timeout=240 --durations=50
+          #  --numprocesses=5 --dist=loadfile
         env:
-          DOCTEST_DOWNLOAD_TIMEOUT: "240"
+          DOCTEST_DOWNLOAD_TIMEOUT: "180"
           SKIP_SLOW_DOCTEST: "1"
         workingDirectory: src
         displayName: "DocTesting"
@@ -132,9 +132,8 @@ jobs:
       - bash: |
           python -m pytest unittests -v \
             -m "not DDP" --numprocesses=5 --dist=loadfile \
-            --cov=torchmetrics --timeout=240 --durations=500
-        env:
-          CUDA_LAUNCH_BLOCKING: "1"
+            --cov=torchmetrics --timeout=240 --durations=200 \
+            --reruns 3 --reruns-delay 1
         workingDirectory: tests
         displayName: "UnitTesting common"
 
@@ -144,7 +143,6 @@ jobs:
             --cov=torchmetrics --timeout=240 --durations=500
         env:
           USE_PYTEST_POOL: "1"
-          CUDA_LAUNCH_BLOCKING: "1"
         workingDirectory: tests
         displayName: "UnitTesting DDP"
 

diff --git a/.github/actions/pull-caches/action.yml b/.github/actions/pull-caches/action.yml
@@ -14,6 +14,10 @@ inputs:
     description: location of local PyPI cache
     required: false
     default: "_ci-cache_PyPI"
+  pypi-key:
+    description: cache restore/dump key
+    required: false
+    default: "pypi-packages"
 
 runs:
   using: "composite"
@@ -38,18 +42,22 @@ runs:
     - name: Define caches
       id: cache_dirs
       run: |
-        torch_cache=$(python -c "import os ; print(os.path.join(os.getcwd(), '_ci-cache_pytorch'))")
-        echo "TORCH_HOME=$torch_cache" >> $GITHUB_ENV
-        hf_cache=$(python -c "import os ; print(os.path.join(os.getcwd(), '_ci-cache_huggingface'))")
-        echo "HF_HOME=$hf_cache" >> $GITHUB_ENV
+        cache_dir=$(python -c "import os ; print(os.path.join(os.getcwd(), '_ci-cache'))")
+        echo "CACHES_DIR=${cache_dir}" >> $GITHUB_ENV
+        dir_sep=$(python -c "import os ; print(os.path.sep)")
+        echo "TORCH_HOME=${cache_dir}${dir_sep}torch" >> $GITHUB_ENV
+        echo "TRANSFORMERS_CACHE=${cache_dir}${dir_sep}transformers" >> $GITHUB_ENV
+        echo "HF_HOME=${cache_dir}${dir_sep}hf-home" >> $GITHUB_ENV
+        echo "HF_HUB_CACHE=${cache_dir}${dir_sep}hf-hub" >> $GITHUB_ENV
       shell: bash
 
     - name: Cache pip
       continue-on-error: true
       uses: actions/cache/restore@v3
       with:
+        enableCrossOsArchive: true
         path: ${{ inputs.pypi-dir }}
-        key: pypi-packages
+        key: ${{ inputs.pypi-key }}
 
     - name: Restored Packages
       run: |
@@ -58,32 +66,17 @@ runs:
         ls -lh ${{ inputs.pypi-dir }}
       shell: bash
 
-    - name: Cache Torch
-      continue-on-error: true
-      uses: actions/cache/restore@v3
-      with:
-        path: ${{ env.TORCH_HOME }}
-        key: cache-pytorch
-
-    - name: Restored PT
-      if: ${{ runner.os == 'Linux' }}
-      run: |
-        mkdir -p $TORCH_HOME
-        printf "list $TORCH_HOME:\n"
-        sudo apt install -q -y tree
-        tree -h $TORCH_HOME
-      shell: bash
-
-    - name: Cache HF
+    - name: Cache Torch & HF
       continue-on-error: true
       uses: actions/cache/restore@v3
       with:
-        path: ${{ env.HF_HOME }}
-        key: cache-transformers
+        enableCrossOsArchive: true
+        path: ${{ env.CACHES_DIR }}
+        key: ci-caches
 
-    - name: Restored HF
+    - name: Restored Torch & HF
       run: |
-        mkdir -p $HF_HOME
-        printf "list $HF_HOME:\n"
-        ls -lh $HF_HOME
+        mkdir -p $CACHES_DIR
+        pip install -q py-tree
+        py-tree $CACHES_DIR
       shell: bash
diff --git a/.github/actions/push-caches/action.yml b/.github/actions/push-caches/action.yml
@@ -47,9 +47,10 @@ runs:
         ls -lh .pip-wheels
       shell: bash
 
-    - name: Cache pull
+    - name: Cache pull packages
       uses: actions/cache/restore@v3
       with:
+        enableCrossOsArchive: true
         path: ${{ inputs.pypi-dir }}
         key: ${{ inputs.pypi-key }}
 
@@ -69,36 +70,22 @@ runs:
       if: ${{ steps.wheels-diff.outputs.count-new != 0 }}
       shell: bash
 
-    - name: Cache push
+    - name: Cache push packages
       if: ${{ steps.wheels-diff.outputs.count-new != 0 }}
       uses: actions/cache/save@v3
       with:
+        enableCrossOsArchive: true
         path: ${{ inputs.pypi-dir }}
         key: ${{ inputs.pypi-key }}
 
-    - name: Post PT
-      if: ${{ runner.os == 'Linux' }}
-      run: |
-        printf "list $TORCH_HOME:\n"
-        tree -h $TORCH_HOME
-      shell: bash
-
-    - name: Cache Torch
-      continue-on-error: true
-      uses: actions/cache/save@v3
-      with:
-        path: ${{ env.TORCH_HOME }}
-        key: cache-pytorch
-
-    - name: Post HF
-      run: |
-        printf "list $HF_HOME:\n"
-        ls -lh $HF_HOME
+    - name: Post Torch & HF
+      run: py-tree $CACHES_DIR
       shell: bash
 
-    - name: Cache HF
+    - name: Cache Torch & HF
       continue-on-error: true
       uses: actions/cache/save@v3
       with:
-        path: ${{ env.HF_HOME }}
-        key: cache-transformers
+        enableCrossOsArchive: true
+        path: ${{ env.CACHES_DIR }}
+        key: ci-caches
diff --git a/.github/workflows/ci-integrate.yml b/.github/workflows/ci-integrate.yml
@@ -46,7 +46,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -64,7 +64,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -177,7 +177,7 @@ jobs:
           coverage report
 
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           file: tests/coverage.xml

diff --git a/.github/workflows/cmd-help.yml b/.github/workflows/cmd-help.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Update with comment
-        uses: peter-evans/create-or-update-comment@v3.1.0
+        uses: peter-evans/create-or-update-comment@v4.0.0
         with:
           token: ${{ secrets.PAT_GHOST }}
           reaction-token: ${{ secrets.PAT_GHOST }}

diff --git a/.github/workflows/cmd-rebase.yml b/.github/workflows/cmd-rebase.yml
@@ -30,7 +30,7 @@ jobs:
           git push --force-with-lease
 
       - name: Update comment
-        uses: peter-evans/create-or-update-comment@v3.1.0
+        uses: peter-evans/create-or-update-comment@v4.0.0
         with:
           token: ${{ secrets.PAT_GHOST }}
           repository: ${{ github.event.client_payload.github.payload.repository.full_name }}
@@ -43,7 +43,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Update comment
-        uses: peter-evans/create-or-update-comment@v3.1.0
+        uses: peter-evans/create-or-update-comment@v4.0.0
         with:
           token: ${{ secrets.PAT_GHOST }}
           repository: ${{ github.event.client_payload.github.payload.repository.full_name }}

diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
@@ -36,7 +36,7 @@ jobs:
     timeout-minutes: 30
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: "3.9"
 

diff --git a/.github/workflows/focus-diff.yml b/.github/workflows/focus-diff.yml
@@ -17,7 +17,7 @@ jobs:
       focus: ${{ steps.diff-domains.outputs.focus }}
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         #with:
         #  python-version: 3.8
 

diff --git a/.github/workflows/publish-pkg.yml b/.github/workflows/publish-pkg.yml
@@ -18,7 +18,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: 3.8
 
@@ -78,7 +78,7 @@ jobs:
     needs: publish-pypi-test
     runs-on: ubuntu-latest
     steps:
-      - uses: juliangruber/sleep-action@v1
+      - uses: juliangruber/sleep-action@v2
         with:
           time: 5m
 

diff --git a/.github/workflows/slash-cmd-dispatch.yml b/.github/workflows/slash-cmd-dispatch.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Slash Command Dispatch
-        uses: peter-evans/slash-command-dispatch@v3.0.2
+        uses: peter-evans/slash-command-dispatch@v4.0.0
         with:
           token: ${{ secrets.PAT_GHOST }}
           reaction-token: ${{ secrets.PAT_GHOST }}

diff --git a/requirements/_tests.txt b/requirements/_tests.txt
@@ -1,13 +1,13 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
-coverage ==7.4.0
+coverage ==7.4.1
 pytest ==7.4.4
 pytest-cov ==4.1.0
 pytest-doctestplus ==1.1.0
 pytest-rerunfailures ==13.0
 pytest-timeout ==2.2.0
-pytest-xdist ==3.3.1
+pytest-xdist ==3.5.0
 phmdoctest ==1.4.0
 
 psutil <5.10.0

diff --git a/src/torchmetrics/functional/multimodal/clip_iqa.py b/src/torchmetrics/functional/multimodal/clip_iqa.py
@@ -28,11 +28,11 @@
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
 
-    def _download_clip() -> None:
-        _CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
-        _CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
+    def _download_clip_for_iqa_metric() -> None:
+        _CLIPModel.from_pretrained("openai/clip-vit-base-patch16", resume_download=True)
+        _CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16", resume_download=True)
 
-    if not _try_proceed_with_timeout(_download_clip):
+    if not _try_proceed_with_timeout(_download_clip_for_iqa_metric):
         __doctest_skip__ = ["clip_image_quality_assessment"]
 else:
     __doctest_skip__ = ["clip_image_quality_assessment"]

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -29,11 +29,11 @@
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
 
-    def _download_clip() -> None:
-        _CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-        _CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    def _download_clip_for_clip_score() -> None:
+        _CLIPModel.from_pretrained("openai/clip-vit-large-patch14", resume_download=True)
+        _CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", resume_download=True)
 
-    if not _try_proceed_with_timeout(_download_clip):
+    if not _try_proceed_with_timeout(_download_clip_for_clip_score):
         __doctest_skip__ = ["clip_score"]
 else:
     __doctest_skip__ = ["clip_score"]

diff --git a/src/torchmetrics/functional/text/bert.py b/src/torchmetrics/functional/text/bert.py
@@ -39,12 +39,12 @@
 if _TRANSFORMERS_GREATER_EQUAL_4_4:
     from transformers import AutoModel, AutoTokenizer
 
-    def _download_model() -> None:
+    def _download_model_for_bert_score() -> None:
         """Download intensive operations."""
-        AutoTokenizer.from_pretrained(_DEFAULT_MODEL)
-        AutoModel.from_pretrained(_DEFAULT_MODEL)
+        AutoTokenizer.from_pretrained(_DEFAULT_MODEL, resume_download=True)
+        AutoModel.from_pretrained(_DEFAULT_MODEL, resume_download=True)
 
-    if _SKIP_SLOW_DOCTEST and not _try_proceed_with_timeout(_download_model):
+    if _SKIP_SLOW_DOCTEST and not _try_proceed_with_timeout(_download_model_for_bert_score):
         __doctest_skip__ = ["bert_score"]
 else:
     __doctest_skip__ = ["bert_score"]

diff --git a/src/torchmetrics/multimodal/clip_iqa.py b/src/torchmetrics/multimodal/clip_iqa.py
@@ -43,11 +43,11 @@
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
 
-    def _download_clip() -> None:
-        _CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-        _CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    def _download_clip_iqa_metric() -> None:
+        _CLIPModel.from_pretrained("openai/clip-vit-large-patch14", resume_download=True)
+        _CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", resume_download=True)
 
-    if not _try_proceed_with_timeout(_download_clip):
+    if not _try_proceed_with_timeout(_download_clip_iqa_metric):
         __doctest_skip__ = ["CLIPImageQualityAssessment", "CLIPImageQualityAssessment.plot"]
 else:
     __doctest_skip__ = ["CLIPImageQualityAssessment", "CLIPImageQualityAssessment.plot"]