Harden "Add annotations" workflow (pytorch#56071)

Summary: Resolves pytorch#55810 by closing some possible security holes due to using [GitHub Actions `${{ <expressions> }}`](https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#about-contexts-and-expressions) in `.github/workflows/add_annotations.yml` and also patching a few other possible scenarios that could cause the workflow to fail by a PR passing a malformed artifact. - [x] flag and remove GitHub Actions expressions in JS scripts - [x] don't fail the workflow if the artifact doesn't look as expected - [x] write unit tests for `tools/extract_scripts.py` Pull Request resolved: pytorch#56071 Test Plan: I tested the end-to-end "Lint" and "Add annotations" system in a separate sandbox repo, including the following cases: - well-formed artifact - missing artifact - artifact containing a file named `linter-output.zip` (name clash) - artifact whose `commit-sha.txt` doesn't contain a 40-digit hex string - artifact whose `commit-sha.txt` contains a 40-digit hex string that isn't a valid Git hash for the current repo - in this last case, the workflow does fail, but handling that is the responsibility of [pytorch/add-annotations-github-action](https://github.com/pytorch/add-annotations-github-action), not pytorch/pytorch To run the new unit tests added in this PR: ``` python tools/test/test_extract_scripts.py ``` Reviewed By: seemethere Differential Revision: D27807074 Pulled By: samestep fbshipit-source-id: e2d3cc5437fe80ff03d46237ebba289901bc567c
atalman · Apr 16, 2021 · c5e80d3 · c5e80d3
1 parent e387bd7
commit c5e80d3
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 44 deletions.
diff --git a/.github/workflows/add_annotations.yml b/.github/workflows/add_annotations.yml
@@ -15,7 +15,6 @@ jobs:
         name:
           - flake8-py3
           - clang-tidy
-    if: github.event.workflow_run.event == 'pull_request'
     runs-on: ubuntu-18.04
     steps:
       - name: Download artifact
@@ -26,10 +25,10 @@ jobs:
             const artifacts = await github.actions.listWorkflowRunArtifacts({
               owner: context.repo.owner,
               repo: context.repo.repo,
-              run_id: ${{ github.event.workflow_run.id }},
+              run_id: process.env.RUN_ID,
             });
             const filteredArtifacts = artifacts.data.artifacts.filter(artifact => {
-              return artifact.name == '${{ matrix.name }}';
+              return artifact.name == process.env.LINT_NAME;
             });
             if (filteredArtifacts.length > 0) {
               const matchArtifact = filteredArtifacts[0];
@@ -41,27 +40,27 @@ jobs:
               });
               const fs = require('fs');
               fs.writeFileSync(
-                '${{ github.workspace }}/linter-output.zip',
+                `${process.env.GITHUB_WORKSPACE}/linter-output.zip`,
                 Buffer.from(download.data),
               );
             }
+        env:
+          RUN_ID: ${{ github.event.workflow_run.id }}
+          LINT_NAME: ${{ matrix.name }}
       - name: Unzip artifact
         id: unzip
         run: |
-          FILENAME=linter-output.zip
-          EXISTS=$([ -f $FILENAME ]; echo $?)
-          echo ::set-output name=exists::"$EXISTS"
-          if [ "$EXISTS" -eq 0 ]; then
-            unzip $FILENAME
-            echo ::set-output name=commit-sha::"$(cat commit-sha.txt)"
+          if unzip linter-output.zip annotations.json commit-sha.txt; then
+            echo ::set-output \
+              name=sha::"$(grep -Em1 '^[[:xdigit:]]{40}$' commit-sha.txt)"
           fi
-      - if: steps.unzip.outputs.exists == '0' # i.e. true
+      - if: steps.unzip.outputs.sha
         name: Add annotations
         uses: pytorch/add-annotations-github-action@master
         with:
           check_name: ${{ matrix.name }}
           linter_output_path: annotations.json
-          commit_sha: ${{ steps.unzip.outputs.commit-sha }}
+          commit_sha: ${{ steps.unzip.outputs.sha }}
           mode: json
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -31,6 +31,8 @@ jobs:
         run: |
           pip install ruamel.yaml==0.17.4
           .github/scripts/lint_native_functions.py
+      - name: Extract scripts from GitHub Actions workflows
+        run: tools/extract_scripts.py --out=.extracted_scripts
       - name: ShellCheck
         # https://github.com/koalaman/shellcheck/tree/v0.7.1#installing-a-pre-compiled-binary
         run: |
@@ -40,9 +42,7 @@ jobs:
           sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
           rm -r "shellcheck-${scversion}"
           shellcheck --version
-          EXTRACT_DIR=.shellcheck_generated
-          tools/extract_scripts.py --out=$EXTRACT_DIR
-          tools/run_shellcheck.sh .jenkins/pytorch $EXTRACT_DIR
+          tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
       - name: Ensure correct trailing newlines
         run: |
           (! git grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))

diff --git a/.gitignore b/.gitignore
@@ -14,7 +14,7 @@ coverage.xml
 .gradle
 .hypothesis
 .mypy_cache
-/.shellcheck_generated/
+/.extracted_scripts/
 **/.pytorch-test-times
 **/.pytorch-slow-tests
 */*.pyc

diff --git a/mypy-strict.ini b/mypy-strict.ini
@@ -46,6 +46,7 @@ files =
     tools/pyi/*.py,
     tools/stats_utils/*.py,
     tools/test_history.py,
+    tools/test/test_extract_scripts.py,
     tools/test/test_mypy_wrapper.py,
     tools/test/test_test_history.py,
     tools/test/test_trailing_newlines.py,

diff --git a/tools/README.md b/tools/README.md
@@ -39,11 +39,14 @@ Developer tools which you might find useful:
 
 * [clang_tidy.py](clang_tidy.py) - Script for running clang-tidy
   on lines of your script which you changed.
-* [extract_scripts.py](extract_scripts.py) - Extract shell scripts from
-  `.github/workflows/*.yml` into a specified dir, on which
-  [run_shellcheck.sh](run_shellcheck.sh) can be run. Exits with nonzero status
-  if any of the extracted scripts contain [GitHub Actions expressions][]: `${{
-  <expression> }}`
+* [extract_scripts.py](extract_scripts.py) - Extract scripts from
+  `.github/workflows/*.yml` into a specified dir, on which linters such as
+  [run_shellcheck.sh](run_shellcheck.sh) can be run. Assumes that every `run`
+  script has `shell: bash` unless a different shell is explicitly listed on that
+  specific step (so `defaults` doesn't currently work), but also has some rules
+  for other situations such as [actions/github-script][]. Exits with nonzero
+  status if any of the extracted scripts contain [GitHub Actions expressions][]:
+  `${{<expression> }}`
 * [git_add_generated_dirs.sh](git_add_generated_dirs.sh) and
   [git_reset_generated_dirs.sh](git_reset_generated_dirs.sh) -
   Use this to force add generated files to your Git index, so that you
@@ -85,6 +88,7 @@ Tools which are only situationally useful:
 * [run-clang-tidy-in-ci.sh](run-clang-tidy-in-ci.sh) - Responsible
   for checking that C++ code is clang-tidy clean in CI on Travis
 
+[actions/github-script]: https://github.com/actions/github-script
 [clang-tidy]: https://clang.llvm.org/extra/clang-tidy/
 [flake8]: https://flake8.pycqa.org/en/latest/
 [github actions expressions]: https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#about-contexts-and-expressions

diff --git a/tools/extract_scripts.py b/tools/extract_scripts.py
@@ -1,24 +1,49 @@
 #!/usr/bin/env python3
 
-# these two pages have some relevant information:
-# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions
-# https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners
-
 import argparse
 import re
 import sys
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import yaml
+from typing_extensions import TypedDict
+
+Step = Dict[str, Any]
+
 
-Job = Dict[str, Any]
+class Script(TypedDict):
+    extension: str
+    script: str
 
-windows_labels = {'windows-latest', 'windows-2019'}
 
+def extract(step: Step) -> Optional[Script]:
+    run = step.get('run')
 
-def get_default_shell(job: Job) -> str:
-    return 'pwsh' if job['runs-on'] in windows_labels else 'bash'
+    # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#using-a-specific-shell
+    shell = step.get('shell', 'bash')
+    extension = {
+        'bash': '.sh',
+        'pwsh': '.ps1',
+        'python': '.py',
+        'sh': '.sh',
+        'cmd': '.cmd',
+        'powershell': '.ps1',
+    }.get(shell)
+
+    is_gh_script = step.get('uses', '').startswith('actions/github-script@')
+    gh_script = step.get('with', {}).get('script')
+
+    if run is not None and extension is not None:
+        script = {
+            'bash': f'#!/usr/bin/env bash\nset -eo pipefail\n{run}',
+            'sh': f'#!/usr/bin/env sh\nset -e\n{run}',
+        }.get(shell, run)
+        return {'extension': extension, 'script': script}
+    elif is_gh_script and gh_script is not None:
+        return {'extension': '.js', 'script': gh_script}
+    else:
+        return None
 
 
 def main() -> None:
@@ -38,31 +63,29 @@ def main() -> None:
 
         for job_name, job in workflow['jobs'].items():
             job_dir = out / p / job_name
-            default_shell = get_default_shell(job)
             steps = job['steps']
             index_chars = len(str(len(steps) - 1))
             for i, step in enumerate(steps, start=1):
-                script = step.get('run')
-                if script:
-                    step_name = step['name']
+                extracted = extract(step)
+                if extracted:
+                    script = extracted['script']
+                    step_name = step.get('name', '')
                     if '${{' in script:
                         gha_expressions_found = True
                         print(
                             f'{p} job `{job_name}` step {i}: {step_name}',
                             file=sys.stderr
                         )
 
-                    if step.get('shell', default_shell) == 'bash':
-                        job_dir.mkdir(parents=True, exist_ok=True)
+                    job_dir.mkdir(parents=True, exist_ok=True)
 
-                        sanitized = re.sub(
-                            '[^a-zA-Z_]+', '_',
-                            f'_{step_name}',
-                        ).rstrip('_')
-                        filename = f'{i:0{index_chars}}{sanitized}.sh'
-                        (job_dir / filename).write_text(
-                            f'#!/usr/bin/env bash\nset -eo pipefail\n{script}'
-                        )
+                    sanitized = re.sub(
+                        '[^a-zA-Z_]+', '_',
+                        f'_{step_name}',
+                    ).rstrip('_')
+                    extension = extracted['extension']
+                    filename = f'{i:0{index_chars}}{sanitized}{extension}'
+                    (job_dir / filename).write_text(script)
 
     if gha_expressions_found:
         sys.exit(

diff --git a/tools/test/test_extract_scripts.py b/tools/test/test_extract_scripts.py
@@ -0,0 +1,85 @@
+import unittest
+
+from tools import extract_scripts
+
+requirements_sh = '''
+#!/usr/bin/env bash
+set -eo pipefail
+pip install -r requirements.txt
+'''.strip()
+
+hello_sh = '''
+#!/usr/bin/env sh
+set -e
+echo hello world
+'''.strip()
+
+
+class TestExtractScripts(unittest.TestCase):
+    def test_extract_none(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Checkout PyTorch',
+                'uses': 'actions/checkout@v2',
+            }),
+            None,
+        )
+
+    def test_extract_run_default_bash(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Install requirements',
+                'run': 'pip install -r requirements.txt',
+            }),
+            {
+                'extension': '.sh',
+                'script': requirements_sh,
+            },
+        )
+
+    def test_extract_run_sh(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Hello world',
+                'run': 'echo hello world',
+                'shell': 'sh',
+            }),
+            {
+                'extension': '.sh',
+                'script': hello_sh,
+            },
+        )
+
+    def test_extract_run_py(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Hello world',
+                'run': 'print("Hello!")',
+                'shell': 'python',
+            }),
+            {
+                'extension': '.py',
+                'script': 'print("Hello!")',
+            },
+        )
+
+    def test_extract_github_script(self) -> None:
+        self.assertEqual(
+            # https://github.com/actions/github-script/tree/v3.1.1#reading-step-results
+            extract_scripts.extract({
+                'uses': 'actions/github-script@v3',
+                'id': 'set-result',
+                'with': {
+                    'script': 'return "Hello!"',
+                    'result-encoding': 'string',
+                },
+            }),
+            {
+                'extension': '.js',
+                'script': 'return "Hello!"',
+            },
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()