Oneflow-Inc · strint · Nov 22, 2022 · Sep 15, 2022 · Sep 15, 2022 · Sep 15, 2022
diff --git a/.github/workflows/canary.yml b/.github/workflows/canary.yml
@@ -55,7 +55,7 @@ jobs:
       - name: Checkout Oneflow-Inc/oneflow
         if: ${{ github.event.inputs.oneflow-ref == '' }}
         uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build manylinux
         id: build-cuda
         with:

diff --git a/.github/workflows/on_merge.yml b/.github/workflows/on_merge.yml
@@ -15,6 +15,6 @@ jobs:
     if: github.event.pull_request.merged == true
     runs-on: ubuntu-latest
     steps:
-      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-cu118
         name: Update benchmark history
         timeout-minutes: 10
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -33,7 +33,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-cu118
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -45,6 +45,7 @@ jobs:
             release
           oneflow-src: ${{ env.ONEFLOW_SRC }}
           entries: |
+            cu118
             cu116
             cu112
             cu102
@@ -74,7 +75,7 @@ jobs:
           python3 -m pip install -U setuptools wheel --user
           python3 -m pip install oss2  --user
       - uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry !='cpu' }}
         with:
@@ -97,7 +98,7 @@ jobs:
             3.8
             3.9
             3.10
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry =='cpu' }}
         with:

diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml
@@ -245,7 +245,7 @@ jobs:
           repository: Oneflow-Inc/conda-env
           ref: 30a7f00eb48ee9009d85a848e720823e5054c66b
           path: conda-env
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build with gcc7
         if: ${{ matrix.build-type == 'gcc7'}}
         with:
@@ -254,7 +254,7 @@ jobs:
           oneflow-build-env: conda
           conda-env-file: conda-env/dev/gcc7/environment-v2.yml
           conda-env-name: oneflow-dev-gcc7-v2
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build with clang10
         if: ${{ matrix.build-type == 'clang10'}}
         with:

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -15,7 +15,7 @@ env:
   FLOW_VISION_SRC: flow_vision
   FLOW_VISION_COMMIT: ca8ebc663b58667cf8cd1b6ef0c861522780b7bb
   LIBAI_SRC: libai
-  LIBAI_COMMIT: 7d31d9781e5f2d559dc0820f599e0bed798488ca
+  LIBAI_COMMIT: 94eb85ff0131e8dfce953a3a916de7a4f897c647
   ONEFLOW_FACE_SRC: oneflow_face
   ONEFLOW_FACE_COMMIT: 110a97e8d5737a1f1856281a7df556a5ac8f06de
   ONEFLOW_IREE_SRC: oneflow_iree
@@ -29,7 +29,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot')
     steps:
-      - uses: Oneflow-Inc/get-oneflow/priority-pr@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/priority-pr@support-cu118
         name: Check priority PR closed
         id: save-cache
         timeout-minutes: 5
@@ -163,7 +163,7 @@ jobs:
           fi
           echo "is_secrets_accessible=1" >> $GITHUB_ENV
       - name: Wait for GPU slot
-        uses: Oneflow-Inc/get-oneflow/wait-for-gpu@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/wait-for-gpu@support-cu118
         if: env.is_secrets_accessible == '1'
         timeout-minutes: 90
         continue-on-error: true
@@ -187,7 +187,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-cu118
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -201,6 +201,8 @@ jobs:
           entries: |
             cu102
             cpu
+            cpu-asan-ubsan
+            cpu-tsan
             llvm13
 
   build-oneflow:
@@ -234,7 +236,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu118
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -248,7 +250,7 @@ jobs:
         run: |
           echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit"
           exit 1
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build manylinux ${{ matrix.entry }}
         id: build-cpu
         if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }}
@@ -270,7 +272,28 @@ jobs:
           python-versions: |
             3.7
             3.8
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
+        name: Build manylinux ${{ matrix.entry }}
+        id: build-cpu-sanitizers
+        if: ${{ (matrix.entry == 'cpu-asan-ubsan' || matrix.entry == 'cpu-tsan') && !matrix.cache-hit }}
+        with:
+          cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/${{ matrix.entry }}.cmake
+          build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build.sh
+          run-lit: false
+          oneflow-src: ${{ env.ONEFLOW_SRC }}
+          oneflow-build-env: manylinux
+          wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }}
+          clear-wheelhouse-dir: true
+          self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }}
+          cuda-version: none
+          manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }}
+          docker-run-use-system-http-proxy: false
+          docker-run-use-lld: true
+          retry-failed-build: true
+          clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
+          python-versions: |
+            3.8
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build manylinux ${{ matrix.entry }}
         id: build-cuda
         if: ${{ matrix.entry =='cu102' && !matrix.cache-hit }}
@@ -290,7 +313,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.7
-      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow@support-cu118
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry == 'llvm13' && !matrix.cache-hit }}
         with:
@@ -329,7 +352,7 @@ jobs:
             })
       - name: Upload packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/digest/upload@support-cu118
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -340,7 +363,7 @@ jobs:
           dst-dir: cpack
       - name: Upload whl
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/digest/upload@support-cu118
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -365,7 +388,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-cu118
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -396,7 +419,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-cu118
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -472,7 +495,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu118
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -488,7 +511,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu118
         id: download-digest
         timeout-minutes: 10
         with:
@@ -498,7 +521,7 @@ jobs:
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Get primary node
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/master-address@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/master-address@support-cu118
         id: get-primary-node
         with:
           rank: ${{ matrix.rank }}
@@ -631,7 +654,7 @@ jobs:
       TEST_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test"
       TEST_MANYLINUX_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test-manylinux"
       TEST_WITH_TF_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-tf-2.3.0:2f831e9354298a11447578e869d983959feb046f
-      TEST_MANYLINUX_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/manylinux2014_x86_64_cuda10.2:4fd9cc268bbe59c6245ca3941b8264fd256a8670
+      TEST_MANYLINUX_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/manylinux2014_x86_64_cuda10.2:190c92408855fe17ae664f2de1a9d6f484b2da2b
       SSH_TANK_HOST: 192.168.1.13
       SSH_TANK_PATH: /tank
       METRICS_DIR: metrics
@@ -689,7 +712,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_MANYLINUX_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu118
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -705,14 +728,34 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu118
         id: download-digest
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
           entry: ${{ matrix.compute-platform }}
           ssh-tank-host: ${{ env.SSH_TANK_HOST }}
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
+      - name: Download ASAN and UBSAN wheel and packed liboneflow
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && matrix.device == 'cpu' }}
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu118
+        id: asan-ubsan-download-digest
+        timeout-minutes: 10
+        with:
+          digest: ${{ steps.save-cache.outputs.build-digest }}
+          entry: cpu-asan-ubsan
+          ssh-tank-host: ${{ env.SSH_TANK_HOST }}
+          ssh-tank-path: ${{ env.SSH_TANK_PATH }}
+      - name: Download TSAN wheel and packed liboneflow
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && matrix.device == 'cpu' }}
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu118
+        id: tsan-download-digest
+        timeout-minutes: 10
+        with:
+          digest: ${{ steps.save-cache.outputs.build-digest }}
+          entry: cpu-tsan
+          ssh-tank-host: ${{ env.SSH_TANK_HOST }}
+          ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Enable TF container
         if: ${{ fromJSON(matrix.is-single-client) }}
         run: |
@@ -765,6 +808,11 @@ jobs:
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && !fromJson(matrix.is-xla) }}
         run: |
           unzip ${{ env.ONEFLOW_CPACK_PATH }}/liboneflow-ci-linux.zip
+      - name: Unzip packed sanitized liboneflow
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && !fromJson(matrix.is-xla) && matrix.device == 'cpu' }}
+        run: |
+          unzip ${{ steps.asan-ubsan-download-digest.outputs.entry-dir }}/cpack/liboneflow-ci-linux.zip -d asan-ubsan
+          unzip ${{ steps.tsan-download-digest.outputs.entry-dir }}/cpack/liboneflow-ci-linux.zip -d tsan
       - name: Start container
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
         working-directory: ${{ env.ONEFLOW_SRC }}
@@ -825,6 +873,13 @@ jobs:
         timeout-minutes: 20
         run: |
           docker exec -e ONEFLOW_SERVING_DEBUG=1 ${{ env.TEST_MANYLINUX_CONTAINER_NAME }} ./liboneflow-ci-linux/bin/oneflow_cpp_api_testexe --gtest_filter=-Api.embedding*
+      - name: Exe test (C++ API with sanitizers)
+        if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' && matrix.device == 'cpu' }}
+        timeout-minutes: 10
+        run: |
+          docker exec -e UBSAN_OPTIONS=suppressions=.ubsan-suppressions -e ASAN_OPTIONS=strict_string_checks=1:detect_stack_use_after_return=1 -e LSAN_OPTIONS=suppressions=.lsan-suppressions ${{ env.TEST_MANYLINUX_CONTAINER_NAME }} ./asan-ubsan/liboneflow-ci-linux/bin/oneflow_cpp_api_testexe --gtest_filter=Api.graph_\*
+          # Run 5 times to avoid false positive because of occasional lack of stack info
+          docker exec -e TSAN_OPTIONS="history_size=7 suppressions=.tsan-suppressions" ${{ env.TEST_MANYLINUX_CONTAINER_NAME }} bash -c "./tsan/liboneflow-ci-linux/bin/oneflow_cpp_api_testexe || ./tsan/liboneflow-ci-linux/bin/oneflow_cpp_api_testexe || ./tsan/liboneflow-ci-linux/bin/oneflow_cpp_api_testexe || ./tsan/liboneflow-ci-linux/bin/oneflow_cpp_api_testexe || ./tsan/liboneflow-ci-linux/bin/oneflow_cpp_api_testexe"
       - name: Test container
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
         run: |
@@ -950,7 +1005,7 @@ jobs:
         timeout-minutes: 30
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' && matrix.device == 'cuda' }}
         run: |
-          docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.ONEFLOW_FACE_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/train/test_train.py
+          docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.ONEFLOW_FACE_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest tests/train/test_train.py
       - name: oneflow_iree test
         timeout-minutes: 45
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }}
@@ -978,10 +1033,16 @@ jobs:
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }}
         run: |
           docker exec -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/tensor ${{ env.TEST_CONTAINER_NAME }} bash ci/test/generic_test_multi_client.sh
+      - name: Test mocking torch by script
+        run: |
+          docker exec ${{ env.TEST_CONTAINER_NAME }} bash -x ci/test/test_mock_script.sh
+      - name: Test mocking torch by function
+        run: |
+          docker exec ${{ env.TEST_CONTAINER_NAME }} bash -x ci/test/test_mock_function.sh
       - name: Benchmark Test
         timeout-minutes: 100
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }}
-        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-iree-ci
+        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-cu118
         with:
           collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark
           container-name: ${{ env.TEST_CONTAINER_NAME }}
@@ -1043,7 +1104,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
           fetch-depth: 0
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu118
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5

diff --git a/.lsan-suppressions b/.lsan-suppressions
@@ -0,0 +1 @@
+leak:CommandT
diff --git a/.tsan-suppressions b/.tsan-suppressions
@@ -0,0 +1,9 @@
+# These four group of functions are designed to be thread unsafe,
+# it's user's responsibility to use them correctly.
+race:ThreadUnsafe
+race:thread_unsafe
+race:flying_instruction_cnt
+race:total_erased_instruction_cnt
+race:ToShape
+# glog
+race:google::
diff --git a/.ubsan-suppressions b/.ubsan-suppressions
@@ -0,0 +1,2 @@
+# llvm
+vptr:Class.cpp