From 47280aaba78b31c0ade1d5de0315657cb84b88ea Mon Sep 17 00:00:00 2001 From: Alex Chronopoulos Date: Mon, 17 Jun 2019 22:56:27 +0000 Subject: [PATCH] Bug 1557528 - Import dav1d from upstream to version 0.4.0. r=TD-Linux Differential Revision: https://phabricator.services.mozilla.com/D34997 --- media/libdav1d/asm/moz.build | 4 +- media/libdav1d/dav1d.rc | 10 +- media/libdav1d/moz.yaml | 2 +- media/libdav1d/vcs_version.h | 2 +- media/libdav1d/version.h | 4 +- third_party/dav1d/.gitlab-ci.yml | 22 + third_party/dav1d/NEWS | 19 + third_party/dav1d/README.md | 2 + third_party/dav1d/THANKS.md | 11 +- third_party/dav1d/dav1d_logo.png | 0 third_party/dav1d/doc/dav1d_logo.svg | 1 + third_party/dav1d/include/common/attributes.h | 4 +- third_party/dav1d/include/dav1d/dav1d.h | 2 + third_party/dav1d/meson.build | 46 +- third_party/dav1d/meson_options.txt | 4 + third_party/dav1d/src/arm/32/mc.S | 17 +- third_party/dav1d/src/arm/64/mc.S | 17 +- third_party/dav1d/src/arm/64/msac.S | 86 +++- third_party/dav1d/src/arm/asm.S | 1 + third_party/dav1d/src/arm/cdef_init_tmpl.c | 3 +- third_party/dav1d/src/arm/cpu.c | 16 +- .../dav1d/src/arm/loopfilter_init_tmpl.c | 2 +- .../dav1d/src/arm/looprestoration_init_tmpl.c | 4 +- third_party/dav1d/src/arm/msac.h | 50 +++ third_party/dav1d/src/cdef_tmpl.c | 2 +- third_party/dav1d/src/cpu.c | 4 +- third_party/dav1d/src/cpu.h | 2 + third_party/dav1d/src/env.h | 35 +- third_party/dav1d/src/internal.h | 1 + third_party/dav1d/src/intra_edge.c | 9 +- third_party/dav1d/src/ipred_prepare_tmpl.c | 25 +- third_party/dav1d/src/ipred_tmpl.c | 2 +- third_party/dav1d/src/itx_tmpl.c | 2 +- third_party/dav1d/src/lib.c | 30 +- third_party/dav1d/src/log.c | 6 +- third_party/dav1d/src/loopfilter_tmpl.c | 2 +- third_party/dav1d/src/looprestoration_tmpl.c | 2 +- third_party/dav1d/src/mc_tmpl.c | 28 +- third_party/dav1d/src/meson.build | 38 +- third_party/dav1d/src/msac.c | 30 +- third_party/dav1d/src/msac.h | 71 ++-- third_party/dav1d/src/obu.c | 13 +- third_party/dav1d/src/qm.c | 4 +- third_party/dav1d/src/recon_tmpl.c | 238 +++++++---- third_party/dav1d/src/tables.c | 2 +- third_party/dav1d/src/wedge.c | 26 +- third_party/dav1d/src/win32/thread.c | 16 +- third_party/dav1d/src/x86/cdef_init_tmpl.c | 2 +- third_party/dav1d/src/x86/cpu.c | 4 +- third_party/dav1d/src/x86/ipred_init_tmpl.c | 2 +- third_party/dav1d/src/x86/itx_init_tmpl.c | 2 +- .../dav1d/src/x86/loopfilter_init_tmpl.c | 2 +- .../dav1d/src/x86/looprestoration_init_tmpl.c | 3 +- third_party/dav1d/src/x86/mc.asm | 178 ++++---- third_party/dav1d/src/x86/mc_init_tmpl.c | 2 +- third_party/dav1d/src/x86/mc_ssse3.asm | 36 +- third_party/dav1d/src/x86/msac.asm | 398 ++++++++++++------ third_party/dav1d/src/x86/msac.h | 51 +++ third_party/dav1d/tests/checkasm/mc.c | 37 +- third_party/dav1d/tests/checkasm/msac.c | 90 +++- third_party/dav1d/tests/meson.build | 6 +- third_party/dav1d/tools/dav1d.c | 109 ++++- third_party/dav1d/tools/dav1d_cli_parse.c | 92 +++- third_party/dav1d/tools/dav1d_cli_parse.h | 8 + third_party/dav1d/tools/output/output.c | 2 + 65 files changed, 1350 insertions(+), 591 deletions(-) create mode 100644 third_party/dav1d/dav1d_logo.png create mode 100644 third_party/dav1d/doc/dav1d_logo.svg create mode 100644 third_party/dav1d/src/arm/msac.h create mode 100644 third_party/dav1d/src/x86/msac.h diff --git a/media/libdav1d/asm/moz.build b/media/libdav1d/asm/moz.build index 0f6797c40df46..dd9a2863e1664 100644 --- a/media/libdav1d/asm/moz.build +++ b/media/libdav1d/asm/moz.build @@ -73,6 +73,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): EXPORTS.dav1d += [ '../../../third_party/dav1d/src/x86/cpu.h', + '../../../third_party/dav1d/src/x86/msac.h', ] # ASM source files @@ -86,7 +87,6 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): '../../../third_party/dav1d/src/x86/loopfilter.asm', '../../../third_party/dav1d/src/x86/looprestoration.asm', '../../../third_party/dav1d/src/x86/mc.asm', - '../../../third_party/dav1d/src/x86/msac.asm', ] SOURCES += [ @@ -96,6 +96,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): '../../../third_party/dav1d/src/x86/itx_ssse3.asm', '../../../third_party/dav1d/src/x86/looprestoration_ssse3.asm', '../../../third_party/dav1d/src/x86/mc_ssse3.asm', + '../../../third_party/dav1d/src/x86/msac.asm', ] # BITDEPTH @@ -142,6 +143,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64': ] EXPORTS += [ '../../../third_party/dav1d/src/arm/cpu.h', + '../../../third_party/dav1d/src/arm/msac.h', ] # BITDEPTH c file diff --git a/media/libdav1d/dav1d.rc b/media/libdav1d/dav1d.rc index 244cbc326f9a6..0623f861599be 100644 --- a/media/libdav1d/dav1d.rc +++ b/media/libdav1d/dav1d.rc @@ -1,7 +1,7 @@ -#define API_VERSION_NUMBER 1,1,0,0 -#define API_VERSION_NUMBER_STR "1.1.0" -#define PROJECT_VERSION_NUMBER 0,3,0,0 -#define PROJECT_VERSION_NUMBER_STR "0.3.0" +#define API_VERSION_NUMBER 2,0,0,0 +#define API_VERSION_NUMBER_STR "2.0.0" +#define PROJECT_VERSION_NUMBER 0,3,1,0 +#define PROJECT_VERSION_NUMBER_STR "0.3.1" #include @@ -22,7 +22,7 @@ BEGIN VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" VALUE "InternalName", "dav1d" VALUE "OriginalFilename", "libdav1d.dll" - VALUE "LegalCopyright", "Copyright \251 2019 VideoLAN and dav1d Authors" + VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors" END END BLOCK "VarFileInfo" diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index ab93f05c3b4ad..8a63a9c30177f 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,7 +20,7 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: commit a713643eadcf50c9f7fd2ea22a598127c959a723 (2019-05-09T07:52:54.000Z). + release: commit 3e3855bfb9935e4a3af714a6063c977320dc1acc (2019-06-05T15:12:17.000Z). # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index 0d5fd0a51142a..53e874e9c5aca 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "0.3.0-13-ga713643" +#define DAV1D_VERSION "0.3.1-40-g3e3855b" diff --git a/media/libdav1d/version.h b/media/libdav1d/version.h index 86b014e7e63d3..97efcb5184171 100644 --- a/media/libdav1d/version.h +++ b/media/libdav1d/version.h @@ -27,8 +27,8 @@ #ifndef DAV1D_VERSION_H #define DAV1D_VERSION_H -#define DAV1D_API_VERSION_MAJOR 1 -#define DAV1D_API_VERSION_MINOR 1 +#define DAV1D_API_VERSION_MAJOR 2 +#define DAV1D_API_VERSION_MINOR 0 #define DAV1D_API_VERSION_PATCH 0 #endif /* DAV1D_VERSION_H */ diff --git a/third_party/dav1d/.gitlab-ci.yml b/third_party/dav1d/.gitlab-ci.yml index e3c5c9876b777..5d12e0b874561 100644 --- a/third_party/dav1d/.gitlab-ci.yml +++ b/third_party/dav1d/.gitlab-ci.yml @@ -384,3 +384,25 @@ test-win64: - ninja -C build - cd build && time meson test -v dependencies: [] + +test-debian-aarch64: + image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457 + stage: test + tags: + - aarch64 + - debian + cache: + key: testdata.git-20190215 + paths: + - cache/dav1d-test-data.git/ + script: + - test -d cache || mkdir cache + - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master + - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git + - git clone cache/dav1d-test-data.git tests/dav1d-test-data + - meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + - ninja -C build + - cd build && time meson test -v + dependencies: [] diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS index aeb2d85248442..7489e86038c61 100644 --- a/third_party/dav1d/NEWS +++ b/third_party/dav1d/NEWS @@ -1,3 +1,22 @@ +Changes for 0.4.0 'Cheetah': +---------------------------- + + - Fix playback with unknown OBUs + - Add an option to limit the maximum frame size + - SSE2 and ARM64 optimizations for MSAC + - Improve speed on 32bits systems + - Optimization in obmc blend + + +Changes for 0.3.1 'Sailfish': +------------------------------ + + - Fix a buffer overflow in frame-threading mode on SSSE3 CPUs + - Reduce binary size, notably on Windows + - SSSE3 optimizations for ipred_filter + - ARM optimizations for MSAC + + Changes for 0.3.0 'Sailfish': ------------------------------ diff --git a/third_party/dav1d/README.md b/third_party/dav1d/README.md index bb8f7deb55df8..ed92711c02728 100644 --- a/third_party/dav1d/README.md +++ b/third_party/dav1d/README.md @@ -1,3 +1,5 @@ +![dav1d logo](dav1d_logo.png) + # dav1d **dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness. diff --git a/third_party/dav1d/THANKS.md b/third_party/dav1d/THANKS.md index cba0537f8a88b..98f8094832dd0 100644 --- a/third_party/dav1d/THANKS.md +++ b/third_party/dav1d/THANKS.md @@ -16,4 +16,13 @@ The Alliance for Open Media (AOM) for funding this project. And all the dav1d Authors (git shortlog -sn), including: -Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr and skal. +Janne Grunau, Ronald S. Bultje, Martin Storsjö, Henrik Gramner, James Almer, +Marvin Scholz, Luc Trudeau, Jean-Baptiste Kempf, Victorien Le Couviour--Tuffet, +David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Nathan E. Egge, +Francois Cartegnie, Konstantin Pavlov, Liwei Wang, Xuefeng Jiang, +Derek Buitenhuis, Raphaël Zumer, Niklas Haas, Michael Bradshaw, Kyle Siefring, +Raphael Zumer, Boyuan Xiao, Thierry Foucu, Matthias Dressel, Thomas Daede, +Rupert Swarbrick, Jan Beich, Dale Curtis, SmilingWolf, Tristan Laurent, +Vittorio Giovara, Rostislav Pehlivanov, Shiz, skal, Steinar Midtskogen, +Luca Barbato, Justin Bull, Jean-Yves Avenard, Timo Gurr, Fred Barbier, +Anisse Astier, Pablo Stebler, Nicolas Frattaroli, Mark Shuttleworth. diff --git a/third_party/dav1d/dav1d_logo.png b/third_party/dav1d/dav1d_logo.png new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/third_party/dav1d/doc/dav1d_logo.svg b/third_party/dav1d/doc/dav1d_logo.svg new file mode 100644 index 0000000000000..2795db8a9368e --- /dev/null +++ b/third_party/dav1d/doc/dav1d_logo.svg @@ -0,0 +1 @@ +dav1d diff --git a/third_party/dav1d/include/common/attributes.h b/third_party/dav1d/include/common/attributes.h index a9fd0faf23c2a..bd5e13d936d18 100644 --- a/third_party/dav1d/include/common/attributes.h +++ b/third_party/dav1d/include/common/attributes.h @@ -34,10 +34,12 @@ #ifdef __GNUC__ #define ATTR_ALIAS __attribute__((may_alias)) -#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr))); +#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr))) +#define COLD __attribute__((cold)) #else #define ATTR_ALIAS #define ATTR_FORMAT_PRINTF(fmt, attr) +#define COLD #endif #if ARCH_X86_64 diff --git a/third_party/dav1d/include/dav1d/dav1d.h b/third_party/dav1d/include/dav1d/dav1d.h index f56cd15d725f9..9afeae668cd1c 100644 --- a/third_party/dav1d/include/dav1d/dav1d.h +++ b/third_party/dav1d/include/dav1d/dav1d.h @@ -64,6 +64,8 @@ typedef struct Dav1dSettings { int apply_grain; int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31) int all_layers; ///< output all spatial layers of a scalable AV1 biststream + unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited) + uint8_t reserved[32]; ///< reserved for future use Dav1dPicAllocator allocator; Dav1dLogger logger; } Dav1dSettings; diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index 8864527c162ad..bafe20a4fbd3f 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -23,14 +23,14 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.3.0', + version: '0.3.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.47.0') -dav1d_soname_version = '1.1.0' +dav1d_soname_version = '2.0.0' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -81,6 +81,8 @@ cdata.set10('CONFIG_LOG', get_option('logging')) # Arguments in test_args will be used even on feature tests test_args = [] +optional_arguments = [] + # Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001) test_args += '-D_POSIX_C_SOURCE=200112L' add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c') @@ -96,10 +98,12 @@ if host_machine.system() == 'windows' cdata.set('fseeko', '_fseeki64') cdata.set('ftello', '_ftelli64') endif -endif -# On Windows, we use a compatibility layer to emulate pthread -if host_machine.system() == 'windows' + if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc') + optional_arguments += '-mcmodel=small' + endif + + # On Windows, we use a compatibility layer to emulate pthread thread_dependency = [] thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c')) else @@ -176,16 +180,23 @@ endif # Compiler flags that should be set # But when the compiler does not supports them # it is not an error and silently tolerated -optional_arguments = [ - '-Wundef', - '-Werror=vla', - '-Wno-maybe-uninitialized', - '-Wno-missing-field-initializers', - '-Wno-unused-parameter', - '-Werror=missing-prototypes', - '-Wshorten-64-to-32', -] -if cc.get_id() == 'msvc' +if cc.get_id() != 'msvc' + optional_arguments += [ + '-Wundef', + '-Werror=vla', + '-Wno-maybe-uninitialized', + '-Wno-missing-field-initializers', + '-Wno-unused-parameter', + '-Werror=missing-prototypes', + '-Wshorten-64-to-32', + ] + if host_machine.cpu_family() == 'x86' + optional_arguments += [ + '-msse2', + '-mfpmath=sse', + ] + endif +else optional_arguments += [ '-wd4028', # parameter different from declaration '-wd4996' # use of POSIX functions @@ -328,6 +339,11 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') # check NASM version if nasm.found() nasm_r = run_command(nasm, '-v') + + if nasm_r.returncode() != 0 + error('failed running nasm to obtain its version') + endif + out = nasm_r.stdout().strip().split() if out[1].to_lower() == 'version' if out[2].version_compare('<2.13.02') diff --git a/third_party/dav1d/meson_options.txt b/third_party/dav1d/meson_options.txt index fe9112cda09a0..55a328b74b0aa 100644 --- a/third_party/dav1d/meson_options.txt +++ b/third_party/dav1d/meson_options.txt @@ -36,6 +36,10 @@ option('fuzzing_engine', value: 'none', description: 'Select the fuzzing engine') +option('fuzzer_ldflags', + type: 'string', + description: 'Extra LDFLAGS used during linking of fuzzing binaries') + option('stack_alignment', type: 'integer', value: 0) diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S index 61dfd40c48b4c..542309a859c4d 100644 --- a/third_party/dav1d/src/arm/32/mc.S +++ b/third_party/dav1d/src/arm/32/mc.S @@ -1112,7 +1112,7 @@ L(\type\()_8tap_v_tbl): vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18 mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9 shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27 - b 48b + bgt 48b 0: vpop {q4} pop {r4-r11,pc} @@ -1145,7 +1145,7 @@ L(\type\()_8tap_v_tbl): 0: pop {r4-r11,pc} -880: // 8x8, 8x16, 8x32 v +880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: @@ -1178,12 +1178,17 @@ L(\type\()_8tap_v_tbl): mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14 shift_store_8 \type, \d_strd, q3, d6, q4, d8 ble 9f - subs \h, \h, #4 - load_reg \sr2, \src, \s_strd, d30, d2, d4, d6 - vmovl_u8 q15, d30, q1, d2, q2, d4, q3, d6 + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d30, d2 + vmovl_u8 q15, d30, q1, d2 mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1 + shift_store_8 \type, \d_strd, q8, d16, q9, d18 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d4, d6 + vmovl_u8 q2, d4, q3, d6 mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3 - shift_store_8 \type, \d_strd, q8, d16, q9, d18, q10, d20, q11, d22 + shift_store_8 \type, \d_strd, q10, d20, q11, d22 ble 9f subs \h, \h, #4 load_reg \sr2, \src, \s_strd, d8, d16, d18, d20 diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 73cfb16836b0a..c85b89f137a34 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -1119,7 +1119,7 @@ L(\type\()_8tap_v): uxtl_b v18, v19, v20, v21 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 - b 48b + b.gt 48b 0: ret @@ -1151,7 +1151,7 @@ L(\type\()_8tap_v): 0: ret -880: // 8x8, 8x16, 8x32 v +880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: @@ -1183,12 +1183,17 @@ L(\type\()_8tap_v): mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f - subs \h, \h, #4 - load_8b \sr2, \src, \s_strd, v27, v16, v17, v18 - uxtl_b v27, v16, v17, v18 + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v27, v16 + uxtl_b v27, v16 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + shift_store_8 \type, \d_strd, v1, v2 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v17, v18 + uxtl_b v17, v18 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 - shift_store_8 \type, \d_strd, v1, v2, v3, v4 + shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S index 158638b94056a..b1bff59afe208 100644 --- a/third_party/dav1d/src/arm/64/msac.S +++ b/third_party/dav1d/src/arm/64/msac.S @@ -192,7 +192,7 @@ function msac_decode_symbol_adapt4_neon, export=1 sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.8h, w4 // -rate - sub w3, w3, w3, lsr #5 // count + (count >= 32) + sub w3, w3, w3, lsr #5 // count - (count >= 32) sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate add w3, w3, #1 // count + (count < 32) @@ -215,6 +215,7 @@ L(renorm): eor w5, w5, #16 // d = clz(rng) ^ 16 mvn x7, x7 // ~dif add x7, x7, x3, lsl #48 // ~dif + (v << 48) +L(renorm2): lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (~dif + (v << 48)) << d @@ -278,3 +279,86 @@ function msac_decode_symbol_adapt16_neon, export=1 decode_update .8h, .16b, 16 b L(renorm) endfunc + +function msac_decode_bool_equi_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + bic w4, w5, #0xff // r &= 0xff00 + add w4, w4, #8 + subs x8, x7, x4, lsl #47 // dif - vw + lsr w4, w4, #1 // v + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + bic w1, w1, #0x3f // f &= ~63 + mul w4, w4, w1 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + ldr w9, [x1] // cdf[0-1] + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + and w2, w9, #0xffc0 // f &= ~63 + mul w4, w4, w2 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + ldr w10, [x0, #ALLOW_UPDATE_CDF] + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + + cbz w10, L(renorm2) + + lsr w2, w9, #16 // count = cdf[1] + and w9, w9, #0xffff // cdf[0] + + sub w3, w2, w2, lsr #5 // count - (count >= 32) + lsr w2, w2, #4 // count >> 4 + add w10, w3, #1 // count + (count < 32) + add w2, w2, #4 // rate = (count >> 4) | 4 + + sub w9, w9, w15 // cdf[0] -= bit + sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} + asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate + sub w9, w9, w11 // cdf[0] + + strh w9, [x1] + strh w10, [x1, #2] + + b L(renorm2) +endfunc diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S index af96d5195177e..6b1d46fcd8108 100644 --- a/third_party/dav1d/src/arm/asm.S +++ b/third_party/dav1d/src/arm/asm.S @@ -37,6 +37,7 @@ .fpu neon .eabi_attribute 10, 0 // suppress Tag_FP_arch .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch + .section .note.GNU-stack,"",%progbits // Mark stack as non-executable #endif #ifdef _WIN32 diff --git a/third_party/dav1d/src/arm/cdef_init_tmpl.c b/third_party/dav1d/src/arm/cdef_init_tmpl.c index a7d58ff8fa35e..6719163a9f920 100644 --- a/third_party/dav1d/src/arm/cdef_init_tmpl.c +++ b/third_party/dav1d/src/arm/cdef_init_tmpl.c @@ -24,7 +24,6 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "common/attributes.h" #include "src/cpu.h" #include "src/cdef.h" @@ -72,7 +71,7 @@ DEFINE_FILTER(4, 4, 8) #endif -void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) { +COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c index e2767a04d65e7..e6c461ab5fe55 100644 --- a/third_party/dav1d/src/arm/cpu.c +++ b/third_party/dav1d/src/arm/cpu.c @@ -27,9 +27,13 @@ #include "config.h" +#include "common/attributes.h" + #include "src/arm/cpu.h" -#if defined(HAVE_GETAUXVAL) && ARCH_ARM +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 +// NEON is always available; runtime tests are not needed. +#elif defined(HAVE_GETAUXVAL) && ARCH_ARM #include #ifndef HWCAP_ARM_NEON @@ -73,11 +77,9 @@ static unsigned parse_proc_cpuinfo(const char *flag) { } #endif -unsigned dav1d_get_cpu_flags_arm(void) { +COLD unsigned dav1d_get_cpu_flags_arm(void) { unsigned flags = 0; -#if ARCH_AARCH64 - flags |= DAV1D_ARM_CPU_FLAG_NEON; -#elif defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 flags |= DAV1D_ARM_CPU_FLAG_NEON; #elif defined(HAVE_GETAUXVAL) && ARCH_ARM unsigned long hw_cap = getauxval(AT_HWCAP); @@ -88,10 +90,6 @@ unsigned dav1d_get_cpu_flags_arm(void) { flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; #elif defined(__ANDROID__) flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#elif defined(__APPLE__) - flags |= DAV1D_ARM_CPU_FLAG_NEON; -#elif defined(_WIN32) - flags |= DAV1D_ARM_CPU_FLAG_NEON; #endif return flags; diff --git a/third_party/dav1d/src/arm/loopfilter_init_tmpl.c b/third_party/dav1d/src/arm/loopfilter_init_tmpl.c index de4e8d2ae2cc2..1394b767c4cc6 100644 --- a/third_party/dav1d/src/arm/loopfilter_init_tmpl.c +++ b/third_party/dav1d/src/arm/loopfilter_init_tmpl.c @@ -33,7 +33,7 @@ decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon); decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon); decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon); -void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) { +COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c index 31cabb724802a..07c56c77d6e6e 100644 --- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c +++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c @@ -27,8 +27,6 @@ #include "src/cpu.h" #include "src/looprestoration.h" - -#include "common/attributes.h" #include "src/tables.h" #if BITDEPTH == 8 @@ -258,7 +256,7 @@ static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, #endif // ARCH_AARCH64 #endif // BITDEPTH == 8 -void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) { +COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h new file mode 100644 index 0000000000000..a243a06295d91 --- /dev/null +++ b/third_party/dav1d/src/arm/msac.h @@ -0,0 +1,50 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_MSAC_H +#define DAV1D_SRC_ARM_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); +unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); + +#if ARCH_AARCH64 +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon +#endif + +#endif /* DAV1D_SRC_ARM_MSAC_H */ diff --git a/third_party/dav1d/src/cdef_tmpl.c b/third_party/dav1d/src/cdef_tmpl.c index d7fde6abdf28b..7a82d8b5c510b 100644 --- a/third_party/dav1d/src/cdef_tmpl.c +++ b/third_party/dav1d/src/cdef_tmpl.c @@ -254,7 +254,7 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride, return best_dir; } -void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { +COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { c->dir = cdef_find_dir_c; c->fb[0] = cdef_filter_block_8x8_c; c->fb[1] = cdef_filter_block_4x8_c; diff --git a/third_party/dav1d/src/cpu.c b/third_party/dav1d/src/cpu.c index 1641b1e88043e..1cacd7228ea3a 100644 --- a/third_party/dav1d/src/cpu.c +++ b/third_party/dav1d/src/cpu.c @@ -32,7 +32,7 @@ static unsigned flags_mask = -1; -unsigned dav1d_get_cpu_flags(void) { +COLD unsigned dav1d_get_cpu_flags(void) { static unsigned flags; static uint8_t checked = 0; @@ -49,6 +49,6 @@ unsigned dav1d_get_cpu_flags(void) { return flags & flags_mask; } -void dav1d_set_cpu_flags_mask(const unsigned mask) { +COLD void dav1d_set_cpu_flags_mask(const unsigned mask) { flags_mask = mask; } diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h index 89a64a0db7162..a256b36902918 100644 --- a/third_party/dav1d/src/cpu.h +++ b/third_party/dav1d/src/cpu.h @@ -30,6 +30,8 @@ #include "config.h" +#include "common/attributes.h" + #include "dav1d/common.h" #if ARCH_AARCH64 || ARCH_ARM diff --git a/third_party/dav1d/src/env.h b/third_party/dav1d/src/env.h index 2d4cc267df7c1..bd8ef089314e9 100644 --- a/third_party/dav1d/src/env.h +++ b/third_party/dav1d/src/env.h @@ -609,25 +609,12 @@ static inline int get_coef_skip_ctx(const TxfmInfo *const t_dim, } } -static inline int get_coef_nz_ctx(uint8_t *const levels, const int scan_idx, - const int rc, const int is_eob, +static inline int get_coef_nz_ctx(uint8_t *const levels, const enum RectTxfmSize tx, - const enum TxClass tx_class) + const enum TxClass tx_class, + const int x, const int y, + const ptrdiff_t stride) { - const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; - - if (is_eob) { - if (scan_idx == 0) return 0; - const int eighth_sz = imin(t_dim->w, 8) * imin(t_dim->h, 8) * 2; - if (scan_idx <= eighth_sz) return 1; - const int quart_sz = eighth_sz * 2; - if (scan_idx <= quart_sz) return 2; - return 3; - } - - const int x = rc >> (2 + imin(t_dim->lh, 3)); - const int y = rc & (4 * imin(t_dim->h, 8) - 1); - const ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1); static const uint8_t offsets[3][5][2 /* x, y */] = { [TX_CLASS_2D] = { { 0, 1 }, { 1, 0 }, { 2, 0 }, { 0, 2 }, { 1, 1 } @@ -643,8 +630,7 @@ static inline int get_coef_nz_ctx(uint8_t *const levels, const int scan_idx, mag += imin(levels[(x + off[i][0]) * stride + (y + off[i][1])], 3); const int ctx = imin((mag + 1) >> 1, 4); if (tx_class == TX_CLASS_2D) { - return !rc ? 0 : - dav1d_nz_map_ctx_offset[tx][imin(y, 4)][imin(x, 4)] + ctx; + return dav1d_nz_map_ctx_offset[tx][imin(y, 4)][imin(x, 4)] + ctx; } else { return 26 + imin((tx_class == TX_CLASS_V) ? y : x, 2) * 5 + ctx; } @@ -686,13 +672,10 @@ static inline int get_dc_sign_ctx(const TxfmInfo *const t_dim, } static inline int get_br_ctx(const uint8_t *const levels, - const int rc, const enum RectTxfmSize tx, - const enum TxClass tx_class) + const int ac, const enum TxClass tx_class, + const int x, const int y, + const ptrdiff_t stride) { - const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; - const int x = rc >> (imin(t_dim->lh, 3) + 2); - const int y = rc & (4 * imin(t_dim->h, 8) - 1); - const int stride = 4 * (imin(t_dim->h, 8) + 1); int mag = 0; static const uint8_t offsets_from_txclass[3][3][2] = { [TX_CLASS_2D] = { { 0, 1 }, { 1, 0 }, { 1, 1 } }, @@ -704,7 +687,7 @@ static inline int get_br_ctx(const uint8_t *const levels, mag += levels[(x + offsets[i][1]) * stride + y + offsets[i][0]]; mag = imin((mag + 1) >> 1, 6); - if (rc == 0) return mag; + if (!ac) return mag; switch (tx_class) { case TX_CLASS_2D: if (y < 2 && x < 2) return mag + 7; diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h index f0fe5efed8996..cabfd8f6191b8 100644 --- a/third_party/dav1d/src/internal.h +++ b/third_party/dav1d/src/internal.h @@ -127,6 +127,7 @@ struct Dav1dContext { int operating_point; unsigned operating_point_idc; int all_layers; + unsigned frame_size_limit; int drain; Dav1dLogger logger; diff --git a/third_party/dav1d/src/intra_edge.c b/third_party/dav1d/src/intra_edge.c index 1dea29544cc84..4cac7cd1e76fb 100644 --- a/third_party/dav1d/src/intra_edge.c +++ b/third_party/dav1d/src/intra_edge.c @@ -112,13 +112,11 @@ static void init_mode_node(EdgeBranch *const nwc, const int top_has_right, const int left_has_bottom) { - int n; - init_edges(&nwc->node, bl, (top_has_right ? ALL_FL(TOP_HAS_RIGHT) : 0) | (left_has_bottom ? ALL_FL(LEFT_HAS_BOTTOM) : 0)); if (bl == BL_16X16) { - for (n = 0; n < 4; n++) { + for (int n = 0; n < 4; n++) { EdgeTip *const nt = mem->nt++; nwc->split[n] = &nt->node; init_edges(&nt->node, bl + 1, @@ -128,7 +126,7 @@ static void init_mode_node(EdgeBranch *const nwc, ALL_FL(LEFT_HAS_BOTTOM))); } } else { - for (n = 0; n < 4; n++) { + for (int n = 0; n < 4; n++) { EdgeBranch *const nwc_child = mem->nwc[bl]++; nwc->split[n] = &nwc_child->node; init_mode_node(nwc_child, bl + 1, mem, @@ -143,12 +141,12 @@ void dav1d_init_mode_tree(EdgeNode *const root_node, EdgeTip *const nt, { EdgeBranch *const root = (EdgeBranch *) root_node; struct ModeSelMem mem; + mem.nt = nt; if (allow_sb128) { mem.nwc[BL_128X128] = &root[1]; mem.nwc[BL_64X64] = &root[1 + 4]; mem.nwc[BL_32X32] = &root[1 + 4 + 16]; - mem.nt = nt; init_mode_node(root, BL_128X128, &mem, 1, 0); assert(mem.nwc[BL_128X128] == &root[1 + 4]); assert(mem.nwc[BL_64X64] == &root[1 + 4 + 16]); @@ -158,7 +156,6 @@ void dav1d_init_mode_tree(EdgeNode *const root_node, EdgeTip *const nt, mem.nwc[BL_128X128] = NULL; mem.nwc[BL_64X64] = &root[1]; mem.nwc[BL_32X32] = &root[1 + 4]; - mem.nt = nt; init_mode_node(root, BL_64X64, &mem, 1, 0); assert(mem.nwc[BL_64X64] == &root[1 + 4]); assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16]); diff --git a/third_party/dav1d/src/ipred_prepare_tmpl.c b/third_party/dav1d/src/ipred_prepare_tmpl.c index fb4c74a099ee4..5eaabfce07b68 100644 --- a/third_party/dav1d/src/ipred_prepare_tmpl.c +++ b/third_party/dav1d/src/ipred_prepare_tmpl.c @@ -99,17 +99,12 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left, case VERT_LEFT_PRED: { *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle; - if (*angle < 90) { - mode = have_top ? Z1_PRED : VERT_PRED; - } else if (*angle == 90) { - mode = VERT_PRED; - } else if (*angle < 180) { + if (*angle <= 90) + mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED; + else if (*angle < 180) mode = Z2_PRED; - } else if (*angle == 180) { - mode = HOR_PRED; - } else { - mode = have_left ? Z3_PRED : HOR_PRED; - } + else + mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED; break; } case DC_PRED: @@ -196,14 +191,14 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left, } if (av1_intra_prediction_edges[mode].needs_topleft) { - if (have_left) { + if (have_left) *topleft_out = have_top ? dst_top[-1] : dst[-1]; - } else { + else *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1; - } + if (mode == Z2_PRED && tw + th >= 6 && filter_edge) - *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 + - topleft_out[1] * 5 + 8) >> 4; + *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 + + topleft_out[0] * 6 + 8) >> 4; } return mode; diff --git a/third_party/dav1d/src/ipred_tmpl.c b/third_party/dav1d/src/ipred_tmpl.c index a6eb999da3591..09be553b7b7fe 100644 --- a/third_party/dav1d/src/ipred_tmpl.c +++ b/third_party/dav1d/src/ipred_tmpl.c @@ -725,7 +725,7 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride, } } -void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { +COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { c->intra_pred[DC_PRED ] = ipred_dc_c; c->intra_pred[DC_128_PRED ] = ipred_dc_128_c; c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c; diff --git a/third_party/dav1d/src/itx_tmpl.c b/third_party/dav1d/src/itx_tmpl.c index 9708d9781cf63..4742785379390 100644 --- a/third_party/dav1d/src/itx_tmpl.c +++ b/third_party/dav1d/src/itx_tmpl.c @@ -193,7 +193,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, memset(coeff, 0, sizeof(*coeff) * 4 * 4); } -void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { +COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { #define assign_itx_all_fn64(w, h, pfx) \ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ inv_txfm_add_dct_dct_##w##x##h##_c diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c index 414e99bc773e6..518ec63933a78 100644 --- a/third_party/dav1d/src/lib.c +++ b/third_party/dav1d/src/lib.c @@ -46,17 +46,17 @@ #include "src/wedge.h" #include "src/film_grain.h" -static void init_internal(void) { +static COLD void init_internal(void) { dav1d_init_wedge_masks(); dav1d_init_interintra_masks(); dav1d_init_qm_tables(); } -const char *dav1d_version(void) { +COLD const char *dav1d_version(void) { return DAV1D_VERSION; } -void dav1d_default_settings(Dav1dSettings *const s) { +COLD void dav1d_default_settings(Dav1dSettings *const s) { s->n_frame_threads = 1; s->n_tile_threads = 1; s->apply_grain = 1; @@ -67,13 +67,12 @@ void dav1d_default_settings(Dav1dSettings *const s) { s->logger.callback = dav1d_log_default_callback; s->operating_point = 0; s->all_layers = 1; // just until the tests are adjusted + s->frame_size_limit = 0; } static void close_internal(Dav1dContext **const c_out, int flush); -int dav1d_open(Dav1dContext **const c_out, - const Dav1dSettings *const s) -{ +COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { static pthread_once_t initted = PTHREAD_ONCE_INIT; pthread_once(&initted, init_internal); @@ -92,7 +91,7 @@ int dav1d_open(Dav1dContext **const c_out, pthread_attr_t thread_attr; if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM); - pthread_attr_setstacksize(&thread_attr, 512 * 1024); + pthread_attr_setstacksize(&thread_attr, 1024 * 1024); Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32); if (!c) goto error; @@ -103,6 +102,19 @@ int dav1d_open(Dav1dContext **const c_out, c->apply_grain = s->apply_grain; c->operating_point = s->operating_point; c->all_layers = s->all_layers; + c->frame_size_limit = s->frame_size_limit; + + /* On 32-bit systems extremely large frame sizes can cause overflows in + * dav1d_decode_frame() malloc size calculations. Prevent that from occuring + * by enforcing a maximum frame size limit, chosen to roughly correspond to + * the largest size possible to decode without exhausting virtual memory. */ + if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) { + c->frame_size_limit = 8192 * 8192; + if (s->frame_size_limit) + dav1d_log(c, "Frame size limit reduced from %u to %u.\n", + s->frame_size_limit, c->frame_size_limit); + } + c->frame_thread.flush = &c->frame_thread.flush_mem; atomic_init(c->frame_thread.flush, 0); c->n_fc = s->n_frame_threads; @@ -432,12 +444,12 @@ void dav1d_flush(Dav1dContext *const c) { c->frame_thread.next = 0; } -void dav1d_close(Dav1dContext **const c_out) { +COLD void dav1d_close(Dav1dContext **const c_out) { validate_input(c_out != NULL); close_internal(c_out, 1); } -static void close_internal(Dav1dContext **const c_out, int flush) { +static COLD void close_internal(Dav1dContext **const c_out, int flush) { Dav1dContext *const c = *c_out; if (!c) return; diff --git a/third_party/dav1d/src/log.c b/third_party/dav1d/src/log.c index 4eb4e913f28ac..999e3a2e8a007 100644 --- a/third_party/dav1d/src/log.c +++ b/third_party/dav1d/src/log.c @@ -36,14 +36,14 @@ #include "src/internal.h" #include "src/log.h" -void dav1d_log_default_callback(void *const cookie, - const char *const format, va_list ap) +COLD void dav1d_log_default_callback(void *const cookie, + const char *const format, va_list ap) { vfprintf(stderr, format, ap); } #if CONFIG_LOG -void dav1d_log(Dav1dContext *const c, const char *const format, ...) { +COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) { validate_input(c != NULL); if (!c->logger.callback) diff --git a/third_party/dav1d/src/loopfilter_tmpl.c b/third_party/dav1d/src/loopfilter_tmpl.c index 1a9d7efd3fc6f..6ea744f37bc5d 100644 --- a/third_party/dav1d/src/loopfilter_tmpl.c +++ b/third_party/dav1d/src/loopfilter_tmpl.c @@ -244,7 +244,7 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride, } } -void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) { +COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) { c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c; c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c; c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c; diff --git a/third_party/dav1d/src/looprestoration_tmpl.c b/third_party/dav1d/src/looprestoration_tmpl.c index 08af2b0f13e76..20ed6702be9a2 100644 --- a/third_party/dav1d/src/looprestoration_tmpl.c +++ b/third_party/dav1d/src/looprestoration_tmpl.c @@ -573,7 +573,7 @@ static void selfguided_c(pixel *p, const ptrdiff_t p_stride, } } -void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) { +COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) { c->wiener = wiener_c; c->selfguided = selfguided_c; diff --git a/third_party/dav1d/src/mc_tmpl.c b/third_party/dav1d/src/mc_tmpl.c index b1fe67ac3ef20..9744f24dff921 100644 --- a/third_party/dav1d/src/mc_tmpl.c +++ b/third_party/dav1d/src/mc_tmpl.c @@ -635,10 +635,8 @@ static void mask_c(pixel *dst, const ptrdiff_t dst_stride, } #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) -static NOINLINE void -blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, - const int w, int h, const uint8_t *mask, - const ptrdiff_t mask_stride) +static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, + const int w, int h, const uint8_t *mask) { do { for (int x = 0; x < w; x++) { @@ -646,26 +644,28 @@ blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, } dst += PXSTRIDE(dst_stride); tmp += w; - mask += mask_stride; + mask += w; } while (--h); } -static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, - const int w, const int h, const uint8_t *mask) -{ - blend_internal_c(dst, dst_stride, tmp, w, h, mask, w); -} - static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, - const int w, const int h) + const int w, int h) { - blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0); + const uint8_t *const mask = &dav1d_obmc_masks[w]; + do { + for (int x = 0; x < (w * 3) >> 2; x++) { + dst[x] = blend_px(dst[x], tmp[x], mask[x]); + } + dst += PXSTRIDE(dst_stride); + tmp += w; + } while (--h); } static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) { const uint8_t *mask = &dav1d_obmc_masks[h]; + h = (h * 3) >> 2; do { const int m = *mask++; for (int x = 0; x < w; x++) { @@ -912,7 +912,7 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride, } while (--h); } -void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { +COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { #define init_mc_fns(type, name) do { \ c->mc [type] = put_##name##_c; \ c->mc_scaled [type] = put_##name##_scaled_c; \ diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build index 2487197247aa7..a2baedbd85720 100644 --- a/third_party/dav1d/src/meson.build +++ b/third_party/dav1d/src/meson.build @@ -28,43 +28,43 @@ # libdav1d source files libdav1d_sources = files( - 'picture.c', + 'cdf.c', 'cpu.c', 'data.c', - 'ref.c', - 'log.c', - 'getbits.c', - 'obu.c', 'decode.c', - 'cdf.c', - 'msac.c', - 'tables.c', - 'scan.c', 'dequant_tables.c', + 'getbits.c', 'intra_edge.c', 'lf_mask.c', + 'log.c', + 'msac.c', + 'obu.c', + 'picture.c', + 'qm.c', + 'ref.c', 'ref_mvs.c', + 'scan.c', + 'tables.c', 'warpmv.c', 'wedge.c', - 'qm.c', ) # libdav1d bitdepth source files # These files are compiled for each bitdepth with # `BITDEPTH` defined to the currently built bitdepth. libdav1d_tmpl_sources = files( + 'cdef_apply_tmpl.c', + 'cdef_tmpl.c', + 'film_grain_tmpl.c', 'ipred_prepare_tmpl.c', 'ipred_tmpl.c', 'itx_tmpl.c', 'lf_apply_tmpl.c', 'loopfilter_tmpl.c', - 'mc_tmpl.c', - 'cdef_apply_tmpl.c', - 'cdef_tmpl.c', - 'lr_apply_tmpl.c', 'looprestoration_tmpl.c', + 'lr_apply_tmpl.c', + 'mc_tmpl.c', 'recon_tmpl.c', - 'film_grain_tmpl.c', ) # libdav1d entrypoint source files @@ -128,15 +128,15 @@ if is_asm_enabled if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'x86/cdef.asm', - 'x86/cdef_sse.asm', 'x86/ipred.asm', - 'x86/ipred_ssse3.asm', 'x86/itx.asm', - 'x86/itx_ssse3.asm', 'x86/loopfilter.asm', 'x86/looprestoration.asm', - 'x86/looprestoration_ssse3.asm', 'x86/mc.asm', + 'x86/cdef_sse.asm', + 'x86/ipred_ssse3.asm', + 'x86/itx_ssse3.asm', + 'x86/looprestoration_ssse3.asm', 'x86/mc_ssse3.asm', ) endif diff --git a/third_party/dav1d/src/msac.c b/third_party/dav1d/src/msac.c index 31e40047e4a28..91459b300e3b4 100644 --- a/third_party/dav1d/src/msac.c +++ b/third_party/dav1d/src/msac.c @@ -27,7 +27,6 @@ #include "config.h" -#include #include #include "common/intops.h" @@ -68,7 +67,7 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) { ctx_refill(s); } -unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) { +unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) { ec_win vw, dif = s->dif; unsigned ret, v, r = s->rng; assert((dif >> (EC_WIN_SIZE - 16)) < r); @@ -86,7 +85,7 @@ unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) { /* Decode a single binary value. * f: The probability that the bit is one * Return: The value decoded (0 or 1). */ -unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) { +unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) { ec_win vw, dif = s->dif; unsigned ret, v, r = s->rng; assert((dif >> (EC_WIN_SIZE - 16)) < r); @@ -99,13 +98,6 @@ unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) { return !ret; } -unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) { - unsigned v = 0; - while (n--) - v = (v << 1) | dav1d_msac_decode_bool_equi(s); - return v; -} - int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, const int n, const unsigned k) { @@ -122,15 +114,6 @@ int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, n - 1 - inv_recenter(n - 1 - ref, v); } -int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) { - assert(n > 0); - const int l = ulog2(n) + 1; - assert(l > 1); - const unsigned m = (1 << l) - n; - const unsigned v = dav1d_msac_decode_bools(s, l - 1); - return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s); -} - /* Decodes a symbol given an inverse cumulative distribution function (CDF) * table in Q15. */ static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf, @@ -172,8 +155,8 @@ unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s, return val; } -unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s, - uint16_t *const cdf) +unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s, + uint16_t *const cdf) { const unsigned bit = dav1d_msac_decode_bool(s, *cdf); @@ -181,11 +164,10 @@ unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s, // update_cdf() specialized for boolean CDFs const unsigned count = cdf[1]; const int rate = (count >> 4) | 4; - if (bit) { + if (bit) cdf[0] += (32768 - cdf[0]) >> rate; - } else { + else cdf[0] -= cdf[0] >> rate; - } cdf[1] = count + (count < 32); } diff --git a/third_party/dav1d/src/msac.h b/third_party/dav1d/src/msac.h index cd04c30847413..779f6236332f1 100644 --- a/third_party/dav1d/src/msac.h +++ b/third_party/dav1d/src/msac.h @@ -28,11 +28,11 @@ #ifndef DAV1D_SRC_MSAC_H #define DAV1D_SRC_MSAC_H +#include #include #include -/* Using uint32_t should be faster on 32 bit systems, in theory, maybe */ -typedef uint64_t ec_win; +typedef size_t ec_win; typedef struct MsacContext { const uint8_t *buf_pos; @@ -43,42 +43,57 @@ typedef struct MsacContext { int allow_update_cdf; } MsacContext; +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/msac.h" +#elif ARCH_X86 +#include "src/x86/msac.h" +#endif +#endif + void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz, int disable_cdf_update_flag); unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf, size_t n_symbols); -unsigned dav1d_msac_decode_bool_equi(MsacContext *s); -unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f); -unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf); -unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n); +unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s); +unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f); int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k); -int dav1d_msac_decode_uniform(MsacContext *s, unsigned n); /* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */ -#if ARCH_AARCH64 && HAVE_ASM -unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, - size_t n_symbols); -unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf, - size_t n_symbols); -unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf, - size_t n_symbols); -#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon -#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon -#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon -#elif ARCH_X86_64 && HAVE_ASM -unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, - size_t n_symbols); -unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, - size_t n_symbols); -unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, - size_t n_symbols); -#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 -#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 -#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 -#else +#ifndef dav1d_msac_decode_symbol_adapt4 #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c +#endif +#ifndef dav1d_msac_decode_symbol_adapt8 #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c +#endif +#ifndef dav1d_msac_decode_symbol_adapt16 #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c #endif +#ifndef dav1d_msac_decode_bool_adapt +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c +#endif +#ifndef dav1d_msac_decode_bool_equi +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c +#endif +#ifndef dav1d_msac_decode_bool +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c +#endif + +static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) { + unsigned v = 0; + while (n--) + v = (v << 1) | dav1d_msac_decode_bool_equi(s); + return v; +} + +static inline int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) { + assert(n > 0); + const int l = ulog2(n) + 1; + assert(l > 1); + const unsigned m = (1 << l) - n; + const unsigned v = dav1d_msac_decode_bools(s, l - 1); + return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s); +} #endif /* DAV1D_SRC_MSAC_H */ diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c index cf4e583d7b046..147887bbd5a66 100644 --- a/third_party/dav1d/src/obu.c +++ b/third_party/dav1d/src/obu.c @@ -1306,6 +1306,15 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { goto error; } + if (c->frame_size_limit && (int64_t)c->frame_hdr->width[1] * + c->frame_hdr->height > c->frame_size_limit) + { + dav1d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width[1], + c->frame_hdr->height, c->frame_size_limit); + c->frame_hdr = NULL; + return DAV1D_ERR(ERANGE); + } + // This is the frame header at the start of a frame OBU. // There's no trailing bit at the end to skip, but we do need // to align to the next byte. @@ -1418,6 +1427,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { default: // print a warning but don't fail for unknown types dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type); + break; } break; @@ -1427,8 +1437,9 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { // ignore OBUs we don't care about break; default: + // print a warning but don't fail for unknown types dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len); - return DAV1D_ERR(EINVAL); + break; } if (c->seq_hdr && c->frame_hdr) { diff --git a/third_party/dav1d/src/qm.c b/third_party/dav1d/src/qm.c index 42c09b51c7887..8d9a0f954ab41 100644 --- a/third_party/dav1d/src/qm.c +++ b/third_party/dav1d/src/qm.c @@ -29,6 +29,8 @@ #include +#include "common/attributes.h" + #include "src/qm.h" static const uint8_t qm_tbl_4x4_t[][2][10] = { @@ -3104,7 +3106,7 @@ static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) { } } -void dav1d_init_qm_tables(void) { +COLD void dav1d_init_qm_tables(void) { // This function is guaranteed to be called only once for (int i = 0; i < 15; i++) diff --git a/third_party/dav1d/src/recon_tmpl.c b/third_party/dav1d/src/recon_tmpl.c index 17b84067742db..751ffabe32160 100644 --- a/third_party/dav1d/src/recon_tmpl.c +++ b/third_party/dav1d/src/recon_tmpl.c @@ -69,19 +69,19 @@ static int decode_coefs(Dav1dTileContext *const t, const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; const int dbg = DEBUG_BLOCK_INFO && plane && 0; - if (dbg) printf("Start: r=%d\n", ts->msac.rng); + if (dbg) + printf("Start: r=%d\n", ts->msac.rng); // does this block have any non-zero coefficients const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout); const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]); if (dbg) - printf("Post-non-zero[%d][%d][%d]: r=%d\n", - t_dim->ctx, sctx, all_skip, ts->msac.rng); + printf("Post-non-zero[%d][%d][%d]: r=%d\n", + t_dim->ctx, sctx, all_skip, ts->msac.rng); if (all_skip) { *res_ctx = 0x40; - *txtp = f->frame_hdr->segmentation.lossless[b->seg_id] ? WHT_WHT : - DCT_DCT; + *txtp = f->frame_hdr->segmentation.lossless[b->seg_id] ? WHT_WHT : DCT_DCT; return -1; } @@ -111,9 +111,9 @@ static int decode_coefs(Dav1dTileContext *const t, dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt); if (dbg) - printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n", - set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1, - idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng); + printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n", + set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1, + idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng); } *txtp = dav1d_tx_types_per_set[set][idx]; } @@ -140,26 +140,20 @@ static int decode_coefs(Dav1dTileContext *const t, #undef case_sz } if (dbg) - printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n", - 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng); + printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n", + 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng); int eob; if (eob_bin > 1) { - eob = 1 << (eob_bin - 1); uint16_t *const eob_hi_bit_cdf = ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin]; - const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, - eob_hi_bit_cdf); + const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf); if (dbg) - printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n", - t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng); - unsigned mask = eob >> 1; - if (eob_hi_bit) eob |= mask; - for (mask >>= 1; mask; mask >>= 1) { - const int eob_bit = dav1d_msac_decode_bool_equi(&ts->msac); - if (eob_bit) eob |= mask; - } + printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n", + t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng); + eob = ((eob_hi_bit | 2) << (eob_bin - 2)) | + dav1d_msac_decode_bools(&ts->msac, eob_bin - 2); if (dbg) - printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng); + printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng); } else { eob = eob_bin; } @@ -168,98 +162,180 @@ static int decode_coefs(Dav1dTileContext *const t, uint16_t (*const br_cdf)[5] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma]; const int16_t *const scan = dav1d_scans[tx][tx_class]; - uint8_t levels[36 * 36]; - ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1); - memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1)); - const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1; - unsigned cul_level = 0; - for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) { - const int rc = scan[i], x = rc >> shift, y = rc & mask; - - // lo tok - const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class); - uint16_t *const lo_cdf = is_last ? - ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] : - ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx]; - int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, - 4 - is_last) + is_last; + int dc_tok; + + if (eob) { + ALIGN_STK_16(uint8_t, levels, 36 * 36,); + const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8); + const ptrdiff_t stride = 4 * (sh + 1); + memset(levels, 0, stride * 4 * (sw + 1)); + const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1; + + { // eob + const int rc = scan[eob], x = rc >> shift, y = rc & mask; + + const int ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4); + uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx]; + int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1; + if (dbg) + printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", + t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); + + if (tok == 3) { + const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride); + do { + const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, + br_cdf[br_ctx], 4); + if (dbg) + printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n", + imin(t_dim->ctx, 3), chroma, br_ctx, + eob, rc, tok_br, tok, ts->msac.rng); + tok += tok_br; + if (tok_br < 3) break; + } while (tok < 15); + } + + cf[rc] = tok; + levels[x * stride + y] = (uint8_t) tok; + } + for (int i = eob - 1; i > 0; i--) { // ac + const int rc = scan[i], x = rc >> shift, y = rc & mask; + + // lo tok + const int ctx = get_coef_nz_ctx(levels, tx, tx_class, x, y, stride); + uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx]; + int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4); + if (dbg) + printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", + t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); + + // hi tok + if (tok == 3) { + const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride); + do { + const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, + br_cdf[br_ctx], 4); + if (dbg) + printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n", + imin(t_dim->ctx, 3), chroma, br_ctx, + i, rc, tok_br, tok, ts->msac.rng); + tok += tok_br; + if (tok_br < 3) break; + } while (tok < 15); + } + + cf[rc] = tok; + levels[x * stride + y] = (uint8_t) tok; + } + { // dc + int ctx = 0; + if (tx_class != TX_CLASS_2D) + ctx = get_coef_nz_ctx(levels, tx, tx_class, 0, 0, stride); + uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx]; + dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4); + if (dbg) + printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", + t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); + + if (dc_tok == 3) { + const int br_ctx = get_br_ctx(levels, 0, tx_class, 0, 0, stride); + do { + const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, + br_cdf[br_ctx], 4); + if (dbg) + printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n", + imin(t_dim->ctx, 3), chroma, br_ctx, + tok_br, dc_tok, ts->msac.rng); + dc_tok += tok_br; + if (tok_br < 3) break; + } while (dc_tok < 15); + } + } + } else { // dc-only + uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][0]; + dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1; if (dbg) - printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", - t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); - if (!tok) continue; + printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", + t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng); - // hi tok - if (tok == 3) { - const int br_ctx = get_br_ctx(levels, rc, tx, tx_class); + if (dc_tok == 3) { do { const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[br_ctx], 4); + br_cdf[0], 4); if (dbg) - printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n", - imin(t_dim->ctx, 3), chroma, br_ctx, - i, rc, tok_br, tok, ts->msac.rng); - tok += tok_br; + printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n", + imin(t_dim->ctx, 3), chroma, 0, + tok_br, dc_tok, ts->msac.rng); + dc_tok += tok_br; if (tok_br < 3) break; - } while (tok < 15); + } while (dc_tok < 15); } - - cf[rc] = tok; - levels[x * stride + y] = (uint8_t) cf[rc]; } // residual and sign - int dc_sign = 1; + int dc_sign = 1 << 6; const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id]; const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane]; const int dq_shift = imax(0, t_dim->ctx - 2); const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc; - const int cf_min = -(1 << (7 + bitdepth)); const int cf_max = (1 << (7 + bitdepth)) - 1; - for (int i = 0; i <= eob; i++) { + unsigned cul_level = 0; + + if (dc_tok) { // dc + const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l); + uint16_t *const dc_sign_cdf = + ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; + const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); + const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5; + if (dbg) + printf("Post-dc_sign[%d][%d][%d]: r=%d\n", + chroma, dc_sign_ctx, sign, ts->msac.rng); + dc_sign = (sign - 1) & (2 << 6); + + if (dc_tok == 15) { + dc_tok += read_golomb(&ts->msac); + if (dbg) + printf("Post-dc_residual[%d->%d]: r=%d\n", + dc_tok - 15, dc_tok, ts->msac.rng); + + dc_tok &= 0xfffff; + } + + cul_level += dc_tok; + dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift; + cf[0] = imin(dc_tok - sign, cf_max) ^ -sign; + } + for (int i = 1; i <= eob; i++) { // ac const int rc = scan[i]; int tok = cf[rc]; if (!tok) continue; - int dq; // sign - int sign; - if (i == 0) { - const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l); - uint16_t *const dc_sign_cdf = - ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; - sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); - if (dbg) - printf("Post-dc_sign[%d][%d][%d]: r=%d\n", - chroma, dc_sign_ctx, sign, ts->msac.rng); - dc_sign = sign ? 0 : 2; - dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5; - } else { - sign = dav1d_msac_decode_bool_equi(&ts->msac); - if (dbg) + const int sign = dav1d_msac_decode_bool_equi(&ts->msac); + const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5; + if (dbg) printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng); - dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5; - } // residual if (tok == 15) { tok += read_golomb(&ts->msac); if (dbg) - printf("Post-residual[%d=%d=%d->%d]: r=%d\n", - i, rc, tok - 15, tok, ts->msac.rng); - } + printf("Post-residual[%d=%d=%d->%d]: r=%d\n", + i, rc, tok - 15, tok, ts->msac.rng); - // coefficient parsing, see 5.11.39 - tok &= 0xfffff; + // coefficient parsing, see 5.11.39 + tok &= 0xfffff; + } // dequant, see 7.12.3 cul_level += tok; - tok = (((int64_t)dq * tok) & 0xffffff) >> dq_shift; - cf[rc] = iclip(sign ? -tok : tok, cf_min, cf_max); + tok = ((dq * tok) & 0xffffff) >> dq_shift; + cf[rc] = imin(tok - sign, cf_max) ^ -sign; } // context - *res_ctx = imin(cul_level, 63) | (dc_sign << 6); + *res_ctx = imin(cul_level, 63) | dc_sign; return eob; } @@ -644,7 +720,7 @@ static int obmc(Dav1dTileContext *const t, if (a_r->ref[0] > 0) { const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]); const int oh4 = imin(b_dim[1], 16) >> 1; - res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4, + res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2, t->bx + x, t->by, pl, a_r->mv[0], &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1, dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]); diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c index 4117a2400602b..31d288c5af8b9 100644 --- a/third_party/dav1d/src/tables.c +++ b/third_party/dav1d/src/tables.c @@ -861,7 +861,7 @@ const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = { } }; -const uint8_t ALIGN(dav1d_obmc_masks[64], 32) = { +const uint8_t dav1d_obmc_masks[64] = { /* Unused */ 0, 0, /* 2 */ diff --git a/third_party/dav1d/src/wedge.c b/third_party/dav1d/src/wedge.c index 533d7b726b2d1..2c292836e06c4 100644 --- a/third_party/dav1d/src/wedge.c +++ b/third_party/dav1d/src/wedge.c @@ -155,7 +155,7 @@ static void copy2d(uint8_t *dst, const uint8_t *src, } } -static void init_chroma(uint8_t *chroma, const uint8_t *luma, +static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma, const int sign, const int w, const int h, const int ss_ver) { for (int y = 0; y < h; y += 1 + ss_ver) { @@ -169,12 +169,12 @@ static void init_chroma(uint8_t *chroma, const uint8_t *luma, } } -static void fill2d_16x2(uint8_t *dst, const int w, const int h, - const enum BlockSize bs, - const uint8_t (*const master)[64 * 64], - const wedge_code_type *const cb, - uint8_t *masks_444, uint8_t *masks_422, - uint8_t *masks_420, const unsigned signs) +static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h, + const enum BlockSize bs, + const uint8_t (*const master)[64 * 64], + const wedge_code_type *const cb, + uint8_t *masks_444, uint8_t *masks_422, + uint8_t *masks_420, const unsigned signs) { uint8_t *ptr = dst; for (int n = 0; n < 16; n++) { @@ -222,7 +222,7 @@ static void fill2d_16x2(uint8_t *dst, const int w, const int h, } } -void dav1d_init_wedge_masks(void) { +COLD void dav1d_init_wedge_masks(void) { // This function is guaranteed to be called only once enum WedgeMasterLineType { @@ -304,10 +304,10 @@ const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = { #undef set #undef set1 -static void build_nondc_ii_masks(uint8_t *const mask_v, - uint8_t *const mask_h, - uint8_t *const mask_sm, - const int w, const int h, const int step) +static COLD void build_nondc_ii_masks(uint8_t *const mask_v, + uint8_t *const mask_h, + uint8_t *const mask_sm, + const int w, const int h, const int step) { static const uint8_t ii_weights_1d[] = { 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, @@ -323,7 +323,7 @@ static void build_nondc_ii_masks(uint8_t *const mask_v, } } -void dav1d_init_interintra_masks(void) { +COLD void dav1d_init_interintra_masks(void) { // This function is guaranteed to be called only once memset(ii_dc_mask, 32, 32 * 32); diff --git a/third_party/dav1d/src/win32/thread.c b/third_party/dav1d/src/win32/thread.c index 2c2a578df4553..c579ba442fe02 100644 --- a/third_party/dav1d/src/win32/thread.c +++ b/third_party/dav1d/src/win32/thread.c @@ -33,17 +33,19 @@ #include #include +#include "common/attributes.h" + #include "src/thread.h" -static unsigned __stdcall thread_entrypoint(void *const data) { +static COLD unsigned __stdcall thread_entrypoint(void *const data) { pthread_t *const t = data; t->arg = t->func(t->arg); return 0; } -int dav1d_pthread_create(pthread_t *const thread, - const pthread_attr_t *const attr, - void *(*const func)(void*), void *const arg) +COLD int dav1d_pthread_create(pthread_t *const thread, + const pthread_attr_t *const attr, + void *(*const func)(void*), void *const arg) { const unsigned stack_size = attr ? attr->stack_size : 0; thread->func = func; @@ -53,7 +55,7 @@ int dav1d_pthread_create(pthread_t *const thread, return !thread->h; } -int dav1d_pthread_join(pthread_t *const thread, void **const res) { +COLD int dav1d_pthread_join(pthread_t *const thread, void **const res) { if (WaitForSingleObject(thread->h, INFINITE)) return 1; @@ -63,8 +65,8 @@ int dav1d_pthread_join(pthread_t *const thread, void **const res) { return !CloseHandle(thread->h); } -int dav1d_pthread_once(pthread_once_t *const once_control, - void (*const init_routine)(void)) +COLD int dav1d_pthread_once(pthread_once_t *const once_control, + void (*const init_routine)(void)) { BOOL pending = FALSE; diff --git a/third_party/dav1d/src/x86/cdef_init_tmpl.c b/third_party/dav1d/src/x86/cdef_init_tmpl.c index 99521aa25e97f..44f43150723a6 100644 --- a/third_party/dav1d/src/x86/cdef_init_tmpl.c +++ b/third_party/dav1d/src/x86/cdef_init_tmpl.c @@ -44,7 +44,7 @@ decl_cdef_dir_fn(dav1d_cdef_dir_avx2); decl_cdef_dir_fn(dav1d_cdef_dir_sse4); decl_cdef_dir_fn(dav1d_cdef_dir_ssse3); -void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { +COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/third_party/dav1d/src/x86/cpu.c b/third_party/dav1d/src/x86/cpu.c index 82c403c47bafd..476e535c0aca0 100644 --- a/third_party/dav1d/src/x86/cpu.c +++ b/third_party/dav1d/src/x86/cpu.c @@ -29,12 +29,14 @@ #include +#include "common/attributes.h" + #include "src/x86/cpu.h" void dav1d_cpu_cpuid(uint32_t *info, int leaf); uint64_t dav1d_cpu_xgetbv(int xcr); -unsigned dav1d_get_cpu_flags_x86(void) { +COLD unsigned dav1d_get_cpu_flags_x86(void) { uint32_t info[4] = {0}, n_ids; unsigned flags = 0; diff --git a/third_party/dav1d/src/x86/ipred_init_tmpl.c b/third_party/dav1d/src/x86/ipred_init_tmpl.c index 427f0fa29c91b..677dfdf548151 100644 --- a/third_party/dav1d/src/x86/ipred_init_tmpl.c +++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c @@ -75,7 +75,7 @@ decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3); decl_pal_pred_fn(dav1d_pal_pred_ssse3); -void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { +COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/third_party/dav1d/src/x86/itx_init_tmpl.c b/third_party/dav1d/src/x86/itx_init_tmpl.c index c4987ae8aaa84..7d0c58c8e8090 100644 --- a/third_party/dav1d/src/x86/itx_init_tmpl.c +++ b/third_party/dav1d/src/x86/itx_init_tmpl.c @@ -98,7 +98,7 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3); decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3); decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3); -void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { +COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ dav1d_inv_txfm_add_##type##_##w##x##h##_##ext diff --git a/third_party/dav1d/src/x86/loopfilter_init_tmpl.c b/third_party/dav1d/src/x86/loopfilter_init_tmpl.c index 0da2f2b1f7070..7a2efd67abb9d 100644 --- a/third_party/dav1d/src/x86/loopfilter_init_tmpl.c +++ b/third_party/dav1d/src/x86/loopfilter_init_tmpl.c @@ -33,7 +33,7 @@ decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_avx2); decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_avx2); decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_avx2); -void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) { +COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; diff --git a/third_party/dav1d/src/x86/looprestoration_init_tmpl.c b/third_party/dav1d/src/x86/looprestoration_init_tmpl.c index 9068008f347de..2c0f04df9a780 100644 --- a/third_party/dav1d/src/x86/looprestoration_init_tmpl.c +++ b/third_party/dav1d/src/x86/looprestoration_init_tmpl.c @@ -28,7 +28,6 @@ #include "src/cpu.h" #include "src/looprestoration.h" -#include "common/attributes.h" #include "common/intops.h" #include "src/tables.h" @@ -211,7 +210,7 @@ DEF_LR_FILTERS(avx2) # endif #endif -void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { +COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/third_party/dav1d/src/x86/mc.asm b/third_party/dav1d/src/x86/mc.asm index 26130eeac9930..1027e02681d7c 100644 --- a/third_party/dav1d/src/x86/mc.asm +++ b/third_party/dav1d/src/x86/mc.asm @@ -2639,55 +2639,47 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .hv_w8_loop0 RET -%macro WARP_V 5 ; dst, 01, 23, 45, 67 +%macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s - lea tmp1d, [myq+deltaq*1] - lea tmp2d, [myq+deltaq*2] + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] - movq xm10, [filterq+tmp1q*8] - lea tmp1d, [tmp2q+deltaq*1] - lea myd, [tmp2q+deltaq*2] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] - movq xm9, [filterq+tmp1q*8] - lea tmp1d, [myq+deltaq*1] - lea tmp2d, [myq+deltaq*2] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 - vinserti128 m8, [filterq+myq *8], 1 ; a e - vinserti128 m10, [filterq+tmp1q*8], 1 ; b f - lea tmp1d, [tmp2q+deltaq*1] + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma shr tmp2d, 10 shr tmp1d, 10 - punpcklbw m8, m10 - vpbroadcastq m10, [filterq+tmp2q*8] ; c g - vpblendd m0, m10, 0x30 - vpbroadcastq m10, [filterq+tmp1q*8] ; d h - vpblendd m9, m10, 0x30 - punpcklbw m0, m9 - punpcklwd m9, m8, m0 - punpckhwd m8, m0 - pxor m10, m10 - punpcklbw m0, m9, m8 - punpckhbw m9, m8 - punpcklbw m8, m10, m0 ; a0 a4 b0 b4 c0 c4 d0 d4 << 8 - punpckhbw m0, m10, m0 ; a1 a5 b1 b5 c1 c5 d1 d5 << 8 + punpcklwd m8, m0 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 + punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 pmaddwd m%2, m8 - pmaddwd m0, m%3 - punpcklbw m8, m10, m9 ; a2 a6 b2 b6 c2 c6 d2 d6 << 8 - punpckhbw m9, m10, m9 ; a3 a7 b3 b7 c3 c7 d3 d7 << 8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 + punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m8, m%4 - pmaddwd m9, m%5 - paddd m0, m%2 - mova m%2, m%3 + pmaddwd m0, m%5 + paddd m%2, m9 paddd m0, m8 - mova m%3, m%4 - paddd m%1, m0, m9 - mova m%4, m%5 + paddd m%1, m0, m%2 %endmacro cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts @@ -2696,13 +2688,13 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts %endif call mangle(private_prefix %+ _warp_affine_8x8_avx2).main .loop: - psrad m11, 13 + psrad m7, 13 psrad m0, 13 - packssdw m11, m0 - pmulhrsw m11, m14 ; (x + (1 << 6)) >> 7 - vpermq m0, m11, q3120 - mova [tmpq+tsq*0], xm0 - vextracti128 [tmpq+tsq*2], m0, 1 + packssdw m7, m0 + pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2 @@ -2723,15 +2715,15 @@ cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ call .main2 lea dstq, [dstq+dsq*2] .start: - psrad m11, 17 - psrad m0, 17 - packssdw m11, m0 - pmulhrsw m11, m14 ; (x + (1 << 10)) >> 11 - vextracti128 xm0, m11, 1 - packuswb xm11, xm0 - pshufd xm0, xm11, q3120 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 + psrad m7, 18 + psrad m0, 18 + packusdw m7, m0 + pavgw m7, m11 ; (x + (1 << 10)) >> 11 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + pshufd xm7, xm7, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 dec r4d jg .loop .end: @@ -2759,83 +2751,82 @@ ALIGN function_align mova m13, [warp_8x8_shufB] vpbroadcastd m14, [pw_8192] vpbroadcastd m15, [pd_32768] + pxor m11, m11 lea filterq, [mc_warp_filter] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] - add tmp2d, tmp2d sub srcq, tmp1q ; src -= src_stride*3 + 3 - sub betad, tmp2d ; beta -= alpha*6 + sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h - pblendw m1, m0, 0xaa ; 01 - psrld m2, m0, 16 + psrld m4, m0, 16 call .h - pblendw m2, m0, 0xaa ; 12 - psrld m3, m0, 16 + pblendw m1, m0, 0xaa ; 02 call .h - pblendw m3, m0, 0xaa ; 23 - psrld m4, m0, 16 + pblendw m4, m0, 0xaa ; 13 call .h - pblendw m4, m0, 0xaa ; 34 - psrld m5, m0, 16 + psrld m2, m1, 16 + pblendw m2, m0, 0xaa ; 24 call .h - pblendw m5, m0, 0xaa ; 45 - psrld m6, m0, 16 + psrld m5, m4, 16 + pblendw m5, m0, 0xaa ; 35 call .h - pblendw m6, m0, 0xaa ; 56 + psrld m3, m2, 16 + pblendw m3, m0, 0xaa ; 46 movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] add myd, 512+(64<<10) mov r4d, 4 lea tmp1d, [deltaq*3] - add tmp1d, tmp1d - sub gammad, tmp1d ; gamma -= delta*6 + sub gammad, tmp1d ; gamma -= delta*3 .main2: call .h - psrld m7, m6, 16 - pblendw m7, m0, 0xaa ; 67 - WARP_V 11, 1, 3, 5, 7 + psrld m6, m5, 16 + pblendw m6, m0, 0xaa ; 57 + WARP_V 7, 1, 3, 4, 6 call .h - psrld m7, 16 - pblendw m7, m0, 0xaa ; 78 - WARP_V 0, 2, 4, 6, 7 + mova m1, m2 + mova m2, m3 + psrld m3, 16 + pblendw m3, m0, 0xaa ; 68 + WARP_V 0, 4, 6, 1, 3 + mova m4, m5 + mova m5, m6 ret ALIGN function_align .h: - lea tmp1d, [mxq+alphaq*1] - lea tmp2d, [mxq+alphaq*2] + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + vbroadcasti128 m10, [srcq] shr mxd, 10 shr tmp1d, 10 - vbroadcasti128 m10, [srcq] movq xm8, [filterq+mxq *8] - movhps xm8, [filterq+tmp1q*8] - lea tmp1d, [tmp2q+alphaq*1] - lea mxd, [tmp2q+alphaq*2] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 - movq xm9, [filterq+tmp2q*8] - movhps xm9, [filterq+tmp1q*8] - lea tmp1d, [mxq+alphaq*1] - lea tmp2d, [mxq+alphaq*2] + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 - vpbroadcastq m0, [filterq+mxq *8] - vpblendd m8, m0, 0x30 - vpbroadcastq m0, [filterq+tmp1q*8] - vpblendd m8, m0, 0xc0 ; 0 1 4 5 - pshufb m0, m10, m12 - pmaddubsw m0, m8 - lea tmp1d, [tmp2q+alphaq*1] + movq xm9, [filterq+mxq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 - vpbroadcastq m8, [filterq+tmp2q*8] - vpblendd m9, m8, 0x30 - vpbroadcastq m8, [filterq+tmp1q*8] - vpblendd m9, m8, 0xc0 ; 2 3 6 7 + punpcklqdq m8, m0 ; 0 1 4 5 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + punpcklqdq m9, m0 ; 2 3 6 7 + pshufb m0, m10, m12 + pmaddubsw m0, m8 pshufb m10, m13 pmaddubsw m10, m9 add srcq, ssq @@ -3837,7 +3828,10 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask movsxd wq, dword [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 - lea maskq, [base+obmc_masks+hq*4] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] neg hq jmp wq .w2: diff --git a/third_party/dav1d/src/x86/mc_init_tmpl.c b/third_party/dav1d/src/x86/mc_init_tmpl.c index 0e33cd4960b41..6a9515f569746 100644 --- a/third_party/dav1d/src/x86/mc_init_tmpl.c +++ b/third_party/dav1d/src/x86/mc_init_tmpl.c @@ -93,7 +93,7 @@ decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2); decl_emu_edge_fn(dav1d_emu_edge_avx2); decl_emu_edge_fn(dav1d_emu_edge_ssse3); -void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { +COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { #define init_mc_fn(type, name, suffix) \ c->mc[type] = dav1d_put_##name##_##suffix #define init_mct_fn(type, name, suffix) \ diff --git a/third_party/dav1d/src/x86/mc_ssse3.asm b/third_party/dav1d/src/x86/mc_ssse3.asm index abca6cf6379e7..82dcb7528c821 100644 --- a/third_party/dav1d/src/x86/mc_ssse3.asm +++ b/third_party/dav1d/src/x86/mc_ssse3.asm @@ -44,8 +44,8 @@ obmc_masks: db 0, 0, 0, 0 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 - db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 @@ -53,7 +53,6 @@ subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_64: times 16 db 64 pw_8: times 8 dw 8 @@ -3773,7 +3772,7 @@ cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask jg .w32 RET -cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask +cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_ssse3_table LEA r5, blend_v_ssse3_table tzcnt wd, wm @@ -3833,8 +3832,7 @@ cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask mova m2, [tmpq]; b BLEND_64M m1, m2, m3, m3 movq [dstq+dsq*0], m0 - punpckhqdq m0, m0 - movq [dstq+dsq*1], m0 + movhps [dstq+dsq*1], m0 add tmpq, 16 lea dstq, [dstq+dsq*2] sub hd, 2 @@ -3855,24 +3853,31 @@ cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask jg .w16_loop RET .w32: - mova m3, [maskq+64 ] ; obmc_masks_32[0] (64-m[0]) - mova m4, [maskq+80 ] ; obmc_masks_32[1] (64-m[1]) - mova m6, [maskq+96 ] ; obmc_masks_32[2] (64-m[2]) - mova m7, [maskq+112] ; obmc_masks_32[3] (64-m[3]) +%if WIN64 + mova [rsp+8], xmm6 +%endif + mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) + mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) + mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) ; 16 mask blend is provided for 64 pixels .w32_loop: mova m1, [dstq+16*0] ; a mova m2, [tmpq+16*0] ; b BLEND_64M m1, m2, m3, m4 + movq m1, [dstq+16*1] ; a + punpcklbw m1, [tmpq+16*1] ; b + pmaddubsw m1, m6 + pmulhrsw m1, m5 + packuswb m1, m1 mova [dstq+16*0], m0 - mova m1, [dstq+16*1] ; a - mova m2, [tmpq+16*1] ; b - BLEND_64M m1, m2, m6, m7 - mova [dstq+16*1], m0 + movq [dstq+16*1], m1 add tmpq, 32 add dstq, dsq dec hd jg .w32_loop +%if WIN64 + mova xmm6, [rsp+8] +%endif RET cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask @@ -3890,7 +3895,10 @@ cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask movsxd wq, dword [t0+wq*4] mova m5, [base+pw_512] add wq, t0 - lea maskq, [base+obmc_masks+hq*4] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] neg hq jmp wq .w2: diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm index 9f3a8207cf150..5956193f16eba 100644 --- a/third_party/dav1d/src/x86/msac.asm +++ b/third_party/dav1d/src/x86/msac.asm @@ -26,18 +26,40 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if ARCH_X86_64 - SECTION_RODATA 64 ; avoids cacheline splits dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 pw_0xff00: times 8 dw 0xff00 pw_32: times 8 dw 32 +%if ARCH_X86_64 +%define resp resq +%define movp movq +%define c_shuf q3333 +%define DECODE_SYMBOL_ADAPT_INIT +%else +%define resp resd +%define movp movd +%define c_shuf q1111 +%macro DECODE_SYMBOL_ADAPT_INIT 0 + mov t0, r0m + mov t1, r1m + mov t2, r2m +%if STACK_ALIGNMENT >= 16 + sub esp, 40 +%else + mov eax, esp + and esp, ~15 + sub esp, 40 + mov [esp], eax +%endif +%endmacro +%endif + struc msac - .buf: resq 1 - .end: resq 1 - .dif: resq 1 + .buf: resp 1 + .end: resp 1 + .dif: resp 1 .rng: resd 1 .cnt: resd 1 .update_cdf: resd 1 @@ -48,22 +70,26 @@ endstruc SECTION .text %if WIN64 -DECLARE_REG_TMP 3 -%define buf rsp+8 ; shadow space -%else -DECLARE_REG_TMP 0 +DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3 +%define buf rsp+8 ; shadow space +%elif UNIX64 +DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0 %define buf rsp-40 ; red zone +%else +DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2 +%define buf esp+8 %endif INIT_XMM sse2 -cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns - movd m2, [sq+msac.rng] - movq m1, [cdfq] - lea rax, [pw_0xff00] - movq m3, [sq+msac.dif] - mov r3d, [sq+msac.update_cdf] - mov r4d, nsd - neg nsq +cglobal msac_decode_symbol_adapt4, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m2, [t0+msac.rng] + movq m1, [t1] + movp m3, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + neg t2 pshuflw m2, m2, q0000 movd [buf+12], m2 pand m2, [rax] @@ -71,110 +97,129 @@ cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns psrlw m1, 6 psllw m1, 7 pmulhuw m1, m2 - movq m2, [rax+nsq*2] - pshuflw m3, m3, q3333 + movq m2, [rax+t2*2] + pshuflw m3, m3, c_shuf paddw m1, m2 mova [buf+16], m1 psubusw m1, m3 pxor m2, m2 pcmpeqw m1, m2 ; c >= v pmovmskb eax, m1 - test r3d, r3d + test t3d, t3d jz .renorm ; !allow_update_cdf ; update_cdf: - movzx r3d, word [cdfq+r4*2] ; count + movzx t3d, word [t1+t4*2] ; count pcmpeqw m2, m2 - mov r2d, r3d - shr r3d, 4 - cmp r4d, 4 - sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4 - cmp r2d, 32 - adc r2d, 0 ; count + (count < 32) - movd m3, r3d + mov t2d, t3d + shr t3d, 4 + cmp t4d, 4 + sbb t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4 + cmp t2d, 32 + adc t2d, 0 ; count + (count < 32) + movd m3, t3d pavgw m2, m1 ; i >= val ? -1 : 32768 psubw m2, m0 ; for (i = 0; i < val; i++) psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate; psraw m2, m3 ; for (; i < n_symbols - 1; i++) paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1; - movq [cdfq], m0 - mov [cdfq+r4*2], r2w + movq [t1], m0 + mov [t1+t4*2], t2w .renorm: tzcnt eax, eax - mov r4, [sq+msac.dif] - movzx r1d, word [buf+rax+16] ; v - movzx r2d, word [buf+rax+14] ; u + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax+16] ; v + movzx t2d, word [buf+rax+14] ; u shr eax, 1 .renorm2: - not r4 - sub r2d, r1d ; rng - shl r1, 48 - add r4, r1 ; ~dif - mov r1d, [sq+msac.cnt] - movifnidn t0, sq - bsr ecx, r2d +%if ARCH_X86_64 == 0 +%if STACK_ALIGNMENT >= 16 + add esp, 40 +%else + mov esp, [esp] +%endif +%endif + not t4 + sub t2d, t1d ; rng + shl t1, gprsize*8-16 + add t4, t1 ; ~dif +.renorm3: + mov t1d, [t0+msac.cnt] + movifnidn t7, t0 +.renorm4: + bsr ecx, t2d xor ecx, 15 ; d - shl r2d, cl - shl r4, cl - mov [t0+msac.rng], r2d - not r4 - sub r1d, ecx + shl t2d, cl + shl t4, cl + mov [t7+msac.rng], t2d + not t4 + sub t1d, ecx jge .end ; no refill required ; refill: - mov r2, [t0+msac.buf] - mov rcx, [t0+msac.end] - lea r5, [r2+8] - cmp r5, rcx + mov t2, [t7+msac.buf] + mov rcx, [t7+msac.end] +%if ARCH_X86_64 == 0 + push t5 +%endif + lea t5, [t2+gprsize] + cmp t5, rcx jg .refill_eob - mov r2, [r2] - lea ecx, [r1+23] - add r1d, 16 + mov t2, [t2] + lea ecx, [t1+23] + add t1d, 16 shr ecx, 3 ; shift_bytes - bswap r2 - sub r5, rcx + bswap t2 + sub t5, rcx shl ecx, 3 ; shift_bits - shr r2, cl - sub ecx, r1d ; shift_bits - 16 - cnt - mov r1d, 48 - shl r2, cl - mov [t0+msac.buf], r5 - sub r1d, ecx ; cnt + 64 - shift_bits - xor r4, r2 + shr t2, cl + sub ecx, t1d ; shift_bits - 16 - cnt + mov t1d, gprsize*8-16 + shl t2, cl + mov [t7+msac.buf], t5 + sub t1d, ecx ; cnt + gprsize*8 - shift_bits + xor t4, t2 +%if ARCH_X86_64 == 0 + pop t5 +%endif .end: - mov [t0+msac.cnt], r1d - mov [t0+msac.dif], r4 + mov [t7+msac.cnt], t1d + mov [t7+msac.dif], t4 RET .refill_eob: ; avoid overreading the input buffer - mov r5, rcx - mov ecx, 40 - sub ecx, r1d ; c + mov t5, rcx + mov ecx, gprsize*8-24 + sub ecx, t1d ; c .refill_eob_loop: - cmp r2, r5 + cmp t2, t5 jge .refill_eob_end ; eob reached - movzx r1d, byte [r2] - inc r2 - shl r1, cl - xor r4, r1 + movzx t1d, byte [t2] + inc t2 + shl t1, cl + xor t4, t1 sub ecx, 8 jge .refill_eob_loop .refill_eob_end: - mov r1d, 40 - sub r1d, ecx - mov [t0+msac.buf], r2 - mov [t0+msac.dif], r4 - mov [t0+msac.cnt], r1d + mov t1d, gprsize*8-24 +%if ARCH_X86_64 == 0 + pop t5 +%endif + sub t1d, ecx + mov [t7+msac.buf], t2 + mov [t7+msac.dif], t4 + mov [t7+msac.cnt], t1d RET -cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns - movd m2, [sq+msac.rng] - movu m1, [cdfq] - lea rax, [pw_0xff00] - movq m3, [sq+msac.dif] - mov r3d, [sq+msac.update_cdf] - mov r4d, nsd - neg nsq +cglobal msac_decode_symbol_adapt8, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m2, [t0+msac.rng] + movu m1, [t1] + movp m3, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + neg t2 pshuflw m2, m2, q0000 movd [buf+12], m2 punpcklqdq m2, m2 @@ -183,8 +228,8 @@ cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns pand m2, [rax] psllw m1, 7 pmulhuw m1, m2 - movu m2, [rax+nsq*2] - pshuflw m3, m3, q3333 + movu m2, [rax+t2*2] + pshuflw m3, m3, c_shuf paddw m1, m2 punpcklqdq m3, m3 mova [buf+16], m1 @@ -192,35 +237,36 @@ cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns pxor m2, m2 pcmpeqw m1, m2 pmovmskb eax, m1 - test r3d, r3d + test t3d, t3d jz m(msac_decode_symbol_adapt4).renorm - movzx r3d, word [cdfq+r4*2] + movzx t3d, word [t1+t4*2] pcmpeqw m2, m2 - mov r2d, r3d - shr r3d, 4 - cmp r4d, 4 ; may be called with n_symbols < 4 - sbb r3d, -5 - cmp r2d, 32 - adc r2d, 0 - movd m3, r3d + mov t2d, t3d + shr t3d, 4 + cmp t4d, 4 ; may be called with n_symbols < 4 + sbb t3d, -5 + cmp t2d, 32 + adc t2d, 0 + movd m3, t3d pavgw m2, m1 psubw m2, m0 psubw m0, m1 psraw m2, m3 paddw m0, m2 - movu [cdfq], m0 - mov [cdfq+r4*2], r2w + movu [t1], m0 + mov [t1+t4*2], t2w jmp m(msac_decode_symbol_adapt4).renorm -cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns - movd m4, [sq+msac.rng] - movu m2, [cdfq] - lea rax, [pw_0xff00] - movu m3, [cdfq+16] - movq m5, [sq+msac.dif] - mov r3d, [sq+msac.update_cdf] - mov r4d, nsd - neg nsq +cglobal msac_decode_symbol_adapt16, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m4, [t0+msac.rng] + movu m2, [t1] + movu m3, [t1+16] + movp m5, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + neg t2 %if WIN64 sub rsp, 48 ; need 36 bytes, shadow space is only 32 %endif @@ -236,8 +282,8 @@ cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns psllw m3, 7 pmulhuw m2, m4 pmulhuw m3, m4 - movu m4, [rax+nsq*2] - pshuflw m5, m5, q3333 + movu m4, [rax+t2*2] + pshuflw m5, m5, c_shuf paddw m2, m4 psubw m4, [rax-pw_0xff00+pw_32] punpcklqdq m5, m5 @@ -251,37 +297,147 @@ cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns pcmpeqw m3, m4 packsswb m5, m2, m3 pmovmskb eax, m5 - test r3d, r3d + test t3d, t3d jz .renorm - movzx r3d, word [cdfq+r4*2] + movzx t3d, word [t1+t4*2] pcmpeqw m4, m4 mova m5, m4 - lea r2d, [r3+80] ; only support n_symbols >= 4 - shr r2d, 4 - cmp r3d, 32 - adc r3d, 0 + lea t2d, [t3+80] ; only support n_symbols >= 4 + shr t2d, 4 + cmp t3d, 32 + adc t3d, 0 pavgw m4, m2 pavgw m5, m3 psubw m4, m0 psubw m0, m2 - movd m2, r2d + movd m2, t2d psubw m5, m1 psubw m1, m3 psraw m4, m2 psraw m5, m2 paddw m0, m4 paddw m1, m5 - movu [cdfq], m0 - movu [cdfq+16], m1 - mov [cdfq+r4*2], r3w + movu [t1], m0 + movu [t1+16], m1 + mov [t1+t4*2], t3w .renorm: tzcnt eax, eax - mov r4, [sq+msac.dif] - movzx r1d, word [buf+rax*2] - movzx r2d, word [buf+rax*2-2] + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax*2] + movzx t2d, word [buf+rax*2-2] %if WIN64 add rsp, 48 %endif jmp m(msac_decode_symbol_adapt4).renorm2 +cglobal msac_decode_bool_adapt, 0, 6, 0 + movifnidn t1, r1mp + movifnidn t0, r0mp + movzx eax, word [t1] + movzx t3d, byte [t0+msac.rng+1] + mov t4, [t0+msac.dif] + mov t2d, [t0+msac.rng] +%if ARCH_X86_64 + mov t5d, eax +%endif + and eax, ~63 + imul eax, t3d +%if UNIX64 + mov t6, t4 +%endif + shr eax, 7 + add eax, 4 ; v + mov t3d, eax + shl rax, gprsize*8-16 ; vw + sub t2d, t3d ; r - v + sub t4, rax ; dif - vw + setb al + cmovb t2d, t3d + mov t3d, [t0+msac.update_cdf] +%if UNIX64 + cmovb t4, t6 +%else + cmovb t4, [t0+msac.dif] +%endif +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + not t4 + test t3d, t3d + jz m(msac_decode_symbol_adapt4).renorm3 +%if UNIX64 == 0 + push t6 +%endif + movzx t6d, word [t1+2] +%if ARCH_X86_64 == 0 + push t5 + movzx t5d, word [t1] +%endif + movifnidn t7, t0 + lea ecx, [t6+64] + cmp t6d, 32 + adc t6d, 0 + mov [t1+2], t6w + imul t6d, eax, -32769 + shr ecx, 4 ; rate + add t6d, t5d ; if (bit) + sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1; + sar t6d, cl ; else + sub t5d, t6d ; cdf[0] -= cdf[0] >> rate; + mov [t1], t5w +%if WIN64 + mov t1d, [t7+msac.cnt] + pop t6 + jmp m(msac_decode_symbol_adapt4).renorm4 +%else +%if ARCH_X86_64 == 0 + pop t5 + pop t6 +%endif + jmp m(msac_decode_symbol_adapt4).renorm3 +%endif + +cglobal msac_decode_bool_equi, 0, 6, 0 + movifnidn t0, r0mp + mov t1d, [t0+msac.rng] + mov t4, [t0+msac.dif] + mov t2d, t1d + mov t1b, 8 + mov t3, t4 + mov eax, t1d + shr t1d, 1 ; v + shl rax, gprsize*8-17 ; vw + sub t2d, t1d ; r - v + sub t4, rax ; dif - vw + cmovb t2d, t1d + cmovb t4, t3 + setb al ; the upper 32 bits contains garbage but that's OK + not t4 +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + jmp m(msac_decode_symbol_adapt4).renorm3 + +cglobal msac_decode_bool, 0, 6, 0 + movifnidn t0, r0mp + movifnidn t1d, r1m + movzx eax, byte [t0+msac.rng+1] ; r >> 8 + mov t4, [t0+msac.dif] + mov t2d, [t0+msac.rng] + and t1d, ~63 + imul eax, t1d + mov t3, t4 + shr eax, 7 + add eax, 4 ; v + mov t1d, eax + shl rax, gprsize*8-16 ; vw + sub t2d, t1d ; r - v + sub t4, rax ; dif - vw + cmovb t2d, t1d + cmovb t4, t3 + setb al + not t4 +%if ARCH_X86_64 == 0 + movzx eax, al %endif + jmp m(msac_decode_symbol_adapt4).renorm3 diff --git a/third_party/dav1d/src/x86/msac.h b/third_party/dav1d/src/x86/msac.h new file mode 100644 index 0000000000000..3d8b76fd9d17d --- /dev/null +++ b/third_party/dav1d/src/x86/msac.h @@ -0,0 +1,51 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_X86_MSAC_H +#define DAV1D_SRC_X86_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s); +unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f); + +#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 +#endif + +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2 +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2 +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2 + +#endif /* DAV1D_SRC_X86_MSAC_H */ diff --git a/third_party/dav1d/tests/checkasm/mc.c b/third_party/dav1d/tests/checkasm/mc.c index fd26386ce8971..ca0c745504e1a 100644 --- a/third_party/dav1d/tests/checkasm/mc.c +++ b/third_party/dav1d/tests/checkasm/mc.c @@ -41,6 +41,21 @@ static const char *const filter_names[] = { static const char *const mxy_names[] = { "0", "h", "v", "hv" }; +static int mc_h_next(const int h) { + switch (h) { + case 4: + case 8: + case 16: + return (h * 3) >> 1; + case 6: + case 12: + case 24: + return (h & (h - 1)) * 2; + default: + return h * 2; + } +} + static void check_mc(Dav1dMCDSPContext *const c) { ALIGN_STK_32(pixel, src_buf, 135 * 135,); ALIGN_STK_32(pixel, c_dst, 128 * 128,); @@ -59,9 +74,9 @@ static void check_mc(Dav1dMCDSPContext *const c) { if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc", filter_names[filter], w, mxy_names[mxy], BITDEPTH)) { - const int min = w <= 32 ? 2 : w / 4; - const int max = imax(imin(w * 4, 128), 32); - for (int h = min; h <= max; h <<= 1) { + const int h_min = w <= 32 ? 2 : w / 4; + const int h_max = imax(imin(w * 4, 128), 32); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0; const int my = (mxy & 2) ? rnd() % 15 + 1 : 0; #if BITDEPTH == 16 @@ -418,8 +433,8 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) { HIGHBD_DECL_SUFFIX); if (check_func(c->warp8x8, "warp_8x8_%dbpc", BITDEPTH)) { - const int mx = (rnd() & 0x1fff) - 0x800; - const int my = (rnd() & 0x1fff) - 0x800; + const int mx = (rnd() & 0x1fff) - 0xa00; + const int my = (rnd() & 0x1fff) - 0xa00; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else @@ -427,7 +442,7 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) { #endif for (int i = 0; i < 4; i++) - abcd[i] = (rnd() & 0x1fff) - 0x800; + abcd[i] = (rnd() & 0x1fff) - 0xa00; for (int i = 0; i < 15 * 15; i++) src_buf[i] = rnd() & bitdepth_max; @@ -444,8 +459,8 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) { static void check_warp8x8t(Dav1dMCDSPContext *const c) { ALIGN_STK_32(pixel, src_buf, 15 * 15,); - ALIGN_STK_32(int16_t, c_tmp, 8 * 8,); - ALIGN_STK_32(int16_t, a_tmp, 8 * 8,); + ALIGN_STK_32(int16_t, c_tmp, 8 * 8,); + ALIGN_STK_32(int16_t, a_tmp, 8 * 8,); int16_t abcd[4]; const pixel *src = src_buf + 15 * 3 + 3; const ptrdiff_t src_stride = 15 * sizeof(pixel); @@ -455,8 +470,8 @@ static void check_warp8x8t(Dav1dMCDSPContext *const c) { HIGHBD_DECL_SUFFIX); if (check_func(c->warp8x8t, "warp_8x8t_%dbpc", BITDEPTH)) { - const int mx = (rnd() & 0x1fff) - 0x800; - const int my = (rnd() & 0x1fff) - 0x800; + const int mx = (rnd() & 0x1fff) - 0xa00; + const int my = (rnd() & 0x1fff) - 0xa00; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else @@ -464,7 +479,7 @@ static void check_warp8x8t(Dav1dMCDSPContext *const c) { #endif for (int i = 0; i < 4; i++) - abcd[i] = (rnd() & 0x1fff) - 0x800; + abcd[i] = (rnd() & 0x1fff) - 0xa00; for (int i = 0; i < 15 * 15; i++) src_buf[i] = rnd() & bitdepth_max; diff --git a/third_party/dav1d/tests/checkasm/msac.c b/third_party/dav1d/tests/checkasm/msac.c index 3808e310fece7..a490cdbe920f5 100644 --- a/third_party/dav1d/tests/checkasm/msac.c +++ b/third_party/dav1d/tests/checkasm/msac.c @@ -32,14 +32,22 @@ #include +#define BUF_SIZE 8192 + /* The normal code doesn't use function pointers */ typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf, size_t n_symbols); +typedef unsigned (*decode_bool_adapt_fn)(MsacContext *s, uint16_t *cdf); +typedef unsigned (*decode_bool_equi_fn)(MsacContext *s); +typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f); typedef struct { decode_symbol_adapt_fn symbol_adapt4; decode_symbol_adapt_fn symbol_adapt8; decode_symbol_adapt_fn symbol_adapt16; + decode_bool_adapt_fn bool_adapt; + decode_bool_equi_fn bool_equi; + decode_bool_fn bool; } MsacDSPContext; static void randomize_cdf(uint16_t *const cdf, int n) { @@ -61,7 +69,7 @@ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) { if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \ for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \ for (int ns = n_min; ns <= n_max; ns++) { \ - dav1d_msac_init(&s_c, buf, sizeof(buf), !cdf_update); \ + dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \ s_a = s_c; \ randomize_cdf(cdf[0], ns); \ memcpy(cdf[1], cdf[0], sizeof(*cdf)); \ @@ -81,20 +89,72 @@ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) { } \ } while (0) -static void check_decode_symbol_adapt(MsacDSPContext *const c) { +static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) { /* Use an aligned CDF buffer for more consistent benchmark * results, and a misaligned one for checking correctness. */ ALIGN_STK_16(uint16_t, cdf, 2, [17]); MsacContext s_c, s_a; - uint8_t buf[1024]; - for (int i = 0; i < 1024; i++) - buf[i] = rnd(); declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols); CHECK_SYMBOL_ADAPT( 4, 1, 5); CHECK_SYMBOL_ADAPT( 8, 1, 8); CHECK_SYMBOL_ADAPT(16, 4, 16); - report("decode_symbol_adapt"); + report("decode_symbol"); +} + +static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) { + MsacContext s_c, s_a; + + if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) { + declare_func(unsigned, MsacContext *s, uint16_t *cdf); + uint16_t cdf[2][2]; + for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { + dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); + s_a = s_c; + cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1; + cdf[0][1] = cdf[1][1] = 0; + for (int i = 0; i < 64; i++) { + unsigned c_res = call_ref(&s_c, cdf[0]); + unsigned a_res = call_new(&s_a, cdf[1]); + if (c_res != a_res || msac_cmp(&s_c, &s_a) || + memcmp(cdf[0], cdf[1], sizeof(*cdf))) + { + fail(); + } + } + if (cdf_update) + bench_new(&s_a, cdf[0]); + } + } + + if (check_func(c->bool_equi, "msac_decode_bool_equi")) { + declare_func(unsigned, MsacContext *s); + dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); + s_a = s_c; + for (int i = 0; i < 64; i++) { + unsigned c_res = call_ref(&s_c); + unsigned a_res = call_new(&s_a); + if (c_res != a_res || msac_cmp(&s_c, &s_a)) + fail(); + } + bench_new(&s_a); + } + + if (check_func(c->bool, "msac_decode_bool")) { + declare_func(unsigned, MsacContext *s, unsigned f); + dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); + s_a = s_c; + for (int i = 0; i < 64; i++) { + const unsigned f = rnd() & 0x7fff; + unsigned c_res = call_ref(&s_c, f); + unsigned a_res = call_new(&s_a, f); + if (c_res != a_res || msac_cmp(&s_c, &s_a)) + fail(); + } + bench_new(&s_a, 16384); + } + + report("decode_bool"); } void checkasm_check_msac(void) { @@ -102,20 +162,34 @@ void checkasm_check_msac(void) { c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c; c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c; c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; + c.bool_adapt = dav1d_msac_decode_bool_adapt_c; + c.bool_equi = dav1d_msac_decode_bool_equi_c; + c.bool = dav1d_msac_decode_bool_c; #if ARCH_AARCH64 && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) { c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon; c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon; c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon; + c.bool_adapt = dav1d_msac_decode_bool_adapt_neon; + c.bool_equi = dav1d_msac_decode_bool_equi_neon; + c.bool = dav1d_msac_decode_bool_neon; } -#elif ARCH_X86_64 && HAVE_ASM +#elif ARCH_X86 && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) { c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2; c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2; c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + c.bool_adapt = dav1d_msac_decode_bool_adapt_sse2; + c.bool_equi = dav1d_msac_decode_bool_equi_sse2; + c.bool = dav1d_msac_decode_bool_sse2; } #endif - check_decode_symbol_adapt(&c); + uint8_t buf[BUF_SIZE]; + for (int i = 0; i < BUF_SIZE; i++) + buf[i] = rnd(); + + check_decode_symbol(&c, buf); + check_decode_bool(&c, buf); } diff --git a/third_party/dav1d/tests/meson.build b/third_party/dav1d/tests/meson.build index 71957533d3190..3fe75746d89b8 100644 --- a/third_party/dav1d/tests/meson.build +++ b/third_party/dav1d/tests/meson.build @@ -98,13 +98,17 @@ endif dav1d_fuzzer_sources = files('libfuzzer/dav1d_fuzzer.c') fuzzer_ldflags = [] +if get_option('fuzzer_ldflags') != '' + fuzzer_ldflags += [get_option('fuzzer_ldflags')] +endif + if fuzzing_engine == 'none' dav1d_fuzzer_sources += files('libfuzzer/main.c') elif fuzzing_engine == 'libfuzzer' fuzzer_ldflags += ['-fsanitize=fuzzer'] elif fuzzing_engine == 'oss-fuzz' # libFuzzingEngine needs libc++ - fuzzer_ldflags += ['-fsanitize=fuzzer', '-lFuzzingEngine', '-lc++'] + fuzzer_ldflags += ['-lc++'] endif dav1d_fuzzer = executable('dav1d_fuzzer', diff --git a/third_party/dav1d/tools/dav1d.c b/third_party/dav1d/tools/dav1d.c index 63a658a1653a8..598755c6ac819 100644 --- a/third_party/dav1d/tools/dav1d.c +++ b/third_party/dav1d/tools/dav1d.c @@ -30,15 +30,20 @@ #include #include +#include #include #include #include +#include #ifdef HAVE_UNISTD_H # include #endif #ifdef HAVE_IO_H # include #endif +#ifdef _WIN32 +# include +#endif #include "dav1d/dav1d.h" @@ -48,18 +53,70 @@ #include "dav1d_cli_parse.h" -static void print_stats(const int istty, const unsigned n, - const unsigned num) +static uint64_t get_time_nanos(void) { +#ifdef _WIN32 + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return 1000000000 * t.QuadPart / frequency.QuadPart; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return 1000000000ULL * ts.tv_sec + ts.tv_nsec; +#endif +} + +static void sleep_nanos(uint64_t d) { +#ifdef _WIN32 + Sleep((unsigned)(d / 1000000)); +#else + const struct timespec ts = { + .tv_sec = (time_t)(d / 1000000000), + .tv_nsec = d % 1000000000, + }; + nanosleep(&ts, NULL); +#endif +} + +static void synchronize(const int realtime, const unsigned cache, + const unsigned n_out, const uint64_t nspf, + const uint64_t tfirst, uint64_t *const elapsed, + FILE *const frametimes) { - const char *pre_string = istty ? "\r" : ""; - const char *post_string = istty ? "" : "\n"; + const uint64_t tcurr = get_time_nanos(); + const uint64_t last = *elapsed; + *elapsed = tcurr - tfirst; + if (realtime) { + const uint64_t deadline = nspf * n_out; + if (*elapsed < deadline) { + const uint64_t remaining = deadline - *elapsed; + if (remaining > nspf * cache) sleep_nanos(remaining - nspf * cache); + *elapsed = deadline; + } + } + if (frametimes) { + const uint64_t frametime = *elapsed - last; + fprintf(frametimes, "%" PRIu64 "\n", frametime); + fflush(frametimes); + } +} - if (num == 0xFFFFFFFFU) { - fprintf(stderr, "%sDecoded %u frames%s", pre_string, n, post_string); +static void print_stats(const int istty, const unsigned n, const unsigned num, + const uint64_t elapsed, const double i_fps) +{ + if (istty) fputs("\r", stderr); + const double d_fps = 1e9 * n / elapsed; + const double speed = d_fps / i_fps; + if (num == 0xFFFFFFFF) { + fprintf(stderr, "Decoded %u frames", n); } else { - fprintf(stderr, "%sDecoded %u/%u frames (%.1lf%%)%s", - pre_string, n, num, 100.0 * n / num, post_string); + fprintf(stderr, "Decoded %u/%u frames (%.1lf%%)", n, num, + 100.0 * n / num); } + if (i_fps) + fprintf(stderr, " - %.2lf/%.2lf fps (%.2lfx)", d_fps, i_fps, speed); + if (!istty) fputs("\n", stderr); } int main(const int argc, char *const *const argv) { @@ -73,6 +130,9 @@ int main(const int argc, char *const *const argv) { Dav1dContext *c; Dav1dData data; unsigned n_out = 0, total, fps[2]; + uint64_t nspf, tfirst, elapsed; + double i_fps; + FILE *frametimes = NULL; const char *version = dav1d_version(); if (strcmp(version, DAV1D_VERSION)) { @@ -126,6 +186,23 @@ int main(const int argc, char *const *const argv) { if ((res = dav1d_open(&c, &lib_settings))) return res; + if (cli_settings.frametimes) + frametimes = fopen(cli_settings.frametimes, "w"); + + if (cli_settings.realtime != REALTIME_CUSTOM) { + if (fps[1] == 0) { + i_fps = 0; + nspf = 0; + } else { + i_fps = (double)fps[0] / fps[1]; + nspf = 1000000000ULL * fps[1] / fps[0]; + } + } else { + i_fps = cli_settings.realtime_fps; + nspf = (uint64_t)(1000000000.0 / cli_settings.realtime_fps); + } + tfirst = get_time_nanos(); + do { memset(&p, 0, sizeof(p)); if ((res = dav1d_send_data(c, &data)) < 0) { @@ -149,14 +226,19 @@ int main(const int argc, char *const *const argv) { cli_settings.outputfile, &p.p, fps)) < 0) { + if (frametimes) fclose(frametimes); return res; } } if ((res = output_write(out, &p)) < 0) break; n_out++; + if (nspf) { + synchronize(cli_settings.realtime, cli_settings.realtime_cache, + n_out, nspf, tfirst, &elapsed, frametimes); + } if (!cli_settings.quiet) - print_stats(istty, n_out, total); + print_stats(istty, n_out, total, elapsed, i_fps); } if (cli_settings.limit && n_out == cli_settings.limit) @@ -181,17 +263,24 @@ int main(const int argc, char *const *const argv) { cli_settings.outputfile, &p.p, fps)) < 0) { + if (frametimes) fclose(frametimes); return res; } } if ((res = output_write(out, &p)) < 0) break; n_out++; + if (nspf) { + synchronize(cli_settings.realtime, cli_settings.realtime_cache, + n_out, nspf, tfirst, &elapsed, frametimes); + } if (!cli_settings.quiet) - print_stats(istty, n_out, total); + print_stats(istty, n_out, total, elapsed, i_fps); } } + if (frametimes) fclose(frametimes); + input_close(in); if (out) { if (!cli_settings.quiet && istty) diff --git a/third_party/dav1d/tools/dav1d_cli_parse.c b/third_party/dav1d/tools/dav1d_cli_parse.c index b364ca3758ca8..a2c183c128d9a 100644 --- a/third_party/dav1d/tools/dav1d_cli_parse.c +++ b/third_party/dav1d/tools/dav1d_cli_parse.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -46,12 +47,16 @@ static const char short_opts[] = "i:o:vql:s:"; enum { ARG_DEMUXER = 256, ARG_MUXER, + ARG_FRAME_TIMES, + ARG_REALTIME, + ARG_REALTIME_CACHE, ARG_FRAME_THREADS, ARG_TILE_THREADS, ARG_VERIFY, ARG_FILM_GRAIN, ARG_OPPOINT, ARG_ALL_LAYERS, + ARG_SIZE_LIMIT, ARG_CPU_MASK, }; @@ -62,14 +67,18 @@ static const struct option long_opts[] = { { "demuxer", 1, NULL, ARG_DEMUXER }, { "muxer", 1, NULL, ARG_MUXER }, { "version", 0, NULL, 'v' }, + { "frametimes", 1, NULL, ARG_FRAME_TIMES }, { "limit", 1, NULL, 'l' }, { "skip", 1, NULL, 's' }, + { "realtime", 2, NULL, ARG_REALTIME }, + { "realtimecache", 1, NULL, ARG_REALTIME_CACHE }, { "framethreads", 1, NULL, ARG_FRAME_THREADS }, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, { "verify", 1, NULL, ARG_VERIFY }, { "filmgrain", 1, NULL, ARG_FILM_GRAIN }, { "oppoint", 1, NULL, ARG_OPPOINT }, { "alllayers", 1, NULL, ARG_ALL_LAYERS }, + { "sizelimit", 1, NULL, ARG_SIZE_LIMIT }, { "cpumask", 1, NULL, ARG_CPU_MASK }, { NULL, 0, NULL, 0 }, }; @@ -94,21 +103,25 @@ static void usage(const char *const app, const char *const reason, ...) { } fprintf(stderr, "Usage: %s [options]\n\n", app); fprintf(stderr, "Supported options:\n" - " --input/-i $file: input file\n" - " --output/-o $file: output file\n" - " --demuxer $name: force demuxer type ('ivf' or 'annexb'; default: detect from extension)\n" - " --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n" - " --quiet/-q: disable status messages\n" - " --limit/-l $num: stop decoding after $num frames\n" - " --skip/-s $num: skip decoding of the first $num frames\n" - " --version/-v: print version and exit\n" - " --framethreads $num: number of frame threads (default: 1)\n" - " --tilethreads $num: number of tile threads (default: 1)\n" - " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" - " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 32)\n" - " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" - " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" - " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n"); + " --input/-i $file: input file\n" + " --output/-o $file: output file\n" + " --demuxer $name: force demuxer type ('ivf' or 'annexb'; default: detect from extension)\n" + " --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n" + " --quiet/-q: disable status messages\n" + " --frametimes $file: dump frame times to file\n" + " --limit/-l $num: stop decoding after $num frames\n" + " --skip/-s $num: skip decoding of the first $num frames\n" + " --realtime [$fract]: limit framerate, optional argument to override input framerate\n" + " --realtimecache $num: set the size of the cache in realtime mode (default: 0)\n" + " --version/-v: print version and exit\n" + " --framethreads $num: number of frame threads (default: 1)\n" + " --tilethreads $num: number of tile threads (default: 1)\n" + " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" + " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 32)\n" + " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" + " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n" + " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" + " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n"); exit(1); } @@ -132,13 +145,31 @@ static void error(const char *const app, const char *const optarg, optarg, optname, shouldbe); } -static unsigned parse_unsigned(char *optarg, const int option, const char *app) { +static unsigned parse_unsigned(const char *const optarg, const int option, + const char *const app) +{ char *end; const unsigned res = (unsigned) strtoul(optarg, &end, 0); if (*end || end == optarg) error(app, optarg, option, "an integer"); return res; } +static int parse_optional_fraction(const char *const optarg, const int option, + const char *const app, double *value) +{ + if (optarg == NULL) return 0; + char *end; + *value = strtod(optarg, &end); + if (*end == '/' && end != optarg) { + const char *optarg2 = end + 1; + *value /= strtod(optarg2, &end); + if (*end || end == optarg2) error(app, optarg, option, "a fraction"); + } else if (*end || end == optarg) { + error(app, optarg, option, "a fraction"); + } + return 1; +} + typedef struct EnumParseTable { const char *str; const int val; @@ -238,6 +269,25 @@ void parse(const int argc, char *const *const argv, case ARG_MUXER: cli_settings->muxer = optarg; break; + case ARG_FRAME_TIMES: + cli_settings->frametimes = optarg; + break; + case ARG_REALTIME: + // workaround to parse an optional argument of the form `--a b` + // (getopt only allows `--a=b`) + if (optarg == NULL && optind < argc && argv[optind] != NULL && + argv[optind][0] != '-') + { + optarg = argv[optind]; + optind++; + } + cli_settings->realtime = 1 + parse_optional_fraction(optarg, + ARG_REALTIME, argv[0], &cli_settings->realtime_fps); + break; + case ARG_REALTIME_CACHE: + cli_settings->realtime_cache = + parse_unsigned(optarg, ARG_REALTIME_CACHE, argv[0]); + break; case ARG_FRAME_THREADS: lib_settings->n_frame_threads = parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]); @@ -262,6 +312,16 @@ void parse(const int argc, char *const *const argv, lib_settings->all_layers = !!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]); break; + case ARG_SIZE_LIMIT: { + char *arg = optarg, *end; + uint64_t res = strtoul(arg, &end, 0); + if (*end == 'x') // NxM + res *= strtoul((arg = end + 1), &end, 0); + if (*end || end == arg || res >= UINT_MAX) + error(argv[0], optarg, ARG_SIZE_LIMIT, "an integer or dimension"); + lib_settings->frame_size_limit = (unsigned) res; + break; + } case 'v': fprintf(stderr, "%s\n", dav1d_version()); exit(0); diff --git a/third_party/dav1d/tools/dav1d_cli_parse.h b/third_party/dav1d/tools/dav1d_cli_parse.h index 899f207ce4ad1..11e88e15d2c7c 100644 --- a/third_party/dav1d/tools/dav1d_cli_parse.h +++ b/third_party/dav1d/tools/dav1d_cli_parse.h @@ -35,9 +35,17 @@ typedef struct { const char *inputfile; const char *demuxer; const char *muxer; + const char *frametimes; const char *verify; unsigned limit, skip; int quiet; + enum { + REALTIME_DISABLE = 0, + REALTIME_INPUT, + REALTIME_CUSTOM, + } realtime; + double realtime_fps; + unsigned realtime_cache; } CLISettings; void parse(const int argc, char *const *const argv, diff --git a/third_party/dav1d/tools/output/output.c b/third_party/dav1d/tools/output/output.c index 7a37f007c1a26..f6e40a1259235 100644 --- a/third_party/dav1d/tools/output/output.c +++ b/third_party/dav1d/tools/output/output.c @@ -95,6 +95,8 @@ int output_open(MuxerContext **const c_out, fprintf(stderr, "Failed to find muxer named \"%s\"\n", name); return DAV1D_ERR(ENOPROTOOPT); } + } else if (!strcmp(filename, "/dev/null")) { + impl = muxers[0]; } else { const char *ext = find_extension(filename); if (!ext) {