diff options
author | Matej Dian | 2021-03-01 21:01:04 +0100 |
---|---|---|
committer | Matej Dian | 2021-03-01 21:01:04 +0100 |
commit | 653c9e7e5af9ca8328653be5ecf0b6cd4d7778c6 (patch) | |
tree | 5d0e0ea04c823fa67853e3419636f7f5a6a2536b | |
parent | ebd178f0a4a474b7cfaf320fa7ffa1d7fea952d3 (diff) | |
download | aur-dav1d-git-optimized.tar.gz |
update patches
-rw-r--r-- | .SRCINFO | 6 | ||||
-rw-r--r-- | 0001-1112.patch | 52 | ||||
-rw-r--r-- | 0002-1160.patch | 1331 | ||||
-rw-r--r-- | PKGBUILD | 9 |
4 files changed, 30 insertions, 1368 deletions
@@ -1,6 +1,6 @@ pkgbase = dav1d-git-optimized pkgdesc = AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations - pkgver = r1617.baa9237 + pkgver = r1627.f06148e pkgrel = 1 url = https://code.videolan.org/videolan/dav1d arch = x86_64 @@ -15,11 +15,9 @@ pkgbase = dav1d-git-optimized source = git+https://code.videolan.org/videolan/dav1d.git source = http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu source = 0001-1112.patch - source = 0002-1160.patch sha256sums = SKIP sha256sums = e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87 - sha256sums = 83807b996384f147cea3702a1a7fcd4accfc04c3937fea11d0f74b615c37f8d2 - sha256sums = de289262c9d4e1964e7b9130a5619c6501e82a074794ced6d7da7922630973f3 + sha256sums = SKIP pkgname = dav1d-git-optimized diff --git a/0001-1112.patch b/0001-1112.patch index b60c79a4a53a..94c89f906fc2 100644 --- a/0001-1112.patch +++ b/0001-1112.patch @@ -1,4 +1,4 @@ -From efd27b6182c04072e1cc4b80b24aa28e78d6bfea Mon Sep 17 00:00:00 2001 +From 3ab60cb9a868286bdeee77636ad4a271432df204 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Mon, 21 Dec 2020 15:38:02 -0500 Subject: [PATCH 1/7] Add bpc suffix to mc functions @@ -11,7 +11,7 @@ Subject: [PATCH 1/7] Add bpc suffix to mc functions 4 files changed, 343 insertions(+), 343 deletions(-) diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm -index dda8234f..e77a6474 100644 +index 6a1ab057..b45aabea 100644 --- a/src/x86/mc_avx2.asm +++ b/src/x86/mc_avx2.asm @@ -110,7 +110,7 @@ cextern resize_filter @@ -174,7 +174,7 @@ index dda8234f..e77a6474 100644 add wq, base_reg jmp wq %ifidn %1, put -@@ -4026,10 +4026,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy +@@ -4025,10 +4025,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy %endmacro %macro BILIN_SCALED_FN 1 @@ -187,7 +187,7 @@ index dda8234f..e77a6474 100644 %endmacro %if WIN64 -@@ -4114,11 +4114,11 @@ MC_8TAP_SCALED prep +@@ -4113,11 +4113,11 @@ MC_8TAP_SCALED prep paddd m%1, m0, m%2 %endmacro @@ -201,7 +201,7 @@ index dda8234f..e77a6474 100644 .loop: psrad m7, 13 psrad m0, 13 -@@ -4128,12 +4128,12 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts +@@ -4127,12 +4127,12 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d @@ -217,7 +217,7 @@ index dda8234f..e77a6474 100644 beta, filter, tmp1, delta, my, gamma %if WIN64 sub rsp, 0xa0 -@@ -4390,9 +4390,9 @@ ALIGN function_align +@@ -4389,9 +4389,9 @@ ALIGN function_align add tmp2q, %1*32 %endmacro @@ -230,7 +230,7 @@ index dda8234f..e77a6474 100644 tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] -@@ -4420,9 +4420,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +@@ -4419,9 +4419,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 %define W_AVG_INC_PTR AVG_INC_PTR @@ -243,7 +243,7 @@ index dda8234f..e77a6474 100644 tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight -@@ -4470,9 +4470,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +@@ -4469,9 +4469,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 add tmp1q, %1*32 %endmacro @@ -256,7 +256,7 @@ index dda8234f..e77a6474 100644 tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp -@@ -4513,9 +4513,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +@@ -4512,9 +4512,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 packuswb m%1, m1 %endmacro @@ -269,7 +269,7 @@ index dda8234f..e77a6474 100644 tzcnt wd, wm movifnidn hd, hm movifnidn maskq, maskmp -@@ -4630,15 +4630,15 @@ ALIGN function_align +@@ -4629,15 +4629,15 @@ ALIGN function_align jg .w32 RET @@ -289,7 +289,7 @@ index dda8234f..e77a6474 100644 jmp wq .w2: vpbroadcastd xm2, [maskq+2*2] -@@ -4741,9 +4741,9 @@ ALIGN function_align +@@ -4740,9 +4740,9 @@ ALIGN function_align jg .w32_loop RET @@ -302,7 +302,7 @@ index dda8234f..e77a6474 100644 mov r6d, wd tzcnt wd, wd mov hd, hm -@@ -4867,7 +4867,7 @@ ALIGN function_align +@@ -4866,7 +4866,7 @@ ALIGN function_align jl .w32_loop0 RET @@ -311,7 +311,7 @@ index dda8234f..e77a6474 100644 bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes -@@ -5054,7 +5054,7 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ +@@ -5053,7 +5053,7 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ .end: RET @@ -320,7 +320,7 @@ index dda8234f..e77a6474 100644 dst_w, h, src_w, dx, mx0 sub dword mx0m, 4<<14 sub dword src_wm, 8 -@@ -5192,9 +5192,9 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ +@@ -5191,9 +5191,9 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ jg .loop_y RET @@ -333,7 +333,7 @@ index dda8234f..e77a6474 100644 tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm -@@ -5398,9 +5398,9 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +@@ -5397,9 +5397,9 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 jg .w128_loop RET @@ -346,7 +346,7 @@ index dda8234f..e77a6474 100644 tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm -@@ -5571,9 +5571,9 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +@@ -5570,9 +5570,9 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 jg .w128_loop RET @@ -1324,7 +1324,7 @@ index edbd1865..8b38daa7 100644 GitLab -From da299eb148a2a799411132166d32613d15586578 Mon Sep 17 00:00:00 2001 +From 10b12fff1df60f403a2eb1a7025b10679619f85e Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Mon, 21 Dec 2020 00:38:05 -0500 Subject: [PATCH 2/7] x86: mc: Add AVX2 implementation of 8tap put/prep for @@ -2954,7 +2954,7 @@ index 00000000..ea6cfdbf GitLab -From 07a3064c9ebd0827177706e135f25cc8a6c25399 Mon Sep 17 00:00:00 2001 +From 9622bbfd5a5a0ef1ee94a1b28f1a9ab62e5f3031 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Sat, 26 Dec 2020 21:38:58 -0500 Subject: [PATCH 3/7] Enable AVX2 8tap put/prep HBD assembly @@ -2965,15 +2965,13 @@ Subject: [PATCH 3/7] Enable AVX2 8tap put/prep HBD assembly 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/meson.build b/src/meson.build -index f9f5c120..ff62a9d8 100644 +index 25729217..e499da2f 100644 --- a/src/meson.build +++ b/src/meson.build -@@ -208,8 +208,9 @@ if is_asm_enabled - - if dav1d_bitdepths.contains('16') - libdav1d_sources_asm += files( +@@ -212,6 +212,7 @@ if is_asm_enabled 'x86/cdef16_avx2.asm', 'x86/cdef16_sse.asm', + 'x86/looprestoration16_avx2.asm', + 'x86/mc16_avx2.asm', ) endif @@ -3040,7 +3038,7 @@ index 468069c5..fcfed9be 100644 GitLab -From 68aa4049fe160b318f0f037b0f5514aefea8a69b Mon Sep 17 00:00:00 2001 +From 3f3572bc6c91a51223756ac2baaf6e7616d09b2d Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Sat, 26 Dec 2020 06:21:44 -0500 Subject: [PATCH 4/7] x86: mc: Add AVX2 implementation of avg/w_avg/mask for @@ -3281,7 +3279,7 @@ index ea6cfdbf..7b4f9cdf 100644 GitLab -From 6ba57502ac82b00d1441a36d4e12814eafd37982 Mon Sep 17 00:00:00 2001 +From 2bb19d9fdffbfd43a89059a855352c7e5155df1c Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Sun, 27 Dec 2020 04:11:21 -0500 Subject: [PATCH 5/7] Enable AVX2 avg/w_avg/mask HBD assembly @@ -3319,7 +3317,7 @@ index fcfed9be..70798047 100644 GitLab -From c18338526a06c14409333e7d6ed34ae60a6dff46 Mon Sep 17 00:00:00 2001 +From fefedec18cb7561549b1bf16d82ab89f74519020 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Sun, 27 Dec 2020 17:25:54 -0500 Subject: [PATCH 6/7] x86: mc: Add AVX2 implementation of blend/blend_h/blend_v @@ -3744,7 +3742,7 @@ index 7b4f9cdf..ee56da2e 100644 GitLab -From 8770797232201d6e0e4106e799b6d76865feff77 Mon Sep 17 00:00:00 2001 +From 31784170aecfa8bf8d9d0cefe537fbec89212460 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" <unlord@xiph.org> Date: Sun, 27 Dec 2020 21:13:15 -0500 Subject: [PATCH 7/7] Enable AVX2 blend/blend_h/blend_v HBD assembly diff --git a/0002-1160.patch b/0002-1160.patch deleted file mode 100644 index 06f9b186c4ea..000000000000 --- a/0002-1160.patch +++ /dev/null @@ -1,1331 +0,0 @@ -From 541a62936532c3edd83edf6eb7ec83ab3e8bac5f Mon Sep 17 00:00:00 2001 -From: "Nathan E. Egge" <unlord@xiph.org> -Date: Tue, 29 Dec 2020 06:58:33 -0500 -Subject: [PATCH 1/3] Add bpc suffix to lr functions - ---- - src/x86/looprestoration.asm | 36 ++-- - src/x86/looprestoration_init_tmpl.c | 266 ++++++++++++++-------------- - src/x86/looprestoration_sse.asm | 60 +++---- - 3 files changed, 177 insertions(+), 185 deletions(-) - -diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm -index 44aaaf49..71e3e0d2 100644 ---- a/src/x86/looprestoration.asm -+++ b/src/x86/looprestoration.asm -@@ -88,8 +88,8 @@ SECTION .text - DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers - - INIT_YMM avx2 --cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h -+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h - mov fltq, fltmp - mov edged, r8m - mov wd, wm -@@ -436,8 +436,8 @@ ALIGN function_align - add dstq, dst_strideq - ret - --cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h -+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h - mov fltq, fltmp - mov edged, r8m - mov wd, wm -@@ -554,7 +554,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - jnz .h_have_right - cmp r10d, -33 - jl .h_have_right -- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right - .h_have_right: - pshufb m0, m4, m6 - pmaddubsw m0, m12 -@@ -613,7 +613,7 @@ ALIGN function_align - jnz .hv_have_right - cmp r10d, -33 - jl .hv_have_right -- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right - .hv_have_right: - pshufb m0, m4, m6 - pmaddubsw m0, m12 -@@ -727,8 +727,8 @@ ALIGN function_align - jl .v_loop - ret - --cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, params, h -+cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, params, h - %define base r12-sgr_x_by_x-256*4 - lea r12, [sgr_x_by_x+256*4] - mov paramsq, paramsmp -@@ -1187,8 +1187,8 @@ ALIGN function_align - add dstq, dst_strideq - ret - --cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, params, h -+cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, params, h - %define base r14-sgr_x_by_x-256*4 - mov paramsq, paramsmp - mov edged, r8m -@@ -1298,7 +1298,7 @@ cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ - jnz .h_have_right - cmp r10d, -17 - jl .h_have_right -- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right -+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right - .h_have_right: - pshufb m0, m5, m8 - pmullw m2, m0, m0 -@@ -1346,7 +1346,7 @@ ALIGN function_align - jnz .hv_have_right - cmp r10d, -17 - jl .hv_have_right -- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right -+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right - .hv_have_right: - pshufb m0, m5, m8 - pmullw m3, m0, m0 -@@ -1546,8 +1546,8 @@ ALIGN function_align - add dstq, dst_strideq - ret - --cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, params, h -+cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, params, h - %define base r12-sgr_x_by_x-256*4 - lea r12, [sgr_x_by_x+256*4] - mov paramsq, paramsmp -@@ -1573,7 +1573,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ - call .h_top - add lpfq, lpf_strideq - mov t2, t1 -- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup -+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup - add t1, 400*12 - call .h_top - lea r10, [lpfq+lpf_strideq*4] -@@ -1681,7 +1681,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ - jnz .h_have_right - cmp r10d, -18 - jl .h_have_right -- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right -+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right - .h_have_right: - pshufb m6, m5, m9 - pshufb m4, m5, m10 -@@ -1742,7 +1742,7 @@ ALIGN function_align - jnz .hv0_have_right - cmp r10d, -18 - jl .hv0_have_right -- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right -+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right - .hv0_have_right: - pshufb m6, m5, m9 - pshufb m4, m5, m10 -@@ -1853,7 +1853,7 @@ ALIGN function_align - jnz .hv1_have_right - cmp r10d, -18 - jl .hv1_have_right -- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right -+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right - .hv1_have_right: - pshufb m6, m5, m9 - pshufb m3, m5, m10 -diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c -index 57727787..a9aa5acf 100644 ---- a/src/x86/looprestoration_init_tmpl.c -+++ b/src/x86/looprestoration_init_tmpl.c -@@ -30,179 +30,171 @@ - - #include "common/intops.h" - --#define WIENER_FILTER(ext) \ --void dav1d_wiener_filter7_##ext(pixel *dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const LooprestorationParams *params, \ -- enum LrEdgeFlags edges); \ --void dav1d_wiener_filter5_##ext(pixel *dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const LooprestorationParams *params, \ -- enum LrEdgeFlags edges); -+#define decl_wiener_filter_fns(ext) \ -+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ -+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) - --#define SGR_FILTER(ext) \ --void dav1d_sgr_filter_5x5_##ext(pixel *dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const LooprestorationParams *params, \ -- enum LrEdgeFlags edges); \ --void dav1d_sgr_filter_3x3_##ext(pixel *dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const LooprestorationParams *params, \ -- enum LrEdgeFlags edges); \ --void dav1d_sgr_filter_mix_##ext(pixel *dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const LooprestorationParams *params, \ -- enum LrEdgeFlags edges); -+#define decl_sgr_filter_fns(ext) \ -+void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \ -+ const pixel (*left)[4], const pixel *lpf, \ -+ ptrdiff_t lpf_stride, int w, int h, \ -+ const LooprestorationParams *params, \ -+ enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \ -+ const pixel (*left)[4], const pixel *lpf, \ -+ ptrdiff_t lpf_stride, int w, int h, \ -+ const LooprestorationParams *params, \ -+ enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \ -+ const pixel (*left)[4], const pixel *lpf, \ -+ ptrdiff_t lpf_stride, int w, int h, \ -+ const LooprestorationParams *params, \ -+ enum LrEdgeFlags edges); - - /* FIXME: Replace with a port of the AVX2 code */ - #define SGR_FILTER_OLD(ext) \ --void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ -- const pixel (*left)[4], \ -- const pixel *src, const ptrdiff_t stride, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ -- const int w, const int h, const unsigned s); \ --void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const int32_t *a, const int16_t *b, \ -- const int w, const int h); \ -+void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ -+ const pixel (*left)[4], \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \ -+ const int w, const int h, const unsigned s); \ -+void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int32_t *a, const int16_t *b, \ -+ const int w, const int h); \ - \ - /* filter with a 3x3 box (radius=1) */ \ --static void dav1d_sgr_filter1_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const pixel (*left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, const int strength, \ -- const enum LrEdgeFlags edges) \ -+static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const pixel (*left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const int strength, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ - int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ - ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ - int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ - \ -- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ -+ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ - if (edges & LR_HAVE_TOP) \ -- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -- NULL, lpf, lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -+ NULL, lpf, lpf_stride, w, 2, edges); \ - \ - if (edges & LR_HAVE_BOTTOM) \ -- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -- lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -+ lpf_stride, w, 2, edges); \ - \ -- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ -- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ -- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ -+ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \ -+ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \ -+ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \ - } \ - \ --void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ -- const pixel (*left)[4], \ -- const pixel *src, const ptrdiff_t stride, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ -- const int w, const int h, const int strength); \ --void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const int32_t *a, const int16_t *b, \ -- const int w, const int h); \ -+void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \ -+ const pixel (*left)[4], \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \ -+ const int w, const int h, const int strength); \ -+void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int32_t *a, const int16_t *b, \ -+ const int w, const int h); \ - \ - /* filter with a 5x5 box (radius=2) */ \ --static void dav1d_sgr_filter2_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const pixel (*left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, const int strength, \ -- const enum LrEdgeFlags edges) \ -+static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const pixel (*left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const int strength, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ - int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ - ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ - int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ - \ -- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ -+ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ - if (edges & LR_HAVE_TOP) \ -- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -- NULL, lpf, lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -+ NULL, lpf, lpf_stride, w, 2, edges); \ - \ - if (edges & LR_HAVE_BOTTOM) \ -- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -- lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -+ lpf_stride, w, 2, edges); \ - \ -- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ -- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ -- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ -+ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \ -+ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \ -+ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \ - } \ - \ --void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ -- const coef *t1, const int w, const int h, \ -- const int wt); \ --void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ -- const coef *t1, const coef *t2, \ -- const int w, const int h, \ -- const uint32_t wt); \ -+void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \ -+ const coef *t1, const int w, const int h, \ -+ const int wt); \ -+void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \ -+ const coef *t1, const coef *t2, \ -+ const int w, const int h, \ -+ const uint32_t wt); \ - \ --static void sgr_filter_5x5_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ -- const pixel (*const left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, \ -- const LooprestorationParams *const params, \ -- const enum LrEdgeFlags edges) \ -+static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ -+ const pixel (*const left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, \ -+ const LooprestorationParams *const params, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ -- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, params->sgr.s0, edges); \ -- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w0); \ -+ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, params->sgr.s0, edges); \ -+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \ - } \ --static void sgr_filter_3x3_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ -- const pixel (*const left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, \ -- const LooprestorationParams *const params, \ -- const enum LrEdgeFlags edges) \ -+static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ -+ const pixel (*const left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, \ -+ const LooprestorationParams *const params, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ -- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, params->sgr.s1, edges); \ -- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w1); \ -+ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, params->sgr.s1, edges); \ -+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \ - } \ --static void sgr_filter_mix_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ -- const pixel (*const left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, \ -- const LooprestorationParams *const params, \ -- const enum LrEdgeFlags edges) \ -+static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ -+ const pixel (*const left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, \ -+ const LooprestorationParams *const params, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(coef, tmp1, 64 * 384,); \ - ALIGN_STK_32(coef, tmp2, 64 * 384,); \ -- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, params->sgr.s0, edges); \ -- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, params->sgr.s1, edges); \ -+ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, params->sgr.s0, edges); \ -+ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, params->sgr.s1, edges); \ - const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \ -- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ -+ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \ - } - - #if BITDEPTH == 8 --WIENER_FILTER(sse2) --WIENER_FILTER(ssse3) -+decl_wiener_filter_fns(sse2); -+decl_wiener_filter_fns(ssse3); - SGR_FILTER_OLD(ssse3) - # if ARCH_X86_64 --WIENER_FILTER(avx2) --SGR_FILTER(avx2) -+decl_wiener_filter_fns(avx2); -+decl_sgr_filter_fns(avx2) - # endif - #endif - -@@ -211,25 +203,25 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont - - if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; - #if BITDEPTH == 8 -- c->wiener[0] = dav1d_wiener_filter7_sse2; -- c->wiener[1] = dav1d_wiener_filter5_sse2; -+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2); -+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2); - #endif - - if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; - #if BITDEPTH == 8 -- c->wiener[0] = dav1d_wiener_filter7_ssse3; -- c->wiener[1] = dav1d_wiener_filter5_ssse3; -- c->sgr[0] = sgr_filter_5x5_ssse3; -- c->sgr[1] = sgr_filter_3x3_ssse3; -- c->sgr[2] = sgr_filter_mix_ssse3; -+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); -+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); -+ c->sgr[0] = BF(sgr_filter_5x5, ssse3); -+ c->sgr[1] = BF(sgr_filter_3x3, ssse3); -+ c->sgr[2] = BF(sgr_filter_mix, ssse3); - #endif - - if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; - #if BITDEPTH == 8 && ARCH_X86_64 -- c->wiener[0] = dav1d_wiener_filter7_avx2; -- c->wiener[1] = dav1d_wiener_filter5_avx2; -- c->sgr[0] = dav1d_sgr_filter_5x5_avx2; -- c->sgr[1] = dav1d_sgr_filter_3x3_avx2; -- c->sgr[2] = dav1d_sgr_filter_mix_avx2; -+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2); -+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2); -+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); -+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); -+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); - #endif - } -diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm -index 5d3ca492..4b77138d 100644 ---- a/src/x86/looprestoration_sse.asm -+++ b/src/x86/looprestoration_sse.asm -@@ -97,8 +97,8 @@ SECTION .text - %macro WIENER 0 - %if ARCH_X86_64 - DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers --cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h, x -+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h, x - %define base 0 - mov fltq, fltmp - mov edged, r8m -@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5 - %define m11 [stk+96] - %define stk_off 112 - %endif --cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride -+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride - %define base r6-pb_right_ext_mask-21 - %define stk esp - %define dstq leftq -@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride - add lpfq, [rsp+gprsize*1] - call .hv_bottom - .v1: -- call mangle(private_prefix %+ _wiener_filter7_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v - RET - .no_top: - lea t3, [lpfq+lpf_strideq*4] -@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride - dec hd - jnz .main - .v3: -- call mangle(private_prefix %+ _wiener_filter7_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v - .v2: -- call mangle(private_prefix %+ _wiener_filter7_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v - jmp .v1 - .extend_right: - movd m2, [lpfq-4] -@@ -685,8 +685,8 @@ ALIGN function_align - %endif - - %if ARCH_X86_64 --cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h, x -+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h, x - mov fltq, fltmp - mov edged, r8m - mov wd, wm -@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - %define m11 [stk+80] - %define stk_off 96 - %endif --cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride -+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride - %define stk esp - %define leftmp [stk+28] - %define m8 [base+pw_m16380] -@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride - dec hd - jnz .main - .v2: -- call mangle(private_prefix %+ _wiener_filter5_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v - add dstq, dst_strideq - mov t4, t3 - mov t3, t2 - mov t2, t1 - movifnidn dstmp, dstq - .v1: -- call mangle(private_prefix %+ _wiener_filter5_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v - jmp .end - .h: - %define stk esp+4 -@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride - jnz .h_have_right - cmp xd, -17 - jl .h_have_right -- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right - .h_have_right: - %macro %%h5 0 - %if cpuflag(ssse3) -@@ -991,7 +991,7 @@ ALIGN function_align - jnz .hv_have_right - cmp xd, -17 - jl .hv_have_right -- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right - .hv_have_right: - %%h5 - mova m2, [t3+xq*2] -@@ -1161,7 +1161,7 @@ WIENER - %endmacro - - %if ARCH_X86_64 --cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim -+cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - mov xlimd, edgem - movifnidn xd, xm - mov hd, hm -@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - add xd, xlimd - xor xlimd, 2 ; 2*!have_right - %else --cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim -+cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - %define wq r0m - %define xlimd r1m - %define hd hmp -@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - RET - - %if ARCH_X86_64 --cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim -+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim - movifnidn edged, edgem - %else --cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y -+cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y - %define sumsq_baseq dword [esp+0] - %define sum_baseq dword [esp+4] - %define ylimd dword [esp+8] -@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y - jl .loop_x - RET - --cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s -+cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s - movifnidn sd, sm - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 -@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s - RET - - %if ARCH_X86_64 --cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ -- tmp_base, src_base, a_base, b_base, x, y -+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ -+ tmp_base, src_base, a_base, b_base, x, y - movifnidn wd, wm - mov hd, hm - mova m15, [pw_16] -@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - mov b_baseq, bq - xor xd, xd - %else --cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y -+cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y - %define tmp_baseq [esp+8] - %define src_baseq [esp+12] - %define a_baseq [esp+16] -@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y - jl .loop_x - RET - --cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt -+cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt - movifnidn hd, hm - %if ARCH_X86_32 - SETUP_PIC r6, 0 -@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt - RET - - %if ARCH_X86_64 --cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim -+cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov edged, edgem - movifnidn wd, wm - mov hd, hm - mova m10, [pb_0] - mova m11, [pb_0_1] - %else --cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge -+cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - %define edgeb byte edgem - %define wd xd - %define wq wd -@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - RET - - %if ARCH_X86_64 --cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim -+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov ylimd, edged - %else --cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr -+cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr - %define wm [esp+0] - %define hm [esp+4] - %define edgem [esp+8] -@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr - jmp .sum_loop_y_noload - %endif - --cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s -+cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s - movifnidn sd, sm - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 -@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s - RET - - %if ARCH_X86_64 --cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ -+cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ - tmp_base, src_base, a_base, b_base, x, y - movifnidn wd, wm - mov hd, hm -@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ - psrlw m11, m12, 1 ; pw_128 - pxor m13, m13 - %else --cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y -+cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y - %define tmp_baseq r0m - %define src_baseq r1m - %define a_baseq r3m -@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y - RET - - %undef t2 --cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt -+cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt - movifnidn wd, wm - movd m0, wtm - %if ARCH_X86_64 --- -GitLab - - -From 1a4489861e55f0e4d70df60ecf15559dfda70aee Mon Sep 17 00:00:00 2001 -From: "Nathan E. Egge" <unlord@xiph.org> -Date: Sun, 10 Jan 2021 14:12:10 -0500 -Subject: [PATCH 2/3] x86: lr: Add AVX2 implementation of wiener filter for 16 - bpc - -Relative speed-ups over C code (compared with gcc-9.3.0): - - C AVX2 -wiener_5tap_10bpc: 194892.0 14831.9 13.14x -wiener_5tap_12bpc: 194295.4 14828.9 13.10x -wiener_7tap_10bpc: 194391.7 19461.4 9.99x -wiener_7tap_12bpc: 194136.1 19418.7 10.00x ---- - src/x86/looprestoration16_avx2.asm | 480 +++++++++++++++++++++++++++++ - 1 file changed, 480 insertions(+) - create mode 100644 src/x86/looprestoration16_avx2.asm - -diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm -new file mode 100644 -index 00000000..4eb1b805 ---- /dev/null -+++ b/src/x86/looprestoration16_avx2.asm -@@ -0,0 +1,480 @@ -+; Copyright (c) 2017-2021, The rav1e contributors -+; Copyright (c) 2021, Nathan Egge -+; All rights reserved. -+; -+; Redistribution and use in source and binary forms, with or without -+; modification, are permitted provided that the following conditions are met: -+; -+; 1. Redistributions of source code must retain the above copyright notice, this -+; list of conditions and the following disclaimer. -+; -+; 2. Redistributions in binary form must reproduce the above copyright notice, -+; this list of conditions and the following disclaimer in the documentation -+; and/or other materials provided with the distribution. -+; -+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+%include "config.asm" -+%include "ext/x86/x86inc.asm" -+ -+%if ARCH_X86_64 -+ -+SECTION_RODATA 32 -+ -+wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 -+wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 -+wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 -+wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -+ -+wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 -+wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 -+wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 -+rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 -+rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 -+wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -+ -+pq_3: dq (6 - 4) + 1 -+pq_5: dq (6 - 2) + 1 -+pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4)) -+pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2)) -+ -+pq_11: dq 12 - (6 - 4) + 1 -+pq_9: dq 12 - (6 - 2) + 1 -+nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8)) -+nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8)) -+ -+pb_wiener5_l: times 2 db 2, 3 -+pb_wiener5_r: times 2 db -6, -5 -+ -+pb_wiener7_l: times 2 db 4, 5 -+pb_wiener7_m: times 2 db -4, -3 -+pb_wiener7_r: times 2 db -8, -7 -+ -+SECTION .text -+ -+INIT_YMM avx2 -+cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ vbroadcasti128 m6, [wiener5_shufB] -+ vpbroadcastd m12, [fq + 2] -+ vbroadcasti128 m7, [wiener5_shufC] -+ vpbroadcastw m13, [fq + 6] -+ vbroadcasti128 m8, [wiener5_shufD] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m9, [pd_65540] -+ movq xm10, [pq_3] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m9, [pd_262160] -+ movq xm10, [pq_5] -+.bits10: -+ pxor m11, m11 -+ add wq, wq -+ add srcq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x -+.v_loop: -+ mov xq, wq -+ test edgeb, 1 ; LR_HAVE_LEFT -+ jz .h_extend_left -+ test leftq, leftq -+ jz .h_loop -+ movd xm4, [leftq + 4] -+ vpblendd m4, [srcq + xq - 4], 0xfe -+ add leftq, 8 -+ jmp .h_main -+.h_extend_left: -+ vbroadcasti128 m5, [srcq + xq] -+ mova m4, [srcq + xq] -+ palignr m4, m5, 12 -+ pshufb m4, [wiener5_l_shuf] -+ jmp .h_main -+.h_loop: -+ movu m4, [srcq + xq - 4] -+.h_main: -+ movu m5, [srcq + xq + 4] -+ test edgeb, 2 ; LR_HAVE_RIGHT -+ jnz .h_have_right -+ cmp xd, -36 -+ jl .h_have_right -+ movd xm2, xd -+ vpbroadcastd m0, [pb_wiener5_l] -+ vpbroadcastd m1, [pb_wiener5_r] -+ vpbroadcastb m2, xm2 -+ movu m3, [pb_0to31] -+ psubb m0, m2 -+ psubb m1, m2 -+ pminub m0, m3 -+ pminub m1, m3 -+ pshufb m4, m0 -+ pshufb m5, m1 -+.h_have_right: -+ pshufb m0, m4, m6 -+ pshufb m2, m4, m7 -+ paddw m0, m2 -+ pmaddwd m0, m12 -+ pshufb m1, m5, m6 -+ pshufb m3, m5, m7 -+ paddw m1, m3 -+ pmaddwd m1, m12 -+ pshufb m4, m8 -+ pmaddwd m4, m13 -+ pshufb m5, m8 -+ pmaddwd m5, m13 -+ paddd m0, m4 -+ paddd m1, m5 -+ paddd m0, m9 -+ paddd m1, m9 -+ psrad m0, xm10 -+ psrad m1, xm10 -+ packssdw m0, m1 -+ pmaxsw m0, m11 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add srcq, ssq -+ add dstq, 384*2 -+ dec hd -+ jg .v_loop -+ RET -+ -+DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 -+ -+INIT_YMM avx2 -+cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ pxor m6, m6 -+ vpbroadcastd m7, [fq + 2] -+ vpbroadcastd m8, [fq + 6] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m9, [nd_1047552] -+ movq xm10, [pq_11] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m9, [nd_1048320] -+ movq xm10, [pq_9] -+.bits10: -+ vpbroadcastw m11, bdmaxm -+ add wq, wq -+ add midq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x -+ mov msq, 2*384 -+ mov t0, midq -+ lea t1, [t0 + msq] -+ lea t2, [t1 + msq] -+ lea t3, [t2 + msq] -+ lea t4, [t3 + msq] -+ test edgeb, 4 ; LR_HAVE_TOP -+ jnz .have_top -+ mov t0, t2 -+ mov t1, t2 -+.have_top: -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jnz .v_loop -+ cmp hd, 2 -+ jg .v_loop -+ cmp hd, 1 -+ jne .limit_v -+ mov t3, t2 -+.limit_v: -+ mov t4, t3 -+.v_loop: -+ mov xq, wq -+.h_loop: -+ mova m1, [t0 + xq] -+ mova m2, [t1 + xq] -+ mova m3, [t2 + xq] -+ mova m4, [t3 + xq] -+ mova m5, [t4 + xq] -+ punpcklwd m0, m1, m2 -+ pmaddwd m0, m7 -+ punpckhwd m1, m2 -+ pmaddwd m1, m7 -+ punpcklwd m2, m5, m4 -+ pmaddwd m2, m7 -+ punpckhwd m5, m4 -+ pmaddwd m5, m7 -+ paddd m0, m2 -+ paddd m1, m5 -+ punpcklwd m2, m3, m6 -+ pmaddwd m2, m8 -+ punpckhwd m3, m6 -+ pmaddwd m3, m8 -+ paddd m0, m2 -+ paddd m1, m3 -+ paddd m0, m9 -+ paddd m1, m9 -+ psrad m0, xm10 -+ psrad m1, xm10 -+ packusdw m0, m1 -+ pminuw m0, m11 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add dstq, dsq -+ mov t0, t1 -+ mov t1, t2 -+ mov t2, t3 -+ mov t3, t4 -+ add t4, msq -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jnz .have_bottom -+ cmp hd, 3 -+ jg .have_bottom -+ mov t4, t3 -+.have_bottom: -+ dec hd -+ jg .v_loop -+ RET -+ -+INIT_YMM avx2 -+cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ vpbroadcastd m7, [fq] -+ vpbroadcastd m8, [fq + 4] -+ vbroadcasti128 m10, [rev_w] -+ vbroadcasti128 m11, [wiener5_shufB] -+ vbroadcasti128 m12, [wiener7_shufC] -+ vbroadcasti128 m13, [wiener7_shufD] -+ vbroadcasti128 m14, [wiener7_shufE] -+ vbroadcasti128 m15, [rev_d] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m9, [pd_65540] -+ mov rhq, [pq_3] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m9, [pd_262160] -+ mov rhq, [pq_5] -+.bits10: -+ add wq, wq -+ add srcq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh -+.v_loop: -+ mov xq, wq -+ test edgeb, 1 ; LR_HAVE_LEFT -+ jz .h_extend_left -+ test leftq, leftq -+ jz .h_loop -+ movq xm4, [leftq + 2] -+ vpblendw xm4, [srcq + xq - 6], 0xf8 -+ vinserti128 m4, [srcq + xq + 10], 1 -+ add leftq, 8 -+ jmp .h_main -+.h_extend_left: -+ vbroadcasti128 m5, [srcq + xq] -+ mova m4, [srcq + xq] -+ palignr m4, m5, 10 -+ pshufb m4, [wiener7_l_shuf] -+ jmp .h_main -+.h_loop: -+ movu m4, [srcq + xq - 6] -+.h_main: -+ movu m5, [srcq + xq + 2] -+ movu m6, [srcq + xq + 6] -+ test edgeb, 2 ; LR_HAVE_RIGHT -+ jnz .h_have_right -+ cmp xd, -38 -+ jl .h_have_right -+ movd xm3, xd -+ vpbroadcastd m0, [pb_wiener7_l] -+ vpbroadcastd m1, [pb_wiener7_m] -+ vpbroadcastd m2, [pb_wiener7_r] -+ vpbroadcastb m3, xm3 -+ psubb m0, m3 -+ psubb m1, m3 -+ psubb m2, m3 -+ movu m3, [pb_0to31] -+ pminub m0, m3 -+ pminub m1, m3 -+ pminub m2, m3 -+ pshufb m4, m0 -+ pshufb m5, m1 -+ pshufb m6, m2 -+ cmp xd, -9*2 -+ jne .hack -+ vpbroadcastw xm3, [srcq + xq + 16] -+ vinserti128 m5, xm3, 1 -+ jmp .h_have_right -+.hack: -+ cmp xd, -1*2 -+ jne .h_have_right -+ vpbroadcastw xm5, [srcq + xq] -+.h_have_right: -+ pshufb m6, m10 -+ pshufb m0, m4, m11 -+ pshufb m2, m5, m12 -+ paddw m0, m2 -+ pmaddwd m0, m7 -+ pshufb m2, m4, m13 -+ pshufb m4, m14 -+ paddw m2, m4 -+ pmaddwd m2, m8 -+ pshufb m1, m6, m11 -+ pshufb m5, m11 -+ pmaddwd m1, m7 -+ pmaddwd m5, m7 -+ pshufb m3, m6, m13 -+ pshufb m6, m14 -+ paddw m3, m6 -+ pmaddwd m3, m8 -+ paddd m0, m2 -+ paddd m1, m3 -+ pshufb m1, m15 -+ paddd m1, m5 -+ movq xm4, rhq -+ pxor m5, m5 -+ paddd m0, m9 -+ paddd m1, m9 -+ psrad m0, xm4 -+ psrad m1, xm4 -+ packssdw m0, m1 -+ pmaxsw m0, m5 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add srcq, ssq -+ add dstq, 384*2 -+ dec hd -+ jg .v_loop -+ RET -+ -+INIT_YMM avx2 -+cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ pxor m6, m6 -+ vpbroadcastd m7, [fq] -+ vpbroadcastw m8, [fq + 4] -+ vpbroadcastd m9, [fq + 6] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m10, [nd_1047552] -+ movq xm11, [pq_11] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m10, [nd_1048320] -+ movq xm11, [pq_9] -+.bits10: -+ vpbroadcastw m12, bdmaxm -+ add wq, wq -+ add midq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x -+ mov msq, 2*384 -+ mov t0, midq -+ mov t1, t0 -+ lea t2, [t1 + msq] -+ lea t3, [t2 + msq] -+ lea t4, [t3 + msq] -+ lea t5, [t4 + msq] -+ lea t6, [t5 + msq] -+ test edgeb, 4 ; LR_HAVE_TOP -+ jnz .have_top -+ mov t0, t3 -+ mov t1, t3 -+ mov t2, t3 -+.have_top: -+ cmp hd, 3 -+ jg .v_loop -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jz .no_bottom0 -+ cmp hd, 1 -+ jg .v_loop -+ jmp .h3 -+.no_bottom0: -+ cmp hd, 2 -+ je .h2 -+ jns .h3 -+.h1: -+ mov t4, t3 -+.h2: -+ mov t5, t4 -+.h3: -+ mov t6, t5 -+.v_loop: -+ mov xq, wq -+.h_loop: -+ mova m1, [t0 + xq] -+ mova m2, [t1 + xq] -+ mova m3, [t5 + xq] -+ mova m4, [t6 + xq] -+ punpcklwd m0, m1, m2 -+ pmaddwd m0, m7 -+ punpckhwd m1, m2 -+ pmaddwd m1, m7 -+ punpcklwd m2, m4, m3 -+ pmaddwd m2, m7 -+ punpckhwd m4, m3 -+ pmaddwd m4, m7 -+ paddd m0, m2 -+ paddd m1, m4 -+ mova m3, [t2 + xq] -+ mova m4, [t4 + xq] -+ punpcklwd m2, m3, m4 -+ pmaddwd m2, m8 -+ punpckhwd m3, m4 -+ pmaddwd m3, m8 -+ paddd m0, m2 -+ paddd m1, m3 -+ mova m3, [t3 + xq] -+ punpcklwd m2, m3, m6 -+ pmaddwd m2, m9 -+ punpckhwd m3, m6 -+ pmaddwd m3, m9 -+ paddd m0, m2 -+ paddd m1, m3 -+ paddd m0, m10 -+ paddd m1, m10 -+ psrad m0, xm11 -+ psrad m1, xm11 -+ packusdw m0, m1 -+ pminuw m0, m12 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add dstq, dsq -+ mov t0, t1 -+ mov t1, t2 -+ mov t2, t3 -+ mov t3, t4 -+ mov t4, t5 -+ mov t5, t6 -+ add t6, msq -+ cmp hd, 4 -+ jg .next_row -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jz .no_bottom -+ cmp hd, 2 -+ jg .next_row -+.no_bottom: -+ mov t6, t5 -+.next_row: -+ dec hd -+ jg .v_loop -+ RET -+ -+%endif ; ARCH_X86_64 --- -GitLab - - -From 2ce581302a1536559aa5e56018a03ac6a3770c0f Mon Sep 17 00:00:00 2001 -From: "Nathan E. Egge" <unlord@xiph.org> -Date: Wed, 13 Jan 2021 14:54:42 -0500 -Subject: [PATCH 3/3] Enable AVX2 wiener filter HBD assembly - ---- - src/meson.build | 1 + - src/x86/looprestoration_init_tmpl.c | 40 +++++++++++++++++++++++++++-- - 2 files changed, 39 insertions(+), 2 deletions(-) - -diff --git a/src/meson.build b/src/meson.build -index 27946501..25729217 100644 ---- a/src/meson.build -+++ b/src/meson.build -@@ -211,6 +211,7 @@ if is_asm_enabled - libdav1d_sources_asm += files( - 'x86/cdef16_avx2.asm', - 'x86/cdef16_sse.asm', -+ 'x86/looprestoration16_avx2.asm', - 'x86/mc16_avx2.asm', - ) - endif - -diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c -index a9aa5acf..52de0faf 100644 ---- a/src/x86/looprestoration_init_tmpl.c -+++ b/src/x86/looprestoration_init_tmpl.c -@@ -30,9 +30,40 @@ - - #include "common/intops.h" - -+#if BITDEPTH != 8 -+#define decl_wiener_filter_fn(name, ext) \ -+void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \ -+ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \ -+ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ -+void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \ -+ const int16_t fv[7], int w, int h, \ -+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ -+static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ -+ const pixel (*const left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const LooprestorationParams *params, \ -+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \ -+ ALIGN_STK_64(int16_t, mid, 68 * 384,); \ -+ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \ -+ edges HIGHBD_TAIL_SUFFIX); \ -+ if (edges & LR_HAVE_TOP) { \ -+ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \ -+ edges HIGHBD_TAIL_SUFFIX); \ -+ } \ -+ if (edges & LR_HAVE_BOTTOM) { \ -+ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \ -+ lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \ -+ } \ -+ BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \ -+} -+#define decl_wiener_filter_fns(ext) \ -+decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \ -+decl_wiener_filter_fn(dav1d_wiener_filter5, ext) -+#else - #define decl_wiener_filter_fns(ext) \ - decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ - decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) -+#endif - - #define decl_sgr_filter_fns(ext) \ - void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \ -@@ -193,11 +224,14 @@ decl_wiener_filter_fns(sse2); - decl_wiener_filter_fns(ssse3); - SGR_FILTER_OLD(ssse3) - # if ARCH_X86_64 --decl_wiener_filter_fns(avx2); - decl_sgr_filter_fns(avx2) - # endif - #endif - -+#if ARCH_X86_64 -+decl_wiener_filter_fns(avx2); -+#endif -+ - COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { - const unsigned flags = dav1d_get_cpu_flags(); - -@@ -217,11 +251,13 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont - #endif - - if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; --#if BITDEPTH == 8 && ARCH_X86_64 -+#if ARCH_X86_64 - c->wiener[0] = BF(dav1d_wiener_filter7, avx2); - c->wiener[1] = BF(dav1d_wiener_filter5, avx2); -+# if BITDEPTH == 8 - c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); - c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); - c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); -+# endif - #endif - } --- -GitLab - @@ -3,7 +3,7 @@ _testvideo=Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu pkgname=dav1d-git-optimized -pkgver=r1617.baa9237 +pkgver=r1627.f06148e pkgrel=1 license=('BSD') pkgdesc='AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations' @@ -14,12 +14,10 @@ conflicts=('dav1d' 'dav1d-git') makedepends=('meson' 'git' 'nasm') source=('git+https://code.videolan.org/videolan/dav1d.git' "http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/$_testvideo" - '0001-1112.patch' - '0002-1160.patch') + '0001-1112.patch') sha256sums=('SKIP' 'e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87' - '83807b996384f147cea3702a1a7fcd4accfc04c3937fea11d0f74b615c37f8d2' - 'de289262c9d4e1964e7b9130a5619c6501e82a074794ced6d7da7922630973f3') + 'SKIP') pkgver () { cd dav1d @@ -29,7 +27,6 @@ pkgver () { prepare () { cd dav1d patch -Np1 -i ${srcdir}/0001-1112.patch - patch -Np1 -i ${srcdir}/0002-1160.patch } build () { |