summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatej Dian2021-02-17 19:38:06 +0100
committerMatej Dian2021-02-17 19:38:06 +0100
commit0cf5a0c19b7a52ab1d3426a724d6c8f0c863118f (patch)
treed36d578406412127128ddc34f356062274d907e7
parentc8557275c5b4b6acbd1c963c61fd3518005975d5 (diff)
downloadaur-0cf5a0c19b7a52ab1d3426a724d6c8f0c863118f.tar.gz
update patches, misc changes, aur is retarded
-rw-r--r--0001-1112.patch4
-rw-r--r--0002-1160.patch1331
-rw-r--r--0002-wiener_2.patch661
-rw-r--r--0003-wiener_3.patch492
-rw-r--r--0004-wiener_4.patch101
-rw-r--r--PKGBUILD21
6 files changed, 1341 insertions, 1269 deletions
diff --git a/0001-1112.patch b/0001-1112.patch
index e5ed26b240a9..b60c79a4a53a 100644
--- a/0001-1112.patch
+++ b/0001-1112.patch
@@ -2968,10 +2968,12 @@ diff --git a/src/meson.build b/src/meson.build
index f9f5c120..ff62a9d8 100644
--- a/src/meson.build
+++ b/src/meson.build
-@@ -208,6 +208,7 @@ if is_asm_enabled
+@@ -208,8 +208,9 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
+ 'x86/cdef16_avx2.asm',
+ 'x86/cdef16_sse.asm',
+ 'x86/mc16_avx2.asm',
)
endif
diff --git a/0002-1160.patch b/0002-1160.patch
new file mode 100644
index 000000000000..06f9b186c4ea
--- /dev/null
+++ b/0002-1160.patch
@@ -0,0 +1,1331 @@
+From 541a62936532c3edd83edf6eb7ec83ab3e8bac5f Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Tue, 29 Dec 2020 06:58:33 -0500
+Subject: [PATCH 1/3] Add bpc suffix to lr functions
+
+---
+ src/x86/looprestoration.asm | 36 ++--
+ src/x86/looprestoration_init_tmpl.c | 266 ++++++++++++++--------------
+ src/x86/looprestoration_sse.asm | 60 +++----
+ 3 files changed, 177 insertions(+), 185 deletions(-)
+
+diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm
+index 44aaaf49..71e3e0d2 100644
+--- a/src/x86/looprestoration.asm
++++ b/src/x86/looprestoration.asm
+@@ -88,8 +88,8 @@ SECTION .text
+ DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers
+
+ INIT_YMM avx2
+-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h
++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+@@ -436,8 +436,8 @@ ALIGN function_align
+ add dstq, dst_strideq
+ ret
+
+-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h
++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+@@ -554,7 +554,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+ .h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+@@ -613,7 +613,7 @@ ALIGN function_align
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+ .hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+@@ -727,8 +727,8 @@ ALIGN function_align
+ jl .v_loop
+ ret
+
+-cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, params, h
++cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, params, h
+ %define base r12-sgr_x_by_x-256*4
+ lea r12, [sgr_x_by_x+256*4]
+ mov paramsq, paramsmp
+@@ -1187,8 +1187,8 @@ ALIGN function_align
+ add dstq, dst_strideq
+ ret
+
+-cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, params, h
++cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, params, h
+ %define base r14-sgr_x_by_x-256*4
+ mov paramsq, paramsmp
+ mov edged, r8m
+@@ -1298,7 +1298,7 @@ cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
+ jnz .h_have_right
+ cmp r10d, -17
+ jl .h_have_right
+- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+ .h_have_right:
+ pshufb m0, m5, m8
+ pmullw m2, m0, m0
+@@ -1346,7 +1346,7 @@ ALIGN function_align
+ jnz .hv_have_right
+ cmp r10d, -17
+ jl .hv_have_right
+- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+ .hv_have_right:
+ pshufb m0, m5, m8
+ pmullw m3, m0, m0
+@@ -1546,8 +1546,8 @@ ALIGN function_align
+ add dstq, dst_strideq
+ ret
+
+-cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, params, h
++cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, params, h
+ %define base r12-sgr_x_by_x-256*4
+ lea r12, [sgr_x_by_x+256*4]
+ mov paramsq, paramsmp
+@@ -1573,7 +1573,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t2, t1
+- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup
++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+lpf_strideq*4]
+@@ -1681,7 +1681,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+ .h_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+@@ -1742,7 +1742,7 @@ ALIGN function_align
+ jnz .hv0_have_right
+ cmp r10d, -18
+ jl .hv0_have_right
+- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+ .hv0_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+@@ -1853,7 +1853,7 @@ ALIGN function_align
+ jnz .hv1_have_right
+ cmp r10d, -18
+ jl .hv1_have_right
+- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+ .hv1_have_right:
+ pshufb m6, m5, m9
+ pshufb m3, m5, m10
+diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
+index 57727787..a9aa5acf 100644
+--- a/src/x86/looprestoration_init_tmpl.c
++++ b/src/x86/looprestoration_init_tmpl.c
+@@ -30,179 +30,171 @@
+
+ #include "common/intops.h"
+
+-#define WIENER_FILTER(ext) \
+-void dav1d_wiener_filter7_##ext(pixel *dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const LooprestorationParams *params, \
+- enum LrEdgeFlags edges); \
+-void dav1d_wiener_filter5_##ext(pixel *dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const LooprestorationParams *params, \
+- enum LrEdgeFlags edges);
++#define decl_wiener_filter_fns(ext) \
++decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
++decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+-#define SGR_FILTER(ext) \
+-void dav1d_sgr_filter_5x5_##ext(pixel *dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const LooprestorationParams *params, \
+- enum LrEdgeFlags edges); \
+-void dav1d_sgr_filter_3x3_##ext(pixel *dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const LooprestorationParams *params, \
+- enum LrEdgeFlags edges); \
+-void dav1d_sgr_filter_mix_##ext(pixel *dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const LooprestorationParams *params, \
+- enum LrEdgeFlags edges);
++#define decl_sgr_filter_fns(ext) \
++void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
++ const pixel (*left)[4], const pixel *lpf, \
++ ptrdiff_t lpf_stride, int w, int h, \
++ const LooprestorationParams *params, \
++ enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \
++ const pixel (*left)[4], const pixel *lpf, \
++ ptrdiff_t lpf_stride, int w, int h, \
++ const LooprestorationParams *params, \
++ enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \
++ const pixel (*left)[4], const pixel *lpf, \
++ ptrdiff_t lpf_stride, int w, int h, \
++ const LooprestorationParams *params, \
++ enum LrEdgeFlags edges);
+
+ /* FIXME: Replace with a port of the AVX2 code */
+ #define SGR_FILTER_OLD(ext) \
+-void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
+- const pixel (*left)[4], \
+- const pixel *src, const ptrdiff_t stride, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
+- const int w, const int h, const unsigned s); \
+-void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const int32_t *a, const int16_t *b, \
+- const int w, const int h); \
++void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
++ const pixel (*left)[4], \
++ const pixel *src, const ptrdiff_t stride, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
++ const int w, const int h, const unsigned s); \
++void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const int32_t *a, const int16_t *b, \
++ const int w, const int h); \
+ \
+ /* filter with a 3x3 box (radius=1) */ \
+-static void dav1d_sgr_filter1_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const pixel (*left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, const int strength, \
+- const enum LrEdgeFlags edges) \
++static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const pixel (*left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const int strength, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
+ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
+ \
+- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
++ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
+ if (edges & LR_HAVE_TOP) \
+- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+- NULL, lpf, lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
++ NULL, lpf, lpf_stride, w, 2, edges); \
+ \
+ if (edges & LR_HAVE_BOTTOM) \
+- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+- lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
++ lpf_stride, w, 2, edges); \
+ \
+- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
+- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
+- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
++ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
++ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
++ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
+ } \
+ \
+-void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
+- const pixel (*left)[4], \
+- const pixel *src, const ptrdiff_t stride, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
+- const int w, const int h, const int strength); \
+-void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const int32_t *a, const int16_t *b, \
+- const int w, const int h); \
++void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
++ const pixel (*left)[4], \
++ const pixel *src, const ptrdiff_t stride, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
++ const int w, const int h, const int strength); \
++void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const int32_t *a, const int16_t *b, \
++ const int w, const int h); \
+ \
+ /* filter with a 5x5 box (radius=2) */ \
+-static void dav1d_sgr_filter2_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const pixel (*left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, const int strength, \
+- const enum LrEdgeFlags edges) \
++static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const pixel (*left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const int strength, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
+ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
+ \
+- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
++ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
+ if (edges & LR_HAVE_TOP) \
+- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+- NULL, lpf, lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
++ NULL, lpf, lpf_stride, w, 2, edges); \
+ \
+ if (edges & LR_HAVE_BOTTOM) \
+- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+- lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
++ lpf_stride, w, 2, edges); \
+ \
+- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
+- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
+- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
++ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
++ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
++ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
+ } \
+ \
+-void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
+- const coef *t1, const int w, const int h, \
+- const int wt); \
+-void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
+- const coef *t1, const coef *t2, \
+- const int w, const int h, \
+- const uint32_t wt); \
++void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
++ const coef *t1, const int w, const int h, \
++ const int wt); \
++void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
++ const coef *t1, const coef *t2, \
++ const int w, const int h, \
++ const uint32_t wt); \
+ \
+-static void sgr_filter_5x5_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
+- const pixel (*const left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, \
+- const LooprestorationParams *const params, \
+- const enum LrEdgeFlags edges) \
++static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
++ const pixel (*const left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, \
++ const LooprestorationParams *const params, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(coef, tmp, 64 * 384,); \
+- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, params->sgr.s0, edges); \
+- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w0); \
++ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, params->sgr.s0, edges); \
++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
+ } \
+-static void sgr_filter_3x3_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
+- const pixel (*const left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, \
+- const LooprestorationParams *const params, \
+- const enum LrEdgeFlags edges) \
++static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
++ const pixel (*const left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, \
++ const LooprestorationParams *const params, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(coef, tmp, 64 * 384,); \
+- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, params->sgr.s1, edges); \
+- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w1); \
++ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, params->sgr.s1, edges); \
++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
+ } \
+-static void sgr_filter_mix_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
+- const pixel (*const left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, \
+- const LooprestorationParams *const params, \
+- const enum LrEdgeFlags edges) \
++static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
++ const pixel (*const left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, \
++ const LooprestorationParams *const params, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(coef, tmp1, 64 * 384,); \
+ ALIGN_STK_32(coef, tmp2, 64 * 384,); \
+- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, params->sgr.s0, edges); \
+- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, params->sgr.s1, edges); \
++ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, params->sgr.s0, edges); \
++ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, params->sgr.s1, edges); \
+ const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
+- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
++ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
+ }
+
+ #if BITDEPTH == 8
+-WIENER_FILTER(sse2)
+-WIENER_FILTER(ssse3)
++decl_wiener_filter_fns(sse2);
++decl_wiener_filter_fns(ssse3);
+ SGR_FILTER_OLD(ssse3)
+ # if ARCH_X86_64
+-WIENER_FILTER(avx2)
+-SGR_FILTER(avx2)
++decl_wiener_filter_fns(avx2);
++decl_sgr_filter_fns(avx2)
+ # endif
+ #endif
+
+@@ -211,25 +203,25 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+ #if BITDEPTH == 8
+- c->wiener[0] = dav1d_wiener_filter7_sse2;
+- c->wiener[1] = dav1d_wiener_filter5_sse2;
++ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
++ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ #if BITDEPTH == 8
+- c->wiener[0] = dav1d_wiener_filter7_ssse3;
+- c->wiener[1] = dav1d_wiener_filter5_ssse3;
+- c->sgr[0] = sgr_filter_5x5_ssse3;
+- c->sgr[1] = sgr_filter_3x3_ssse3;
+- c->sgr[2] = sgr_filter_mix_ssse3;
++ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
++ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
++ c->sgr[0] = BF(sgr_filter_5x5, ssse3);
++ c->sgr[1] = BF(sgr_filter_3x3, ssse3);
++ c->sgr[2] = BF(sgr_filter_mix, ssse3);
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+ #if BITDEPTH == 8 && ARCH_X86_64
+- c->wiener[0] = dav1d_wiener_filter7_avx2;
+- c->wiener[1] = dav1d_wiener_filter5_avx2;
+- c->sgr[0] = dav1d_sgr_filter_5x5_avx2;
+- c->sgr[1] = dav1d_sgr_filter_3x3_avx2;
+- c->sgr[2] = dav1d_sgr_filter_mix_avx2;
++ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
++ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
++ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
++ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
++ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+ #endif
+ }
+diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm
+index 5d3ca492..4b77138d 100644
+--- a/src/x86/looprestoration_sse.asm
++++ b/src/x86/looprestoration_sse.asm
+@@ -97,8 +97,8 @@ SECTION .text
+ %macro WIENER 0
+ %if ARCH_X86_64
+ DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
+-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h, x
++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h, x
+ %define base 0
+ mov fltq, fltmp
+ mov edged, r8m
+@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
+ %define m11 [stk+96]
+ %define stk_off 112
+ %endif
+-cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
++cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ add lpfq, [rsp+gprsize*1]
+ call .hv_bottom
+ .v1:
+- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ RET
+ .no_top:
+ lea t3, [lpfq+lpf_strideq*4]
+@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ dec hd
+ jnz .main
+ .v3:
+- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ .v2:
+- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ jmp .v1
+ .extend_right:
+ movd m2, [lpfq-4]
+@@ -685,8 +685,8 @@ ALIGN function_align
+ %endif
+
+ %if ARCH_X86_64
+-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h, x
++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h, x
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ %define m11 [stk+80]
+ %define stk_off 96
+ %endif
+-cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
++cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ dec hd
+ jnz .main
+ .v2:
+- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ add dstq, dst_strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+ .v1:
+- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ jmp .end
+ .h:
+ %define stk esp+4
+@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+ .h_have_right:
+ %macro %%h5 0
+ %if cpuflag(ssse3)
+@@ -991,7 +991,7 @@ ALIGN function_align
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+ .hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+@@ -1161,7 +1161,7 @@ WIENER
+ %endmacro
+
+ %if ARCH_X86_64
+-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
++cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ mov xlimd, edgem
+ movifnidn xd, xm
+ mov hd, hm
+@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ add xd, xlimd
+ xor xlimd, 2 ; 2*!have_right
+ %else
+-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
++cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ %define wq r0m
+ %define xlimd r1m
+ %define hd hmp
+@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
++cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+ movifnidn edged, edgem
+ %else
+-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
++cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ %define sumsq_baseq dword [esp+0]
+ %define sum_baseq dword [esp+4]
+ %define ylimd dword [esp+8]
+@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ jl .loop_x
+ RET
+
+-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
++cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+- tmp_base, src_base, a_base, b_base, x, y
++cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
++ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ mova m15, [pw_16]
+@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+ mov b_baseq, bq
+ xor xd, xd
+ %else
+-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
++cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ %define tmp_baseq [esp+8]
+ %define src_baseq [esp+12]
+ %define a_baseq [esp+16]
+@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ jl .loop_x
+ RET
+
+-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
++cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
+ movifnidn hd, hm
+ %if ARCH_X86_32
+ SETUP_PIC r6, 0
+@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
++cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ mova m10, [pb_0]
+ mova m11, [pb_0_1]
+ %else
+-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
++cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ %define edgeb byte edgem
+ %define wd xd
+ %define wq wd
+@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
++cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
+ mov ylimd, edged
+ %else
+-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
++cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ %define wm [esp+0]
+ %define hm [esp+4]
+ %define edgem [esp+8]
+@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ jmp .sum_loop_y_noload
+ %endif
+
+-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
++cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
++cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
+ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+ psrlw m11, m12, 1 ; pw_128
+ pxor m13, m13
+ %else
+-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
++cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
+ %define tmp_baseq r0m
+ %define src_baseq r1m
+ %define a_baseq r3m
+@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+ RET
+
+ %undef t2
+-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
++cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movd m0, wtm
+ %if ARCH_X86_64
+--
+GitLab
+
+
+From 1a4489861e55f0e4d70df60ecf15559dfda70aee Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sun, 10 Jan 2021 14:12:10 -0500
+Subject: [PATCH 2/3] x86: lr: Add AVX2 implementation of wiener filter for 16
+ bpc
+
+Relative speed-ups over C code (compared with gcc-9.3.0):
+
+ C AVX2
+wiener_5tap_10bpc: 194892.0 14831.9 13.14x
+wiener_5tap_12bpc: 194295.4 14828.9 13.10x
+wiener_7tap_10bpc: 194391.7 19461.4 9.99x
+wiener_7tap_12bpc: 194136.1 19418.7 10.00x
+---
+ src/x86/looprestoration16_avx2.asm | 480 +++++++++++++++++++++++++++++
+ 1 file changed, 480 insertions(+)
+ create mode 100644 src/x86/looprestoration16_avx2.asm
+
+diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm
+new file mode 100644
+index 00000000..4eb1b805
+--- /dev/null
++++ b/src/x86/looprestoration16_avx2.asm
+@@ -0,0 +1,480 @@
++; Copyright (c) 2017-2021, The rav1e contributors
++; Copyright (c) 2021, Nathan Egge
++; All rights reserved.
++;
++; Redistribution and use in source and binary forms, with or without
++; modification, are permitted provided that the following conditions are met:
++;
++; 1. Redistributions of source code must retain the above copyright notice, this
++; list of conditions and the following disclaimer.
++;
++; 2. Redistributions in binary form must reproduce the above copyright notice,
++; this list of conditions and the following disclaimer in the documentation
++; and/or other materials provided with the distribution.
++;
++; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++%include "config.asm"
++%include "ext/x86/x86inc.asm"
++
++%if ARCH_X86_64
++
++SECTION_RODATA 32
++
++wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
++wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13
++wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1
++wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
++pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++
++wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9
++wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
++wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
++rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
++rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
++wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++
++pq_3: dq (6 - 4) + 1
++pq_5: dq (6 - 2) + 1
++pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
++pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
++
++pq_11: dq 12 - (6 - 4) + 1
++pq_9: dq 12 - (6 - 2) + 1
++nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
++nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
++
++pb_wiener5_l: times 2 db 2, 3
++pb_wiener5_r: times 2 db -6, -5
++
++pb_wiener7_l: times 2 db 4, 5
++pb_wiener7_m: times 2 db -4, -3
++pb_wiener7_r: times 2 db -8, -7
++
++SECTION .text
++
++INIT_YMM avx2
++cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ vbroadcasti128 m6, [wiener5_shufB]
++ vpbroadcastd m12, [fq + 2]
++ vbroadcasti128 m7, [wiener5_shufC]
++ vpbroadcastw m13, [fq + 6]
++ vbroadcasti128 m8, [wiener5_shufD]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m9, [pd_65540]
++ movq xm10, [pq_3]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m9, [pd_262160]
++ movq xm10, [pq_5]
++.bits10:
++ pxor m11, m11
++ add wq, wq
++ add srcq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
++.v_loop:
++ mov xq, wq
++ test edgeb, 1 ; LR_HAVE_LEFT
++ jz .h_extend_left
++ test leftq, leftq
++ jz .h_loop
++ movd xm4, [leftq + 4]
++ vpblendd m4, [srcq + xq - 4], 0xfe
++ add leftq, 8
++ jmp .h_main
++.h_extend_left:
++ vbroadcasti128 m5, [srcq + xq]
++ mova m4, [srcq + xq]
++ palignr m4, m5, 12
++ pshufb m4, [wiener5_l_shuf]
++ jmp .h_main
++.h_loop:
++ movu m4, [srcq + xq - 4]
++.h_main:
++ movu m5, [srcq + xq + 4]
++ test edgeb, 2 ; LR_HAVE_RIGHT
++ jnz .h_have_right
++ cmp xd, -36
++ jl .h_have_right
++ movd xm2, xd
++ vpbroadcastd m0, [pb_wiener5_l]
++ vpbroadcastd m1, [pb_wiener5_r]
++ vpbroadcastb m2, xm2
++ movu m3, [pb_0to31]
++ psubb m0, m2
++ psubb m1, m2
++ pminub m0, m3
++ pminub m1, m3
++ pshufb m4, m0
++ pshufb m5, m1
++.h_have_right:
++ pshufb m0, m4, m6
++ pshufb m2, m4, m7
++ paddw m0, m2
++ pmaddwd m0, m12
++ pshufb m1, m5, m6
++ pshufb m3, m5, m7
++ paddw m1, m3
++ pmaddwd m1, m12
++ pshufb m4, m8
++ pmaddwd m4, m13
++ pshufb m5, m8
++ pmaddwd m5, m13
++ paddd m0, m4
++ paddd m1, m5
++ paddd m0, m9
++ paddd m1, m9
++ psrad m0, xm10
++ psrad m1, xm10
++ packssdw m0, m1
++ pmaxsw m0, m11
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add srcq, ssq
++ add dstq, 384*2
++ dec hd
++ jg .v_loop
++ RET
++
++DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
++
++INIT_YMM avx2
++cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ pxor m6, m6
++ vpbroadcastd m7, [fq + 2]
++ vpbroadcastd m8, [fq + 6]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m9, [nd_1047552]
++ movq xm10, [pq_11]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m9, [nd_1048320]
++ movq xm10, [pq_9]
++.bits10:
++ vpbroadcastw m11, bdmaxm
++ add wq, wq
++ add midq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
++ mov msq, 2*384
++ mov t0, midq
++ lea t1, [t0 + msq]
++ lea t2, [t1 + msq]
++ lea t3, [t2 + msq]
++ lea t4, [t3 + msq]
++ test edgeb, 4 ; LR_HAVE_TOP
++ jnz .have_top
++ mov t0, t2
++ mov t1, t2
++.have_top:
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jnz .v_loop
++ cmp hd, 2
++ jg .v_loop
++ cmp hd, 1
++ jne .limit_v
++ mov t3, t2
++.limit_v:
++ mov t4, t3
++.v_loop:
++ mov xq, wq
++.h_loop:
++ mova m1, [t0 + xq]
++ mova m2, [t1 + xq]
++ mova m3, [t2 + xq]
++ mova m4, [t3 + xq]
++ mova m5, [t4 + xq]
++ punpcklwd m0, m1, m2
++ pmaddwd m0, m7
++ punpckhwd m1, m2
++ pmaddwd m1, m7
++ punpcklwd m2, m5, m4
++ pmaddwd m2, m7
++ punpckhwd m5, m4
++ pmaddwd m5, m7
++ paddd m0, m2
++ paddd m1, m5
++ punpcklwd m2, m3, m6
++ pmaddwd m2, m8
++ punpckhwd m3, m6
++ pmaddwd m3, m8
++ paddd m0, m2
++ paddd m1, m3
++ paddd m0, m9
++ paddd m1, m9
++ psrad m0, xm10
++ psrad m1, xm10
++ packusdw m0, m1
++ pminuw m0, m11
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add dstq, dsq
++ mov t0, t1
++ mov t1, t2
++ mov t2, t3
++ mov t3, t4
++ add t4, msq
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jnz .have_bottom
++ cmp hd, 3
++ jg .have_bottom
++ mov t4, t3
++.have_bottom:
++ dec hd
++ jg .v_loop
++ RET
++
++INIT_YMM avx2
++cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ vpbroadcastd m7, [fq]
++ vpbroadcastd m8, [fq + 4]
++ vbroadcasti128 m10, [rev_w]
++ vbroadcasti128 m11, [wiener5_shufB]
++ vbroadcasti128 m12, [wiener7_shufC]
++ vbroadcasti128 m13, [wiener7_shufD]
++ vbroadcasti128 m14, [wiener7_shufE]
++ vbroadcasti128 m15, [rev_d]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m9, [pd_65540]
++ mov rhq, [pq_3]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m9, [pd_262160]
++ mov rhq, [pq_5]
++.bits10:
++ add wq, wq
++ add srcq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
++.v_loop:
++ mov xq, wq
++ test edgeb, 1 ; LR_HAVE_LEFT
++ jz .h_extend_left
++ test leftq, leftq
++ jz .h_loop
++ movq xm4, [leftq + 2]
++ vpblendw xm4, [srcq + xq - 6], 0xf8
++ vinserti128 m4, [srcq + xq + 10], 1
++ add leftq, 8
++ jmp .h_main
++.h_extend_left:
++ vbroadcasti128 m5, [srcq + xq]
++ mova m4, [srcq + xq]
++ palignr m4, m5, 10
++ pshufb m4, [wiener7_l_shuf]
++ jmp .h_main
++.h_loop:
++ movu m4, [srcq + xq - 6]
++.h_main:
++ movu m5, [srcq + xq + 2]
++ movu m6, [srcq + xq + 6]
++ test edgeb, 2 ; LR_HAVE_RIGHT
++ jnz .h_have_right
++ cmp xd, -38
++ jl .h_have_right
++ movd xm3, xd
++ vpbroadcastd m0, [pb_wiener7_l]
++ vpbroadcastd m1, [pb_wiener7_m]
++ vpbroadcastd m2, [pb_wiener7_r]
++ vpbroadcastb m3, xm3
++ psubb m0, m3
++ psubb m1, m3
++ psubb m2, m3
++ movu m3, [pb_0to31]
++ pminub m0, m3
++ pminub m1, m3
++ pminub m2, m3
++ pshufb m4, m0
++ pshufb m5, m1
++ pshufb m6, m2
++ cmp xd, -9*2
++ jne .hack
++ vpbroadcastw xm3, [srcq + xq + 16]
++ vinserti128 m5, xm3, 1
++ jmp .h_have_right
++.hack:
++ cmp xd, -1*2
++ jne .h_have_right
++ vpbroadcastw xm5, [srcq + xq]
++.h_have_right:
++ pshufb m6, m10
++ pshufb m0, m4, m11
++ pshufb m2, m5, m12
++ paddw m0, m2
++ pmaddwd m0, m7
++ pshufb m2, m4, m13
++ pshufb m4, m14
++ paddw m2, m4
++ pmaddwd m2, m8
++ pshufb m1, m6, m11
++ pshufb m5, m11
++ pmaddwd m1, m7
++ pmaddwd m5, m7
++ pshufb m3, m6, m13
++ pshufb m6, m14
++ paddw m3, m6
++ pmaddwd m3, m8
++ paddd m0, m2
++ paddd m1, m3
++ pshufb m1, m15
++ paddd m1, m5
++ movq xm4, rhq
++ pxor m5, m5
++ paddd m0, m9
++ paddd m1, m9
++ psrad m0, xm4
++ psrad m1, xm4
++ packssdw m0, m1
++ pmaxsw m0, m5
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add srcq, ssq
++ add dstq, 384*2
++ dec hd
++ jg .v_loop
++ RET
++
++INIT_YMM avx2
++cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ pxor m6, m6
++ vpbroadcastd m7, [fq]
++ vpbroadcastw m8, [fq + 4]
++ vpbroadcastd m9, [fq + 6]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m10, [nd_1047552]
++ movq xm11, [pq_11]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m10, [nd_1048320]
++ movq xm11, [pq_9]
++.bits10:
++ vpbroadcastw m12, bdmaxm
++ add wq, wq
++ add midq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
++ mov msq, 2*384
++ mov t0, midq
++ mov t1, t0
++ lea t2, [t1 + msq]
++ lea t3, [t2 + msq]
++ lea t4, [t3 + msq]
++ lea t5, [t4 + msq]
++ lea t6, [t5 + msq]
++ test edgeb, 4 ; LR_HAVE_TOP
++ jnz .have_top
++ mov t0, t3
++ mov t1, t3
++ mov t2, t3
++.have_top:
++ cmp hd, 3
++ jg .v_loop
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jz .no_bottom0
++ cmp hd, 1
++ jg .v_loop
++ jmp .h3
++.no_bottom0:
++ cmp hd, 2
++ je .h2
++ jns .h3
++.h1:
++ mov t4, t3
++.h2:
++ mov t5, t4
++.h3:
++ mov t6, t5
++.v_loop:
++ mov xq, wq
++.h_loop:
++ mova m1, [t0 + xq]
++ mova m2, [t1 + xq]
++ mova m3, [t5 + xq]
++ mova m4, [t6 + xq]
++ punpcklwd m0, m1, m2
++ pmaddwd m0, m7
++ punpckhwd m1, m2
++ pmaddwd m1, m7
++ punpcklwd m2, m4, m3
++ pmaddwd m2, m7
++ punpckhwd m4, m3
++ pmaddwd m4, m7
++ paddd m0, m2
++ paddd m1, m4
++ mova m3, [t2 + xq]
++ mova m4, [t4 + xq]
++ punpcklwd m2, m3, m4
++ pmaddwd m2, m8
++ punpckhwd m3, m4
++ pmaddwd m3, m8
++ paddd m0, m2
++ paddd m1, m3
++ mova m3, [t3 + xq]
++ punpcklwd m2, m3, m6
++ pmaddwd m2, m9
++ punpckhwd m3, m6
++ pmaddwd m3, m9
++ paddd m0, m2
++ paddd m1, m3
++ paddd m0, m10
++ paddd m1, m10
++ psrad m0, xm11
++ psrad m1, xm11
++ packusdw m0, m1
++ pminuw m0, m12
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add dstq, dsq
++ mov t0, t1
++ mov t1, t2
++ mov t2, t3
++ mov t3, t4
++ mov t4, t5
++ mov t5, t6
++ add t6, msq
++ cmp hd, 4
++ jg .next_row
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jz .no_bottom
++ cmp hd, 2
++ jg .next_row
++.no_bottom:
++ mov t6, t5
++.next_row:
++ dec hd
++ jg .v_loop
++ RET
++
++%endif ; ARCH_X86_64
+--
+GitLab
+
+
+From 2ce581302a1536559aa5e56018a03ac6a3770c0f Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Wed, 13 Jan 2021 14:54:42 -0500
+Subject: [PATCH 3/3] Enable AVX2 wiener filter HBD assembly
+
+---
+ src/meson.build | 1 +
+ src/x86/looprestoration_init_tmpl.c | 40 +++++++++++++++++++++++++++--
+ 2 files changed, 39 insertions(+), 2 deletions(-)
+
+diff --git a/src/meson.build b/src/meson.build
+index 27946501..25729217 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -211,6 +211,7 @@ if is_asm_enabled
+ libdav1d_sources_asm += files(
+ 'x86/cdef16_avx2.asm',
+ 'x86/cdef16_sse.asm',
++ 'x86/looprestoration16_avx2.asm',
+ 'x86/mc16_avx2.asm',
+ )
+ endif
+
+diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
+index a9aa5acf..52de0faf 100644
+--- a/src/x86/looprestoration_init_tmpl.c
++++ b/src/x86/looprestoration_init_tmpl.c
+@@ -30,9 +30,40 @@
+
+ #include "common/intops.h"
+
++#if BITDEPTH != 8
++#define decl_wiener_filter_fn(name, ext) \
++void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
++ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
++ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
++void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
++ const int16_t fv[7], int w, int h, \
++ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
++static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
++ const pixel (*const left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const LooprestorationParams *params, \
++ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
++ ALIGN_STK_64(int16_t, mid, 68 * 384,); \
++ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \
++ edges HIGHBD_TAIL_SUFFIX); \
++ if (edges & LR_HAVE_TOP) { \
++ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \
++ edges HIGHBD_TAIL_SUFFIX); \
++ } \
++ if (edges & LR_HAVE_BOTTOM) { \
++ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
++ lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
++ } \
++ BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
++}
++#define decl_wiener_filter_fns(ext) \
++decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
++decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
++#else
+ #define decl_wiener_filter_fns(ext) \
+ decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+ decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
++#endif
+
+ #define decl_sgr_filter_fns(ext) \
+ void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
+@@ -193,11 +224,14 @@ decl_wiener_filter_fns(sse2);
+ decl_wiener_filter_fns(ssse3);
+ SGR_FILTER_OLD(ssse3)
+ # if ARCH_X86_64
+-decl_wiener_filter_fns(avx2);
+ decl_sgr_filter_fns(avx2)
+ # endif
+ #endif
+
++#if ARCH_X86_64
++decl_wiener_filter_fns(avx2);
++#endif
++
+ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+@@ -217,11 +251,13 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+-#if BITDEPTH == 8 && ARCH_X86_64
++#if ARCH_X86_64
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
++# if BITDEPTH == 8
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
++# endif
+ #endif
+ }
+--
+GitLab
+
diff --git a/0002-wiener_2.patch b/0002-wiener_2.patch
index 149be2fe1293..e69de29bb2d1 100644
--- a/0002-wiener_2.patch
+++ b/0002-wiener_2.patch
@@ -1,661 +0,0 @@
-From 269eeaf7c01afc79a53537881ad03185bf491cf6 Mon Sep 17 00:00:00 2001
-From: "Nathan E. Egge" <unlord@xiph.org>
-Date: Tue, 29 Dec 2020 06:58:33 -0500
-Subject: [PATCH] Add bpc suffix to lr functions
-
----
- src/x86/looprestoration.asm | 36 ++---
- src/x86/looprestoration_init_tmpl.c | 204 +++++++++++++---------------
- src/x86/looprestoration_sse.asm | 60 ++++----
- 3 files changed, 146 insertions(+), 154 deletions(-)
-
-diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm
-index 8ebe230..e077cdd 100644
---- a/src/x86/looprestoration.asm
-+++ b/src/x86/looprestoration.asm
-@@ -66,8 +66,8 @@ SECTION .text
- DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
-
- INIT_YMM avx2
--cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h
-+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h
- mov fltq, fltmp
- mov edged, r8m
- mov wd, wm
-@@ -414,8 +414,8 @@ ALIGN function_align
- add dstq, dst_strideq
- ret
-
--cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h
-+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h
- mov fltq, fltmp
- mov edged, r8m
- mov wd, wm
-@@ -532,7 +532,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
- jnz .h_have_right
- cmp r10d, -33
- jl .h_have_right
-- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
- .h_have_right:
- pshufb m0, m4, m6
- pmaddubsw m0, m12
-@@ -591,7 +591,7 @@ ALIGN function_align
- jnz .hv_have_right
- cmp r10d, -33
- jl .hv_have_right
-- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
- .hv_have_right:
- pshufb m0, m4, m6
- pmaddubsw m0, m12
-@@ -705,7 +705,7 @@ ALIGN function_align
- jl .v_loop
- ret
-
--cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-+cglobal sgr_box3_h_8bpc, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov xlimd, edgem
- movifnidn wd, wm
- mov hd, hm
-@@ -805,7 +805,7 @@ cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- RET
-
- INIT_YMM avx2
--cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
-+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
- movifnidn edged, edgem
- mov xq, -2
- rorx ylimd, edged, 2
-@@ -868,7 +868,7 @@ cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
- RET
-
- INIT_YMM avx2
--cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
-+cglobal sgr_calc_ab1_8bpc, 4, 6, 11, a, b, w, h, s
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
- add hd, 2
-@@ -937,8 +937,8 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
- RET
-
- INIT_YMM avx2
--cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
-- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
-+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
-+ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
- movifnidn wd, wm
- mov hd, hm
- vpbroadcastd m15, [pw_16]
-@@ -1043,7 +1043,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
- RET
-
- INIT_YMM avx2
--cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
-+cglobal sgr_weighted1_8bpc, 4, 6, 6, dst, stride, t, w, h, wt
- %ifidn wtd, wtm
- shl wtd, 4
- movd xm5, wtd
-@@ -1082,7 +1082,7 @@ cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
- RET
-
- INIT_YMM avx2
--cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-+cglobal sgr_box5_h_8bpc, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
-@@ -1200,7 +1200,7 @@ cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
- RET
-
- INIT_YMM avx2
--cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
-+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
- movifnidn edged, edgem
- mov xq, -2
- rorx ylimd, edged, 2
-@@ -1293,7 +1293,7 @@ cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
- jmp .loop_y_noload
-
- INIT_YMM avx2
--cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
-+cglobal sgr_calc_ab2_8bpc, 4, 6, 11, a, b, w, h, s
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
- add hd, 2
-@@ -1364,8 +1364,8 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
- RET
-
- INIT_YMM avx2
--cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
-- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
-+cglobal sgr_finish_filter2_8bpc, 5, 13, 13, t, src, stride, a, b, w, h, \
-+ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
- movifnidn wd, wm
- mov hd, hm
- vpbroadcastd m9, [pw_5_6]
-@@ -1483,7 +1483,7 @@ cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
- RET
-
- INIT_YMM avx2
--cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt
-+cglobal sgr_weighted2_8bpc, 4, 7, 11, dst, stride, t1, t2, w, h, wt
- movifnidn wd, wm
- movifnidn hd, hm
- vpbroadcastd m0, wtm
-diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
-index 5df449c..11ebdd1 100644
---- a/src/x86/looprestoration_init_tmpl.c
-+++ b/src/x86/looprestoration_init_tmpl.c
-@@ -31,148 +31,140 @@
- #include "common/intops.h"
- #include "src/tables.h"
-
--#define WIENER_FILTER(ext) \
--void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const int16_t filter[2][8], \
-- enum LrEdgeFlags edges); \
--void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const int16_t filter[2][8], \
-- enum LrEdgeFlags edges);
-+#define decl_wiener_filter_fns(ext) \
-+decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \
-+decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext))
-
--#define SGR_FILTER(ext) \
--void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
-- const pixel (*left)[4], \
-- const pixel *src, const ptrdiff_t stride, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
-- const int w, const int h, const int strength); \
--void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const int32_t *a, const int16_t *b, \
-- const int w, const int h); \
-+#define decl_sgr_filter_fn(ext) \
-+void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
-+ const pixel (*left)[4], \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
-+ const int w, const int h, const int strength); \
-+void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int32_t *a, const int16_t *b, \
-+ const int w, const int h); \
- \
- /* filter with a 3x3 box (radius=1) */ \
--static void dav1d_sgr_filter1_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const pixel (*left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, const int strength, \
-- const enum LrEdgeFlags edges) \
-+static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const pixel (*left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const int strength, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
- int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
- ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
- int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
- \
-- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
-+ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
- if (edges & LR_HAVE_TOP) \
-- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-- NULL, lpf, lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-+ NULL, lpf, lpf_stride, w, 2, edges); \
- \
- if (edges & LR_HAVE_BOTTOM) \
-- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-- lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-+ lpf_stride, w, 2, edges); \
- \
-- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
-- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
-- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
-+ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
-+ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
-+ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
- } \
- \
--void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
-- const pixel (*left)[4], \
-- const pixel *src, const ptrdiff_t stride, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
-- const int w, const int h, const int strength); \
--void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const int32_t *a, const int16_t *b, \
-- const int w, const int h); \
-+void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
-+ const pixel (*left)[4], \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
-+ const int w, const int h, const int strength); \
-+void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int32_t *a, const int16_t *b, \
-+ const int w, const int h); \
- \
- /* filter with a 5x5 box (radius=2) */ \
--static void dav1d_sgr_filter2_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const pixel (*left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, const int strength, \
-- const enum LrEdgeFlags edges) \
-+static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const pixel (*left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const int strength, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
- int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
- ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
- int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
- \
-- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
-+ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
- if (edges & LR_HAVE_TOP) \
-- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-- NULL, lpf, lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-+ NULL, lpf, lpf_stride, w, 2, edges); \
- \
- if (edges & LR_HAVE_BOTTOM) \
-- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-- lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-+ lpf_stride, w, 2, edges); \
- \
-- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
-- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
-- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
-+ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
-+ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
-+ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
- } \
- \
--void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
-- const coef *t1, const int w, const int h, \
-- const int wt); \
--void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
-- const coef *t1, const coef *t2, \
-- const int w, const int h, \
-- const uint32_t wt); \
-+void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
-+ const coef *t1, const int w, const int h, \
-+ const int wt); \
-+void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
-+ const coef *t1, const coef *t2, \
-+ const int w, const int h, \
-+ const uint32_t wt); \
- \
--static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-- const pixel (*const left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, const int sgr_idx, \
-- const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
-+static void BF(sgr_filter, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
-+ const pixel (*const left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const int sgr_idx, \
-+ const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
- { \
- if (!dav1d_sgr_params[sgr_idx][0]) { \
- ALIGN_STK_32(coef, tmp, 64 * 384,); \
-- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, dav1d_sgr_params[sgr_idx][3], edges); \
-- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
-+ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, dav1d_sgr_params[sgr_idx][3], edges); \
-+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
- } else if (!dav1d_sgr_params[sgr_idx][1]) { \
- ALIGN_STK_32(coef, tmp, 64 * 384,); \
-- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, dav1d_sgr_params[sgr_idx][2], edges); \
-- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
-+ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, dav1d_sgr_params[sgr_idx][2], edges); \
-+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
- } else { \
- ALIGN_STK_32(coef, tmp1, 64 * 384,); \
- ALIGN_STK_32(coef, tmp2, 64 * 384,); \
-- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, dav1d_sgr_params[sgr_idx][2], edges); \
-- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, dav1d_sgr_params[sgr_idx][3], edges); \
-+ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, dav1d_sgr_params[sgr_idx][2], edges); \
-+ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, dav1d_sgr_params[sgr_idx][3], edges); \
- const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
-- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
-+ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
- } \
- }
-
- #if BITDEPTH == 8
--WIENER_FILTER(sse2)
--WIENER_FILTER(ssse3)
--SGR_FILTER(ssse3)
-+decl_wiener_filter_fns(sse2);
-+decl_wiener_filter_fns(ssse3);
-+decl_sgr_filter_fn(ssse3)
- # if ARCH_X86_64
--WIENER_FILTER(avx2)
--SGR_FILTER(avx2)
-+decl_wiener_filter_fns(avx2);
-+decl_sgr_filter_fn(avx2)
- # endif
- #endif
-
-@@ -181,21 +173,21 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
-
- if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
- #if BITDEPTH == 8
-- c->wiener[0] = dav1d_wiener_filter7_sse2;
-- c->wiener[1] = dav1d_wiener_filter5_sse2;
-+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
-+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
- #endif
-
- if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
- #if BITDEPTH == 8
-- c->wiener[0] = dav1d_wiener_filter7_ssse3;
-- c->wiener[1] = dav1d_wiener_filter5_ssse3;
-- c->selfguided = sgr_filter_ssse3;
-+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
-+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
-+ c->selfguided = BF(sgr_filter, ssse3);
- #endif
-
- if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
- #if BITDEPTH == 8 && ARCH_X86_64
-- c->wiener[0] = dav1d_wiener_filter7_avx2;
-- c->wiener[1] = dav1d_wiener_filter5_avx2;
-- c->selfguided = sgr_filter_avx2;
-+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
-+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
-+ c->selfguided = BF(sgr_filter, avx2);
- #endif
- }
-diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm
-index 5d3ca49..4b77138 100644
---- a/src/x86/looprestoration_sse.asm
-+++ b/src/x86/looprestoration_sse.asm
-@@ -97,8 +97,8 @@ SECTION .text
- %macro WIENER 0
- %if ARCH_X86_64
- DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
--cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h, x
-+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h, x
- %define base 0
- mov fltq, fltmp
- mov edged, r8m
-@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
- %define m11 [stk+96]
- %define stk_off 112
- %endif
--cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
-+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
- %define base r6-pb_right_ext_mask-21
- %define stk esp
- %define dstq leftq
-@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
- add lpfq, [rsp+gprsize*1]
- call .hv_bottom
- .v1:
-- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
- RET
- .no_top:
- lea t3, [lpfq+lpf_strideq*4]
-@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
- dec hd
- jnz .main
- .v3:
-- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
- .v2:
-- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
- jmp .v1
- .extend_right:
- movd m2, [lpfq-4]
-@@ -685,8 +685,8 @@ ALIGN function_align
- %endif
-
- %if ARCH_X86_64
--cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h, x
-+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h, x
- mov fltq, fltmp
- mov edged, r8m
- mov wd, wm
-@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
- %define m11 [stk+80]
- %define stk_off 96
- %endif
--cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
-+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
- %define stk esp
- %define leftmp [stk+28]
- %define m8 [base+pw_m16380]
-@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
- dec hd
- jnz .main
- .v2:
-- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
- add dstq, dst_strideq
- mov t4, t3
- mov t3, t2
- mov t2, t1
- movifnidn dstmp, dstq
- .v1:
-- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
- jmp .end
- .h:
- %define stk esp+4
-@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
- jnz .h_have_right
- cmp xd, -17
- jl .h_have_right
-- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
- .h_have_right:
- %macro %%h5 0
- %if cpuflag(ssse3)
-@@ -991,7 +991,7 @@ ALIGN function_align
- jnz .hv_have_right
- cmp xd, -17
- jl .hv_have_right
-- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
- .hv_have_right:
- %%h5
- mova m2, [t3+xq*2]
-@@ -1161,7 +1161,7 @@ WIENER
- %endmacro
-
- %if ARCH_X86_64
--cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
-+cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- mov xlimd, edgem
- movifnidn xd, xm
- mov hd, hm
-@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- add xd, xlimd
- xor xlimd, 2 ; 2*!have_right
- %else
--cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
-+cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- %define wq r0m
- %define xlimd r1m
- %define hd hmp
-@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- RET
-
- %if ARCH_X86_64
--cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
-+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
- movifnidn edged, edgem
- %else
--cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
-+cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- %define sumsq_baseq dword [esp+0]
- %define sum_baseq dword [esp+4]
- %define ylimd dword [esp+8]
-@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- jl .loop_x
- RET
-
--cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
-+cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
-@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
- RET
-
- %if ARCH_X86_64
--cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
-- tmp_base, src_base, a_base, b_base, x, y
-+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
-+ tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
- mova m15, [pw_16]
-@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
- mov b_baseq, bq
- xor xd, xd
- %else
--cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
-+cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
- %define tmp_baseq [esp+8]
- %define src_baseq [esp+12]
- %define a_baseq [esp+16]
-@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
- jl .loop_x
- RET
-
--cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
-+cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
- movifnidn hd, hm
- %if ARCH_X86_32
- SETUP_PIC r6, 0
-@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
- RET
-
- %if ARCH_X86_64
--cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-+cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
- mova m10, [pb_0]
- mova m11, [pb_0_1]
- %else
--cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
-+cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edgeb byte edgem
- %define wd xd
- %define wq wd
-@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- RET
-
- %if ARCH_X86_64
--cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
-+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
- movifnidn edged, edgem
- mov ylimd, edged
- %else
--cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
-+cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- %define wm [esp+0]
- %define hm [esp+4]
- %define edgem [esp+8]
-@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- jmp .sum_loop_y_noload
- %endif
-
--cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
-+cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
-@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
- RET
-
- %if ARCH_X86_64
--cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
-+cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
- tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
-@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
- psrlw m11, m12, 1 ; pw_128
- pxor m13, m13
- %else
--cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
-+cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
- %define tmp_baseq r0m
- %define src_baseq r1m
- %define a_baseq r3m
-@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
- RET
-
- %undef t2
--cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
-+cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
- movifnidn wd, wm
- movd m0, wtm
- %if ARCH_X86_64
---
-GitLab
-
diff --git a/0003-wiener_3.patch b/0003-wiener_3.patch
index b2852c881396..e69de29bb2d1 100644
--- a/0003-wiener_3.patch
+++ b/0003-wiener_3.patch
@@ -1,492 +0,0 @@
-From 43c61c3f259400cde5facbe7ce50769088b5f5b6 Mon Sep 17 00:00:00 2001
-From: "Nathan E. Egge" <unlord@xiph.org>
-Date: Sun, 10 Jan 2021 14:12:10 -0500
-Subject: [PATCH] x86: lr: Add AVX2 implementation of wiener filter for 16 bpc
-
-Relative speed-ups over C code (compared with gcc-9.3.0):
-
- C AVX2
-wiener_5tap_10bpc: 194892.0 14831.9 13.14x
-wiener_5tap_12bpc: 194295.4 14828.9 13.10x
-wiener_7tap_10bpc: 194391.7 19461.4 9.99x
-wiener_7tap_12bpc: 194136.1 19418.7 10.00x
----
- src/x86/looprestoration16_avx2.asm | 466 +++++++++++++++++++++++++++++
- 1 file changed, 466 insertions(+)
- create mode 100644 src/x86/looprestoration16_avx2.asm
-
-diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm
-new file mode 100644
-index 0000000..2012860
---- /dev/null
-+++ b/src/x86/looprestoration16_avx2.asm
-@@ -0,0 +1,466 @@
-+; Copyright (c) 2017-2021, The rav1e contributors
-+; Copyright (c) 2021, Nathan Egge
-+; All rights reserved.
-+;
-+; This source code is subject to the terms of the BSD 2 Clause License and
-+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-+; was not distributed with this source code in the LICENSE file, you can
-+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-+; Media Patent License 1.0 was not distributed with this source code in the
-+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-+
-+%include "config.asm"
-+%include "ext/x86/x86inc.asm"
-+
-+%if ARCH_X86_64
-+
-+SECTION_RODATA 32
-+
-+wiener5_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
-+wiener5_shufB: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13
-+wiener5_shufC: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1
-+wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-+
-+wiener7_shufB: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9
-+wiener7_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
-+wiener7_shufD: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
-+rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-+rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-+wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-+
-+pq_3: dq (6 - 4) + 1
-+pq_5: dq (6 - 2) + 1
-+pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
-+pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
-+
-+pq_11: dq 12 - (6 - 4) + 1
-+pq_9: dq 12 - (6 - 2) + 1
-+nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
-+nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
-+
-+pb_wiener5_l: times 2 db 2, 3
-+pb_wiener5_r: times 2 db -6, -5
-+
-+pb_wiener7_l: times 2 db 4, 5
-+pb_wiener7_m: times 2 db -4, -3
-+pb_wiener7_r: times 2 db -8, -7
-+
-+SECTION .text
-+
-+INIT_YMM avx2
-+cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ vbroadcasti128 m6, [wiener5_shufA]
-+ vpbroadcastd m12, [fq + 2]
-+ vbroadcasti128 m7, [wiener5_shufB]
-+ vpbroadcastw m13, [fq + 6]
-+ vbroadcasti128 m8, [wiener5_shufC]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m9, [pd_65540]
-+ movq xm10, [pq_3]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m9, [pd_262160]
-+ movq xm10, [pq_5]
-+.bits10:
-+ pxor m11, m11
-+ add wq, wq
-+ add srcq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
-+.v_loop:
-+ mov xq, wq
-+ test edgeb, 1 ; LR_HAVE_LEFT
-+ jz .h_extend_left
-+ test leftq, leftq
-+ jz .h_loop
-+ movd xm4, [leftq + 4]
-+ vpblendd m4, [srcq + xq - 4], 0xfe
-+ add leftq, 8
-+ jmp .h_main
-+.h_extend_left:
-+ vbroadcasti128 m5, [srcq + xq]
-+ mova m4, [srcq + xq]
-+ palignr m4, m5, 12
-+ pshufb m4, [wiener5_l_shuf]
-+ jmp .h_main
-+.h_loop:
-+ movu m4, [srcq + xq - 4]
-+.h_main:
-+ movu m5, [srcq + xq + 4]
-+ test edgeb, 2 ; LR_HAVE_RIGHT
-+ jnz .h_have_right
-+ cmp xd, -18*2
-+ jl .h_have_right
-+ movd xm2, xd
-+ vpbroadcastd m0, [pb_wiener5_l]
-+ vpbroadcastd m1, [pb_wiener5_r]
-+ vpbroadcastb m2, xm2
-+ movu m3, [pb_0to31]
-+ psubb m0, m2
-+ psubb m1, m2
-+ pminub m0, m3
-+ pminub m1, m3
-+ pshufb m4, m0
-+ pshufb m5, m1
-+.h_have_right:
-+ pshufb m0, m4, m6
-+ pshufb m2, m4, m7
-+ paddw m0, m2
-+ pmaddwd m0, m12
-+ pshufb m1, m5, m6
-+ pshufb m3, m5, m7
-+ paddw m1, m3
-+ pmaddwd m1, m12
-+ pshufb m4, m8
-+ pmaddwd m4, m13
-+ pshufb m5, m8
-+ pmaddwd m5, m13
-+ paddd m0, m4
-+ paddd m1, m5
-+ paddd m0, m9
-+ paddd m1, m9
-+ psrad m0, xm10
-+ psrad m1, xm10
-+ packssdw m0, m1
-+ pmaxsw m0, m11
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add srcq, ssq
-+ add dstq, 384*2
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
-+
-+INIT_YMM avx2
-+cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ pxor m6, m6
-+ vpbroadcastd m7, [fq + 2]
-+ vpbroadcastd m8, [fq + 6]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m9, [nd_1047552]
-+ movq xm10, [pq_11]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m9, [nd_1048320]
-+ movq xm10, [pq_9]
-+.bits10:
-+ vpbroadcastw m11, bdmaxm
-+ add wq, wq
-+ add midq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
-+ mov msq, 2*384
-+ mov t0, midq
-+ lea t1, [t0 + msq]
-+ lea t2, [t1 + msq]
-+ lea t3, [t2 + msq]
-+ lea t4, [t3 + msq]
-+ test edgeb, 4 ; LR_HAVE_TOP
-+ jnz .have_top
-+ mov t0, t2
-+ mov t1, t2
-+.have_top:
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jnz .v_loop
-+ cmp hd, 2
-+ jg .v_loop
-+ cmp hd, 1
-+ jne .limit_v
-+ mov t3, t2
-+.limit_v:
-+ mov t4, t3
-+.v_loop:
-+ mov xq, wq
-+.h_loop:
-+ mova m1, [t0 + xq]
-+ mova m2, [t1 + xq]
-+ mova m3, [t2 + xq]
-+ mova m4, [t3 + xq]
-+ mova m5, [t4 + xq]
-+ punpcklwd m0, m1, m2
-+ pmaddwd m0, m7
-+ punpckhwd m1, m2
-+ pmaddwd m1, m7
-+ punpcklwd m2, m5, m4
-+ pmaddwd m2, m7
-+ punpckhwd m5, m4
-+ pmaddwd m5, m7
-+ paddd m0, m2
-+ paddd m1, m5
-+ punpcklwd m2, m3, m6
-+ pmaddwd m2, m8
-+ punpckhwd m3, m6
-+ pmaddwd m3, m8
-+ paddd m0, m2
-+ paddd m1, m3
-+ paddd m0, m9
-+ paddd m1, m9
-+ psrad m0, xm10
-+ psrad m1, xm10
-+ packusdw m0, m1
-+ pminuw m0, m11
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add dstq, dsq
-+ mov t0, t1
-+ mov t1, t2
-+ mov t2, t3
-+ mov t3, t4
-+ add t4, msq
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jnz .have_bottom
-+ cmp hd, 3
-+ jg .have_bottom
-+ mov t4, t3
-+.have_bottom:
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+INIT_YMM avx2
-+cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ vpbroadcastd m7, [fq]
-+ vpbroadcastd m8, [fq + 4]
-+ vbroadcasti128 m10, [rev_w]
-+ vbroadcasti128 m11, [wiener5_shufA]
-+ vbroadcasti128 m12, [wiener7_shufB]
-+ vbroadcasti128 m13, [wiener7_shufC]
-+ vbroadcasti128 m14, [wiener7_shufD]
-+ vbroadcasti128 m15, [rev_d]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m9, [pd_65540]
-+ mov rhq, [pq_3]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m9, [pd_262160]
-+ mov rhq, [pq_5]
-+.bits10:
-+ add wq, wq
-+ add srcq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
-+.v_loop:
-+ mov xq, wq
-+ test edgeb, 1 ; LR_HAVE_LEFT
-+ jz .h_extend_left
-+ test leftq, leftq
-+ jz .h_loop
-+ movq xm4, [leftq + 2]
-+ vpblendw xm4, [srcq + xq - 6], 0xf8
-+ vinserti128 m4, [srcq + xq + 10], 1
-+ add leftq, 8
-+ jmp .h_main
-+.h_extend_left:
-+ vbroadcasti128 m5, [srcq + xq]
-+ mova m4, [srcq + xq]
-+ palignr m4, m5, 10
-+ pshufb m4, [wiener7_l_shuf]
-+ jmp .h_main
-+.h_loop:
-+ movu m4, [srcq + xq - 6]
-+.h_main:
-+ movu m5, [srcq + xq + 2]
-+ movu m6, [srcq + xq + 6]
-+ test edgeb, 2 ; LR_HAVE_RIGHT
-+ jnz .h_have_right
-+ cmp xd, -19*2
-+ jl .h_have_right
-+ movd xm3, xd
-+ vpbroadcastd m0, [pb_wiener7_l]
-+ vpbroadcastd m1, [pb_wiener7_m]
-+ vpbroadcastd m2, [pb_wiener7_r]
-+ vpbroadcastb m3, xm3
-+ psubb m0, m3
-+ psubb m1, m3
-+ psubb m2, m3
-+ movu m3, [pb_0to31]
-+ pminub m0, m3
-+ pminub m1, m3
-+ pminub m2, m3
-+ pshufb m4, m0
-+ pshufb m5, m1
-+ pshufb m6, m2
-+ cmp xd, -9*2
-+ jne .hack
-+ vpbroadcastw xm3, [srcq + xq + 16]
-+ vinserti128 m5, xm3, 1
-+ jmp .h_have_right
-+.hack:
-+ cmp xd, -1*2
-+ jne .h_have_right
-+ vpbroadcastw xm5, [srcq + xq]
-+.h_have_right:
-+ pshufb m6, m10
-+ pshufb m0, m4, m11
-+ pshufb m2, m5, m12
-+ paddw m0, m2
-+ pmaddwd m0, m7
-+ pshufb m2, m4, m13
-+ pshufb m4, m14
-+ paddw m2, m4
-+ pmaddwd m2, m8
-+ pshufb m1, m6, m11
-+ pshufb m5, m11
-+ pmaddwd m1, m7
-+ pmaddwd m5, m7
-+ pshufb m3, m6, m13
-+ pshufb m6, m14
-+ paddw m3, m6
-+ pmaddwd m3, m8
-+ paddd m0, m2
-+ paddd m1, m3
-+ pshufb m1, m15
-+ paddd m1, m5
-+ movq xm4, rhq
-+ pxor m5, m5
-+ paddd m0, m9
-+ paddd m1, m9
-+ psrad m0, xm4
-+ psrad m1, xm4
-+ packssdw m0, m1
-+ pmaxsw m0, m5
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add srcq, ssq
-+ add dstq, 384*2
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+INIT_YMM avx2
-+cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ pxor m6, m6
-+ vpbroadcastd m7, [fq]
-+ vpbroadcastw m8, [fq + 4]
-+ vpbroadcastd m9, [fq + 6]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m10, [nd_1047552]
-+ movq xm11, [pq_11]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m10, [nd_1048320]
-+ movq xm11, [pq_9]
-+.bits10:
-+ vpbroadcastw m12, bdmaxm
-+ add wq, wq
-+ add midq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
-+ mov msq, 2*384
-+ mov t0, midq
-+ mov t1, t0
-+ lea t2, [t1 + msq]
-+ lea t3, [t2 + msq]
-+ lea t4, [t3 + msq]
-+ lea t5, [t4 + msq]
-+ lea t6, [t5 + msq]
-+ test edgeb, 4 ; LR_HAVE_TOP
-+ jnz .have_top
-+ mov t0, t3
-+ mov t1, t3
-+ mov t2, t3
-+.have_top:
-+ cmp hd, 3
-+ jg .v_loop
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jz .no_bottom0
-+ cmp hd, 1
-+ jg .v_loop
-+ jmp .h3
-+.no_bottom0:
-+ cmp hd, 2
-+ je .h2
-+ jns .h3
-+.h1:
-+ mov t4, t3
-+.h2:
-+ mov t5, t4
-+.h3:
-+ mov t6, t5
-+.v_loop:
-+ mov xq, wq
-+.h_loop:
-+ mova m1, [t0 + xq]
-+ mova m2, [t1 + xq]
-+ mova m3, [t5 + xq]
-+ mova m4, [t6 + xq]
-+ punpcklwd m0, m1, m2
-+ pmaddwd m0, m7
-+ punpckhwd m1, m2
-+ pmaddwd m1, m7
-+ punpcklwd m2, m4, m3
-+ pmaddwd m2, m7
-+ punpckhwd m4, m3
-+ pmaddwd m4, m7
-+ paddd m0, m2
-+ paddd m1, m4
-+ mova m3, [t2 + xq]
-+ mova m4, [t4 + xq]
-+ punpcklwd m2, m3, m4
-+ pmaddwd m2, m8
-+ punpckhwd m3, m4
-+ pmaddwd m3, m8
-+ paddd m0, m2
-+ paddd m1, m3
-+ mova m3, [t3 + xq]
-+ punpcklwd m2, m3, m6
-+ pmaddwd m2, m9
-+ punpckhwd m3, m6
-+ pmaddwd m3, m9
-+ paddd m0, m2
-+ paddd m1, m3
-+ paddd m0, m10
-+ paddd m1, m10
-+ psrad m0, xm11
-+ psrad m1, xm11
-+ packusdw m0, m1
-+ pminuw m0, m12
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add dstq, dsq
-+ mov t0, t1
-+ mov t1, t2
-+ mov t2, t3
-+ mov t3, t4
-+ mov t4, t5
-+ mov t5, t6
-+ add t6, msq
-+ cmp hd, 4
-+ jg .next_row
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jz .no_bottom
-+ cmp hd, 2
-+ jg .next_row
-+.no_bottom:
-+ mov t6, t5
-+.next_row:
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+%endif ; ARCH_X86_64
---
-GitLab
-
diff --git a/0004-wiener_4.patch b/0004-wiener_4.patch
index 1876e7bd1d25..e69de29bb2d1 100644
--- a/0004-wiener_4.patch
+++ b/0004-wiener_4.patch
@@ -1,101 +0,0 @@
-From 2d59aa7b52713b77243bda12066213fca8447f9d Mon Sep 17 00:00:00 2001
-From: "Nathan E. Egge" <unlord@xiph.org>
-Date: Wed, 13 Jan 2021 14:54:42 -0500
-Subject: [PATCH] Enable AVX2 wiener filter HBD assembly
-
----
- src/meson.build | 1 +
- src/x86/looprestoration_init_tmpl.c | 41 +++++++++++++++++++++++++++--
- 2 files changed, 40 insertions(+), 2 deletions(-)
-
-diff --git a/src/meson.build b/src/meson.build
-index ca0b406..c5c305d 100644
---- a/src/meson.build
-+++ b/src/meson.build
-@@ -209,7 +209,8 @@ if is_asm_enabled
-
- if dav1d_bitdepths.contains('16')
- libdav1d_sources_asm += files(
-+ 'x86/looprestoration16_avx2.asm',
- 'x86/mc16_avx2.asm',
- )
- endif
-
-diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
-index 11ebdd1..dfc9f84 100644
---- a/src/x86/looprestoration_init_tmpl.c
-+++ b/src/x86/looprestoration_init_tmpl.c
-@@ -31,9 +31,41 @@
- #include "common/intops.h"
- #include "src/tables.h"
-
-+#if BITDEPTH != 8
-+#undef decl_wiener_filter_fn
-+#define decl_wiener_filter_fn(name, ext) \
-+void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
-+ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
-+ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
-+void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
-+ const int16_t fv[7], int w, int h, \
-+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
-+static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
-+ const pixel (*const left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const int16_t filter[2][8], \
-+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
-+ ALIGN_STK_64(int16_t, mid, 68 * 384,); \
-+ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, filter[0], w, h, \
-+ edges HIGHBD_TAIL_SUFFIX); \
-+ if (edges & LR_HAVE_TOP) { \
-+ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, filter[0], w, 2, \
-+ edges HIGHBD_TAIL_SUFFIX); \
-+ } \
-+ if (edges & LR_HAVE_BOTTOM) { \
-+ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
-+ lpf_stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
-+ } \
-+ BF(name##_v, ext)(dst, dst_stride, mid, filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
-+}
-+#define decl_wiener_filter_fns(ext) \
-+decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
-+decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
-+#else
- #define decl_wiener_filter_fns(ext) \
- decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \
- decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext))
-+#endif
-
- #define decl_sgr_filter_fn(ext) \
- void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
-@@ -163,11 +195,14 @@ decl_wiener_filter_fns(sse2);
- decl_wiener_filter_fns(ssse3);
- decl_sgr_filter_fn(ssse3)
- # if ARCH_X86_64
--decl_wiener_filter_fns(avx2);
- decl_sgr_filter_fn(avx2)
- # endif
- #endif
-
-+#if ARCH_X86_64
-+decl_wiener_filter_fns(avx2);
-+#endif
-+
- COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
- const unsigned flags = dav1d_get_cpu_flags();
-
-@@ -185,9 +220,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
- #endif
-
- if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--#if BITDEPTH == 8 && ARCH_X86_64
-+#if ARCH_X86_64
- c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
- c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
-+# if BITDEPTH == 8
- c->selfguided = BF(sgr_filter, avx2);
-+# endif
- #endif
- }
---
-GitLab
-
diff --git a/PKGBUILD b/PKGBUILD
index 658a3cdee81e..e4274b503901 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -1,9 +1,9 @@
# Maintainer: Ben Grant <ben@190n.org>
-# Maintainer: Snoop05
+# Maintainer: Matej Dian <snoop05b@gmail.com>
_testvideo=Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu
pkgname=dav1d-git-optimized
-pkgver=r1556.05d05f9
+pkgver=r1617.baa9237
pkgrel=1
license=('BSD')
pkgdesc='AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations'
@@ -15,15 +15,11 @@ makedepends=('meson' 'git' 'nasm')
source=('git+https://code.videolan.org/videolan/dav1d.git'
"http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/$_testvideo"
'0001-1112.patch'
- '0002-wiener_2.patch'
- '0003-wiener_3.patch'
- '0004-wiener_4.patch')
+ '0002-1160.patch')
sha256sums=('SKIP'
'e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87'
- '421c4732d3a3fc85428263f4e4419f7b3bfc7059a29c2b81055a6ebf4345d0eb'
- '32385f2316886cef326e7887a3de96fdada2ee723b269908794ed770da460626'
- '1cf8db585f98ef8e63bb3f44f11679cdc554377f58964bebc7ca29aa1639d1ea'
- '5e46d8d6fcf2d2cdb062368b23af534ecf123321594f9d548a6f14d80d16d981')
+ '83807b996384f147cea3702a1a7fcd4accfc04c3937fea11d0f74b615c37f8d2'
+ 'de289262c9d4e1964e7b9130a5619c6501e82a074794ced6d7da7922630973f3')
pkgver () {
cd dav1d
@@ -32,11 +28,8 @@ pkgver () {
prepare () {
cd dav1d
- # from https://code.videolan.org/videolan/dav1d/-/merge_requests/1112
patch -Np1 -i ${srcdir}/0001-1112.patch
- patch -Np1 -i ${srcdir}/0002-wiener_2.patch
- patch -Np1 -i ${srcdir}/0003-wiener_3.patch
- patch -Np1 -i ${srcdir}/0004-wiener_4.patch
+ patch -Np1 -i ${srcdir}/0002-1160.patch
}
build () {
@@ -48,7 +41,7 @@ build () {
-Db_lto=false \
-Db_pgo=generate
ninja -C build
- ./build/tools/dav1d -i "$srcdir/$_testvideo" --muxer null --framethreads $(nproc) --tilethread 4
+ LD_PRELOAD=./build/src/libdav1d.so ./build/tools/dav1d -i "$srcdir/$_testvideo" --muxer null --framethreads $(nproc) --tilethreads 4 --pfthreads $(nproc)
meson configure build -Db_pgo=use
ninja -C build
}