diff options
author | Matej Dian | 2021-02-17 19:38:06 +0100 |
---|---|---|
committer | Matej Dian | 2021-02-17 19:38:06 +0100 |
commit | 0cf5a0c19b7a52ab1d3426a724d6c8f0c863118f (patch) | |
tree | d36d578406412127128ddc34f356062274d907e7 | |
parent | c8557275c5b4b6acbd1c963c61fd3518005975d5 (diff) | |
download | aur-0cf5a0c19b7a52ab1d3426a724d6c8f0c863118f.tar.gz |
update patches, misc changes, aur is retarded
-rw-r--r-- | 0001-1112.patch | 4 | ||||
-rw-r--r-- | 0002-1160.patch | 1331 | ||||
-rw-r--r-- | 0002-wiener_2.patch | 661 | ||||
-rw-r--r-- | 0003-wiener_3.patch | 492 | ||||
-rw-r--r-- | 0004-wiener_4.patch | 101 | ||||
-rw-r--r-- | PKGBUILD | 21 |
6 files changed, 1341 insertions, 1269 deletions
diff --git a/0001-1112.patch b/0001-1112.patch index e5ed26b240a9..b60c79a4a53a 100644 --- a/0001-1112.patch +++ b/0001-1112.patch @@ -2968,10 +2968,12 @@ diff --git a/src/meson.build b/src/meson.build index f9f5c120..ff62a9d8 100644 --- a/src/meson.build +++ b/src/meson.build -@@ -208,6 +208,7 @@ if is_asm_enabled +@@ -208,8 +208,9 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( + 'x86/cdef16_avx2.asm', + 'x86/cdef16_sse.asm', + 'x86/mc16_avx2.asm', ) endif diff --git a/0002-1160.patch b/0002-1160.patch new file mode 100644 index 000000000000..06f9b186c4ea --- /dev/null +++ b/0002-1160.patch @@ -0,0 +1,1331 @@ +From 541a62936532c3edd83edf6eb7ec83ab3e8bac5f Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Tue, 29 Dec 2020 06:58:33 -0500 +Subject: [PATCH 1/3] Add bpc suffix to lr functions + +--- + src/x86/looprestoration.asm | 36 ++-- + src/x86/looprestoration_init_tmpl.c | 266 ++++++++++++++-------------- + src/x86/looprestoration_sse.asm | 60 +++---- + 3 files changed, 177 insertions(+), 185 deletions(-) + +diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm +index 44aaaf49..71e3e0d2 100644 +--- a/src/x86/looprestoration.asm ++++ b/src/x86/looprestoration.asm +@@ -88,8 +88,8 @@ SECTION .text + DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers + + INIT_YMM avx2 +-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h ++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm +@@ -436,8 +436,8 @@ ALIGN function_align + add dstq, dst_strideq + ret + +-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h ++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm +@@ -554,7 +554,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right +- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right + .h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 +@@ -613,7 +613,7 @@ ALIGN function_align + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right +- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right + .hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 +@@ -727,8 +727,8 @@ ALIGN function_align + jl .v_loop + ret + +-cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, params, h ++cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, params, h + %define base r12-sgr_x_by_x-256*4 + lea r12, [sgr_x_by_x+256*4] + mov paramsq, paramsmp +@@ -1187,8 +1187,8 @@ ALIGN function_align + add dstq, dst_strideq + ret + +-cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, params, h ++cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, params, h + %define base r14-sgr_x_by_x-256*4 + mov paramsq, paramsmp + mov edged, r8m +@@ -1298,7 +1298,7 @@ cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ + jnz .h_have_right + cmp r10d, -17 + jl .h_have_right +- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right ++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right + .h_have_right: + pshufb m0, m5, m8 + pmullw m2, m0, m0 +@@ -1346,7 +1346,7 @@ ALIGN function_align + jnz .hv_have_right + cmp r10d, -17 + jl .hv_have_right +- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right ++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right + .hv_have_right: + pshufb m0, m5, m8 + pmullw m3, m0, m0 +@@ -1546,8 +1546,8 @@ ALIGN function_align + add dstq, dst_strideq + ret + +-cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, params, h ++cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, params, h + %define base r12-sgr_x_by_x-256*4 + lea r12, [sgr_x_by_x+256*4] + mov paramsq, paramsmp +@@ -1573,7 +1573,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ + call .h_top + add lpfq, lpf_strideq + mov t2, t1 +- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup ++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] +@@ -1681,7 +1681,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right +- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right ++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right + .h_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 +@@ -1742,7 +1742,7 @@ ALIGN function_align + jnz .hv0_have_right + cmp r10d, -18 + jl .hv0_have_right +- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right ++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right + .hv0_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 +@@ -1853,7 +1853,7 @@ ALIGN function_align + jnz .hv1_have_right + cmp r10d, -18 + jl .hv1_have_right +- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right ++ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right + .hv1_have_right: + pshufb m6, m5, m9 + pshufb m3, m5, m10 +diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c +index 57727787..a9aa5acf 100644 +--- a/src/x86/looprestoration_init_tmpl.c ++++ b/src/x86/looprestoration_init_tmpl.c +@@ -30,179 +30,171 @@ + + #include "common/intops.h" + +-#define WIENER_FILTER(ext) \ +-void dav1d_wiener_filter7_##ext(pixel *dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const LooprestorationParams *params, \ +- enum LrEdgeFlags edges); \ +-void dav1d_wiener_filter5_##ext(pixel *dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const LooprestorationParams *params, \ +- enum LrEdgeFlags edges); ++#define decl_wiener_filter_fns(ext) \ ++decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ ++decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) + +-#define SGR_FILTER(ext) \ +-void dav1d_sgr_filter_5x5_##ext(pixel *dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const LooprestorationParams *params, \ +- enum LrEdgeFlags edges); \ +-void dav1d_sgr_filter_3x3_##ext(pixel *dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const LooprestorationParams *params, \ +- enum LrEdgeFlags edges); \ +-void dav1d_sgr_filter_mix_##ext(pixel *dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const LooprestorationParams *params, \ +- enum LrEdgeFlags edges); ++#define decl_sgr_filter_fns(ext) \ ++void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \ ++ const pixel (*left)[4], const pixel *lpf, \ ++ ptrdiff_t lpf_stride, int w, int h, \ ++ const LooprestorationParams *params, \ ++ enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \ ++ const pixel (*left)[4], const pixel *lpf, \ ++ ptrdiff_t lpf_stride, int w, int h, \ ++ const LooprestorationParams *params, \ ++ enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \ ++ const pixel (*left)[4], const pixel *lpf, \ ++ ptrdiff_t lpf_stride, int w, int h, \ ++ const LooprestorationParams *params, \ ++ enum LrEdgeFlags edges); + + /* FIXME: Replace with a port of the AVX2 code */ + #define SGR_FILTER_OLD(ext) \ +-void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ +- const pixel (*left)[4], \ +- const pixel *src, const ptrdiff_t stride, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ +- const int w, const int h, const unsigned s); \ +-void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const int32_t *a, const int16_t *b, \ +- const int w, const int h); \ ++void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ ++ const pixel (*left)[4], \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \ ++ const int w, const int h, const unsigned s); \ ++void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int32_t *a, const int16_t *b, \ ++ const int w, const int h); \ + \ + /* filter with a 3x3 box (radius=1) */ \ +-static void dav1d_sgr_filter1_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const pixel (*left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, const int strength, \ +- const enum LrEdgeFlags edges) \ ++static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const pixel (*left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int strength, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ + ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ + \ +- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ ++ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ +- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ +- NULL, lpf, lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ ++ NULL, lpf, lpf_stride, w, 2, edges); \ + \ + if (edges & LR_HAVE_BOTTOM) \ +- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ +- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ +- lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ ++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ ++ lpf_stride, w, 2, edges); \ + \ +- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ +- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ +- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ ++ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \ ++ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \ ++ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \ + } \ + \ +-void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ +- const pixel (*left)[4], \ +- const pixel *src, const ptrdiff_t stride, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ +- const int w, const int h, const int strength); \ +-void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const int32_t *a, const int16_t *b, \ +- const int w, const int h); \ ++void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \ ++ const pixel (*left)[4], \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \ ++ const int w, const int h, const int strength); \ ++void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int32_t *a, const int16_t *b, \ ++ const int w, const int h); \ + \ + /* filter with a 5x5 box (radius=2) */ \ +-static void dav1d_sgr_filter2_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const pixel (*left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, const int strength, \ +- const enum LrEdgeFlags edges) \ ++static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const pixel (*left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int strength, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ + ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ + \ +- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ ++ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ +- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ +- NULL, lpf, lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ ++ NULL, lpf, lpf_stride, w, 2, edges); \ + \ + if (edges & LR_HAVE_BOTTOM) \ +- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ +- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ +- lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ ++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ ++ lpf_stride, w, 2, edges); \ + \ +- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ +- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ +- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ ++ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \ ++ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \ ++ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \ + } \ + \ +-void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ +- const coef *t1, const int w, const int h, \ +- const int wt); \ +-void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ +- const coef *t1, const coef *t2, \ +- const int w, const int h, \ +- const uint32_t wt); \ ++void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \ ++ const coef *t1, const int w, const int h, \ ++ const int wt); \ ++void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \ ++ const coef *t1, const coef *t2, \ ++ const int w, const int h, \ ++ const uint32_t wt); \ + \ +-static void sgr_filter_5x5_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ +- const pixel (*const left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, \ +- const LooprestorationParams *const params, \ +- const enum LrEdgeFlags edges) \ ++static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, \ ++ const LooprestorationParams *const params, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(coef, tmp, 64 * 384,); \ +- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, params->sgr.s0, edges); \ +- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w0); \ ++ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, params->sgr.s0, edges); \ ++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \ + } \ +-static void sgr_filter_3x3_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ +- const pixel (*const left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, \ +- const LooprestorationParams *const params, \ +- const enum LrEdgeFlags edges) \ ++static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, \ ++ const LooprestorationParams *const params, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(coef, tmp, 64 * 384,); \ +- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, params->sgr.s1, edges); \ +- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w1); \ ++ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, params->sgr.s1, edges); \ ++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \ + } \ +-static void sgr_filter_mix_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ +- const pixel (*const left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, \ +- const LooprestorationParams *const params, \ +- const enum LrEdgeFlags edges) \ ++static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, \ ++ const LooprestorationParams *const params, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(coef, tmp1, 64 * 384,); \ + ALIGN_STK_32(coef, tmp2, 64 * 384,); \ +- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, params->sgr.s0, edges); \ +- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, params->sgr.s1, edges); \ ++ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, params->sgr.s0, edges); \ ++ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, params->sgr.s1, edges); \ + const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \ +- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ ++ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \ + } + + #if BITDEPTH == 8 +-WIENER_FILTER(sse2) +-WIENER_FILTER(ssse3) ++decl_wiener_filter_fns(sse2); ++decl_wiener_filter_fns(ssse3); + SGR_FILTER_OLD(ssse3) + # if ARCH_X86_64 +-WIENER_FILTER(avx2) +-SGR_FILTER(avx2) ++decl_wiener_filter_fns(avx2); ++decl_sgr_filter_fns(avx2) + # endif + #endif + +@@ -211,25 +203,25 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + #if BITDEPTH == 8 +- c->wiener[0] = dav1d_wiener_filter7_sse2; +- c->wiener[1] = dav1d_wiener_filter5_sse2; ++ c->wiener[0] = BF(dav1d_wiener_filter7, sse2); ++ c->wiener[1] = BF(dav1d_wiener_filter5, sse2); + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + #if BITDEPTH == 8 +- c->wiener[0] = dav1d_wiener_filter7_ssse3; +- c->wiener[1] = dav1d_wiener_filter5_ssse3; +- c->sgr[0] = sgr_filter_5x5_ssse3; +- c->sgr[1] = sgr_filter_3x3_ssse3; +- c->sgr[2] = sgr_filter_mix_ssse3; ++ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); ++ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); ++ c->sgr[0] = BF(sgr_filter_5x5, ssse3); ++ c->sgr[1] = BF(sgr_filter_3x3, ssse3); ++ c->sgr[2] = BF(sgr_filter_mix, ssse3); + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + #if BITDEPTH == 8 && ARCH_X86_64 +- c->wiener[0] = dav1d_wiener_filter7_avx2; +- c->wiener[1] = dav1d_wiener_filter5_avx2; +- c->sgr[0] = dav1d_sgr_filter_5x5_avx2; +- c->sgr[1] = dav1d_sgr_filter_3x3_avx2; +- c->sgr[2] = dav1d_sgr_filter_mix_avx2; ++ c->wiener[0] = BF(dav1d_wiener_filter7, avx2); ++ c->wiener[1] = BF(dav1d_wiener_filter5, avx2); ++ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); ++ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); ++ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); + #endif + } +diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm +index 5d3ca492..4b77138d 100644 +--- a/src/x86/looprestoration_sse.asm ++++ b/src/x86/looprestoration_sse.asm +@@ -97,8 +97,8 @@ SECTION .text + %macro WIENER 0 + %if ARCH_X86_64 + DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers +-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h, x ++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h, x + %define base 0 + mov fltq, fltmp + mov edged, r8m +@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5 + %define m11 [stk+96] + %define stk_off 112 + %endif +-cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride ++cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + %define base r6-pb_right_ext_mask-21 + %define stk esp + %define dstq leftq +@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + add lpfq, [rsp+gprsize*1] + call .hv_bottom + .v1: +- call mangle(private_prefix %+ _wiener_filter7_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + RET + .no_top: + lea t3, [lpfq+lpf_strideq*4] +@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + dec hd + jnz .main + .v3: +- call mangle(private_prefix %+ _wiener_filter7_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + .v2: +- call mangle(private_prefix %+ _wiener_filter7_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + jmp .v1 + .extend_right: + movd m2, [lpfq-4] +@@ -685,8 +685,8 @@ ALIGN function_align + %endif + + %if ARCH_X86_64 +-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h, x ++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h, x + mov fltq, fltmp + mov edged, r8m + mov wd, wm +@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + %define m11 [stk+80] + %define stk_off 96 + %endif +-cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride ++cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + %define stk esp + %define leftmp [stk+28] + %define m8 [base+pw_m16380] +@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + dec hd + jnz .main + .v2: +- call mangle(private_prefix %+ _wiener_filter5_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + add dstq, dst_strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + movifnidn dstmp, dstq + .v1: +- call mangle(private_prefix %+ _wiener_filter5_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + jmp .end + .h: + %define stk esp+4 +@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + jnz .h_have_right + cmp xd, -17 + jl .h_have_right +- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right + .h_have_right: + %macro %%h5 0 + %if cpuflag(ssse3) +@@ -991,7 +991,7 @@ ALIGN function_align + jnz .hv_have_right + cmp xd, -17 + jl .hv_have_right +- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right + .hv_have_right: + %%h5 + mova m2, [t3+xq*2] +@@ -1161,7 +1161,7 @@ WIENER + %endmacro + + %if ARCH_X86_64 +-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim ++cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + mov xlimd, edgem + movifnidn xd, xm + mov hd, hm +@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + add xd, xlimd + xor xlimd, 2 ; 2*!have_right + %else +-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim ++cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + %define wq r0m + %define xlimd r1m + %define hd hmp +@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + RET + + %if ARCH_X86_64 +-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim ++cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim + movifnidn edged, edgem + %else +-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y ++cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y + %define sumsq_baseq dword [esp+0] + %define sum_baseq dword [esp+4] + %define ylimd dword [esp+8] +@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y + jl .loop_x + RET + +-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s ++cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 +@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s + RET + + %if ARCH_X86_64 +-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ +- tmp_base, src_base, a_base, b_base, x, y ++cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ ++ tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm + mova m15, [pw_16] +@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ + mov b_baseq, bq + xor xd, xd + %else +-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y ++cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y + %define tmp_baseq [esp+8] + %define src_baseq [esp+12] + %define a_baseq [esp+16] +@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y + jl .loop_x + RET + +-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt ++cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt + movifnidn hd, hm + %if ARCH_X86_32 + SETUP_PIC r6, 0 +@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt + RET + + %if ARCH_X86_64 +-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim ++cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + mova m10, [pb_0] + mova m11, [pb_0_1] + %else +-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge ++cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge + %define edgeb byte edgem + %define wd xd + %define wq wd +@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge + RET + + %if ARCH_X86_64 +-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim ++cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov ylimd, edged + %else +-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr ++cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr + %define wm [esp+0] + %define hm [esp+4] + %define edgem [esp+8] +@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr + jmp .sum_loop_y_noload + %endif + +-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s ++cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 +@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s + RET + + %if ARCH_X86_64 +-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ ++cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm +@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ + psrlw m11, m12, 1 ; pw_128 + pxor m13, m13 + %else +-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y ++cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y + %define tmp_baseq r0m + %define src_baseq r1m + %define a_baseq r3m +@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y + RET + + %undef t2 +-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt ++cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movd m0, wtm + %if ARCH_X86_64 +-- +GitLab + + +From 1a4489861e55f0e4d70df60ecf15559dfda70aee Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sun, 10 Jan 2021 14:12:10 -0500 +Subject: [PATCH 2/3] x86: lr: Add AVX2 implementation of wiener filter for 16 + bpc + +Relative speed-ups over C code (compared with gcc-9.3.0): + + C AVX2 +wiener_5tap_10bpc: 194892.0 14831.9 13.14x +wiener_5tap_12bpc: 194295.4 14828.9 13.10x +wiener_7tap_10bpc: 194391.7 19461.4 9.99x +wiener_7tap_12bpc: 194136.1 19418.7 10.00x +--- + src/x86/looprestoration16_avx2.asm | 480 +++++++++++++++++++++++++++++ + 1 file changed, 480 insertions(+) + create mode 100644 src/x86/looprestoration16_avx2.asm + +diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm +new file mode 100644 +index 00000000..4eb1b805 +--- /dev/null ++++ b/src/x86/looprestoration16_avx2.asm +@@ -0,0 +1,480 @@ ++; Copyright (c) 2017-2021, The rav1e contributors ++; Copyright (c) 2021, Nathan Egge ++; All rights reserved. ++; ++; Redistribution and use in source and binary forms, with or without ++; modification, are permitted provided that the following conditions are met: ++; ++; 1. Redistributions of source code must retain the above copyright notice, this ++; list of conditions and the following disclaimer. ++; ++; 2. Redistributions in binary form must reproduce the above copyright notice, ++; this list of conditions and the following disclaimer in the documentation ++; and/or other materials provided with the distribution. ++; ++; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++%include "config.asm" ++%include "ext/x86/x86inc.asm" ++ ++%if ARCH_X86_64 ++ ++SECTION_RODATA 32 ++ ++wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 ++wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 ++wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 ++wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ++ ++wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 ++wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 ++wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 ++rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 ++rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ++wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ++ ++pq_3: dq (6 - 4) + 1 ++pq_5: dq (6 - 2) + 1 ++pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4)) ++pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2)) ++ ++pq_11: dq 12 - (6 - 4) + 1 ++pq_9: dq 12 - (6 - 2) + 1 ++nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8)) ++nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8)) ++ ++pb_wiener5_l: times 2 db 2, 3 ++pb_wiener5_r: times 2 db -6, -5 ++ ++pb_wiener7_l: times 2 db 4, 5 ++pb_wiener7_m: times 2 db -4, -3 ++pb_wiener7_r: times 2 db -8, -7 ++ ++SECTION .text ++ ++INIT_YMM avx2 ++cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ vbroadcasti128 m6, [wiener5_shufB] ++ vpbroadcastd m12, [fq + 2] ++ vbroadcasti128 m7, [wiener5_shufC] ++ vpbroadcastw m13, [fq + 6] ++ vbroadcasti128 m8, [wiener5_shufD] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m9, [pd_65540] ++ movq xm10, [pq_3] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m9, [pd_262160] ++ movq xm10, [pq_5] ++.bits10: ++ pxor m11, m11 ++ add wq, wq ++ add srcq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x ++.v_loop: ++ mov xq, wq ++ test edgeb, 1 ; LR_HAVE_LEFT ++ jz .h_extend_left ++ test leftq, leftq ++ jz .h_loop ++ movd xm4, [leftq + 4] ++ vpblendd m4, [srcq + xq - 4], 0xfe ++ add leftq, 8 ++ jmp .h_main ++.h_extend_left: ++ vbroadcasti128 m5, [srcq + xq] ++ mova m4, [srcq + xq] ++ palignr m4, m5, 12 ++ pshufb m4, [wiener5_l_shuf] ++ jmp .h_main ++.h_loop: ++ movu m4, [srcq + xq - 4] ++.h_main: ++ movu m5, [srcq + xq + 4] ++ test edgeb, 2 ; LR_HAVE_RIGHT ++ jnz .h_have_right ++ cmp xd, -36 ++ jl .h_have_right ++ movd xm2, xd ++ vpbroadcastd m0, [pb_wiener5_l] ++ vpbroadcastd m1, [pb_wiener5_r] ++ vpbroadcastb m2, xm2 ++ movu m3, [pb_0to31] ++ psubb m0, m2 ++ psubb m1, m2 ++ pminub m0, m3 ++ pminub m1, m3 ++ pshufb m4, m0 ++ pshufb m5, m1 ++.h_have_right: ++ pshufb m0, m4, m6 ++ pshufb m2, m4, m7 ++ paddw m0, m2 ++ pmaddwd m0, m12 ++ pshufb m1, m5, m6 ++ pshufb m3, m5, m7 ++ paddw m1, m3 ++ pmaddwd m1, m12 ++ pshufb m4, m8 ++ pmaddwd m4, m13 ++ pshufb m5, m8 ++ pmaddwd m5, m13 ++ paddd m0, m4 ++ paddd m1, m5 ++ paddd m0, m9 ++ paddd m1, m9 ++ psrad m0, xm10 ++ psrad m1, xm10 ++ packssdw m0, m1 ++ pmaxsw m0, m11 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add srcq, ssq ++ add dstq, 384*2 ++ dec hd ++ jg .v_loop ++ RET ++ ++DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 ++ ++INIT_YMM avx2 ++cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ pxor m6, m6 ++ vpbroadcastd m7, [fq + 2] ++ vpbroadcastd m8, [fq + 6] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m9, [nd_1047552] ++ movq xm10, [pq_11] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m9, [nd_1048320] ++ movq xm10, [pq_9] ++.bits10: ++ vpbroadcastw m11, bdmaxm ++ add wq, wq ++ add midq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x ++ mov msq, 2*384 ++ mov t0, midq ++ lea t1, [t0 + msq] ++ lea t2, [t1 + msq] ++ lea t3, [t2 + msq] ++ lea t4, [t3 + msq] ++ test edgeb, 4 ; LR_HAVE_TOP ++ jnz .have_top ++ mov t0, t2 ++ mov t1, t2 ++.have_top: ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jnz .v_loop ++ cmp hd, 2 ++ jg .v_loop ++ cmp hd, 1 ++ jne .limit_v ++ mov t3, t2 ++.limit_v: ++ mov t4, t3 ++.v_loop: ++ mov xq, wq ++.h_loop: ++ mova m1, [t0 + xq] ++ mova m2, [t1 + xq] ++ mova m3, [t2 + xq] ++ mova m4, [t3 + xq] ++ mova m5, [t4 + xq] ++ punpcklwd m0, m1, m2 ++ pmaddwd m0, m7 ++ punpckhwd m1, m2 ++ pmaddwd m1, m7 ++ punpcklwd m2, m5, m4 ++ pmaddwd m2, m7 ++ punpckhwd m5, m4 ++ pmaddwd m5, m7 ++ paddd m0, m2 ++ paddd m1, m5 ++ punpcklwd m2, m3, m6 ++ pmaddwd m2, m8 ++ punpckhwd m3, m6 ++ pmaddwd m3, m8 ++ paddd m0, m2 ++ paddd m1, m3 ++ paddd m0, m9 ++ paddd m1, m9 ++ psrad m0, xm10 ++ psrad m1, xm10 ++ packusdw m0, m1 ++ pminuw m0, m11 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add dstq, dsq ++ mov t0, t1 ++ mov t1, t2 ++ mov t2, t3 ++ mov t3, t4 ++ add t4, msq ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jnz .have_bottom ++ cmp hd, 3 ++ jg .have_bottom ++ mov t4, t3 ++.have_bottom: ++ dec hd ++ jg .v_loop ++ RET ++ ++INIT_YMM avx2 ++cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ vpbroadcastd m7, [fq] ++ vpbroadcastd m8, [fq + 4] ++ vbroadcasti128 m10, [rev_w] ++ vbroadcasti128 m11, [wiener5_shufB] ++ vbroadcasti128 m12, [wiener7_shufC] ++ vbroadcasti128 m13, [wiener7_shufD] ++ vbroadcasti128 m14, [wiener7_shufE] ++ vbroadcasti128 m15, [rev_d] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m9, [pd_65540] ++ mov rhq, [pq_3] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m9, [pd_262160] ++ mov rhq, [pq_5] ++.bits10: ++ add wq, wq ++ add srcq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh ++.v_loop: ++ mov xq, wq ++ test edgeb, 1 ; LR_HAVE_LEFT ++ jz .h_extend_left ++ test leftq, leftq ++ jz .h_loop ++ movq xm4, [leftq + 2] ++ vpblendw xm4, [srcq + xq - 6], 0xf8 ++ vinserti128 m4, [srcq + xq + 10], 1 ++ add leftq, 8 ++ jmp .h_main ++.h_extend_left: ++ vbroadcasti128 m5, [srcq + xq] ++ mova m4, [srcq + xq] ++ palignr m4, m5, 10 ++ pshufb m4, [wiener7_l_shuf] ++ jmp .h_main ++.h_loop: ++ movu m4, [srcq + xq - 6] ++.h_main: ++ movu m5, [srcq + xq + 2] ++ movu m6, [srcq + xq + 6] ++ test edgeb, 2 ; LR_HAVE_RIGHT ++ jnz .h_have_right ++ cmp xd, -38 ++ jl .h_have_right ++ movd xm3, xd ++ vpbroadcastd m0, [pb_wiener7_l] ++ vpbroadcastd m1, [pb_wiener7_m] ++ vpbroadcastd m2, [pb_wiener7_r] ++ vpbroadcastb m3, xm3 ++ psubb m0, m3 ++ psubb m1, m3 ++ psubb m2, m3 ++ movu m3, [pb_0to31] ++ pminub m0, m3 ++ pminub m1, m3 ++ pminub m2, m3 ++ pshufb m4, m0 ++ pshufb m5, m1 ++ pshufb m6, m2 ++ cmp xd, -9*2 ++ jne .hack ++ vpbroadcastw xm3, [srcq + xq + 16] ++ vinserti128 m5, xm3, 1 ++ jmp .h_have_right ++.hack: ++ cmp xd, -1*2 ++ jne .h_have_right ++ vpbroadcastw xm5, [srcq + xq] ++.h_have_right: ++ pshufb m6, m10 ++ pshufb m0, m4, m11 ++ pshufb m2, m5, m12 ++ paddw m0, m2 ++ pmaddwd m0, m7 ++ pshufb m2, m4, m13 ++ pshufb m4, m14 ++ paddw m2, m4 ++ pmaddwd m2, m8 ++ pshufb m1, m6, m11 ++ pshufb m5, m11 ++ pmaddwd m1, m7 ++ pmaddwd m5, m7 ++ pshufb m3, m6, m13 ++ pshufb m6, m14 ++ paddw m3, m6 ++ pmaddwd m3, m8 ++ paddd m0, m2 ++ paddd m1, m3 ++ pshufb m1, m15 ++ paddd m1, m5 ++ movq xm4, rhq ++ pxor m5, m5 ++ paddd m0, m9 ++ paddd m1, m9 ++ psrad m0, xm4 ++ psrad m1, xm4 ++ packssdw m0, m1 ++ pmaxsw m0, m5 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add srcq, ssq ++ add dstq, 384*2 ++ dec hd ++ jg .v_loop ++ RET ++ ++INIT_YMM avx2 ++cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ pxor m6, m6 ++ vpbroadcastd m7, [fq] ++ vpbroadcastw m8, [fq + 4] ++ vpbroadcastd m9, [fq + 6] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m10, [nd_1047552] ++ movq xm11, [pq_11] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m10, [nd_1048320] ++ movq xm11, [pq_9] ++.bits10: ++ vpbroadcastw m12, bdmaxm ++ add wq, wq ++ add midq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x ++ mov msq, 2*384 ++ mov t0, midq ++ mov t1, t0 ++ lea t2, [t1 + msq] ++ lea t3, [t2 + msq] ++ lea t4, [t3 + msq] ++ lea t5, [t4 + msq] ++ lea t6, [t5 + msq] ++ test edgeb, 4 ; LR_HAVE_TOP ++ jnz .have_top ++ mov t0, t3 ++ mov t1, t3 ++ mov t2, t3 ++.have_top: ++ cmp hd, 3 ++ jg .v_loop ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jz .no_bottom0 ++ cmp hd, 1 ++ jg .v_loop ++ jmp .h3 ++.no_bottom0: ++ cmp hd, 2 ++ je .h2 ++ jns .h3 ++.h1: ++ mov t4, t3 ++.h2: ++ mov t5, t4 ++.h3: ++ mov t6, t5 ++.v_loop: ++ mov xq, wq ++.h_loop: ++ mova m1, [t0 + xq] ++ mova m2, [t1 + xq] ++ mova m3, [t5 + xq] ++ mova m4, [t6 + xq] ++ punpcklwd m0, m1, m2 ++ pmaddwd m0, m7 ++ punpckhwd m1, m2 ++ pmaddwd m1, m7 ++ punpcklwd m2, m4, m3 ++ pmaddwd m2, m7 ++ punpckhwd m4, m3 ++ pmaddwd m4, m7 ++ paddd m0, m2 ++ paddd m1, m4 ++ mova m3, [t2 + xq] ++ mova m4, [t4 + xq] ++ punpcklwd m2, m3, m4 ++ pmaddwd m2, m8 ++ punpckhwd m3, m4 ++ pmaddwd m3, m8 ++ paddd m0, m2 ++ paddd m1, m3 ++ mova m3, [t3 + xq] ++ punpcklwd m2, m3, m6 ++ pmaddwd m2, m9 ++ punpckhwd m3, m6 ++ pmaddwd m3, m9 ++ paddd m0, m2 ++ paddd m1, m3 ++ paddd m0, m10 ++ paddd m1, m10 ++ psrad m0, xm11 ++ psrad m1, xm11 ++ packusdw m0, m1 ++ pminuw m0, m12 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add dstq, dsq ++ mov t0, t1 ++ mov t1, t2 ++ mov t2, t3 ++ mov t3, t4 ++ mov t4, t5 ++ mov t5, t6 ++ add t6, msq ++ cmp hd, 4 ++ jg .next_row ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jz .no_bottom ++ cmp hd, 2 ++ jg .next_row ++.no_bottom: ++ mov t6, t5 ++.next_row: ++ dec hd ++ jg .v_loop ++ RET ++ ++%endif ; ARCH_X86_64 +-- +GitLab + + +From 2ce581302a1536559aa5e56018a03ac6a3770c0f Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Wed, 13 Jan 2021 14:54:42 -0500 +Subject: [PATCH 3/3] Enable AVX2 wiener filter HBD assembly + +--- + src/meson.build | 1 + + src/x86/looprestoration_init_tmpl.c | 40 +++++++++++++++++++++++++++-- + 2 files changed, 39 insertions(+), 2 deletions(-) + +diff --git a/src/meson.build b/src/meson.build +index 27946501..25729217 100644 +--- a/src/meson.build ++++ b/src/meson.build +@@ -211,6 +211,7 @@ if is_asm_enabled + libdav1d_sources_asm += files( + 'x86/cdef16_avx2.asm', + 'x86/cdef16_sse.asm', ++ 'x86/looprestoration16_avx2.asm', + 'x86/mc16_avx2.asm', + ) + endif + +diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c +index a9aa5acf..52de0faf 100644 +--- a/src/x86/looprestoration_init_tmpl.c ++++ b/src/x86/looprestoration_init_tmpl.c +@@ -30,9 +30,40 @@ + + #include "common/intops.h" + ++#if BITDEPTH != 8 ++#define decl_wiener_filter_fn(name, ext) \ ++void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \ ++ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \ ++ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ ++void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \ ++ const int16_t fv[7], int w, int h, \ ++ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ ++static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const LooprestorationParams *params, \ ++ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \ ++ ALIGN_STK_64(int16_t, mid, 68 * 384,); \ ++ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \ ++ edges HIGHBD_TAIL_SUFFIX); \ ++ if (edges & LR_HAVE_TOP) { \ ++ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \ ++ edges HIGHBD_TAIL_SUFFIX); \ ++ } \ ++ if (edges & LR_HAVE_BOTTOM) { \ ++ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \ ++ lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \ ++ } \ ++ BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \ ++} ++#define decl_wiener_filter_fns(ext) \ ++decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \ ++decl_wiener_filter_fn(dav1d_wiener_filter5, ext) ++#else + #define decl_wiener_filter_fns(ext) \ + decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ + decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) ++#endif + + #define decl_sgr_filter_fns(ext) \ + void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \ +@@ -193,11 +224,14 @@ decl_wiener_filter_fns(sse2); + decl_wiener_filter_fns(ssse3); + SGR_FILTER_OLD(ssse3) + # if ARCH_X86_64 +-decl_wiener_filter_fns(avx2); + decl_sgr_filter_fns(avx2) + # endif + #endif + ++#if ARCH_X86_64 ++decl_wiener_filter_fns(avx2); ++#endif ++ + COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + +@@ -217,11 +251,13 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; +-#if BITDEPTH == 8 && ARCH_X86_64 ++#if ARCH_X86_64 + c->wiener[0] = BF(dav1d_wiener_filter7, avx2); + c->wiener[1] = BF(dav1d_wiener_filter5, avx2); ++# if BITDEPTH == 8 + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); ++# endif + #endif + } +-- +GitLab + diff --git a/0002-wiener_2.patch b/0002-wiener_2.patch index 149be2fe1293..e69de29bb2d1 100644 --- a/0002-wiener_2.patch +++ b/0002-wiener_2.patch @@ -1,661 +0,0 @@ -From 269eeaf7c01afc79a53537881ad03185bf491cf6 Mon Sep 17 00:00:00 2001 -From: "Nathan E. Egge" <unlord@xiph.org> -Date: Tue, 29 Dec 2020 06:58:33 -0500 -Subject: [PATCH] Add bpc suffix to lr functions - ---- - src/x86/looprestoration.asm | 36 ++--- - src/x86/looprestoration_init_tmpl.c | 204 +++++++++++++--------------- - src/x86/looprestoration_sse.asm | 60 ++++---- - 3 files changed, 146 insertions(+), 154 deletions(-) - -diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm -index 8ebe230..e077cdd 100644 ---- a/src/x86/looprestoration.asm -+++ b/src/x86/looprestoration.asm -@@ -66,8 +66,8 @@ SECTION .text - DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers - - INIT_YMM avx2 --cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h -+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h - mov fltq, fltmp - mov edged, r8m - mov wd, wm -@@ -414,8 +414,8 @@ ALIGN function_align - add dstq, dst_strideq - ret - --cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h -+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h - mov fltq, fltmp - mov edged, r8m - mov wd, wm -@@ -532,7 +532,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - jnz .h_have_right - cmp r10d, -33 - jl .h_have_right -- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right - .h_have_right: - pshufb m0, m4, m6 - pmaddubsw m0, m12 -@@ -591,7 +591,7 @@ ALIGN function_align - jnz .hv_have_right - cmp r10d, -33 - jl .hv_have_right -- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right - .hv_have_right: - pshufb m0, m4, m6 - pmaddubsw m0, m12 -@@ -705,7 +705,7 @@ ALIGN function_align - jl .v_loop - ret - --cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim -+cglobal sgr_box3_h_8bpc, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov xlimd, edgem - movifnidn wd, wm - mov hd, hm -@@ -805,7 +805,7 @@ cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim - RET - - INIT_YMM avx2 --cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim -+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov xq, -2 - rorx ylimd, edged, 2 -@@ -868,7 +868,7 @@ cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, - RET - - INIT_YMM avx2 --cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s -+cglobal sgr_calc_ab1_8bpc, 4, 6, 11, a, b, w, h, s - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 -@@ -937,8 +937,8 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s - RET - - INIT_YMM avx2 --cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ -- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y -+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ -+ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y - movifnidn wd, wm - mov hd, hm - vpbroadcastd m15, [pw_16] -@@ -1043,7 +1043,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - RET - - INIT_YMM avx2 --cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt -+cglobal sgr_weighted1_8bpc, 4, 6, 6, dst, stride, t, w, h, wt - %ifidn wtd, wtm - shl wtd, 4 - movd xm5, wtd -@@ -1082,7 +1082,7 @@ cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt - RET - - INIT_YMM avx2 --cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim -+cglobal sgr_box5_h_8bpc, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov edged, edgem - movifnidn wd, wm - mov hd, hm -@@ -1200,7 +1200,7 @@ cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli - RET - - INIT_YMM avx2 --cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim -+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov xq, -2 - rorx ylimd, edged, 2 -@@ -1293,7 +1293,7 @@ cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, - jmp .loop_y_noload - - INIT_YMM avx2 --cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s -+cglobal sgr_calc_ab2_8bpc, 4, 6, 11, a, b, w, h, s - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 -@@ -1364,8 +1364,8 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s - RET - - INIT_YMM avx2 --cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ -- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y -+cglobal sgr_finish_filter2_8bpc, 5, 13, 13, t, src, stride, a, b, w, h, \ -+ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y - movifnidn wd, wm - mov hd, hm - vpbroadcastd m9, [pw_5_6] -@@ -1483,7 +1483,7 @@ cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ - RET - - INIT_YMM avx2 --cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt -+cglobal sgr_weighted2_8bpc, 4, 7, 11, dst, stride, t1, t2, w, h, wt - movifnidn wd, wm - movifnidn hd, hm - vpbroadcastd m0, wtm -diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c -index 5df449c..11ebdd1 100644 ---- a/src/x86/looprestoration_init_tmpl.c -+++ b/src/x86/looprestoration_init_tmpl.c -@@ -31,148 +31,140 @@ - #include "common/intops.h" - #include "src/tables.h" - --#define WIENER_FILTER(ext) \ --void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const int16_t filter[2][8], \ -- enum LrEdgeFlags edges); \ --void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \ -- const pixel (*left)[4], const pixel *lpf, \ -- ptrdiff_t lpf_stride, int w, int h, \ -- const int16_t filter[2][8], \ -- enum LrEdgeFlags edges); -+#define decl_wiener_filter_fns(ext) \ -+decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \ -+decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext)) - --#define SGR_FILTER(ext) \ --void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ -- const pixel (*left)[4], \ -- const pixel *src, const ptrdiff_t stride, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ -- const int w, const int h, const int strength); \ --void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const int32_t *a, const int16_t *b, \ -- const int w, const int h); \ -+#define decl_sgr_filter_fn(ext) \ -+void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ -+ const pixel (*left)[4], \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \ -+ const int w, const int h, const int strength); \ -+void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int32_t *a, const int16_t *b, \ -+ const int w, const int h); \ - \ - /* filter with a 3x3 box (radius=1) */ \ --static void dav1d_sgr_filter1_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const pixel (*left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, const int strength, \ -- const enum LrEdgeFlags edges) \ -+static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const pixel (*left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const int strength, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ - int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ - ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ - int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ - \ -- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ -+ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ - if (edges & LR_HAVE_TOP) \ -- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -- NULL, lpf, lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -+ NULL, lpf, lpf_stride, w, 2, edges); \ - \ - if (edges & LR_HAVE_BOTTOM) \ -- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -- lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -+ lpf_stride, w, 2, edges); \ - \ -- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ -- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ -- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ -+ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \ -+ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \ -+ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \ - } \ - \ --void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ -- const pixel (*left)[4], \ -- const pixel *src, const ptrdiff_t stride, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ -- const int w, const int h, \ -- const enum LrEdgeFlags edges); \ --void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ -- const int w, const int h, const int strength); \ --void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const int32_t *a, const int16_t *b, \ -- const int w, const int h); \ -+void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \ -+ const pixel (*left)[4], \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \ -+ const int w, const int h, \ -+ const enum LrEdgeFlags edges); \ -+void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \ -+ const int w, const int h, const int strength); \ -+void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const int32_t *a, const int16_t *b, \ -+ const int w, const int h); \ - \ - /* filter with a 5x5 box (radius=2) */ \ --static void dav1d_sgr_filter2_##ext(coef *tmp, \ -- const pixel *src, const ptrdiff_t stride, \ -- const pixel (*left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, const int strength, \ -- const enum LrEdgeFlags edges) \ -+static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \ -+ const pixel *src, const ptrdiff_t stride, \ -+ const pixel (*left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const int strength, \ -+ const enum LrEdgeFlags edges) \ - { \ - ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ - int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ - ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ - int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ - \ -- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ -+ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ - if (edges & LR_HAVE_TOP) \ -- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -- NULL, lpf, lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ -+ NULL, lpf, lpf_stride, w, 2, edges); \ - \ - if (edges & LR_HAVE_BOTTOM) \ -- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -- lpf_stride, w, 2, edges); \ -+ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ -+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ -+ lpf_stride, w, 2, edges); \ - \ -- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ -- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ -- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ -+ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \ -+ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \ -+ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \ - } \ - \ --void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ -- const coef *t1, const int w, const int h, \ -- const int wt); \ --void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ -- const coef *t1, const coef *t2, \ -- const int w, const int h, \ -- const uint32_t wt); \ -+void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \ -+ const coef *t1, const int w, const int h, \ -+ const int wt); \ -+void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \ -+ const coef *t1, const coef *t2, \ -+ const int w, const int h, \ -+ const uint32_t wt); \ - \ --static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ -- const pixel (*const left)[4], \ -- const pixel *lpf, const ptrdiff_t lpf_stride, \ -- const int w, const int h, const int sgr_idx, \ -- const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ -+static void BF(sgr_filter, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ -+ const pixel (*const left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const int sgr_idx, \ -+ const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ - { \ - if (!dav1d_sgr_params[sgr_idx][0]) { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ -- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, dav1d_sgr_params[sgr_idx][3], edges); \ -- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ -+ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ -+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ - } else if (!dav1d_sgr_params[sgr_idx][1]) { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ -- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, dav1d_sgr_params[sgr_idx][2], edges); \ -- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ -+ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ -+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ - } else { \ - ALIGN_STK_32(coef, tmp1, 64 * 384,); \ - ALIGN_STK_32(coef, tmp2, 64 * 384,); \ -- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, dav1d_sgr_params[sgr_idx][2], edges); \ -- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ -- w, h, dav1d_sgr_params[sgr_idx][3], edges); \ -+ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ -+ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ -+ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ - const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \ -- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ -+ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \ - } \ - } - - #if BITDEPTH == 8 --WIENER_FILTER(sse2) --WIENER_FILTER(ssse3) --SGR_FILTER(ssse3) -+decl_wiener_filter_fns(sse2); -+decl_wiener_filter_fns(ssse3); -+decl_sgr_filter_fn(ssse3) - # if ARCH_X86_64 --WIENER_FILTER(avx2) --SGR_FILTER(avx2) -+decl_wiener_filter_fns(avx2); -+decl_sgr_filter_fn(avx2) - # endif - #endif - -@@ -181,21 +173,21 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont - - if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; - #if BITDEPTH == 8 -- c->wiener[0] = dav1d_wiener_filter7_sse2; -- c->wiener[1] = dav1d_wiener_filter5_sse2; -+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2); -+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2); - #endif - - if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; - #if BITDEPTH == 8 -- c->wiener[0] = dav1d_wiener_filter7_ssse3; -- c->wiener[1] = dav1d_wiener_filter5_ssse3; -- c->selfguided = sgr_filter_ssse3; -+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); -+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); -+ c->selfguided = BF(sgr_filter, ssse3); - #endif - - if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; - #if BITDEPTH == 8 && ARCH_X86_64 -- c->wiener[0] = dav1d_wiener_filter7_avx2; -- c->wiener[1] = dav1d_wiener_filter5_avx2; -- c->selfguided = sgr_filter_avx2; -+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2); -+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2); -+ c->selfguided = BF(sgr_filter, avx2); - #endif - } -diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm -index 5d3ca49..4b77138 100644 ---- a/src/x86/looprestoration_sse.asm -+++ b/src/x86/looprestoration_sse.asm -@@ -97,8 +97,8 @@ SECTION .text - %macro WIENER 0 - %if ARCH_X86_64 - DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers --cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h, x -+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h, x - %define base 0 - mov fltq, fltmp - mov edged, r8m -@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5 - %define m11 [stk+96] - %define stk_off 112 - %endif --cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride -+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride - %define base r6-pb_right_ext_mask-21 - %define stk esp - %define dstq leftq -@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride - add lpfq, [rsp+gprsize*1] - call .hv_bottom - .v1: -- call mangle(private_prefix %+ _wiener_filter7_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v - RET - .no_top: - lea t3, [lpfq+lpf_strideq*4] -@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride - dec hd - jnz .main - .v3: -- call mangle(private_prefix %+ _wiener_filter7_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v - .v2: -- call mangle(private_prefix %+ _wiener_filter7_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v - jmp .v1 - .extend_right: - movd m2, [lpfq-4] -@@ -685,8 +685,8 @@ ALIGN function_align - %endif - - %if ARCH_X86_64 --cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -- lpf_stride, w, edge, flt, h, x -+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ -+ lpf_stride, w, edge, flt, h, x - mov fltq, fltmp - mov edged, r8m - mov wd, wm -@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - %define m11 [stk+80] - %define stk_off 96 - %endif --cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride -+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride - %define stk esp - %define leftmp [stk+28] - %define m8 [base+pw_m16380] -@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride - dec hd - jnz .main - .v2: -- call mangle(private_prefix %+ _wiener_filter5_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v - add dstq, dst_strideq - mov t4, t3 - mov t3, t2 - mov t2, t1 - movifnidn dstmp, dstq - .v1: -- call mangle(private_prefix %+ _wiener_filter5_ssse3).v -+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v - jmp .end - .h: - %define stk esp+4 -@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride - jnz .h_have_right - cmp xd, -17 - jl .h_have_right -- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right - .h_have_right: - %macro %%h5 0 - %if cpuflag(ssse3) -@@ -991,7 +991,7 @@ ALIGN function_align - jnz .hv_have_right - cmp xd, -17 - jl .hv_have_right -- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right -+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right - .hv_have_right: - %%h5 - mova m2, [t3+xq*2] -@@ -1161,7 +1161,7 @@ WIENER - %endmacro - - %if ARCH_X86_64 --cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim -+cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - mov xlimd, edgem - movifnidn xd, xm - mov hd, hm -@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - add xd, xlimd - xor xlimd, 2 ; 2*!have_right - %else --cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim -+cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - %define wq r0m - %define xlimd r1m - %define hd hmp -@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - RET - - %if ARCH_X86_64 --cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim -+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim - movifnidn edged, edgem - %else --cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y -+cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y - %define sumsq_baseq dword [esp+0] - %define sum_baseq dword [esp+4] - %define ylimd dword [esp+8] -@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y - jl .loop_x - RET - --cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s -+cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s - movifnidn sd, sm - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 -@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s - RET - - %if ARCH_X86_64 --cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ -- tmp_base, src_base, a_base, b_base, x, y -+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ -+ tmp_base, src_base, a_base, b_base, x, y - movifnidn wd, wm - mov hd, hm - mova m15, [pw_16] -@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - mov b_baseq, bq - xor xd, xd - %else --cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y -+cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y - %define tmp_baseq [esp+8] - %define src_baseq [esp+12] - %define a_baseq [esp+16] -@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y - jl .loop_x - RET - --cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt -+cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt - movifnidn hd, hm - %if ARCH_X86_32 - SETUP_PIC r6, 0 -@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt - RET - - %if ARCH_X86_64 --cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim -+cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov edged, edgem - movifnidn wd, wm - mov hd, hm - mova m10, [pb_0] - mova m11, [pb_0_1] - %else --cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge -+cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - %define edgeb byte edgem - %define wd xd - %define wq wd -@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - RET - - %if ARCH_X86_64 --cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim -+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov ylimd, edged - %else --cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr -+cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr - %define wm [esp+0] - %define hm [esp+4] - %define edgem [esp+8] -@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr - jmp .sum_loop_y_noload - %endif - --cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s -+cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s - movifnidn sd, sm - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 -@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s - RET - - %if ARCH_X86_64 --cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ -+cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ - tmp_base, src_base, a_base, b_base, x, y - movifnidn wd, wm - mov hd, hm -@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ - psrlw m11, m12, 1 ; pw_128 - pxor m13, m13 - %else --cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y -+cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y - %define tmp_baseq r0m - %define src_baseq r1m - %define a_baseq r3m -@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y - RET - - %undef t2 --cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt -+cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt - movifnidn wd, wm - movd m0, wtm - %if ARCH_X86_64 --- -GitLab - diff --git a/0003-wiener_3.patch b/0003-wiener_3.patch index b2852c881396..e69de29bb2d1 100644 --- a/0003-wiener_3.patch +++ b/0003-wiener_3.patch @@ -1,492 +0,0 @@ -From 43c61c3f259400cde5facbe7ce50769088b5f5b6 Mon Sep 17 00:00:00 2001 -From: "Nathan E. Egge" <unlord@xiph.org> -Date: Sun, 10 Jan 2021 14:12:10 -0500 -Subject: [PATCH] x86: lr: Add AVX2 implementation of wiener filter for 16 bpc - -Relative speed-ups over C code (compared with gcc-9.3.0): - - C AVX2 -wiener_5tap_10bpc: 194892.0 14831.9 13.14x -wiener_5tap_12bpc: 194295.4 14828.9 13.10x -wiener_7tap_10bpc: 194391.7 19461.4 9.99x -wiener_7tap_12bpc: 194136.1 19418.7 10.00x ---- - src/x86/looprestoration16_avx2.asm | 466 +++++++++++++++++++++++++++++ - 1 file changed, 466 insertions(+) - create mode 100644 src/x86/looprestoration16_avx2.asm - -diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm -new file mode 100644 -index 0000000..2012860 ---- /dev/null -+++ b/src/x86/looprestoration16_avx2.asm -@@ -0,0 +1,466 @@ -+; Copyright (c) 2017-2021, The rav1e contributors -+; Copyright (c) 2021, Nathan Egge -+; All rights reserved. -+; -+; This source code is subject to the terms of the BSD 2 Clause License and -+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -+; was not distributed with this source code in the LICENSE file, you can -+; obtain it at www.aomedia.org/license/software. If the Alliance for Open -+; Media Patent License 1.0 was not distributed with this source code in the -+; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -+ -+%include "config.asm" -+%include "ext/x86/x86inc.asm" -+ -+%if ARCH_X86_64 -+ -+SECTION_RODATA 32 -+ -+wiener5_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 -+wiener5_shufB: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 -+wiener5_shufC: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 -+wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -+ -+wiener7_shufB: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 -+wiener7_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 -+wiener7_shufD: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 -+rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 -+rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 -+wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -+ -+pq_3: dq (6 - 4) + 1 -+pq_5: dq (6 - 2) + 1 -+pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4)) -+pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2)) -+ -+pq_11: dq 12 - (6 - 4) + 1 -+pq_9: dq 12 - (6 - 2) + 1 -+nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8)) -+nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8)) -+ -+pb_wiener5_l: times 2 db 2, 3 -+pb_wiener5_r: times 2 db -6, -5 -+ -+pb_wiener7_l: times 2 db 4, 5 -+pb_wiener7_m: times 2 db -4, -3 -+pb_wiener7_r: times 2 db -8, -7 -+ -+SECTION .text -+ -+INIT_YMM avx2 -+cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ vbroadcasti128 m6, [wiener5_shufA] -+ vpbroadcastd m12, [fq + 2] -+ vbroadcasti128 m7, [wiener5_shufB] -+ vpbroadcastw m13, [fq + 6] -+ vbroadcasti128 m8, [wiener5_shufC] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m9, [pd_65540] -+ movq xm10, [pq_3] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m9, [pd_262160] -+ movq xm10, [pq_5] -+.bits10: -+ pxor m11, m11 -+ add wq, wq -+ add srcq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x -+.v_loop: -+ mov xq, wq -+ test edgeb, 1 ; LR_HAVE_LEFT -+ jz .h_extend_left -+ test leftq, leftq -+ jz .h_loop -+ movd xm4, [leftq + 4] -+ vpblendd m4, [srcq + xq - 4], 0xfe -+ add leftq, 8 -+ jmp .h_main -+.h_extend_left: -+ vbroadcasti128 m5, [srcq + xq] -+ mova m4, [srcq + xq] -+ palignr m4, m5, 12 -+ pshufb m4, [wiener5_l_shuf] -+ jmp .h_main -+.h_loop: -+ movu m4, [srcq + xq - 4] -+.h_main: -+ movu m5, [srcq + xq + 4] -+ test edgeb, 2 ; LR_HAVE_RIGHT -+ jnz .h_have_right -+ cmp xd, -18*2 -+ jl .h_have_right -+ movd xm2, xd -+ vpbroadcastd m0, [pb_wiener5_l] -+ vpbroadcastd m1, [pb_wiener5_r] -+ vpbroadcastb m2, xm2 -+ movu m3, [pb_0to31] -+ psubb m0, m2 -+ psubb m1, m2 -+ pminub m0, m3 -+ pminub m1, m3 -+ pshufb m4, m0 -+ pshufb m5, m1 -+.h_have_right: -+ pshufb m0, m4, m6 -+ pshufb m2, m4, m7 -+ paddw m0, m2 -+ pmaddwd m0, m12 -+ pshufb m1, m5, m6 -+ pshufb m3, m5, m7 -+ paddw m1, m3 -+ pmaddwd m1, m12 -+ pshufb m4, m8 -+ pmaddwd m4, m13 -+ pshufb m5, m8 -+ pmaddwd m5, m13 -+ paddd m0, m4 -+ paddd m1, m5 -+ paddd m0, m9 -+ paddd m1, m9 -+ psrad m0, xm10 -+ psrad m1, xm10 -+ packssdw m0, m1 -+ pmaxsw m0, m11 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add srcq, ssq -+ add dstq, 384*2 -+ dec hd -+ jg .v_loop -+ RET -+ -+DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 -+ -+INIT_YMM avx2 -+cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ pxor m6, m6 -+ vpbroadcastd m7, [fq + 2] -+ vpbroadcastd m8, [fq + 6] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m9, [nd_1047552] -+ movq xm10, [pq_11] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m9, [nd_1048320] -+ movq xm10, [pq_9] -+.bits10: -+ vpbroadcastw m11, bdmaxm -+ add wq, wq -+ add midq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x -+ mov msq, 2*384 -+ mov t0, midq -+ lea t1, [t0 + msq] -+ lea t2, [t1 + msq] -+ lea t3, [t2 + msq] -+ lea t4, [t3 + msq] -+ test edgeb, 4 ; LR_HAVE_TOP -+ jnz .have_top -+ mov t0, t2 -+ mov t1, t2 -+.have_top: -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jnz .v_loop -+ cmp hd, 2 -+ jg .v_loop -+ cmp hd, 1 -+ jne .limit_v -+ mov t3, t2 -+.limit_v: -+ mov t4, t3 -+.v_loop: -+ mov xq, wq -+.h_loop: -+ mova m1, [t0 + xq] -+ mova m2, [t1 + xq] -+ mova m3, [t2 + xq] -+ mova m4, [t3 + xq] -+ mova m5, [t4 + xq] -+ punpcklwd m0, m1, m2 -+ pmaddwd m0, m7 -+ punpckhwd m1, m2 -+ pmaddwd m1, m7 -+ punpcklwd m2, m5, m4 -+ pmaddwd m2, m7 -+ punpckhwd m5, m4 -+ pmaddwd m5, m7 -+ paddd m0, m2 -+ paddd m1, m5 -+ punpcklwd m2, m3, m6 -+ pmaddwd m2, m8 -+ punpckhwd m3, m6 -+ pmaddwd m3, m8 -+ paddd m0, m2 -+ paddd m1, m3 -+ paddd m0, m9 -+ paddd m1, m9 -+ psrad m0, xm10 -+ psrad m1, xm10 -+ packusdw m0, m1 -+ pminuw m0, m11 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add dstq, dsq -+ mov t0, t1 -+ mov t1, t2 -+ mov t2, t3 -+ mov t3, t4 -+ add t4, msq -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jnz .have_bottom -+ cmp hd, 3 -+ jg .have_bottom -+ mov t4, t3 -+.have_bottom: -+ dec hd -+ jg .v_loop -+ RET -+ -+INIT_YMM avx2 -+cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ vpbroadcastd m7, [fq] -+ vpbroadcastd m8, [fq + 4] -+ vbroadcasti128 m10, [rev_w] -+ vbroadcasti128 m11, [wiener5_shufA] -+ vbroadcasti128 m12, [wiener7_shufB] -+ vbroadcasti128 m13, [wiener7_shufC] -+ vbroadcasti128 m14, [wiener7_shufD] -+ vbroadcasti128 m15, [rev_d] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m9, [pd_65540] -+ mov rhq, [pq_3] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m9, [pd_262160] -+ mov rhq, [pq_5] -+.bits10: -+ add wq, wq -+ add srcq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh -+.v_loop: -+ mov xq, wq -+ test edgeb, 1 ; LR_HAVE_LEFT -+ jz .h_extend_left -+ test leftq, leftq -+ jz .h_loop -+ movq xm4, [leftq + 2] -+ vpblendw xm4, [srcq + xq - 6], 0xf8 -+ vinserti128 m4, [srcq + xq + 10], 1 -+ add leftq, 8 -+ jmp .h_main -+.h_extend_left: -+ vbroadcasti128 m5, [srcq + xq] -+ mova m4, [srcq + xq] -+ palignr m4, m5, 10 -+ pshufb m4, [wiener7_l_shuf] -+ jmp .h_main -+.h_loop: -+ movu m4, [srcq + xq - 6] -+.h_main: -+ movu m5, [srcq + xq + 2] -+ movu m6, [srcq + xq + 6] -+ test edgeb, 2 ; LR_HAVE_RIGHT -+ jnz .h_have_right -+ cmp xd, -19*2 -+ jl .h_have_right -+ movd xm3, xd -+ vpbroadcastd m0, [pb_wiener7_l] -+ vpbroadcastd m1, [pb_wiener7_m] -+ vpbroadcastd m2, [pb_wiener7_r] -+ vpbroadcastb m3, xm3 -+ psubb m0, m3 -+ psubb m1, m3 -+ psubb m2, m3 -+ movu m3, [pb_0to31] -+ pminub m0, m3 -+ pminub m1, m3 -+ pminub m2, m3 -+ pshufb m4, m0 -+ pshufb m5, m1 -+ pshufb m6, m2 -+ cmp xd, -9*2 -+ jne .hack -+ vpbroadcastw xm3, [srcq + xq + 16] -+ vinserti128 m5, xm3, 1 -+ jmp .h_have_right -+.hack: -+ cmp xd, -1*2 -+ jne .h_have_right -+ vpbroadcastw xm5, [srcq + xq] -+.h_have_right: -+ pshufb m6, m10 -+ pshufb m0, m4, m11 -+ pshufb m2, m5, m12 -+ paddw m0, m2 -+ pmaddwd m0, m7 -+ pshufb m2, m4, m13 -+ pshufb m4, m14 -+ paddw m2, m4 -+ pmaddwd m2, m8 -+ pshufb m1, m6, m11 -+ pshufb m5, m11 -+ pmaddwd m1, m7 -+ pmaddwd m5, m7 -+ pshufb m3, m6, m13 -+ pshufb m6, m14 -+ paddw m3, m6 -+ pmaddwd m3, m8 -+ paddd m0, m2 -+ paddd m1, m3 -+ pshufb m1, m15 -+ paddd m1, m5 -+ movq xm4, rhq -+ pxor m5, m5 -+ paddd m0, m9 -+ paddd m1, m9 -+ psrad m0, xm4 -+ psrad m1, xm4 -+ packssdw m0, m1 -+ pmaxsw m0, m5 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add srcq, ssq -+ add dstq, 384*2 -+ dec hd -+ jg .v_loop -+ RET -+ -+INIT_YMM avx2 -+cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax -+ movifnidn wd, wm -+ movifnidn hd, hm -+ movifnidn edgeb, edgem -+ pxor m6, m6 -+ vpbroadcastd m7, [fq] -+ vpbroadcastw m8, [fq + 4] -+ vpbroadcastd m9, [fq + 6] -+ popcnt bdmaxd, bdmaxm -+ vpbroadcastd m10, [nd_1047552] -+ movq xm11, [pq_11] -+ cmp bdmaxd, 10 -+ je .bits10 -+ vpbroadcastd m10, [nd_1048320] -+ movq xm11, [pq_9] -+.bits10: -+ vpbroadcastw m12, bdmaxm -+ add wq, wq -+ add midq, wq -+ add dstq, wq -+ neg wq -+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x -+ mov msq, 2*384 -+ mov t0, midq -+ mov t1, t0 -+ lea t2, [t1 + msq] -+ lea t3, [t2 + msq] -+ lea t4, [t3 + msq] -+ lea t5, [t4 + msq] -+ lea t6, [t5 + msq] -+ test edgeb, 4 ; LR_HAVE_TOP -+ jnz .have_top -+ mov t0, t3 -+ mov t1, t3 -+ mov t2, t3 -+.have_top: -+ cmp hd, 3 -+ jg .v_loop -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jz .no_bottom0 -+ cmp hd, 1 -+ jg .v_loop -+ jmp .h3 -+.no_bottom0: -+ cmp hd, 2 -+ je .h2 -+ jns .h3 -+.h1: -+ mov t4, t3 -+.h2: -+ mov t5, t4 -+.h3: -+ mov t6, t5 -+.v_loop: -+ mov xq, wq -+.h_loop: -+ mova m1, [t0 + xq] -+ mova m2, [t1 + xq] -+ mova m3, [t5 + xq] -+ mova m4, [t6 + xq] -+ punpcklwd m0, m1, m2 -+ pmaddwd m0, m7 -+ punpckhwd m1, m2 -+ pmaddwd m1, m7 -+ punpcklwd m2, m4, m3 -+ pmaddwd m2, m7 -+ punpckhwd m4, m3 -+ pmaddwd m4, m7 -+ paddd m0, m2 -+ paddd m1, m4 -+ mova m3, [t2 + xq] -+ mova m4, [t4 + xq] -+ punpcklwd m2, m3, m4 -+ pmaddwd m2, m8 -+ punpckhwd m3, m4 -+ pmaddwd m3, m8 -+ paddd m0, m2 -+ paddd m1, m3 -+ mova m3, [t3 + xq] -+ punpcklwd m2, m3, m6 -+ pmaddwd m2, m9 -+ punpckhwd m3, m6 -+ pmaddwd m3, m9 -+ paddd m0, m2 -+ paddd m1, m3 -+ paddd m0, m10 -+ paddd m1, m10 -+ psrad m0, xm11 -+ psrad m1, xm11 -+ packusdw m0, m1 -+ pminuw m0, m12 -+ mova [dstq + xq], m0 -+ add xq, 32 -+ jl .h_loop -+ add dstq, dsq -+ mov t0, t1 -+ mov t1, t2 -+ mov t2, t3 -+ mov t3, t4 -+ mov t4, t5 -+ mov t5, t6 -+ add t6, msq -+ cmp hd, 4 -+ jg .next_row -+ test edgeb, 8 ; LR_HAVE_BOTTOM -+ jz .no_bottom -+ cmp hd, 2 -+ jg .next_row -+.no_bottom: -+ mov t6, t5 -+.next_row: -+ dec hd -+ jg .v_loop -+ RET -+ -+%endif ; ARCH_X86_64 --- -GitLab - diff --git a/0004-wiener_4.patch b/0004-wiener_4.patch index 1876e7bd1d25..e69de29bb2d1 100644 --- a/0004-wiener_4.patch +++ b/0004-wiener_4.patch @@ -1,101 +0,0 @@ -From 2d59aa7b52713b77243bda12066213fca8447f9d Mon Sep 17 00:00:00 2001 -From: "Nathan E. Egge" <unlord@xiph.org> -Date: Wed, 13 Jan 2021 14:54:42 -0500 -Subject: [PATCH] Enable AVX2 wiener filter HBD assembly - ---- - src/meson.build | 1 + - src/x86/looprestoration_init_tmpl.c | 41 +++++++++++++++++++++++++++-- - 2 files changed, 40 insertions(+), 2 deletions(-) - -diff --git a/src/meson.build b/src/meson.build -index ca0b406..c5c305d 100644 ---- a/src/meson.build -+++ b/src/meson.build -@@ -209,7 +209,8 @@ if is_asm_enabled - - if dav1d_bitdepths.contains('16') - libdav1d_sources_asm += files( -+ 'x86/looprestoration16_avx2.asm', - 'x86/mc16_avx2.asm', - ) - endif - -diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c -index 11ebdd1..dfc9f84 100644 ---- a/src/x86/looprestoration_init_tmpl.c -+++ b/src/x86/looprestoration_init_tmpl.c -@@ -31,9 +31,41 @@ - #include "common/intops.h" - #include "src/tables.h" - -+#if BITDEPTH != 8 -+#undef decl_wiener_filter_fn -+#define decl_wiener_filter_fn(name, ext) \ -+void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \ -+ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \ -+ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ -+void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \ -+ const int16_t fv[7], int w, int h, \ -+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ -+static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ -+ const pixel (*const left)[4], \ -+ const pixel *lpf, const ptrdiff_t lpf_stride, \ -+ const int w, const int h, const int16_t filter[2][8], \ -+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \ -+ ALIGN_STK_64(int16_t, mid, 68 * 384,); \ -+ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, filter[0], w, h, \ -+ edges HIGHBD_TAIL_SUFFIX); \ -+ if (edges & LR_HAVE_TOP) { \ -+ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, filter[0], w, 2, \ -+ edges HIGHBD_TAIL_SUFFIX); \ -+ } \ -+ if (edges & LR_HAVE_BOTTOM) { \ -+ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \ -+ lpf_stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \ -+ } \ -+ BF(name##_v, ext)(dst, dst_stride, mid, filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \ -+} -+#define decl_wiener_filter_fns(ext) \ -+decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \ -+decl_wiener_filter_fn(dav1d_wiener_filter5, ext) -+#else - #define decl_wiener_filter_fns(ext) \ - decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \ - decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext)) -+#endif - - #define decl_sgr_filter_fn(ext) \ - void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ -@@ -163,11 +195,14 @@ decl_wiener_filter_fns(sse2); - decl_wiener_filter_fns(ssse3); - decl_sgr_filter_fn(ssse3) - # if ARCH_X86_64 --decl_wiener_filter_fns(avx2); - decl_sgr_filter_fn(avx2) - # endif - #endif - -+#if ARCH_X86_64 -+decl_wiener_filter_fns(avx2); -+#endif -+ - COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { - const unsigned flags = dav1d_get_cpu_flags(); - -@@ -185,9 +220,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont - #endif - - if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; --#if BITDEPTH == 8 && ARCH_X86_64 -+#if ARCH_X86_64 - c->wiener[0] = BF(dav1d_wiener_filter7, avx2); - c->wiener[1] = BF(dav1d_wiener_filter5, avx2); -+# if BITDEPTH == 8 - c->selfguided = BF(sgr_filter, avx2); -+# endif - #endif - } --- -GitLab - @@ -1,9 +1,9 @@ # Maintainer: Ben Grant <ben@190n.org> -# Maintainer: Snoop05 +# Maintainer: Matej Dian <snoop05b@gmail.com> _testvideo=Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu pkgname=dav1d-git-optimized -pkgver=r1556.05d05f9 +pkgver=r1617.baa9237 pkgrel=1 license=('BSD') pkgdesc='AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations' @@ -15,15 +15,11 @@ makedepends=('meson' 'git' 'nasm') source=('git+https://code.videolan.org/videolan/dav1d.git' "http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/$_testvideo" '0001-1112.patch' - '0002-wiener_2.patch' - '0003-wiener_3.patch' - '0004-wiener_4.patch') + '0002-1160.patch') sha256sums=('SKIP' 'e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87' - '421c4732d3a3fc85428263f4e4419f7b3bfc7059a29c2b81055a6ebf4345d0eb' - '32385f2316886cef326e7887a3de96fdada2ee723b269908794ed770da460626' - '1cf8db585f98ef8e63bb3f44f11679cdc554377f58964bebc7ca29aa1639d1ea' - '5e46d8d6fcf2d2cdb062368b23af534ecf123321594f9d548a6f14d80d16d981') + '83807b996384f147cea3702a1a7fcd4accfc04c3937fea11d0f74b615c37f8d2' + 'de289262c9d4e1964e7b9130a5619c6501e82a074794ced6d7da7922630973f3') pkgver () { cd dav1d @@ -32,11 +28,8 @@ pkgver () { prepare () { cd dav1d - # from https://code.videolan.org/videolan/dav1d/-/merge_requests/1112 patch -Np1 -i ${srcdir}/0001-1112.patch - patch -Np1 -i ${srcdir}/0002-wiener_2.patch - patch -Np1 -i ${srcdir}/0003-wiener_3.patch - patch -Np1 -i ${srcdir}/0004-wiener_4.patch + patch -Np1 -i ${srcdir}/0002-1160.patch } build () { @@ -48,7 +41,7 @@ build () { -Db_lto=false \ -Db_pgo=generate ninja -C build - ./build/tools/dav1d -i "$srcdir/$_testvideo" --muxer null --framethreads $(nproc) --tilethread 4 + LD_PRELOAD=./build/src/libdav1d.so ./build/tools/dav1d -i "$srcdir/$_testvideo" --muxer null --framethreads $(nproc) --tilethreads 4 --pfthreads $(nproc) meson configure build -Db_pgo=use ninja -C build } |