summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatej Dian2021-03-01 21:01:04 +0100
committerMatej Dian2021-03-01 21:01:04 +0100
commit653c9e7e5af9ca8328653be5ecf0b6cd4d7778c6 (patch)
tree5d0e0ea04c823fa67853e3419636f7f5a6a2536b
parentebd178f0a4a474b7cfaf320fa7ffa1d7fea952d3 (diff)
downloadaur-dav1d-git-optimized.tar.gz
update patches
-rw-r--r--.SRCINFO6
-rw-r--r--0001-1112.patch52
-rw-r--r--0002-1160.patch1331
-rw-r--r--PKGBUILD9
4 files changed, 30 insertions, 1368 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 8c5ce90f2f7b..220e05e93248 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,6 +1,6 @@
pkgbase = dav1d-git-optimized
pkgdesc = AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations
- pkgver = r1617.baa9237
+ pkgver = r1627.f06148e
pkgrel = 1
url = https://code.videolan.org/videolan/dav1d
arch = x86_64
@@ -15,11 +15,9 @@ pkgbase = dav1d-git-optimized
source = git+https://code.videolan.org/videolan/dav1d.git
source = http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu
source = 0001-1112.patch
- source = 0002-1160.patch
sha256sums = SKIP
sha256sums = e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87
- sha256sums = 83807b996384f147cea3702a1a7fcd4accfc04c3937fea11d0f74b615c37f8d2
- sha256sums = de289262c9d4e1964e7b9130a5619c6501e82a074794ced6d7da7922630973f3
+ sha256sums = SKIP
pkgname = dav1d-git-optimized
diff --git a/0001-1112.patch b/0001-1112.patch
index b60c79a4a53a..94c89f906fc2 100644
--- a/0001-1112.patch
+++ b/0001-1112.patch
@@ -1,4 +1,4 @@
-From efd27b6182c04072e1cc4b80b24aa28e78d6bfea Mon Sep 17 00:00:00 2001
+From 3ab60cb9a868286bdeee77636ad4a271432df204 Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Mon, 21 Dec 2020 15:38:02 -0500
Subject: [PATCH 1/7] Add bpc suffix to mc functions
@@ -11,7 +11,7 @@ Subject: [PATCH 1/7] Add bpc suffix to mc functions
4 files changed, 343 insertions(+), 343 deletions(-)
diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm
-index dda8234f..e77a6474 100644
+index 6a1ab057..b45aabea 100644
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -110,7 +110,7 @@ cextern resize_filter
@@ -174,7 +174,7 @@ index dda8234f..e77a6474 100644
add wq, base_reg
jmp wq
%ifidn %1, put
-@@ -4026,10 +4026,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+@@ -4025,10 +4025,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%endmacro
%macro BILIN_SCALED_FN 1
@@ -187,7 +187,7 @@ index dda8234f..e77a6474 100644
%endmacro
%if WIN64
-@@ -4114,11 +4114,11 @@ MC_8TAP_SCALED prep
+@@ -4113,11 +4113,11 @@ MC_8TAP_SCALED prep
paddd m%1, m0, m%2
%endmacro
@@ -201,7 +201,7 @@ index dda8234f..e77a6474 100644
.loop:
psrad m7, 13
psrad m0, 13
-@@ -4128,12 +4128,12 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
+@@ -4127,12 +4127,12 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
mova [tmpq+tsq*0], xm7
vextracti128 [tmpq+tsq*2], m7, 1
dec r4d
@@ -217,7 +217,7 @@ index dda8234f..e77a6474 100644
beta, filter, tmp1, delta, my, gamma
%if WIN64
sub rsp, 0xa0
-@@ -4390,9 +4390,9 @@ ALIGN function_align
+@@ -4389,9 +4389,9 @@ ALIGN function_align
add tmp2q, %1*32
%endmacro
@@ -230,7 +230,7 @@ index dda8234f..e77a6474 100644
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, dword [r6+wq*4]
-@@ -4420,9 +4420,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+@@ -4419,9 +4419,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define W_AVG_INC_PTR AVG_INC_PTR
@@ -243,7 +243,7 @@ index dda8234f..e77a6474 100644
tzcnt wd, wm
movifnidn hd, hm
vpbroadcastw m4, r6m ; weight
-@@ -4470,9 +4470,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+@@ -4469,9 +4469,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
add tmp1q, %1*32
%endmacro
@@ -256,7 +256,7 @@ index dda8234f..e77a6474 100644
tzcnt wd, wm
movifnidn hd, hm
mov maskq, maskmp
-@@ -4513,9 +4513,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+@@ -4512,9 +4512,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
packuswb m%1, m1
%endmacro
@@ -269,7 +269,7 @@ index dda8234f..e77a6474 100644
tzcnt wd, wm
movifnidn hd, hm
movifnidn maskq, maskmp
-@@ -4630,15 +4630,15 @@ ALIGN function_align
+@@ -4629,15 +4629,15 @@ ALIGN function_align
jg .w32
RET
@@ -289,7 +289,7 @@ index dda8234f..e77a6474 100644
jmp wq
.w2:
vpbroadcastd xm2, [maskq+2*2]
-@@ -4741,9 +4741,9 @@ ALIGN function_align
+@@ -4740,9 +4740,9 @@ ALIGN function_align
jg .w32_loop
RET
@@ -302,7 +302,7 @@ index dda8234f..e77a6474 100644
mov r6d, wd
tzcnt wd, wd
mov hd, hm
-@@ -4867,7 +4867,7 @@ ALIGN function_align
+@@ -4866,7 +4866,7 @@ ALIGN function_align
jl .w32_loop0
RET
@@ -311,7 +311,7 @@ index dda8234f..e77a6474 100644
bottomext, rightext
; we assume that the buffer (stride) is larger than width, so we can
; safely overwrite by a few bytes
-@@ -5054,7 +5054,7 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+@@ -5053,7 +5053,7 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
.end:
RET
@@ -320,7 +320,7 @@ index dda8234f..e77a6474 100644
dst_w, h, src_w, dx, mx0
sub dword mx0m, 4<<14
sub dword src_wm, 8
-@@ -5192,9 +5192,9 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
+@@ -5191,9 +5191,9 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
jg .loop_y
RET
@@ -333,7 +333,7 @@ index dda8234f..e77a6474 100644
tzcnt wd, wm
mov r6d, r7m ; sign
movifnidn hd, hm
-@@ -5398,9 +5398,9 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+@@ -5397,9 +5397,9 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
@@ -346,7 +346,7 @@ index dda8234f..e77a6474 100644
tzcnt wd, wm
mov r6d, r7m ; sign
movifnidn hd, hm
-@@ -5571,9 +5571,9 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+@@ -5570,9 +5570,9 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
@@ -1324,7 +1324,7 @@ index edbd1865..8b38daa7 100644
GitLab
-From da299eb148a2a799411132166d32613d15586578 Mon Sep 17 00:00:00 2001
+From 10b12fff1df60f403a2eb1a7025b10679619f85e Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Mon, 21 Dec 2020 00:38:05 -0500
Subject: [PATCH 2/7] x86: mc: Add AVX2 implementation of 8tap put/prep for
@@ -2954,7 +2954,7 @@ index 00000000..ea6cfdbf
GitLab
-From 07a3064c9ebd0827177706e135f25cc8a6c25399 Mon Sep 17 00:00:00 2001
+From 9622bbfd5a5a0ef1ee94a1b28f1a9ab62e5f3031 Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Sat, 26 Dec 2020 21:38:58 -0500
Subject: [PATCH 3/7] Enable AVX2 8tap put/prep HBD assembly
@@ -2965,15 +2965,13 @@ Subject: [PATCH 3/7] Enable AVX2 8tap put/prep HBD assembly
2 files changed, 21 insertions(+), 18 deletions(-)
diff --git a/src/meson.build b/src/meson.build
-index f9f5c120..ff62a9d8 100644
+index 25729217..e499da2f 100644
--- a/src/meson.build
+++ b/src/meson.build
-@@ -208,8 +208,9 @@ if is_asm_enabled
-
- if dav1d_bitdepths.contains('16')
- libdav1d_sources_asm += files(
+@@ -212,6 +212,7 @@ if is_asm_enabled
'x86/cdef16_avx2.asm',
'x86/cdef16_sse.asm',
+ 'x86/looprestoration16_avx2.asm',
+ 'x86/mc16_avx2.asm',
)
endif
@@ -3040,7 +3038,7 @@ index 468069c5..fcfed9be 100644
GitLab
-From 68aa4049fe160b318f0f037b0f5514aefea8a69b Mon Sep 17 00:00:00 2001
+From 3f3572bc6c91a51223756ac2baaf6e7616d09b2d Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Sat, 26 Dec 2020 06:21:44 -0500
Subject: [PATCH 4/7] x86: mc: Add AVX2 implementation of avg/w_avg/mask for
@@ -3281,7 +3279,7 @@ index ea6cfdbf..7b4f9cdf 100644
GitLab
-From 6ba57502ac82b00d1441a36d4e12814eafd37982 Mon Sep 17 00:00:00 2001
+From 2bb19d9fdffbfd43a89059a855352c7e5155df1c Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Sun, 27 Dec 2020 04:11:21 -0500
Subject: [PATCH 5/7] Enable AVX2 avg/w_avg/mask HBD assembly
@@ -3319,7 +3317,7 @@ index fcfed9be..70798047 100644
GitLab
-From c18338526a06c14409333e7d6ed34ae60a6dff46 Mon Sep 17 00:00:00 2001
+From fefedec18cb7561549b1bf16d82ab89f74519020 Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Sun, 27 Dec 2020 17:25:54 -0500
Subject: [PATCH 6/7] x86: mc: Add AVX2 implementation of blend/blend_h/blend_v
@@ -3744,7 +3742,7 @@ index 7b4f9cdf..ee56da2e 100644
GitLab
-From 8770797232201d6e0e4106e799b6d76865feff77 Mon Sep 17 00:00:00 2001
+From 31784170aecfa8bf8d9d0cefe537fbec89212460 Mon Sep 17 00:00:00 2001
From: "Nathan E. Egge" <unlord@xiph.org>
Date: Sun, 27 Dec 2020 21:13:15 -0500
Subject: [PATCH 7/7] Enable AVX2 blend/blend_h/blend_v HBD assembly
diff --git a/0002-1160.patch b/0002-1160.patch
deleted file mode 100644
index 06f9b186c4ea..000000000000
--- a/0002-1160.patch
+++ /dev/null
@@ -1,1331 +0,0 @@
-From 541a62936532c3edd83edf6eb7ec83ab3e8bac5f Mon Sep 17 00:00:00 2001
-From: "Nathan E. Egge" <unlord@xiph.org>
-Date: Tue, 29 Dec 2020 06:58:33 -0500
-Subject: [PATCH 1/3] Add bpc suffix to lr functions
-
----
- src/x86/looprestoration.asm | 36 ++--
- src/x86/looprestoration_init_tmpl.c | 266 ++++++++++++++--------------
- src/x86/looprestoration_sse.asm | 60 +++----
- 3 files changed, 177 insertions(+), 185 deletions(-)
-
-diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm
-index 44aaaf49..71e3e0d2 100644
---- a/src/x86/looprestoration.asm
-+++ b/src/x86/looprestoration.asm
-@@ -88,8 +88,8 @@ SECTION .text
- DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers
-
- INIT_YMM avx2
--cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h
-+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h
- mov fltq, fltmp
- mov edged, r8m
- mov wd, wm
-@@ -436,8 +436,8 @@ ALIGN function_align
- add dstq, dst_strideq
- ret
-
--cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h
-+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h
- mov fltq, fltmp
- mov edged, r8m
- mov wd, wm
-@@ -554,7 +554,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
- jnz .h_have_right
- cmp r10d, -33
- jl .h_have_right
-- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
- .h_have_right:
- pshufb m0, m4, m6
- pmaddubsw m0, m12
-@@ -613,7 +613,7 @@ ALIGN function_align
- jnz .hv_have_right
- cmp r10d, -33
- jl .hv_have_right
-- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
- .hv_have_right:
- pshufb m0, m4, m6
- pmaddubsw m0, m12
-@@ -727,8 +727,8 @@ ALIGN function_align
- jl .v_loop
- ret
-
--cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, params, h
-+cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, params, h
- %define base r12-sgr_x_by_x-256*4
- lea r12, [sgr_x_by_x+256*4]
- mov paramsq, paramsmp
-@@ -1187,8 +1187,8 @@ ALIGN function_align
- add dstq, dst_strideq
- ret
-
--cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, params, h
-+cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, params, h
- %define base r14-sgr_x_by_x-256*4
- mov paramsq, paramsmp
- mov edged, r8m
-@@ -1298,7 +1298,7 @@ cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
- jnz .h_have_right
- cmp r10d, -17
- jl .h_have_right
-- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
-+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
- .h_have_right:
- pshufb m0, m5, m8
- pmullw m2, m0, m0
-@@ -1346,7 +1346,7 @@ ALIGN function_align
- jnz .hv_have_right
- cmp r10d, -17
- jl .hv_have_right
-- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
-+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
- .hv_have_right:
- pshufb m0, m5, m8
- pmullw m3, m0, m0
-@@ -1546,8 +1546,8 @@ ALIGN function_align
- add dstq, dst_strideq
- ret
-
--cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, params, h
-+cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, params, h
- %define base r12-sgr_x_by_x-256*4
- lea r12, [sgr_x_by_x+256*4]
- mov paramsq, paramsmp
-@@ -1573,7 +1573,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
- call .h_top
- add lpfq, lpf_strideq
- mov t2, t1
-- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup
-+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
- add t1, 400*12
- call .h_top
- lea r10, [lpfq+lpf_strideq*4]
-@@ -1681,7 +1681,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
- jnz .h_have_right
- cmp r10d, -18
- jl .h_have_right
-- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
-+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
- .h_have_right:
- pshufb m6, m5, m9
- pshufb m4, m5, m10
-@@ -1742,7 +1742,7 @@ ALIGN function_align
- jnz .hv0_have_right
- cmp r10d, -18
- jl .hv0_have_right
-- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
-+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
- .hv0_have_right:
- pshufb m6, m5, m9
- pshufb m4, m5, m10
-@@ -1853,7 +1853,7 @@ ALIGN function_align
- jnz .hv1_have_right
- cmp r10d, -18
- jl .hv1_have_right
-- call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
-+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
- .hv1_have_right:
- pshufb m6, m5, m9
- pshufb m3, m5, m10
-diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
-index 57727787..a9aa5acf 100644
---- a/src/x86/looprestoration_init_tmpl.c
-+++ b/src/x86/looprestoration_init_tmpl.c
-@@ -30,179 +30,171 @@
-
- #include "common/intops.h"
-
--#define WIENER_FILTER(ext) \
--void dav1d_wiener_filter7_##ext(pixel *dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const LooprestorationParams *params, \
-- enum LrEdgeFlags edges); \
--void dav1d_wiener_filter5_##ext(pixel *dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const LooprestorationParams *params, \
-- enum LrEdgeFlags edges);
-+#define decl_wiener_filter_fns(ext) \
-+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
-+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
-
--#define SGR_FILTER(ext) \
--void dav1d_sgr_filter_5x5_##ext(pixel *dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const LooprestorationParams *params, \
-- enum LrEdgeFlags edges); \
--void dav1d_sgr_filter_3x3_##ext(pixel *dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const LooprestorationParams *params, \
-- enum LrEdgeFlags edges); \
--void dav1d_sgr_filter_mix_##ext(pixel *dst, ptrdiff_t dst_stride, \
-- const pixel (*left)[4], const pixel *lpf, \
-- ptrdiff_t lpf_stride, int w, int h, \
-- const LooprestorationParams *params, \
-- enum LrEdgeFlags edges);
-+#define decl_sgr_filter_fns(ext) \
-+void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
-+ const pixel (*left)[4], const pixel *lpf, \
-+ ptrdiff_t lpf_stride, int w, int h, \
-+ const LooprestorationParams *params, \
-+ enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \
-+ const pixel (*left)[4], const pixel *lpf, \
-+ ptrdiff_t lpf_stride, int w, int h, \
-+ const LooprestorationParams *params, \
-+ enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \
-+ const pixel (*left)[4], const pixel *lpf, \
-+ ptrdiff_t lpf_stride, int w, int h, \
-+ const LooprestorationParams *params, \
-+ enum LrEdgeFlags edges);
-
- /* FIXME: Replace with a port of the AVX2 code */
- #define SGR_FILTER_OLD(ext) \
--void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
-- const pixel (*left)[4], \
-- const pixel *src, const ptrdiff_t stride, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
-- const int w, const int h, const unsigned s); \
--void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const int32_t *a, const int16_t *b, \
-- const int w, const int h); \
-+void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
-+ const pixel (*left)[4], \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
-+ const int w, const int h, const unsigned s); \
-+void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int32_t *a, const int16_t *b, \
-+ const int w, const int h); \
- \
- /* filter with a 3x3 box (radius=1) */ \
--static void dav1d_sgr_filter1_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const pixel (*left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, const int strength, \
-- const enum LrEdgeFlags edges) \
-+static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const pixel (*left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const int strength, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
- int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
- ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
- int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
- \
-- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
-+ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
- if (edges & LR_HAVE_TOP) \
-- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-- NULL, lpf, lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-+ NULL, lpf, lpf_stride, w, 2, edges); \
- \
- if (edges & LR_HAVE_BOTTOM) \
-- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-- lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-+ lpf_stride, w, 2, edges); \
- \
-- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
-- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
-- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
-+ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
-+ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
-+ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
- } \
- \
--void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
-- const pixel (*left)[4], \
-- const pixel *src, const ptrdiff_t stride, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
-- const int w, const int h, \
-- const enum LrEdgeFlags edges); \
--void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
-- const int w, const int h, const int strength); \
--void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const int32_t *a, const int16_t *b, \
-- const int w, const int h); \
-+void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
-+ const pixel (*left)[4], \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
-+ const int w, const int h, \
-+ const enum LrEdgeFlags edges); \
-+void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
-+ const int w, const int h, const int strength); \
-+void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const int32_t *a, const int16_t *b, \
-+ const int w, const int h); \
- \
- /* filter with a 5x5 box (radius=2) */ \
--static void dav1d_sgr_filter2_##ext(coef *tmp, \
-- const pixel *src, const ptrdiff_t stride, \
-- const pixel (*left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, const int strength, \
-- const enum LrEdgeFlags edges) \
-+static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
-+ const pixel *src, const ptrdiff_t stride, \
-+ const pixel (*left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const int strength, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
- int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
- ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
- int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
- \
-- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
-+ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
- if (edges & LR_HAVE_TOP) \
-- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-- NULL, lpf, lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-+ NULL, lpf, lpf_stride, w, 2, edges); \
- \
- if (edges & LR_HAVE_BOTTOM) \
-- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-- lpf_stride, w, 2, edges); \
-+ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-+ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-+ lpf_stride, w, 2, edges); \
- \
-- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
-- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
-- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
-+ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
-+ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
-+ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
- } \
- \
--void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
-- const coef *t1, const int w, const int h, \
-- const int wt); \
--void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
-- const coef *t1, const coef *t2, \
-- const int w, const int h, \
-- const uint32_t wt); \
-+void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
-+ const coef *t1, const int w, const int h, \
-+ const int wt); \
-+void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
-+ const coef *t1, const coef *t2, \
-+ const int w, const int h, \
-+ const uint32_t wt); \
- \
--static void sgr_filter_5x5_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-- const pixel (*const left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, \
-- const LooprestorationParams *const params, \
-- const enum LrEdgeFlags edges) \
-+static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
-+ const pixel (*const left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, \
-+ const LooprestorationParams *const params, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(coef, tmp, 64 * 384,); \
-- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, params->sgr.s0, edges); \
-- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w0); \
-+ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, params->sgr.s0, edges); \
-+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
- } \
--static void sgr_filter_3x3_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-- const pixel (*const left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, \
-- const LooprestorationParams *const params, \
-- const enum LrEdgeFlags edges) \
-+static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
-+ const pixel (*const left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, \
-+ const LooprestorationParams *const params, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(coef, tmp, 64 * 384,); \
-- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, params->sgr.s1, edges); \
-- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w1); \
-+ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, params->sgr.s1, edges); \
-+ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
- } \
--static void sgr_filter_mix_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-- const pixel (*const left)[4], \
-- const pixel *lpf, const ptrdiff_t lpf_stride, \
-- const int w, const int h, \
-- const LooprestorationParams *const params, \
-- const enum LrEdgeFlags edges) \
-+static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
-+ const pixel (*const left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, \
-+ const LooprestorationParams *const params, \
-+ const enum LrEdgeFlags edges) \
- { \
- ALIGN_STK_32(coef, tmp1, 64 * 384,); \
- ALIGN_STK_32(coef, tmp2, 64 * 384,); \
-- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, params->sgr.s0, edges); \
-- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
-- w, h, params->sgr.s1, edges); \
-+ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, params->sgr.s0, edges); \
-+ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
-+ w, h, params->sgr.s1, edges); \
- const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
-- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
-+ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
- }
-
- #if BITDEPTH == 8
--WIENER_FILTER(sse2)
--WIENER_FILTER(ssse3)
-+decl_wiener_filter_fns(sse2);
-+decl_wiener_filter_fns(ssse3);
- SGR_FILTER_OLD(ssse3)
- # if ARCH_X86_64
--WIENER_FILTER(avx2)
--SGR_FILTER(avx2)
-+decl_wiener_filter_fns(avx2);
-+decl_sgr_filter_fns(avx2)
- # endif
- #endif
-
-@@ -211,25 +203,25 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
-
- if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
- #if BITDEPTH == 8
-- c->wiener[0] = dav1d_wiener_filter7_sse2;
-- c->wiener[1] = dav1d_wiener_filter5_sse2;
-+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
-+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
- #endif
-
- if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
- #if BITDEPTH == 8
-- c->wiener[0] = dav1d_wiener_filter7_ssse3;
-- c->wiener[1] = dav1d_wiener_filter5_ssse3;
-- c->sgr[0] = sgr_filter_5x5_ssse3;
-- c->sgr[1] = sgr_filter_3x3_ssse3;
-- c->sgr[2] = sgr_filter_mix_ssse3;
-+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
-+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
-+ c->sgr[0] = BF(sgr_filter_5x5, ssse3);
-+ c->sgr[1] = BF(sgr_filter_3x3, ssse3);
-+ c->sgr[2] = BF(sgr_filter_mix, ssse3);
- #endif
-
- if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
- #if BITDEPTH == 8 && ARCH_X86_64
-- c->wiener[0] = dav1d_wiener_filter7_avx2;
-- c->wiener[1] = dav1d_wiener_filter5_avx2;
-- c->sgr[0] = dav1d_sgr_filter_5x5_avx2;
-- c->sgr[1] = dav1d_sgr_filter_3x3_avx2;
-- c->sgr[2] = dav1d_sgr_filter_mix_avx2;
-+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
-+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
-+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
-+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
-+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
- #endif
- }
-diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm
-index 5d3ca492..4b77138d 100644
---- a/src/x86/looprestoration_sse.asm
-+++ b/src/x86/looprestoration_sse.asm
-@@ -97,8 +97,8 @@ SECTION .text
- %macro WIENER 0
- %if ARCH_X86_64
- DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
--cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h, x
-+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h, x
- %define base 0
- mov fltq, fltmp
- mov edged, r8m
-@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
- %define m11 [stk+96]
- %define stk_off 112
- %endif
--cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
-+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
- %define base r6-pb_right_ext_mask-21
- %define stk esp
- %define dstq leftq
-@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
- add lpfq, [rsp+gprsize*1]
- call .hv_bottom
- .v1:
-- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
- RET
- .no_top:
- lea t3, [lpfq+lpf_strideq*4]
-@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
- dec hd
- jnz .main
- .v3:
-- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
- .v2:
-- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
- jmp .v1
- .extend_right:
- movd m2, [lpfq-4]
-@@ -685,8 +685,8 @@ ALIGN function_align
- %endif
-
- %if ARCH_X86_64
--cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-- lpf_stride, w, edge, flt, h, x
-+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-+ lpf_stride, w, edge, flt, h, x
- mov fltq, fltmp
- mov edged, r8m
- mov wd, wm
-@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
- %define m11 [stk+80]
- %define stk_off 96
- %endif
--cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
-+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
- %define stk esp
- %define leftmp [stk+28]
- %define m8 [base+pw_m16380]
-@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
- dec hd
- jnz .main
- .v2:
-- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
- add dstq, dst_strideq
- mov t4, t3
- mov t3, t2
- mov t2, t1
- movifnidn dstmp, dstq
- .v1:
-- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
-+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
- jmp .end
- .h:
- %define stk esp+4
-@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
- jnz .h_have_right
- cmp xd, -17
- jl .h_have_right
-- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
- .h_have_right:
- %macro %%h5 0
- %if cpuflag(ssse3)
-@@ -991,7 +991,7 @@ ALIGN function_align
- jnz .hv_have_right
- cmp xd, -17
- jl .hv_have_right
-- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
-+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
- .hv_have_right:
- %%h5
- mova m2, [t3+xq*2]
-@@ -1161,7 +1161,7 @@ WIENER
- %endmacro
-
- %if ARCH_X86_64
--cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
-+cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- mov xlimd, edgem
- movifnidn xd, xm
- mov hd, hm
-@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- add xd, xlimd
- xor xlimd, 2 ; 2*!have_right
- %else
--cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
-+cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- %define wq r0m
- %define xlimd r1m
- %define hd hmp
-@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- RET
-
- %if ARCH_X86_64
--cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
-+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
- movifnidn edged, edgem
- %else
--cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
-+cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- %define sumsq_baseq dword [esp+0]
- %define sum_baseq dword [esp+4]
- %define ylimd dword [esp+8]
-@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- jl .loop_x
- RET
-
--cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
-+cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
-@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
- RET
-
- %if ARCH_X86_64
--cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
-- tmp_base, src_base, a_base, b_base, x, y
-+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
-+ tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
- mova m15, [pw_16]
-@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
- mov b_baseq, bq
- xor xd, xd
- %else
--cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
-+cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
- %define tmp_baseq [esp+8]
- %define src_baseq [esp+12]
- %define a_baseq [esp+16]
-@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
- jl .loop_x
- RET
-
--cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
-+cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
- movifnidn hd, hm
- %if ARCH_X86_32
- SETUP_PIC r6, 0
-@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
- RET
-
- %if ARCH_X86_64
--cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-+cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
- mova m10, [pb_0]
- mova m11, [pb_0_1]
- %else
--cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
-+cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edgeb byte edgem
- %define wd xd
- %define wq wd
-@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- RET
-
- %if ARCH_X86_64
--cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
-+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
- movifnidn edged, edgem
- mov ylimd, edged
- %else
--cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
-+cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- %define wm [esp+0]
- %define hm [esp+4]
- %define edgem [esp+8]
-@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- jmp .sum_loop_y_noload
- %endif
-
--cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
-+cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
-@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
- RET
-
- %if ARCH_X86_64
--cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
-+cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
- tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
-@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
- psrlw m11, m12, 1 ; pw_128
- pxor m13, m13
- %else
--cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
-+cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
- %define tmp_baseq r0m
- %define src_baseq r1m
- %define a_baseq r3m
-@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
- RET
-
- %undef t2
--cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
-+cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
- movifnidn wd, wm
- movd m0, wtm
- %if ARCH_X86_64
---
-GitLab
-
-
-From 1a4489861e55f0e4d70df60ecf15559dfda70aee Mon Sep 17 00:00:00 2001
-From: "Nathan E. Egge" <unlord@xiph.org>
-Date: Sun, 10 Jan 2021 14:12:10 -0500
-Subject: [PATCH 2/3] x86: lr: Add AVX2 implementation of wiener filter for 16
- bpc
-
-Relative speed-ups over C code (compared with gcc-9.3.0):
-
- C AVX2
-wiener_5tap_10bpc: 194892.0 14831.9 13.14x
-wiener_5tap_12bpc: 194295.4 14828.9 13.10x
-wiener_7tap_10bpc: 194391.7 19461.4 9.99x
-wiener_7tap_12bpc: 194136.1 19418.7 10.00x
----
- src/x86/looprestoration16_avx2.asm | 480 +++++++++++++++++++++++++++++
- 1 file changed, 480 insertions(+)
- create mode 100644 src/x86/looprestoration16_avx2.asm
-
-diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm
-new file mode 100644
-index 00000000..4eb1b805
---- /dev/null
-+++ b/src/x86/looprestoration16_avx2.asm
-@@ -0,0 +1,480 @@
-+; Copyright (c) 2017-2021, The rav1e contributors
-+; Copyright (c) 2021, Nathan Egge
-+; All rights reserved.
-+;
-+; Redistribution and use in source and binary forms, with or without
-+; modification, are permitted provided that the following conditions are met:
-+;
-+; 1. Redistributions of source code must retain the above copyright notice, this
-+; list of conditions and the following disclaimer.
-+;
-+; 2. Redistributions in binary form must reproduce the above copyright notice,
-+; this list of conditions and the following disclaimer in the documentation
-+; and/or other materials provided with the distribution.
-+;
-+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+%include "config.asm"
-+%include "ext/x86/x86inc.asm"
-+
-+%if ARCH_X86_64
-+
-+SECTION_RODATA 32
-+
-+wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
-+wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13
-+wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1
-+wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-+
-+wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9
-+wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
-+wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
-+rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-+rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-+wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-+
-+pq_3: dq (6 - 4) + 1
-+pq_5: dq (6 - 2) + 1
-+pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
-+pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
-+
-+pq_11: dq 12 - (6 - 4) + 1
-+pq_9: dq 12 - (6 - 2) + 1
-+nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
-+nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
-+
-+pb_wiener5_l: times 2 db 2, 3
-+pb_wiener5_r: times 2 db -6, -5
-+
-+pb_wiener7_l: times 2 db 4, 5
-+pb_wiener7_m: times 2 db -4, -3
-+pb_wiener7_r: times 2 db -8, -7
-+
-+SECTION .text
-+
-+INIT_YMM avx2
-+cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ vbroadcasti128 m6, [wiener5_shufB]
-+ vpbroadcastd m12, [fq + 2]
-+ vbroadcasti128 m7, [wiener5_shufC]
-+ vpbroadcastw m13, [fq + 6]
-+ vbroadcasti128 m8, [wiener5_shufD]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m9, [pd_65540]
-+ movq xm10, [pq_3]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m9, [pd_262160]
-+ movq xm10, [pq_5]
-+.bits10:
-+ pxor m11, m11
-+ add wq, wq
-+ add srcq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
-+.v_loop:
-+ mov xq, wq
-+ test edgeb, 1 ; LR_HAVE_LEFT
-+ jz .h_extend_left
-+ test leftq, leftq
-+ jz .h_loop
-+ movd xm4, [leftq + 4]
-+ vpblendd m4, [srcq + xq - 4], 0xfe
-+ add leftq, 8
-+ jmp .h_main
-+.h_extend_left:
-+ vbroadcasti128 m5, [srcq + xq]
-+ mova m4, [srcq + xq]
-+ palignr m4, m5, 12
-+ pshufb m4, [wiener5_l_shuf]
-+ jmp .h_main
-+.h_loop:
-+ movu m4, [srcq + xq - 4]
-+.h_main:
-+ movu m5, [srcq + xq + 4]
-+ test edgeb, 2 ; LR_HAVE_RIGHT
-+ jnz .h_have_right
-+ cmp xd, -36
-+ jl .h_have_right
-+ movd xm2, xd
-+ vpbroadcastd m0, [pb_wiener5_l]
-+ vpbroadcastd m1, [pb_wiener5_r]
-+ vpbroadcastb m2, xm2
-+ movu m3, [pb_0to31]
-+ psubb m0, m2
-+ psubb m1, m2
-+ pminub m0, m3
-+ pminub m1, m3
-+ pshufb m4, m0
-+ pshufb m5, m1
-+.h_have_right:
-+ pshufb m0, m4, m6
-+ pshufb m2, m4, m7
-+ paddw m0, m2
-+ pmaddwd m0, m12
-+ pshufb m1, m5, m6
-+ pshufb m3, m5, m7
-+ paddw m1, m3
-+ pmaddwd m1, m12
-+ pshufb m4, m8
-+ pmaddwd m4, m13
-+ pshufb m5, m8
-+ pmaddwd m5, m13
-+ paddd m0, m4
-+ paddd m1, m5
-+ paddd m0, m9
-+ paddd m1, m9
-+ psrad m0, xm10
-+ psrad m1, xm10
-+ packssdw m0, m1
-+ pmaxsw m0, m11
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add srcq, ssq
-+ add dstq, 384*2
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
-+
-+INIT_YMM avx2
-+cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ pxor m6, m6
-+ vpbroadcastd m7, [fq + 2]
-+ vpbroadcastd m8, [fq + 6]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m9, [nd_1047552]
-+ movq xm10, [pq_11]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m9, [nd_1048320]
-+ movq xm10, [pq_9]
-+.bits10:
-+ vpbroadcastw m11, bdmaxm
-+ add wq, wq
-+ add midq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
-+ mov msq, 2*384
-+ mov t0, midq
-+ lea t1, [t0 + msq]
-+ lea t2, [t1 + msq]
-+ lea t3, [t2 + msq]
-+ lea t4, [t3 + msq]
-+ test edgeb, 4 ; LR_HAVE_TOP
-+ jnz .have_top
-+ mov t0, t2
-+ mov t1, t2
-+.have_top:
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jnz .v_loop
-+ cmp hd, 2
-+ jg .v_loop
-+ cmp hd, 1
-+ jne .limit_v
-+ mov t3, t2
-+.limit_v:
-+ mov t4, t3
-+.v_loop:
-+ mov xq, wq
-+.h_loop:
-+ mova m1, [t0 + xq]
-+ mova m2, [t1 + xq]
-+ mova m3, [t2 + xq]
-+ mova m4, [t3 + xq]
-+ mova m5, [t4 + xq]
-+ punpcklwd m0, m1, m2
-+ pmaddwd m0, m7
-+ punpckhwd m1, m2
-+ pmaddwd m1, m7
-+ punpcklwd m2, m5, m4
-+ pmaddwd m2, m7
-+ punpckhwd m5, m4
-+ pmaddwd m5, m7
-+ paddd m0, m2
-+ paddd m1, m5
-+ punpcklwd m2, m3, m6
-+ pmaddwd m2, m8
-+ punpckhwd m3, m6
-+ pmaddwd m3, m8
-+ paddd m0, m2
-+ paddd m1, m3
-+ paddd m0, m9
-+ paddd m1, m9
-+ psrad m0, xm10
-+ psrad m1, xm10
-+ packusdw m0, m1
-+ pminuw m0, m11
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add dstq, dsq
-+ mov t0, t1
-+ mov t1, t2
-+ mov t2, t3
-+ mov t3, t4
-+ add t4, msq
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jnz .have_bottom
-+ cmp hd, 3
-+ jg .have_bottom
-+ mov t4, t3
-+.have_bottom:
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+INIT_YMM avx2
-+cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ vpbroadcastd m7, [fq]
-+ vpbroadcastd m8, [fq + 4]
-+ vbroadcasti128 m10, [rev_w]
-+ vbroadcasti128 m11, [wiener5_shufB]
-+ vbroadcasti128 m12, [wiener7_shufC]
-+ vbroadcasti128 m13, [wiener7_shufD]
-+ vbroadcasti128 m14, [wiener7_shufE]
-+ vbroadcasti128 m15, [rev_d]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m9, [pd_65540]
-+ mov rhq, [pq_3]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m9, [pd_262160]
-+ mov rhq, [pq_5]
-+.bits10:
-+ add wq, wq
-+ add srcq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
-+.v_loop:
-+ mov xq, wq
-+ test edgeb, 1 ; LR_HAVE_LEFT
-+ jz .h_extend_left
-+ test leftq, leftq
-+ jz .h_loop
-+ movq xm4, [leftq + 2]
-+ vpblendw xm4, [srcq + xq - 6], 0xf8
-+ vinserti128 m4, [srcq + xq + 10], 1
-+ add leftq, 8
-+ jmp .h_main
-+.h_extend_left:
-+ vbroadcasti128 m5, [srcq + xq]
-+ mova m4, [srcq + xq]
-+ palignr m4, m5, 10
-+ pshufb m4, [wiener7_l_shuf]
-+ jmp .h_main
-+.h_loop:
-+ movu m4, [srcq + xq - 6]
-+.h_main:
-+ movu m5, [srcq + xq + 2]
-+ movu m6, [srcq + xq + 6]
-+ test edgeb, 2 ; LR_HAVE_RIGHT
-+ jnz .h_have_right
-+ cmp xd, -38
-+ jl .h_have_right
-+ movd xm3, xd
-+ vpbroadcastd m0, [pb_wiener7_l]
-+ vpbroadcastd m1, [pb_wiener7_m]
-+ vpbroadcastd m2, [pb_wiener7_r]
-+ vpbroadcastb m3, xm3
-+ psubb m0, m3
-+ psubb m1, m3
-+ psubb m2, m3
-+ movu m3, [pb_0to31]
-+ pminub m0, m3
-+ pminub m1, m3
-+ pminub m2, m3
-+ pshufb m4, m0
-+ pshufb m5, m1
-+ pshufb m6, m2
-+ cmp xd, -9*2
-+ jne .hack
-+ vpbroadcastw xm3, [srcq + xq + 16]
-+ vinserti128 m5, xm3, 1
-+ jmp .h_have_right
-+.hack:
-+ cmp xd, -1*2
-+ jne .h_have_right
-+ vpbroadcastw xm5, [srcq + xq]
-+.h_have_right:
-+ pshufb m6, m10
-+ pshufb m0, m4, m11
-+ pshufb m2, m5, m12
-+ paddw m0, m2
-+ pmaddwd m0, m7
-+ pshufb m2, m4, m13
-+ pshufb m4, m14
-+ paddw m2, m4
-+ pmaddwd m2, m8
-+ pshufb m1, m6, m11
-+ pshufb m5, m11
-+ pmaddwd m1, m7
-+ pmaddwd m5, m7
-+ pshufb m3, m6, m13
-+ pshufb m6, m14
-+ paddw m3, m6
-+ pmaddwd m3, m8
-+ paddd m0, m2
-+ paddd m1, m3
-+ pshufb m1, m15
-+ paddd m1, m5
-+ movq xm4, rhq
-+ pxor m5, m5
-+ paddd m0, m9
-+ paddd m1, m9
-+ psrad m0, xm4
-+ psrad m1, xm4
-+ packssdw m0, m1
-+ pmaxsw m0, m5
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add srcq, ssq
-+ add dstq, 384*2
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+INIT_YMM avx2
-+cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
-+ movifnidn wd, wm
-+ movifnidn hd, hm
-+ movifnidn edgeb, edgem
-+ pxor m6, m6
-+ vpbroadcastd m7, [fq]
-+ vpbroadcastw m8, [fq + 4]
-+ vpbroadcastd m9, [fq + 6]
-+ popcnt bdmaxd, bdmaxm
-+ vpbroadcastd m10, [nd_1047552]
-+ movq xm11, [pq_11]
-+ cmp bdmaxd, 10
-+ je .bits10
-+ vpbroadcastd m10, [nd_1048320]
-+ movq xm11, [pq_9]
-+.bits10:
-+ vpbroadcastw m12, bdmaxm
-+ add wq, wq
-+ add midq, wq
-+ add dstq, wq
-+ neg wq
-+ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
-+ mov msq, 2*384
-+ mov t0, midq
-+ mov t1, t0
-+ lea t2, [t1 + msq]
-+ lea t3, [t2 + msq]
-+ lea t4, [t3 + msq]
-+ lea t5, [t4 + msq]
-+ lea t6, [t5 + msq]
-+ test edgeb, 4 ; LR_HAVE_TOP
-+ jnz .have_top
-+ mov t0, t3
-+ mov t1, t3
-+ mov t2, t3
-+.have_top:
-+ cmp hd, 3
-+ jg .v_loop
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jz .no_bottom0
-+ cmp hd, 1
-+ jg .v_loop
-+ jmp .h3
-+.no_bottom0:
-+ cmp hd, 2
-+ je .h2
-+ jns .h3
-+.h1:
-+ mov t4, t3
-+.h2:
-+ mov t5, t4
-+.h3:
-+ mov t6, t5
-+.v_loop:
-+ mov xq, wq
-+.h_loop:
-+ mova m1, [t0 + xq]
-+ mova m2, [t1 + xq]
-+ mova m3, [t5 + xq]
-+ mova m4, [t6 + xq]
-+ punpcklwd m0, m1, m2
-+ pmaddwd m0, m7
-+ punpckhwd m1, m2
-+ pmaddwd m1, m7
-+ punpcklwd m2, m4, m3
-+ pmaddwd m2, m7
-+ punpckhwd m4, m3
-+ pmaddwd m4, m7
-+ paddd m0, m2
-+ paddd m1, m4
-+ mova m3, [t2 + xq]
-+ mova m4, [t4 + xq]
-+ punpcklwd m2, m3, m4
-+ pmaddwd m2, m8
-+ punpckhwd m3, m4
-+ pmaddwd m3, m8
-+ paddd m0, m2
-+ paddd m1, m3
-+ mova m3, [t3 + xq]
-+ punpcklwd m2, m3, m6
-+ pmaddwd m2, m9
-+ punpckhwd m3, m6
-+ pmaddwd m3, m9
-+ paddd m0, m2
-+ paddd m1, m3
-+ paddd m0, m10
-+ paddd m1, m10
-+ psrad m0, xm11
-+ psrad m1, xm11
-+ packusdw m0, m1
-+ pminuw m0, m12
-+ mova [dstq + xq], m0
-+ add xq, 32
-+ jl .h_loop
-+ add dstq, dsq
-+ mov t0, t1
-+ mov t1, t2
-+ mov t2, t3
-+ mov t3, t4
-+ mov t4, t5
-+ mov t5, t6
-+ add t6, msq
-+ cmp hd, 4
-+ jg .next_row
-+ test edgeb, 8 ; LR_HAVE_BOTTOM
-+ jz .no_bottom
-+ cmp hd, 2
-+ jg .next_row
-+.no_bottom:
-+ mov t6, t5
-+.next_row:
-+ dec hd
-+ jg .v_loop
-+ RET
-+
-+%endif ; ARCH_X86_64
---
-GitLab
-
-
-From 2ce581302a1536559aa5e56018a03ac6a3770c0f Mon Sep 17 00:00:00 2001
-From: "Nathan E. Egge" <unlord@xiph.org>
-Date: Wed, 13 Jan 2021 14:54:42 -0500
-Subject: [PATCH 3/3] Enable AVX2 wiener filter HBD assembly
-
----
- src/meson.build | 1 +
- src/x86/looprestoration_init_tmpl.c | 40 +++++++++++++++++++++++++++--
- 2 files changed, 39 insertions(+), 2 deletions(-)
-
-diff --git a/src/meson.build b/src/meson.build
-index 27946501..25729217 100644
---- a/src/meson.build
-+++ b/src/meson.build
-@@ -211,6 +211,7 @@ if is_asm_enabled
- libdav1d_sources_asm += files(
- 'x86/cdef16_avx2.asm',
- 'x86/cdef16_sse.asm',
-+ 'x86/looprestoration16_avx2.asm',
- 'x86/mc16_avx2.asm',
- )
- endif
-
-diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
-index a9aa5acf..52de0faf 100644
---- a/src/x86/looprestoration_init_tmpl.c
-+++ b/src/x86/looprestoration_init_tmpl.c
-@@ -30,9 +30,40 @@
-
- #include "common/intops.h"
-
-+#if BITDEPTH != 8
-+#define decl_wiener_filter_fn(name, ext) \
-+void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
-+ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
-+ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
-+void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
-+ const int16_t fv[7], int w, int h, \
-+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
-+static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
-+ const pixel (*const left)[4], \
-+ const pixel *lpf, const ptrdiff_t lpf_stride, \
-+ const int w, const int h, const LooprestorationParams *params, \
-+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
-+ ALIGN_STK_64(int16_t, mid, 68 * 384,); \
-+ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \
-+ edges HIGHBD_TAIL_SUFFIX); \
-+ if (edges & LR_HAVE_TOP) { \
-+ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \
-+ edges HIGHBD_TAIL_SUFFIX); \
-+ } \
-+ if (edges & LR_HAVE_BOTTOM) { \
-+ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
-+ lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
-+ } \
-+ BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
-+}
-+#define decl_wiener_filter_fns(ext) \
-+decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
-+decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
-+#else
- #define decl_wiener_filter_fns(ext) \
- decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
- decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
-+#endif
-
- #define decl_sgr_filter_fns(ext) \
- void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
-@@ -193,11 +224,14 @@ decl_wiener_filter_fns(sse2);
- decl_wiener_filter_fns(ssse3);
- SGR_FILTER_OLD(ssse3)
- # if ARCH_X86_64
--decl_wiener_filter_fns(avx2);
- decl_sgr_filter_fns(avx2)
- # endif
- #endif
-
-+#if ARCH_X86_64
-+decl_wiener_filter_fns(avx2);
-+#endif
-+
- COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
- const unsigned flags = dav1d_get_cpu_flags();
-
-@@ -217,11 +251,13 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
- #endif
-
- if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--#if BITDEPTH == 8 && ARCH_X86_64
-+#if ARCH_X86_64
- c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
- c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
-+# if BITDEPTH == 8
- c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
- c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
- c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
-+# endif
- #endif
- }
---
-GitLab
-
diff --git a/PKGBUILD b/PKGBUILD
index e4274b503901..a3e40306f49b 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -3,7 +3,7 @@
_testvideo=Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu
pkgname=dav1d-git-optimized
-pkgver=r1617.baa9237
+pkgver=r1627.f06148e
pkgrel=1
license=('BSD')
pkgdesc='AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations'
@@ -14,12 +14,10 @@ conflicts=('dav1d' 'dav1d-git')
makedepends=('meson' 'git' 'nasm')
source=('git+https://code.videolan.org/videolan/dav1d.git'
"http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/$_testvideo"
- '0001-1112.patch'
- '0002-1160.patch')
+ '0001-1112.patch')
sha256sums=('SKIP'
'e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87'
- '83807b996384f147cea3702a1a7fcd4accfc04c3937fea11d0f74b615c37f8d2'
- 'de289262c9d4e1964e7b9130a5619c6501e82a074794ced6d7da7922630973f3')
+ 'SKIP')
pkgver () {
cd dav1d
@@ -29,7 +27,6 @@ pkgver () {
prepare () {
cd dav1d
patch -Np1 -i ${srcdir}/0001-1112.patch
- patch -Np1 -i ${srcdir}/0002-1160.patch
}
build () {