summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Grant2021-01-19 10:31:16 -0800
committerBen Grant2021-01-19 10:31:16 -0800
commit0ef85b5b789f87392cd0ae19291693c6c718b3c3 (patch)
treee6dd5699565b7a8d939c2465ee90047a699e9ab1
parente5a701db715ae63ab1d888a3358221752a696abc (diff)
downloadaur-0ef85b5b789f87392cd0ae19291693c6c718b3c3.tar.gz
Add patches and PGO support from SnoopCat
-rw-r--r--0001-1112.patch3780
-rw-r--r--0002-wiener_2.patch661
-rw-r--r--0003-wiener_3.patch492
-rw-r--r--0004-wiener_4.patch101
-rw-r--r--PKGBUILD42
5 files changed, 5067 insertions, 9 deletions
diff --git a/0001-1112.patch b/0001-1112.patch
new file mode 100644
index 000000000000..e5ed26b240a9
--- /dev/null
+++ b/0001-1112.patch
@@ -0,0 +1,3780 @@
+From efd27b6182c04072e1cc4b80b24aa28e78d6bfea Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Mon, 21 Dec 2020 15:38:02 -0500
+Subject: [PATCH 1/7] Add bpc suffix to mc functions
+
+---
+ src/x86/mc_avx2.asm | 132 +++++++--------
+ src/x86/mc_avx512.asm | 60 +++----
+ src/x86/mc_init_tmpl.c | 362 ++++++++++++++++++++---------------------
+ src/x86/mc_sse.asm | 132 +++++++--------
+ 4 files changed, 343 insertions(+), 343 deletions(-)
+
+diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm
+index dda8234f..e77a6474 100644
+--- a/src/x86/mc_avx2.asm
++++ b/src/x86/mc_avx2.asm
+@@ -110,7 +110,7 @@ cextern resize_filter
+ %endmacro
+
+ %macro HV_JMP_TABLE 5-*
+- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
++ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+@@ -176,8 +176,8 @@ cextern resize_filter
+ %endrep
+ %endmacro
+
+-%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
+-%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
++%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
++%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
+
+ %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+@@ -187,22 +187,22 @@ HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+ HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+ HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+ HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+-SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+-SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
+-BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
+-BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
++SCALED_JMP_TABLE put_8tap_scaled_8bpc_avx2, 2, 4, 8, 16, 32, 64, 128
++SCALED_JMP_TABLE prep_8tap_scaled_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE avg_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_avg_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE mask_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_420_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_422_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_444_8bpc_avx2, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE blend_8bpc_avx2, 4, 8, 16, 32
++BIDIR_JMP_TABLE blend_v_8bpc_avx2, 2, 4, 8, 16, 32
++BIDIR_JMP_TABLE blend_h_8bpc_avx2, 2, 4, 8, 16, 32, 32, 32
+
+ SECTION .text
+
+ INIT_XMM avx2
+-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
++cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx2]
+ tzcnt wd, wm
+@@ -769,7 +769,7 @@ INIT_YMM avx2
+ %endif
+ RET
+
+-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
++cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep%+SUFFIX]
+ tzcnt wd, wm
+@@ -1439,7 +1439,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ %assign FILTER_SHARP (2*15 << 16) | 3*15
+
+ %macro FN 4 ; fn, type, type_h, type_v
+-cglobal %1_%2
++cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+ %ifidn %3, %4
+ mov t1d, t0d
+@@ -1447,7 +1447,7 @@ cglobal %1_%2
+ mov t1d, FILTER_%4
+ %endif
+ %ifnidn %2, regular ; skip the jump in the last filter
+- jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
++ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+ %endif
+ %endmacro
+
+@@ -1469,7 +1469,7 @@ PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+ PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+ PUT_8TAP_FN regular, REGULAR, REGULAR
+
+-cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
++cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+@@ -2135,7 +2135,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+ PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+ PREP_8TAP_FN regular, REGULAR, REGULAR
+
+-cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
++cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+@@ -2725,26 +2725,26 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ %ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+-cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
++cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+-cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
++cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+ %else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+-cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
++cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+-cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
++cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+120]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+ %endif
+- lea base_reg, [%1_8tap_scaled_avx2]
+-%define base base_reg-%1_8tap_scaled_avx2
++ lea base_reg, [%1_8tap_scaled_8bpc_avx2]
++%define base base_reg-%1_8tap_scaled_8bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+ %if isprep && UNIX64
+@@ -2807,7 +2807,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+- movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
++ movzx wd, word [base+%1_8tap_scaled_8bpc_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+ %ifidn %1, put
+@@ -3280,7 +3280,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ pmulhrsw m3, m12
+ jmp .vloop
+ .dy1:
+- movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
++ movzx wd, word [base+%1_8tap_scaled_8bpc_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+ %ifidn %1, put
+@@ -3647,7 +3647,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+ .dy2:
+- movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
++ movzx wd, word [base+%1_8tap_scaled_8bpc_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+ %ifidn %1, put
+@@ -4026,10 +4026,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %endmacro
+
+ %macro BILIN_SCALED_FN 1
+-cglobal %1_bilin_scaled
++cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+- jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
++ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+ %endmacro
+
+ %if WIN64
+@@ -4114,11 +4114,11 @@ MC_8TAP_SCALED prep
+ paddd m%1, m0, m%2
+ %endmacro
+
+-cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
++cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
+ %if WIN64
+ sub rsp, 0xa0
+ %endif
+- call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
+ .loop:
+ psrad m7, 13
+ psrad m0, 13
+@@ -4128,12 +4128,12 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+- jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
+- call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
++ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+
+-cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
++cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+ beta, filter, tmp1, delta, my, gamma
+ %if WIN64
+ sub rsp, 0xa0
+@@ -4390,9 +4390,9 @@ ALIGN function_align
+ add tmp2q, %1*32
+ %endmacro
+
+-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+-%define base r6-avg %+ SUFFIX %+ _table
+- lea r6, [avg %+ SUFFIX %+ _table]
++cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
++%define base r6-avg_8bpc %+ SUFFIX %+ _table
++ lea r6, [avg_8bpc %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+@@ -4420,9 +4420,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+
+ %define W_AVG_INC_PTR AVG_INC_PTR
+
+-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+-%define base r6-w_avg %+ SUFFIX %+ _table
+- lea r6, [w_avg %+ SUFFIX %+ _table]
++cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
++%define base r6-w_avg_8bpc %+ SUFFIX %+ _table
++ lea r6, [w_avg_8bpc %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+@@ -4470,9 +4470,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ add tmp1q, %1*32
+ %endmacro
+
+-cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-mask %+ SUFFIX %+ _table
+- lea r7, [mask %+ SUFFIX %+ _table]
++cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-mask_8bpc %+ SUFFIX %+ _table
++ lea r7, [mask_8bpc %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+@@ -4513,9 +4513,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ packuswb m%1, m1
+ %endmacro
+
+-cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+-%define base r6-blend_avx2_table
+- lea r6, [blend_avx2_table]
++cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
++%define base r6-blend_8bpc_avx2_table
++ lea r6, [blend_8bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+@@ -4630,15 +4630,15 @@ ALIGN function_align
+ jg .w32
+ RET
+
+-cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+-%define base r5-blend_v_avx2_table
+- lea r5, [blend_v_avx2_table]
++cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
++%define base r5-blend_v_8bpc_avx2_table
++ lea r5, [blend_v_8bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+- add maskq, obmc_masks-blend_v_avx2_table
++ add maskq, obmc_masks-blend_v_8bpc_avx2_table
+ jmp wq
+ .w2:
+ vpbroadcastd xm2, [maskq+2*2]
+@@ -4741,9 +4741,9 @@ ALIGN function_align
+ jg .w32_loop
+ RET
+
+-cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
+-%define base r5-blend_h_avx2_table
+- lea r5, [blend_h_avx2_table]
++cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
++%define base r5-blend_h_8bpc_avx2_table
++ lea r5, [blend_h_8bpc_avx2_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+@@ -4867,7 +4867,7 @@ ALIGN function_align
+ jl .w32_loop0
+ RET
+
+-cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
++cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+@@ -5054,7 +5054,7 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ .end:
+ RET
+
+-cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
++cglobal resize_8bpc, 6, 14, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+@@ -5192,9 +5192,9 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
+ jg .loop_y
+ RET
+
+-cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-w_mask_420_avx2_table
+- lea r7, [w_mask_420_avx2_table]
++cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-w_mask_420_8bpc_avx2_table
++ lea r7, [w_mask_420_8bpc_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+@@ -5398,9 +5398,9 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ jg .w128_loop
+ RET
+
+-cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-w_mask_422_avx2_table
+- lea r7, [w_mask_422_avx2_table]
++cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-w_mask_422_8bpc_avx2_table
++ lea r7, [w_mask_422_8bpc_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+@@ -5571,9 +5571,9 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ jg .w128_loop
+ RET
+
+-cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-w_mask_444_avx2_table
+- lea r7, [w_mask_444_avx2_table]
++cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-w_mask_444_8bpc_avx2_table
++ lea r7, [w_mask_444_8bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+diff --git a/src/x86/mc_avx512.asm b/src/x86/mc_avx512.asm
+index a13c2423..72525f86 100644
+--- a/src/x86/mc_avx512.asm
++++ b/src/x86/mc_avx512.asm
+@@ -146,7 +146,7 @@ cextern mc_subpel_filters
+ %endmacro
+
+ %macro HV_JMP_TABLE 5-*
+- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
++ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+@@ -188,19 +188,19 @@ cextern mc_subpel_filters
+ %endrep
+ %endmacro
+
+-%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
++%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
+
+ %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+ BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+ HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+ HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE avg_8bpc_avx512icl, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_avg_8bpc_avx512icl, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE mask_8bpc_avx512icl, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_420_8bpc_avx512icl, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_422_8bpc_avx512icl, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_444_8bpc_avx512icl, 4, 8, 16, 32, 64, 128
+
+ SECTION .text
+
+@@ -221,7 +221,7 @@ INIT_ZMM cpuname
+ DECLARE_REG_TMP 3, 5, 6
+
+ INIT_ZMM avx512icl
+-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
++cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx512icl]
+ tzcnt wd, wm
+@@ -772,7 +772,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ %assign FILTER_SHARP (2*15 << 16) | 3*15
+
+ %macro FN 4 ; fn, type, type_h, type_v
+-cglobal %1_%2
++cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+ %ifidn %3, %4
+ mov t1d, t0d
+@@ -780,7 +780,7 @@ cglobal %1_%2
+ mov t1d, FILTER_%4
+ %endif
+ %ifnidn %2, regular ; skip the jump in the last filter
+- jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
++ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+ %endif
+ %endmacro
+
+@@ -829,7 +829,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+ PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+ PREP_8TAP_FN regular, REGULAR, REGULAR
+
+-cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
++cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+@@ -1753,9 +1753,9 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ add tmp2q, %1*mmsize
+ %endmacro
+
+-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+-%define base r6-avg_avx512icl_table
+- lea r6, [avg_avx512icl_table]
++cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
++%define base r6-avg_8bpc_avx512icl_table
++ lea r6, [avg_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+@@ -1783,9 +1783,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+
+ %define W_AVG_INC_PTR AVG_INC_PTR
+
+-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+-%define base r6-w_avg_avx512icl_table
+- lea r6, [w_avg_avx512icl_table]
++cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
++%define base r6-w_avg_8bpc_avx512icl_table
++ lea r6, [w_avg_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+@@ -1837,9 +1837,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ add tmp1q, %1*64
+ %endmacro
+
+-cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-mask_avx512icl_table
+- lea r7, [mask_avx512icl_table]
++cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-mask_8bpc_avx512icl_table
++ lea r7, [mask_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+@@ -1877,9 +1877,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ packuswb m%1, m1
+ %endmacro
+
+-cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-w_mask_420_avx512icl_table
+- lea r7, [w_mask_420_avx512icl_table]
++cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-w_mask_420_8bpc_avx512icl_table
++ lea r7, [w_mask_420_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+@@ -2070,9 +2070,9 @@ cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ jg .w128_loop
+ RET
+
+-cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-w_mask_422_avx512icl_table
+- lea r7, [w_mask_422_avx512icl_table]
++cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-w_mask_422_8bpc_avx512icl_table
++ lea r7, [w_mask_422_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+@@ -2243,9 +2243,9 @@ cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ jg .w128_loop
+ RET
+
+-cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+-%define base r7-w_mask_444_avx512icl_table
+- lea r7, [w_mask_444_avx512icl_table]
++cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
++%define base r7-w_mask_444_8bpc_avx512icl_table
++ lea r7, [w_mask_444_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
+index 47f0104a..468069c5 100644
+--- a/src/x86/mc_init_tmpl.c
++++ b/src/x86/mc_init_tmpl.c
+@@ -28,157 +28,157 @@
+ #include "src/cpu.h"
+ #include "src/mc.h"
+
+-decl_mc_fn(dav1d_put_8tap_regular_avx2);
+-decl_mc_fn(dav1d_put_8tap_regular_ssse3);
+-decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2);
+-decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3);
+-decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2);
+-decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3);
+-decl_mc_fn(dav1d_put_8tap_smooth_avx2);
+-decl_mc_fn(dav1d_put_8tap_smooth_ssse3);
+-decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2);
+-decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3);
+-decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2);
+-decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3);
+-decl_mc_fn(dav1d_put_8tap_sharp_avx2);
+-decl_mc_fn(dav1d_put_8tap_sharp_ssse3);
+-decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
+-decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3);
+-decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
+-decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3);
+-decl_mc_fn(dav1d_put_bilin_avx2);
+-decl_mc_fn(dav1d_put_bilin_ssse3);
+-
+-decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_regular_avx2);
+-decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_regular_sse2);
+-decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
+-decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2);
+-decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
+-decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2);
+-decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
+-decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_smooth_sse2);
+-decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
+-decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2);
+-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
+-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2);
+-decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
+-decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_sharp_sse2);
+-decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
+-decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2);
+-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
+-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
+-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2);
+-decl_mct_fn(dav1d_prep_bilin_avx512icl);
+-decl_mct_fn(dav1d_prep_bilin_avx2);
+-decl_mct_fn(dav1d_prep_bilin_ssse3);
+-decl_mct_fn(dav1d_prep_bilin_sse2);
+-
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_ssse3);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_ssse3);
+-decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
+-decl_mc_scaled_fn(dav1d_put_bilin_scaled_ssse3);
+-
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_ssse3);
+-decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
+-decl_mct_scaled_fn(dav1d_prep_bilin_scaled_ssse3);
+-
+-decl_avg_fn(dav1d_avg_avx512icl);
+-decl_avg_fn(dav1d_avg_avx2);
+-decl_avg_fn(dav1d_avg_ssse3);
+-decl_w_avg_fn(dav1d_w_avg_avx512icl);
+-decl_w_avg_fn(dav1d_w_avg_avx2);
+-decl_w_avg_fn(dav1d_w_avg_ssse3);
+-decl_mask_fn(dav1d_mask_avx512icl);
+-decl_mask_fn(dav1d_mask_avx2);
+-decl_mask_fn(dav1d_mask_ssse3);
+-decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
+-decl_w_mask_fn(dav1d_w_mask_420_avx2);
+-decl_w_mask_fn(dav1d_w_mask_420_ssse3);
+-decl_w_mask_fn(dav1d_w_mask_422_avx512icl);
+-decl_w_mask_fn(dav1d_w_mask_422_avx2);
+-decl_w_mask_fn(dav1d_w_mask_444_avx512icl);
+-decl_w_mask_fn(dav1d_w_mask_444_avx2);
+-decl_blend_fn(dav1d_blend_avx2);
+-decl_blend_fn(dav1d_blend_ssse3);
+-decl_blend_dir_fn(dav1d_blend_v_avx2);
+-decl_blend_dir_fn(dav1d_blend_v_ssse3);
+-decl_blend_dir_fn(dav1d_blend_h_avx2);
+-decl_blend_dir_fn(dav1d_blend_h_ssse3);
+-
+-decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
+-decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
+-decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
+-decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2);
+-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
+-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
+-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
+-
+-decl_emu_edge_fn(dav1d_emu_edge_avx2);
+-decl_emu_edge_fn(dav1d_emu_edge_ssse3);
+-
+-decl_resize_fn(dav1d_resize_avx2);
+-decl_resize_fn(dav1d_resize_ssse3);
++decl_mc_fn(BF(dav1d_put_8tap_regular, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_regular, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_smooth, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_smooth, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_sharp, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_sharp, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, ssse3));
++decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, avx2));
++decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, ssse3));
++decl_mc_fn(BF(dav1d_put_bilin, avx2));
++decl_mc_fn(BF(dav1d_put_bilin, ssse3));
++
++decl_mct_fn(BF(dav1d_prep_8tap_regular, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_regular, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_regular, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_regular, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, sse2));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, avx512icl));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, avx2));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, ssse3));
++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, sse2));
++decl_mct_fn(BF(dav1d_prep_bilin, avx512icl));
++decl_mct_fn(BF(dav1d_prep_bilin, avx2));
++decl_mct_fn(BF(dav1d_prep_bilin, ssse3));
++decl_mct_fn(BF(dav1d_prep_bilin, sse2));
++
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_smooth, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_smooth, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_sharp, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_sharp, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_regular, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_regular, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_sharp, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_sharp, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_regular, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_regular, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_smooth, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_smooth, ssse3));
++decl_mc_scaled_fn(BF(dav1d_put_bilin_scaled, avx2));
++decl_mc_scaled_fn(BF(dav1d_put_bilin_scaled, ssse3));
++
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_smooth, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_smooth, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_sharp, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_sharp, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_regular, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_regular, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_sharp, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_sharp, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_regular, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_regular, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_smooth, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_smooth, ssse3));
++decl_mct_scaled_fn(BF(dav1d_prep_bilin_scaled, avx2));
++decl_mct_scaled_fn(BF(dav1d_prep_bilin_scaled, ssse3));
++
++decl_avg_fn(BF(dav1d_avg, avx512icl));
++decl_avg_fn(BF(dav1d_avg, avx2));
++decl_avg_fn(BF(dav1d_avg, ssse3));
++decl_w_avg_fn(BF(dav1d_w_avg, avx512icl));
++decl_w_avg_fn(BF(dav1d_w_avg, avx2));
++decl_w_avg_fn(BF(dav1d_w_avg, ssse3));
++decl_mask_fn(BF(dav1d_mask, avx512icl));
++decl_mask_fn(BF(dav1d_mask, avx2));
++decl_mask_fn(BF(dav1d_mask, ssse3));
++decl_w_mask_fn(BF(dav1d_w_mask_420, avx512icl));
++decl_w_mask_fn(BF(dav1d_w_mask_420, avx2));
++decl_w_mask_fn(BF(dav1d_w_mask_420, ssse3));
++decl_w_mask_fn(BF(dav1d_w_mask_422, avx512icl));
++decl_w_mask_fn(BF(dav1d_w_mask_422, avx2));
++decl_w_mask_fn(BF(dav1d_w_mask_444, avx512icl));
++decl_w_mask_fn(BF(dav1d_w_mask_444, avx2));
++decl_blend_fn(BF(dav1d_blend, avx2));
++decl_blend_fn(BF(dav1d_blend, ssse3));
++decl_blend_dir_fn(BF(dav1d_blend_v, avx2));
++decl_blend_dir_fn(BF(dav1d_blend_v, ssse3));
++decl_blend_dir_fn(BF(dav1d_blend_h, avx2));
++decl_blend_dir_fn(BF(dav1d_blend_h, ssse3));
++
++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, avx2));
++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, ssse3));
++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse2));
++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, avx2));
++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, ssse3));
++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse2));
++
++decl_emu_edge_fn(BF(dav1d_emu_edge, avx2));
++decl_emu_edge_fn(BF(dav1d_emu_edge, ssse3));
++
++decl_resize_fn(BF(dav1d_resize, avx2));
++decl_resize_fn(BF(dav1d_resize, ssse3));
+
+ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ #define init_mc_fn(type, name, suffix) \
+- c->mc[type] = dav1d_put_##name##_##suffix
++ c->mc[type] = BF(dav1d_put_##name, suffix)
+ #define init_mct_fn(type, name, suffix) \
+- c->mct[type] = dav1d_prep_##name##_##suffix
++ c->mct[type] = BF(dav1d_prep_##name, suffix)
+ #define init_mc_scaled_fn(type, name, suffix) \
+- c->mc_scaled[type] = dav1d_put_##name##_##suffix
++ c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
+ #define init_mct_scaled_fn(type, name, suffix) \
+- c->mct_scaled[type] = dav1d_prep_##name##_##suffix
++ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+@@ -197,8 +197,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
+- c->warp8x8 = dav1d_warp_affine_8x8_sse2;
+- c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
++ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
+ #endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+@@ -251,27 +251,27 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+ #endif
+
+- c->avg = dav1d_avg_ssse3;
+- c->w_avg = dav1d_w_avg_ssse3;
+- c->mask = dav1d_mask_ssse3;
+- c->w_mask[2] = dav1d_w_mask_420_ssse3;
+- c->blend = dav1d_blend_ssse3;
+- c->blend_v = dav1d_blend_v_ssse3;
+- c->blend_h = dav1d_blend_h_ssse3;
++ c->avg = BF(dav1d_avg, ssse3);
++ c->w_avg = BF(dav1d_w_avg, ssse3);
++ c->mask = BF(dav1d_mask, ssse3);
++ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
++ c->blend = BF(dav1d_blend, ssse3);
++ c->blend_v = BF(dav1d_blend_v, ssse3);
++ c->blend_h = BF(dav1d_blend_h, ssse3);
+
+- c->warp8x8 = dav1d_warp_affine_8x8_ssse3;
+- c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
++ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
+
+- c->emu_edge = dav1d_emu_edge_ssse3;
+- c->resize = dav1d_resize_ssse3;
++ c->emu_edge = BF(dav1d_emu_edge, ssse3);
++ c->resize = BF(dav1d_resize, ssse3);
+ #endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+ #if BITDEPTH == 8
+- c->warp8x8 = dav1d_warp_affine_8x8_sse4;
+- c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
++ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
+ #endif
+
+ #if ARCH_X86_64
+@@ -323,21 +323,21 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+- c->avg = dav1d_avg_avx2;
+- c->w_avg = dav1d_w_avg_avx2;
+- c->mask = dav1d_mask_avx2;
+- c->w_mask[0] = dav1d_w_mask_444_avx2;
+- c->w_mask[1] = dav1d_w_mask_422_avx2;
+- c->w_mask[2] = dav1d_w_mask_420_avx2;
+- c->blend = dav1d_blend_avx2;
+- c->blend_v = dav1d_blend_v_avx2;
+- c->blend_h = dav1d_blend_h_avx2;
+-
+- c->warp8x8 = dav1d_warp_affine_8x8_avx2;
+- c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
+-
+- c->emu_edge = dav1d_emu_edge_avx2;
+- c->resize = dav1d_resize_avx2;
++ c->avg = BF(dav1d_avg, avx2);
++ c->w_avg = BF(dav1d_w_avg, avx2);
++ c->mask = BF(dav1d_mask, avx2);
++ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
++ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
++ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
++ c->blend = BF(dav1d_blend, avx2);
++ c->blend_v = BF(dav1d_blend_v, avx2);
++ c->blend_h = BF(dav1d_blend_h, avx2);
++
++ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
++
++ c->emu_edge = BF(dav1d_emu_edge, avx2);
++ c->resize = BF(dav1d_resize, avx2);
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+@@ -355,12 +355,12 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
+
+- c->avg = dav1d_avg_avx512icl;
+- c->w_avg = dav1d_w_avg_avx512icl;
+- c->mask = dav1d_mask_avx512icl;
+- c->w_mask[0] = dav1d_w_mask_444_avx512icl;
+- c->w_mask[1] = dav1d_w_mask_422_avx512icl;
+- c->w_mask[2] = dav1d_w_mask_420_avx512icl;
++ c->avg = BF(dav1d_avg, avx512icl);
++ c->w_avg = BF(dav1d_w_avg, avx512icl);
++ c->mask = BF(dav1d_mask, avx512icl);
++ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
++ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
++ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
+ #endif
+ #endif
+ }
+diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
+index edbd1865..8b38daa7 100644
+--- a/src/x86/mc_sse.asm
++++ b/src/x86/mc_sse.asm
+@@ -113,13 +113,13 @@ cextern mc_subpel_filters
+ %endrep
+ %endmacro
+
+-BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
+-BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
+-BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32
+-BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
+-BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
++BIDIR_JMP_TABLE avg_8bpc_ssse3, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_avg_8bpc_ssse3, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE mask_8bpc_ssse3, 4, 8, 16, 32, 64, 128
++BIDIR_JMP_TABLE w_mask_420_8bpc_ssse3, 4, 8, 16, 16, 16, 16
++BIDIR_JMP_TABLE blend_8bpc_ssse3, 4, 8, 16, 32
++BIDIR_JMP_TABLE blend_v_8bpc_ssse3, 2, 4, 8, 16, 32
++BIDIR_JMP_TABLE blend_h_8bpc_ssse3, 2, 4, 8, 16, 16, 16, 16
+
+ %macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+@@ -131,15 +131,15 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
+ %endrep
+ %endmacro
+
+-%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep)
+-%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+-%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
++%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep)
++%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
++%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
+
+ BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+ %macro HV_JMP_TABLE 5-*
+- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
++ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+@@ -202,8 +202,8 @@ HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
+ %endmacro
+
+ %if ARCH_X86_64
+-SCALED_JMP_TABLE put_8tap_scaled_ssse3, 2, 4, 8, 16, 32, 64, 128
+-SCALED_JMP_TABLE prep_8tap_scaled_ssse3, 4, 8, 16, 32, 64, 128
++SCALED_JMP_TABLE put_8tap_scaled_8bpc_ssse3, 2, 4, 8, 16, 32, 64, 128
++SCALED_JMP_TABLE prep_8tap_scaled_8bpc_ssse3, 4, 8, 16, 32, 64, 128
+ %endif
+
+ %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+@@ -228,7 +228,7 @@ INIT_XMM ssse3
+ %endif
+ %endmacro
+
+-cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy
++cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn srcq, srcmp
+@@ -853,7 +853,7 @@ cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy
+ %define base 0
+ %endif
+
+-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
++cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep%+SUFFIX
+ tzcnt wd, wm
+@@ -1450,7 +1450,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ %assign FILTER_SHARP (2*15 << 16) | 3*15
+
+ %macro FN 4 ; prefix, type, type_h, type_v
+-cglobal %1_%2
++cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+ %ifidn %3, %4
+ mov t1d, t0d
+@@ -1458,7 +1458,7 @@ cglobal %1_%2
+ mov t1d, FILTER_%4
+ %endif
+ %ifnidn %2, regular ; skip the jump in the last filter
+- jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
++ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+ %endif
+ %endmacro
+
+@@ -1488,7 +1488,7 @@ FN put_8tap, regular, REGULAR, REGULAR
+ %define base 0
+ %endif
+
+-cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
++cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ %assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+@@ -2739,7 +2739,7 @@ FN prep_8tap, regular, REGULAR, REGULAR
+ %define base_reg r7
+ %define base 0
+ %endif
+-cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
++cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+@@ -3920,26 +3920,26 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+-cglobal put_8tap_scaled, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
++cglobal put_8tap_scaled_8bpc, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+-cglobal put_8tap_scaled, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
++cglobal put_8tap_scaled_8bpc, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+ %else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+-cglobal prep_8tap_scaled, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
++cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+-cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
++cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+0x138]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+ %endif
+- LEA base_reg, %1_8tap_scaled_ssse3
+-%define base base_reg-%1_8tap_scaled_ssse3
++ LEA base_reg, %1_8tap_scaled_8bpc_ssse3
++%define base base_reg-%1_8tap_scaled_8bpc_ssse3
+ tzcnt wd, wm
+ movd m8, dxm
+ movd m14, mxm
+@@ -4001,7 +4001,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+- movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
++ movzx wd, word [base+%1_8tap_scaled_8bpc_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+ %ifidn %1, put
+@@ -4557,7 +4557,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ mova [rsp+0x80], m4
+ jmp .vloop
+ .dy1:
+- movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
++ movzx wd, word [base+%1_8tap_scaled_8bpc_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+ %ifidn %1, put
+@@ -5049,7 +5049,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ mova [rsp+0x80], m7
+ jmp .dy1_vloop
+ .dy2:
+- movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
++ movzx wd, word [base+%1_8tap_scaled_8bpc_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+ %ifidn %1, put
+@@ -5522,10 +5522,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %endmacro
+
+ %macro BILIN_SCALED_FN 1
+-cglobal %1_bilin_scaled
++cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+- jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
++ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+ %endmacro
+
+ %if ARCH_X86_64
+@@ -5719,15 +5719,15 @@ MC_8TAP_SCALED prep
+
+ %macro WARP_AFFINE_8X8T 0
+ %if ARCH_X86_64
+-cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
++cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
+ %else
+-cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
++cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+ %define tmpm [esp+stack_size-4*1]
+ %define tsm [esp+stack_size-4*2]
+ %endif
+ %endif
+- call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
+ .loop:
+ %if ARCH_X86_32
+ %define m12 m4
+@@ -5768,24 +5768,24 @@ cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
+ mova [tmpq+tsq*0], m12
+ mova [tmpq+tsq*2], m14
+ dec counterd
+- jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
++ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
+ %if ARCH_X86_32
+ mov tmpm, tmpd
+ mov r0, [esp+0x100]
+ mov r1, [esp+0x104]
+ %endif
+- call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+ %endmacro
+
+ %macro WARP_AFFINE_8X8 0
+ %if ARCH_X86_64
+-cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
++cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %else
+-cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
++cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %define alphaq r0
+@@ -6375,12 +6375,12 @@ DECLARE_REG_TMP 6, 7
+ add tmp2q, %1*mmsize
+ %endmacro
+
+-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+- LEA r6, avg_ssse3_table
++cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
++ LEA r6, avg_8bpc_ssse3_table
+ tzcnt wd, wm ; leading zeros
+ movifnidn hd, hm ; move h(stack) to h(register) if not already that register
+ movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+- mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
++ mova m2, [pw_1024+r6-avg_8bpc_ssse3_table] ; fill m2 with shift/align
+ add wq, r6
+ BIDIR_FN AVG
+
+@@ -6406,14 +6406,14 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+
+ %define W_AVG_INC_PTR AVG_INC_PTR
+
+-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+- LEA r6, w_avg_ssse3_table
++cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
++ LEA r6, w_avg_8bpc_ssse3_table
+ tzcnt wd, wm
+ movd m4, r6m
+ movifnidn hd, hm
+ pxor m0, m0
+ movsxd wq, dword [r6+wq*4]
+- mova m5, [pw_2048+r6-w_avg_ssse3_table]
++ mova m5, [pw_2048+r6-w_avg_8bpc_ssse3_table]
+ pshufb m4, m0
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+@@ -6460,14 +6460,14 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ %endmacro
+
+ %if ARCH_X86_64
+-cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
++cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ movifnidn hd, hm
+ %else
+-cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
++cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+ %define hd dword r5m
+ %endif
+-%define base r6-mask_ssse3_table
+- LEA r6, mask_ssse3_table
++%define base r6-mask_8bpc_ssse3_table
++ LEA r6, mask_8bpc_ssse3_table
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+@@ -6514,13 +6514,13 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+ W_MASK_420_B (%1*16), %2
+ %endmacro
+
+-%define base r6-w_mask_420_ssse3_table
++%define base r6-w_mask_420_8bpc_ssse3_table
+ %if ARCH_X86_64
+ %define reg_pw_6903 m8
+ %define reg_pw_2048 m9
+ ; args: dst, stride, tmp1, tmp2, w, h, mask, sign
+-cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
+- lea r6, [w_mask_420_ssse3_table]
++cglobal w_mask_420_8bpc, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
++ lea r6, [w_mask_420_8bpc_ssse3_table]
+ mov wd, wm
+ tzcnt r7d, wd
+ movd m0, r7m ; sign
+@@ -6540,9 +6540,9 @@ cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
+ %else
+ %define reg_pw_6903 [base+pw_6903]
+ %define reg_pw_2048 m3
+-cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
++cglobal w_mask_420_8bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+ tzcnt wd, wm
+- LEA r6, w_mask_420_ssse3_table
++ LEA r6, w_mask_420_8bpc_ssse3_table
+ movd m0, r7m ; sign
+ mov maskq, r6mp
+ mov wd, [r6+wq*4]
+@@ -6656,9 +6656,9 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+ BLEND_64M %1, %2, m2, m3
+ %endmacro
+
+-cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+-%define base r6-blend_ssse3_table
+- LEA r6, blend_ssse3_table
++cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
++%define base r6-blend_8bpc_ssse3_table
++ LEA r6, blend_8bpc_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+@@ -6732,15 +6732,15 @@ cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+ jg .w32
+ RET
+
+-cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+-%define base r5-blend_v_ssse3_table
+- LEA r5, blend_v_ssse3_table
++cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
++%define base r5-blend_v_8bpc_ssse3_table
++ LEA r5, blend_v_8bpc_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ mova m5, [base+pw_512]
+ add wq, r5
+- add maskq, obmc_masks-blend_v_ssse3_table
++ add maskq, obmc_masks-blend_v_8bpc_ssse3_table
+ jmp wq
+ .w2:
+ movd m3, [maskq+4]
+@@ -6840,8 +6840,8 @@ cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+ %endif
+ RET
+
+-cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+-%define base t0-blend_h_ssse3_table
++cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
++%define base t0-blend_h_8bpc_ssse3_table
+ %if ARCH_X86_32
+ ; We need to keep the PIC pointer for w4, reload wd from stack instead
+ DECLARE_REG_TMP 6
+@@ -6849,7 +6849,7 @@ cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+ DECLARE_REG_TMP 5
+ mov r6d, wd
+ %endif
+- LEA t0, blend_h_ssse3_table
++ LEA t0, blend_h_8bpc_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, dword [t0+wq*4]
+@@ -6954,7 +6954,7 @@ cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+ ; bw, bh total filled size
+ ; iw, ih, copied block -> fill bottom, right
+ ; x, y, offset in bw/bh -> fill top, left
+-cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
++cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+@@ -7317,13 +7317,13 @@ cextern resize_filter
+ %endmacro
+
+ %if ARCH_X86_64
+-cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
++cglobal resize_8bpc, 0, 14, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ %elif STACK_ALIGNMENT >= 16
+-cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
++cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ %else
+-cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
++cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ %endif
+ movifnidn dstq, dstmp
+--
+GitLab
+
+
+From da299eb148a2a799411132166d32613d15586578 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Mon, 21 Dec 2020 00:38:05 -0500
+Subject: [PATCH 2/7] x86: mc: Add AVX2 implementation of 8tap put/prep for
+ 16bpc
+
+Relative speed-ups over C code (compared with gcc-9.3.0):
+
+ C AVX2
+mc_8tap_regular_w2_0_16bpc: 146.2 17.4 8.40x
+mc_8tap_regular_w4_0_16bpc: 117.1 17.2 6.81x
+mc_8tap_regular_w8_0_16bpc: 128.9 17.2 7.49x
+mc_8tap_regular_w16_0_16bpc: 148.3 24.6 6.03x
+mc_8tap_regular_w32_0_16bpc: 233.3 55.4 4.21x
+mc_8tap_regular_w64_0_16bpc: 571.2 319.2 1.79x
+mc_8tap_regular_w128_0_16bpc: 1027.1 961.1 1.07x
+mc_8tap_regular_w2_h_16bpc: 378.0 36.9 10.24x
+mc_8tap_regular_w4_h_16bpc: 660.2 49.0 13.47x
+mc_8tap_regular_w8_h_16bpc: 1215.7 112.9 10.77x
+mc_8tap_regular_w16_h_16bpc: 2989.1 295.8 10.11x
+mc_8tap_regular_w32_h_16bpc: 8724.6 939.4 9.29x
+mc_8tap_regular_w64_h_16bpc: 29957.4 3296.8 9.09x
+mc_8tap_regular_w128_h_16bpc: 83043.1 9318.0 8.91x
+mc_8tap_regular_w2_hv_16bpc: 829.3 142.3 5.83x
+mc_8tap_regular_w4_hv_16bpc: 1508.8 168.5 8.95x
+mc_8tap_regular_w8_hv_16bpc: 2163.8 232.1 9.32x
+mc_8tap_regular_w16_hv_16bpc: 3710.8 595.0 6.24x
+mc_8tap_regular_w32_hv_16bpc: 10317.3 1814.8 5.69x
+mc_8tap_regular_w64_hv_16bpc: 33509.6 6120.2 5.48x
+mc_8tap_regular_w128_hv_16bpc: 91086.4 17263.0 5.28x
+mc_8tap_regular_w2_v_16bpc: 523.6 69.9 7.49x
+mc_8tap_regular_w4_v_16bpc: 769.4 68.8 11.18x
+mc_8tap_regular_w8_v_16bpc: 1292.3 92.7 13.94x
+mc_8tap_regular_w16_v_16bpc: 3078.1 242.5 12.69x
+mc_8tap_regular_w32_v_16bpc: 8706.3 731.1 11.91x
+mc_8tap_regular_w64_v_16bpc: 28948.7 2593.4 11.16x
+mc_8tap_regular_w128_v_16bpc: 79731.9 7838.7 10.17x
+
+mct_8tap_regular_w4_0_16bpc: 106.6 15.7 6.79x
+mct_8tap_regular_w8_0_16bpc: 120.1 24.7 4.86x
+mct_8tap_regular_w16_0_16bpc: 276.4 43.0 6.43x
+mct_8tap_regular_w32_0_16bpc: 940.5 171.7 5.48x
+mct_8tap_regular_w64_0_16bpc: 2238.6 485.7 4.61x
+mct_8tap_regular_w128_0_16bpc: 5529.9 1113.5 4.97x
+mct_8tap_regular_w4_h_16bpc: 394.6 36.2 10.90x
+mct_8tap_regular_w8_h_16bpc: 1121.1 125.4 8.94x
+mct_8tap_regular_w16_h_16bpc: 3706.9 383.0 9.68x
+mct_8tap_regular_w32_h_16bpc: 13628.9 1554.6 8.77x
+mct_8tap_regular_w64_h_16bpc: 31807.7 3727.5 8.53x
+mct_8tap_regular_w128_h_16bpc: 77388.9 9688.3 7.99x
+mct_8tap_regular_w4_hv_16bpc: 1099.5 118.1 9.31x
+mct_8tap_regular_w8_hv_16bpc: 2280.3 242.0 9.42x
+mct_8tap_regular_w16_hv_16bpc: 4510.8 718.0 6.28x
+mct_8tap_regular_w32_hv_16bpc: 15620.4 2853.5 5.47x
+mct_8tap_regular_w64_hv_16bpc: 35833.0 6572.0 5.45x
+mct_8tap_regular_w128_hv_16bpc: 85563.2 16550.1 5.17x
+mct_8tap_regular_w4_v_16bpc: 578.3 47.9 12.07x
+mct_8tap_regular_w8_v_16bpc: 1470.1 99.0 14.85x
+mct_8tap_regular_w16_v_16bpc: 4165.9 296.8 14.04x
+mct_8tap_regular_w32_v_16bpc: 14660.0 1142.4 12.83x
+mct_8tap_regular_w64_v_16bpc: 33287.1 2787.2 11.94x
+mct_8tap_regular_w128_v_16bpc: 79527.0 7323.4 10.86x
+---
+ src/x86/mc16_avx2.asm | 1553 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 1553 insertions(+)
+ create mode 100644 src/x86/mc16_avx2.asm
+
+diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
+new file mode 100644
+index 00000000..ea6cfdbf
+--- /dev/null
++++ b/src/x86/mc16_avx2.asm
+@@ -0,0 +1,1553 @@
++; Copyright (c) 2017-2020, The rav1e contributors
++; Copyright (c) 2020, Nathan Egge
++; All rights reserved.
++;
++; This source code is subject to the terms of the BSD 2 Clause License and
++; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
++; was not distributed with this source code in the LICENSE file, you can
++; obtain it at www.aomedia.org/license/software. If the Alliance for Open
++; Media Patent License 1.0 was not distributed with this source code in the
++; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
++
++%include "config.asm"
++%include "ext/x86/x86inc.asm"
++
++%if ARCH_X86_64
++
++SECTION_RODATA 32
++
++spf_h_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
++ db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
++pq_2: dq (6 - 4)
++pq_4: dq (6 - 2)
++pq_8: dq (6 + 2)
++pq_10: dq (6 + 4)
++pd_32: dd (1 << 6 >> 1)
++pd_34: dd (1 << 6 >> 1) + (1 << (6 - 4) >> 1)
++pd_40: dd (1 << 6 >> 1) + (1 << (6 - 2) >> 1)
++pd_2: dd (1 << (6 - 4) >> 1)
++pd_512: dd (1 << (6 + 4) >> 1)
++pd_8: dd (1 << (6 - 2) >> 1)
++pd_128: dd (1 << (6 + 2) >> 1)
++nd_524256: dd (1 << 6 >> 1) - (8192 << 6)
++nd_32766: dd (1 << (6 - 4) >> 1) - (8192 << (6 - 4))
++nd_131064: dd (1 << (6 - 2) >> 1) - (8192 << (6 - 2))
++pw_8192: dw 8192
++
++SECTION .text
++
++%macro PUT_4TAP_H 6
++ pshufb %1, %3
++ pshufb %2, %3
++ pmaddwd %1, %4
++ pmaddwd %2, %4
++ phaddd %1, %2
++ paddd %1, %5
++ psrad %1, %6
++%endm
++
++%macro PUT_8TAP_H 8
++ movu xm%1, [srcq + %8 + 0]
++ movu xm%3, [srcq + %8 + 2]
++ vinserti128 m%1, [srcq + ssq + %8 + 0], 1
++ vinserti128 m%3, [srcq + ssq + %8 + 2], 1
++ movu xm%2, [srcq + %8 + 4]
++ movu xm%4, [srcq + %8 + 6]
++ vinserti128 m%2, [srcq + ssq + %8 + 4], 1
++ vinserti128 m%4, [srcq + ssq + %8 + 6], 1
++ pmaddwd m%1, %5
++ pmaddwd m%3, %5
++ pmaddwd m%2, %5
++ pmaddwd m%4, %5
++ phaddd m%1, m%3
++ phaddd m%2, m%4
++ phaddd m%1, m%2
++ paddd m%1, %6
++ psrad m%1, %7
++%endm
++
++%macro PUT_4TAP_HS1 5
++ pshufb %1, %2
++ pmaddwd %1, %3
++ phaddd %1, %1
++ paddd %1, %4
++ psrad %1, %5
++ packssdw %1, %1
++%endm
++
++%macro PUT_4TAP_HS2 6
++ pshufb %1, %3
++ pshufb %2, %3
++ pmaddwd %1, %4
++ pmaddwd %2, %4
++ phaddd %1, %1
++ phaddd %2, %2
++ paddd %1, %5
++ paddd %2, %5
++ psrad %1, %6
++ psrad %2, %6
++ packssdw %1, %1
++ packssdw %2, %2
++%endm
++
++%macro PUT_8TAP_HS 7-8
++ movu xm%1, [srcq + %7 + 0]
++ movu xm%3, [srcq + %7 + 2]
++ vinserti128 m%1, [srcq + %7 + 8], 1
++ vinserti128 m%3, [srcq + %7 + 10], 1
++ pmaddwd m%1, %4
++ pmaddwd m%3, %4
++ phaddd m%1, m%3
++ movu xm%2, [srcq + %7 + 4]
++ movu xm%3, [srcq + %7 + 6]
++ vinserti128 m%2, [srcq + %7 + 12], 1
++ vinserti128 m%3, [srcq + %7 + 14], 1
++ pmaddwd m%2, %4
++ pmaddwd m%3, %4
++ phaddd m%2, m%3
++%if %0 > 7
++ vpbroadcastd %5, %8
++%endif
++ phaddd m%1, m%2
++ paddd m%1, %5
++ psrad m%1, %6
++ packssdw m%1, m%1
++%endm
++
++%macro LOAD_REGS_2 3
++ mov%1 xm%2, [srcq + ssq*0]
++ mov%1 xm%3, [srcq + ssq*1]
++%ifidn %1, u
++ vpermq m%2, m%2, q3120
++ vpermq m%3, m%3, q3120
++%endif
++ lea srcq, [srcq + ssq*2]
++%endm
++
++%macro LOAD_REGS_3 4
++ mov%1 xm%2, [srcq + ssq*0]
++ mov%1 xm%3, [srcq + ssq*1]
++ mov%1 xm%4, [srcq + ssq*2]
++%ifidn %1, u
++ vpermq m%2, m%2, q3120
++ vpermq m%3, m%3, q3120
++ vpermq m%4, m%4, q3120
++%endif
++ add srcq, ss3q
++%endm
++
++%macro LOAD_REGS 3-8
++%if %0 == 3
++ LOAD_REGS_2 %1, %2, %3
++%elif %0 == 4
++ LOAD_REGS_3 %1, %2, %3, %4
++%elif %0 == 5
++ LOAD_REGS_2 %1, %2, %3
++ LOAD_REGS_2 %1, %4, %5
++%elif %0 == 6
++ LOAD_REGS_3 %1, %2, %3, %4
++ LOAD_RESG_2 %1, %5, %6
++%elif %0 == 7
++ LOAD_REGS_3 %1, %2, %3, %4
++ LOAD_REGS_3 %1, %5, %6, %7
++%else
++ LOAD_REGS_3 %1, %2, %3, %4
++ LOAD_REGS_2 %1, %5, %6
++ LOAD_REGS_2 %1, %7, %8
++%endif
++%endm
++
++%macro STORE_REGS 3
++%ifidn %1, u
++ vpermq m%2, m%2, q3120
++ vpermq m%3, m%3, q3120
++%endif
++ mov%1 [dstq + dsq*0], xm%2
++ mov%1 [dstq + dsq*1], xm%3
++ lea dstq, [dstq + dsq*2]
++%endm
++
++%macro INTERLEAVE_REGS 4-8
++ punpckl%1 %2, %3
++ punpckl%1 %3, %4
++%if %0 > 4
++ punpckl%1 %4, %5
++ punpckl%1 %5, %6
++%endif
++%if %0 > 6
++ punpckl%1 %6, %7
++ punpckl%1 %7, %8
++%endif
++%endm
++
++%macro MUL_ADD_R 8
++ pmaddwd %3, %7
++ pmaddwd %1, %5, %8
++ paddd %1, %3
++ mova %3, %5
++ pmaddwd %4, %7
++ pmaddwd %2, %6, %8
++ paddd %2, %4
++ mova %4, %6
++%endm
++
++%macro MUL_ACC_R 7
++ pmaddwd %3, %5, %7
++ pmaddwd %4, %6, %7
++ paddd %1, %3
++ paddd %2, %4
++ mova %3, %5
++ mova %4, %6
++%endm
++
++%macro RND_SHR_MIN_R 5
++ paddd %1, %3
++ paddd %2, %3
++ psrad %1, %4
++ psrad %2, %4
++ packusdw %1, %1
++ packusdw %2, %2
++ pminuw %1, %5
++ pminuw %2, %5
++%endm
++
++%macro RND_SHR_R 4
++ paddd %1, %3
++ paddd %2, %3
++ psrad %1, %4
++ psrad %2, %4
++ packssdw %1, %1
++ packssdw %2, %2
++%endm
++
++; int8_t subpel_filters[5][15][8]
++%assign FILTER_REGULAR (0*15 << 7) | 3*15
++%assign FILTER_SMOOTH (1*15 << 7) | 4*15
++%assign FILTER_SHARP (2*15 << 7) | 3*15
++
++%macro make_8tap_fn 4 ; type, op, type_h, type_v
++INIT_XMM avx2
++cglobal %1_8tap_%2_16bpc
++ mov t0d, FILTER_%3
++ mov t1d, FILTER_%4
++ jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
++%endmacro
++
++cextern mc_subpel_filters
++
++%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
++
++%macro filter_fn 1
++
++%if WIN64
++%ifidn %1, put
++DECLARE_REG_TMP 5, 4
++%else
++DECLARE_REG_TMP 4, 5
++%endif
++%else
++DECLARE_REG_TMP 7, 8
++%endif
++
++make_8tap_fn %1, regular, REGULAR, REGULAR
++make_8tap_fn %1, regular_smooth, REGULAR, SMOOTH
++make_8tap_fn %1, regular_sharp, REGULAR, SHARP
++make_8tap_fn %1, smooth, SMOOTH, SMOOTH
++make_8tap_fn %1, smooth_regular, SMOOTH, REGULAR
++make_8tap_fn %1, smooth_sharp, SMOOTH, SHARP
++make_8tap_fn %1, sharp, SHARP, SHARP
++make_8tap_fn %1, sharp_regular, SHARP, REGULAR
++make_8tap_fn %1, sharp_smooth, SHARP, SMOOTH
++
++INIT_YMM avx2
++%ifidn %1, put
++cglobal put_8tap_16bpc, 4, 10, 16, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3
++%else
++cglobal prep_8tap_16bpc, 3, 10, 16, dst, src, ss, _w, h, mx, my, bdmax, ds, ss3
++%endif
++
++%ifidn %1, put
++ imul mxd, mxm, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0)
++ add mxd, t0d
++ imul myd, mym, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0)
++ add myd, t1d
++%else
++ imul myd, mym, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0)
++ add myd, t1d
++ imul mxd, mxm, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0)
++ add mxd, t0d
++%endif
++
++ movsxd _wq, _wm
++ movifnidn hd, hm
++
++%ifidn %1, put
++ vpbroadcastw m7, bdmaxm
++%else
++ lea dsq, [_wq*2]
++%endif
++
++ test mxd, (0x7f << 14)
++ jnz .%1_8tap_h_16bpc
++ test myd, (0x7f << 14)
++ jnz .%1_8tap_v_16bpc
++
++; ---- {put,prep}_16bpc ----
++
++INIT_XMM avx2
++.%1_16bpc: ; cglobal put_16bpc, 6, 8, 8, dst, ds, src, ss, w, h
++
++%ifidn %1, prep
++INIT_YMM avx2
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastq m8, [pq_4]
++ vpbroadcastw m9, [pw_8192]
++ cmp bdmaxd, 12
++ jne .prep_bits10
++ vpbroadcastq m8, [pq_2]
++.prep_bits10:
++INIT_XMM avx2
++%endif
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3
++%else
++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3
++%endif
++
++ lea jrq, [.jmp_tbl]
++ tzcnt _wd, _wm
++%ifidn %1, put
++ sub _wd, 1
++%else
++ sub _wd, 2
++%endif
++ movsxd _wq, [jrq + _wq*4]
++ add _wq, jrq
++ jmp _wq
++
++%ifidn %1, put
++.w2: ; 2xN
++ movd m0, [srcq]
++ movd m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++ movd [dstq], m0
++ movd [dstq + dsq], m1
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .w2
++ RET
++%endif
++
++.w4: ; 4xN
++ movq m0, [srcq]
++ movq m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++%ifidn %1, prep
++ psllw m0, m8
++ psllw m1, m8
++ psubw m0, m9
++ psubw m1, m9
++%endif
++ movq [dstq], m0
++ movq [dstq + dsq], m1
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .w4
++ RET
++
++ ; XXX is unaligned input (but aligned output) a hard requirement, or is checkasm broken?
++.w8: ; 8xN
++ movu m0, [srcq]
++ movu m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++%ifidn %1, prep
++ psllw m0, m8
++ psllw m1, m8
++ psubw m0, m9
++ psubw m1, m9
++%endif
++ mova [dstq], m0
++ mova [dstq + dsq], m1
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .w8
++ RET
++
++INIT_YMM avx2
++.w16: ; 16xN
++ movu m0, [srcq]
++ movu m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++%ifidn %1, prep
++ psllw m0, xm8
++ psllw m1, xm8
++ psubw m0, m9
++ psubw m1, m9
++%endif
++ mova [dstq], m0
++ mova [dstq + dsq], m1
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .w16
++ RET
++
++.w32: ; 32xN
++ movu m0, [srcq + 32*0]
++ movu m1, [srcq + 32*1]
++ movu m2, [srcq + ssq]
++ movu m3, [srcq + ssq + 32*1]
++ lea srcq, [srcq + ssq*2]
++%ifidn %1, prep
++ psllw m0, xm8
++ psllw m1, xm8
++ psllw m2, xm8
++ psllw m3, xm8
++ psubw m0, m9
++ psubw m1, m9
++ psubw m2, m9
++ psubw m3, m9
++%endif
++ mova [dstq + 32*0], m0
++ mova [dstq + 32*1], m1
++ mova [dstq + dsq + 32*0], m2
++ mova [dstq + dsq + 32*1], m3
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .w32
++ RET
++
++.w64: ; 64xN
++ movu m0, [srcq + 32*0]
++ movu m1, [srcq + 32*1]
++ movu m2, [srcq + 32*2]
++ movu m3, [srcq + 32*3]
++ movu m4, [srcq + ssq + 32*0]
++ movu m5, [srcq + ssq + 32*1]
++ movu m6, [srcq + ssq + 32*2]
++ movu m7, [srcq + ssq + 32*3]
++ lea srcq, [srcq + ssq*2]
++%ifidn %1, prep
++ psllw m0, xm8
++ psllw m1, xm8
++ psllw m2, xm8
++ psllw m3, xm8
++ psllw m4, xm8
++ psllw m5, xm8
++ psllw m6, xm8
++ psllw m7, xm8
++ psubw m0, m9
++ psubw m1, m9
++ psubw m2, m9
++ psubw m3, m9
++ psubw m4, m9
++ psubw m5, m9
++ psubw m6, m9
++ psubw m7, m9
++%endif
++ mova [dstq + 32*0], m0
++ mova [dstq + 32*1], m1
++ mova [dstq + 32*2], m2
++ mova [dstq + 32*3], m3
++ mova [dstq + dsq + 32*0], m4
++ mova [dstq + dsq + 32*1], m5
++ mova [dstq + dsq + 32*2], m6
++ mova [dstq + dsq + 32*3], m7
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .w64
++ RET
++
++.w128: ; 128xN
++ movu m0, [srcq + 32*0]
++ movu m1, [srcq + 32*1]
++ movu m2, [srcq + 32*2]
++ movu m3, [srcq + 32*3]
++ movu m4, [srcq + 32*4]
++ movu m5, [srcq + 32*5]
++ movu m6, [srcq + 32*6]
++ movu m7, [srcq + 32*7]
++ add srcq, ssq
++%ifidn %1, prep
++ psllw m0, xm8
++ psllw m1, xm8
++ psllw m2, xm8
++ psllw m3, xm8
++ psllw m4, xm8
++ psllw m5, xm8
++ psllw m6, xm8
++ psllw m7, xm8
++ psubw m0, m9
++ psubw m1, m9
++ psubw m2, m9
++ psubw m3, m9
++ psubw m4, m9
++ psubw m5, m9
++ psubw m6, m9
++ psubw m7, m9
++%endif
++ mova [dstq + 32*0], m0
++ mova [dstq + 32*1], m1
++ mova [dstq + 32*2], m2
++ mova [dstq + 32*3], m3
++ mova [dstq + 32*4], m4
++ mova [dstq + 32*5], m5
++ mova [dstq + 32*6], m6
++ mova [dstq + 32*7], m7
++ add dstq, dsq
++ dec hd
++ jg .w128
++ RET
++
++.jmp_tbl:
++%ifidn %1, put
++ dd .w2 - .jmp_tbl
++%endif
++ dd .w4 - .jmp_tbl
++ dd .w8 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w32 - .jmp_tbl
++ dd .w64 - .jmp_tbl
++ dd .w128 - .jmp_tbl
++
++; ---- {put,prep}_8tap_h_16bpc ----
++
++INIT_XMM avx2
++.%1_8tap_h_16bpc: ; cglobal put_8tap_h_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, bdmax
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3
++%else
++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3
++%endif
++
++ cmp _wd, 4
++ jle .h_use4tap
++ shr mxd, 7
++.h_use4tap:
++ and mxd, 0x7f
++
++ test myd, (0x7f << 14)
++ jnz .%1_8tap_hv_16bpc
++
++INIT_YMM avx2
++ popcnt bdmaxd, bdmaxm
++%ifidn %1, put
++ vpbroadcastd m6, [pd_34] ; (1 << 6 >> 1) + (1 << (6 - 4) >> 1)
++%else
++ vpbroadcastd m6, [nd_32766] ; (1 << (6 - 4) >> 1) - (8192 << (6 - 4))
++ vpbroadcastq m7, [pq_2] ; (6 - 4)
++%endif
++ cmp bdmaxd, 12
++ jne .h_bits10
++%ifidn %1, put
++ vpbroadcastd m6, [pd_40] ; (1 << 6 >> 1) + (1 << (6 - 2) >> 1)
++%else
++ vpbroadcastd m6, [nd_131064] ; (1 << (6 - 2) >> 1) - (8192 << (6 - 2))
++ vpbroadcastq m7, [pq_4] ; (6 - 2)
++%endif
++.h_bits10:
++INIT_XMM avx2
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, w2, jr, ss3
++%else
++ DEFINE_ARGS dst, src, ss, _w, h, mx, w2, jr, ds, ss3
++%endif
++
++ lea w2q, [_wq*2]
++
++ lea jrq, [.h_jmp_tbl]
++ tzcnt _wd, _wm
++%ifidn %1, put
++ sub _wd, 1
++%else
++ sub _wd, 2
++%endif
++ movsxd _wq, [jrq + _wq*4]
++ add _wq, jrq
++ jmp _wq
++
++%ifidn %1, put
++.h_w2:
++ sub srcq, 2
++ mova xm4, [spf_h_shuf]
++ vpbroadcastd m5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8 + 2]
++ vpmovsxbw m5, m5
++
++.h_w2l:
++ movu m0, [srcq]
++ movu m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++%ifidn %1, put
++ PUT_4TAP_H m0, m1, m4, m5, m6, 6
++ packusdw m0, m0
++ pminuw m0, m7
++%else
++ PUT_4TAP_H m0, m1, m4, m5, m6, m7
++ packssdw m0, m1
++%endif
++
++ movd [dstq], m0
++ pextrd [dstq + dsq], m0, 1
++ lea dstq, [dstq + dsq*2]
++
++ sub hd, 2
++ jg .h_w2l
++ RET
++%endif
++
++INIT_YMM avx2
++.h_w4:
++ sub srcq, 2
++ mova m4, [spf_h_shuf]
++ vpbroadcastd xm5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8 + 2]
++ vpmovsxbw m5, xm5
++
++.h_w4l:
++ vbroadcasti128 m0, [srcq]
++ vbroadcasti128 m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++%ifidn %1, put
++ PUT_4TAP_H m0, m1, m4, m5, m6, 6
++ packusdw m0, m0
++ pminuw m0, m7
++%else
++ PUT_4TAP_H m0, m1, m4, m5, m6, xm7
++ packssdw m0, m0
++%endif
++
++ vextracti128 xm1, m0, 1
++ movd [dstq], xm0
++ movd [dstq + 4], xm1
++ pextrd [dstq + dsq], xm0, 1
++ pextrd [dstq + dsq + 4], xm1, 1
++ lea dstq, [dstq + dsq*2]
++
++ sub hd, 2
++ jg .h_w4l
++ RET
++
++.h_w8:
++ sub srcq, 6
++ vpbroadcastq xm5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8]
++ vpmovsxbw m5, xm5
++
++.h_w8l:
++ mov _wd, w2d
++
++.h_w8c:
++%ifidn %1, put
++ PUT_8TAP_H 0, 1, 2, 3, m5, m6, 6, 4*0
++ PUT_8TAP_H 1, 2, 3, 4, m5, m6, 6, 4*2
++ packusdw m0, m1
++ pminuw m0, m7
++%else
++ PUT_8TAP_H 0, 1, 2, 3, m5, m6, xm7, 4*0
++ PUT_8TAP_H 1, 2, 3, 4, m5, m6, xm7, 4*2
++ packssdw m0, m1
++%endif
++ add srcq, 8*2
++
++ mova [dstq], xm0
++ vextracti128 [dstq + dsq], m0, 1
++
++ add dstq, 8*2
++ sub _wd, 8*2
++ jg .h_w8c
++
++ sub srcq, w2q
++ sub dstq, w2q
++ lea srcq, [srcq + ssq*2]
++ lea dstq, [dstq + dsq*2]
++ sub hd, 2
++ jg .h_w8l
++ RET
++
++.h_jmp_tbl:
++%ifidn %1, put
++ dd .h_w2 - .h_jmp_tbl
++%endif
++ dd .h_w4 - .h_jmp_tbl
++ dd .h_w8 - .h_jmp_tbl
++ dd .h_w8 - .h_jmp_tbl
++ dd .h_w8 - .h_jmp_tbl
++ dd .h_w8 - .h_jmp_tbl
++ dd .h_w8 - .h_jmp_tbl
++
++; ---- {put,prep}_8tap_v_16bpc ----
++
++INIT_XMM avx2
++.%1_8tap_v_16bpc: ; cglobal put_8tap_v_16bpc, 4, 9, 0, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3
++%else
++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3
++%endif
++
++ cmp hd, 4
++ jle .v_use4tap
++ shr myd, 7
++.v_use4tap:
++ and myd, 0x7f
++
++INIT_YMM avx2
++%ifidn %1, put
++ vpbroadcastd m6, [pd_32] ; (1 << 6 >> 1)
++%else
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m6, [nd_32766] ; (1 << (6 - 4) >> 1) - (8192 << (6 - 4))
++ vpbroadcastq m7, [pq_2] ; (6 - 4)
++ cmp bdmaxd, 12
++ jne .v_bits10
++ vpbroadcastd m6, [nd_131064] ; (1 << (6 - 2) >> 1) - (8192 << (6 - 2))
++ vpbroadcastq m7, [pq_4] ; (6 - 2)
++.v_bits10:
++%endif
++INIT_XMM avx2
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, w2, my, jr, ss3
++%else
++ DEFINE_ARGS dst, src, ss, _w, h, w2, my, jr, ds, ss3
++%endif
++
++ lea jrq, [.v_jmp_tbl]
++ lea w2q, [_wq*2]
++ lea ss3q, [ssq*3]
++
++INIT_YMM avx2
++ lea myq, [jrq - .v_jmp_tbl + subpel_filters + myq*8]
++ vpbroadcastw m8, [myq+0]
++ vpbroadcastw m9, [myq+2]
++ vpbroadcastw m10, [myq+4]
++ vpbroadcastw m11, [myq+6]
++ vpmovsxbw m8, xm8
++ vpmovsxbw m9, xm9
++ vpmovsxbw m10, xm10
++ vpmovsxbw m11, xm11
++INIT_XMM avx2
++
++ tzcnt _wd, _wm
++%ifidn %1, put
++ sub _wd, 1
++%else
++ sub _wd, 2
++%endif
++ movsxd _wq, [jrq + _wq*4]
++ add _wq, jrq
++ jmp _wq
++
++%ifidn %1, put
++.v_w2:
++
++ cmp hd, 4
++ jg .v_w28
++
++ sub srcq, ssq
++ LOAD_REGS d, 0, 1, 2
++ INTERLEAVE_REGS wd, m0, m1, m2
++
++.v_w2l: ; 2x2, 2x4
++
++ LOAD_REGS d, 3, 4
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++ MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10
++ mova m2, m4
++
++ RND_SHR_MIN_R m5, m8, m6, 6, m7
++ STORE_REGS d, 5, 8
++
++ sub hd, 2
++ jg .v_w2l
++ RET
++
++.v_w28:
++
++ sub srcq, ss3q
++ LOAD_REGS d, 0, 1, 2, 3, 4, 12, 13
++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13
++
++.v_w28l: ; 2x6, 2x8, 2x12, 2x16, 2x24, 2x32
++
++ sub srcq, ssq
++ LOAD_REGS d, 13, 14, 15
++ INTERLEAVE_REGS wd, m13, m14, m15
++
++ MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9
++ MUL_ACC_R m5, m15, m2, m3, m4, m12, m10
++ MUL_ACC_R m5, m15, m4, m12, m13, m14, m11
++
++ RND_SHR_MIN_R m5, m15, m6, 6, m7
++ STORE_REGS d, 5, 15
++
++ sub hd, 2
++ jg .v_w28l
++ RET
++%endif
++
++.v_w4:
++
++ cmp hd, 4
++ jg .v_w48
++
++ sub srcq, ssq
++ LOAD_REGS q, 0, 1, 2
++ INTERLEAVE_REGS wd, m0, m1, m2
++
++.v_w4l: ; 4x2 4x4
++
++ LOAD_REGS q, 3, 4
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++ MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10
++ mova m2, m4
++
++%ifidn %1, put
++ RND_SHR_MIN_R m5, m8, m6, 6, m7
++%else
++ RND_SHR_R m5, m8, m6, m7
++%endif
++ STORE_REGS q, 5, 8
++
++ sub hd, 2
++ jg .v_w4l
++ RET
++
++.v_w48:
++
++ sub srcq, ss3q
++ LOAD_REGS q, 0, 1, 2, 3, 4, 12, 13
++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13
++
++.v_w48l: ; 4x6, 4x8, 4x12, 4x16, 4x24, 4x32
++
++ sub srcq, ssq
++ LOAD_REGS q, 13, 14, 15
++ INTERLEAVE_REGS wd, m13, m14, m15
++
++ MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9
++ MUL_ACC_R m5, m15, m2, m3, m4, m12, m10
++ MUL_ACC_R m5, m15, m4, m12, m13, m14, m11
++
++%ifidn %1, put
++ RND_SHR_MIN_R m5, m15, m6, 6, m7
++%else
++ RND_SHR_R m5, m15, m6, m7
++%endif
++ STORE_REGS q, 5, 15
++
++ sub hd, 2
++ jg .v_w48l
++
++ RET
++
++INIT_YMM avx2
++.v_w8:
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, oh, h, w2, tdst, tsrc, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, oh, h, w2, tdst, tsrc, ds, ss3
++%endif
++
++ mov ohd, hd
++ mov tdstq, dstq
++
++ cmp hd, 4
++ jg .v_w88
++
++ sub srcq, ssq
++ mov tsrcq, srcq
++
++.v_w8l: ; N = 8, 16, 32, 64, 128
++
++ LOAD_REGS u, 0, 1, 2
++ INTERLEAVE_REGS wd, m0, m1, m2
++
++.v_w8c: ; Nx2, Nx4
++
++ LOAD_REGS u, 3, 4
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++ MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10
++ mova m2, m4
++
++%ifidn %1, put
++ RND_SHR_MIN_R m5, m8, m6, 6, m7
++%else
++ RND_SHR_R m5, m8, m6, xm7
++%endif
++ STORE_REGS u, 5, 8
++
++ sub hd, 2
++ jg .v_w8c
++
++ add tdstq, 2*8
++ add tsrcq, 2*8
++ mov hd, ohd
++ mov dstq, tdstq
++ mov srcq, tsrcq
++ sub w2d, 2*8
++ jg .v_w8l
++
++ RET
++
++.v_w88:
++
++ sub srcq, ss3q
++ mov tsrcq, srcq
++
++.v_w88l: ; N = 8, 16, 32, 64, 128
++
++ LOAD_REGS u, 0, 1, 2, 3, 4, 12, 13
++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13
++
++.v_w88c: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32
++
++ sub srcq, ssq
++
++ LOAD_REGS u, 13, 14, 15
++ INTERLEAVE_REGS wd, m13, m14, m15
++
++ MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9
++ MUL_ACC_R m5, m15, m2, m3, m4, m12, m10
++ MUL_ACC_R m5, m15, m4, m12, m13, m14, m11
++
++%ifidn %1, put
++ RND_SHR_MIN_R m5, m15, m6, 6, m7
++%else
++ RND_SHR_R m5, m15, m6, xm7
++%endif
++ STORE_REGS u, 5, 15
++
++ sub hd, 2
++ jg .v_w88c
++
++ add tdstq, 2*8
++ add tsrcq, 2*8
++ mov hd, ohd
++ mov dstq, tdstq
++ mov srcq, tsrcq
++ sub w2d, 2*8
++ jg .v_w88l
++
++ RET
++
++.v_jmp_tbl:
++%ifidn %1, put
++ dd .v_w2 - .v_jmp_tbl
++%endif
++ dd .v_w4 - .v_jmp_tbl
++ dd .v_w8 - .v_jmp_tbl
++ dd .v_w8 - .v_jmp_tbl
++ dd .v_w8 - .v_jmp_tbl
++ dd .v_w8 - .v_jmp_tbl
++ dd .v_w8 - .v_jmp_tbl
++
++; ---- {put,prep}_8tap_hv_16bpc ----
++
++INIT_XMM avx2
++.%1_8tap_hv_16bpc: ; cglobal put_8tap_hv_16bpc, 4, 9, 0, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3
++%endif
++
++ cmp hd, 4
++ jle .hv_use4tap
++ shr myd, 7
++.hv_use4tap:
++ and myd, 0x7f
++
++INIT_YMM avx2
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m6, [pd_2] ; (1 << (6 - 4) >> 1)
++ movq xm13, [pq_2] ; 6 - 4
++%ifidn %1, put
++ vpbroadcastd m14, [pd_512] ; (1 << (6 + 4) >> 1)
++ movq xm15, [pq_10] ; 6 + 4
++%else
++ vpbroadcastd m14, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6)
++%endif
++ cmp bdmaxd, 12
++ jne .hv_bits10
++ vpbroadcastd m6, [pd_8] ; (1 << (6 - 2) >> 1)
++ movq xm13, [pq_4] ; 6 - 2
++%ifidn %1, put
++ vpbroadcastd m14, [pd_128] ; (1 << (6 + 2) >> 1)
++ movq xm15, [pq_8] ; 6 + 2
++%endif
++.hv_bits10:
++INIT_XMM avx2
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3
++%endif
++
++ lea jrq, [.hv_jmp_tbl]
++
++INIT_YMM avx2
++ lea ss3q, [jrq - .hv_jmp_tbl + subpel_filters + myq*8]
++ vpbroadcastw xm8, [ss3q]
++ vpbroadcastw xm9, [ss3q + 2]
++ vpbroadcastw xm10, [ss3q + 4]
++ vpbroadcastw xm11, [ss3q + 6]
++ vpmovsxbw m8, xm8
++ vpmovsxbw m9, xm9
++ vpmovsxbw m10, xm10
++ vpmovsxbw m11, xm11
++INIT_XMM avx2
++
++ ; Width is need for for filters 8 and larger, see .hv_w8
++ mov ss3q, _wq
++
++ tzcnt _wd, _wm
++%ifidn %1, put
++ sub _wd, 1
++%else
++ sub _wd, 2
++%endif
++ movsxd _wq, [jrq + _wq*4]
++ add _wq, jrq
++ jmp _wq
++
++%ifidn %1, put
++.hv_w2:
++ cmp hd, 4
++ jg .hv_w28
++
++ lea ss3q, [ssq*3]
++
++ mova m8, [spf_h_shuf]
++ vpbroadcastd m5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2]
++ vpmovsxbw m5, m5
++
++ sub srcq, 2
++ sub srcq, ssq
++
++ movu m0, [srcq]
++ movu m1, [srcq + ssq]
++ movu m2, [srcq + ssq*2]
++ add srcq, ss3q
++
++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, m13
++ PUT_4TAP_HS1 m2, m8, m5, m6, m13
++ INTERLEAVE_REGS wd, m0, m1, m2
++
++.hv_w2l:
++
++ movu m3, [srcq]
++ movu m4, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, m13
++
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++ MUL_ADD_R m11, m12, m0, m1, m2, m3, m9, m10
++ mova m2, m4
++
++ RND_SHR_MIN_R m11, m12, m14, m15, m7
++ STORE_REGS d, 11, 12
++
++ sub hd, 2
++ jg .hv_w2l
++
++ RET
++
++.hv_w28:
++ lea ss3q, [ssq*3]
++
++ mova m8, [spf_h_shuf]
++ vpbroadcastd m5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2]
++ vpmovsxbw m5, m5
++
++ lea myq, [jrq - .hv_jmp_tbl + subpel_filters + myq*8]
++ vpbroadcastd m9, [myq]
++ vpbroadcastd m10, [myq + 4]
++ vpmovsxbw m9, m9
++ vpmovsxbw m10, m10
++
++ sub srcq, 2
++ sub srcq, ss3q
++
++ movu m0, [srcq]
++ movu m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, m13
++
++ movu m4, [srcq]
++ movu m3, [srcq + ssq]
++ movu m2, [srcq + ssq*2]
++ add srcq, ss3q
++
++ PUT_4TAP_HS2 m4, m3, m8, m5, m6, m13
++ PUT_4TAP_HS1 m2, m8, m5, m6, m13
++
++ INTERLEAVE_REGS wd, m0, m1, m4, m3, m2
++ punpckldq m0, m4
++ punpckldq m1, m3
++
++ movu m3, [srcq]
++ movu m4, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, m13
++
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++.hv_w28l:
++
++ movu m11, [srcq]
++ movu m12, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m11, m12, m8, m5, m6, m13
++
++ INTERLEAVE_REGS wd, m4, m11, m12
++ punpckldq m2, m4
++ punpckldq m3, m11
++
++ pmaddwd m11, m0, m9
++ pmaddwd m4, m2, m10
++ pmaddwd m12, m1, m9
++ paddd m11, m4
++ pmaddwd m4, m3, m10
++ paddd m12, m4
++ phaddd m11, m11
++ phaddd m12, m12
++
++ RND_SHR_MIN_R m11, m12, m14, m15, m7
++ STORE_REGS d, 11, 12
++
++ pshufd m0, m0, q2031
++ pshufd m1, m1, q2031
++ pshufd m11, m2, q3120
++ pshufd m12, m3, q3120
++ pshufd m2, m2, q2031
++ pshufd m3, m3, q2031
++
++ mova m4, m3
++ psrad m4, 16
++ packssdw m4, m4
++
++ punpckldq m0, m11
++ punpckldq m1, m12
++
++ sub hd, 2
++ jg .hv_w28l
++
++ RET
++%endif
++
++INIT_YMM avx2
++.hv_w4:
++ cmp hd, 4
++ jg .hv_w48
++
++ lea ss3q, [ssq*3]
++
++ mova m8, [spf_h_shuf]
++ vpbroadcastd xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2]
++ vpmovsxbw m5, xm5
++
++ sub srcq, 2
++ sub srcq, ssq
++
++ vbroadcasti128 m0, [srcq]
++ vbroadcasti128 m1, [srcq + ssq]
++ vbroadcasti128 m2, [srcq + ssq*2]
++ add srcq, ss3q
++
++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, xm13
++ PUT_4TAP_HS1 m2, m8, m5, m6, xm13
++ INTERLEAVE_REGS wd, m0, m1, m2
++
++.hv_w4l:
++
++ vbroadcasti128 m3, [srcq]
++ vbroadcasti128 m4, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, xm13
++
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++ MUL_ADD_R m11, m12, m0, m1, m2, m3, m9, m10
++ mova m2, m4
++
++%ifidn %1, put
++ RND_SHR_MIN_R m11, m12, m14, xm15, m7
++%else
++ RND_SHR_R m11, m12, m14, 6
++%endif
++
++ vextracti128 xm3, m11, 1
++ vextracti128 xm4, m12, 1
++
++ movd [dstq], xm11
++ movd [dstq + 4], xm3
++ movd [dstq + dsq], xm12
++ movd [dstq + dsq + 4], xm4
++ lea dstq, [dstq + dsq*2]
++
++ sub hd, 2
++ jg .hv_w4l
++
++ RET
++
++.hv_w48:
++ lea ss3q, [ssq*3]
++
++ mova m8, [spf_h_shuf]
++ vpbroadcastd xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2]
++ vpmovsxbw m5, xm5
++
++ lea myq, [jrq - .hv_jmp_tbl + subpel_filters + myq*8]
++ vpbroadcastd xm9, [myq]
++ vpbroadcastd xm10, [myq + 4]
++ vpmovsxbw m9, xm9
++ vpmovsxbw m10, xm10
++
++ sub srcq, 2
++ sub srcq, ss3q
++
++ vbroadcasti128 m0, [srcq]
++ vbroadcasti128 m1, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, xm13
++
++ vbroadcasti128 m4, [srcq]
++ vbroadcasti128 m3, [srcq + ssq]
++ vbroadcasti128 m2, [srcq + ssq*2]
++ add srcq, ss3q
++
++ PUT_4TAP_HS2 m4, m3, m8, m5, m6, xm13
++ PUT_4TAP_HS1 m2, m8, m5, m6, xm13
++
++ INTERLEAVE_REGS wd, m0, m1, m4, m3, m2
++ punpckldq m0, m4
++ punpckldq m1, m3
++
++ vbroadcasti128 m3, [srcq]
++ vbroadcasti128 m4, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, xm13
++
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++.hv_w48l:
++
++ vbroadcasti128 m11, [srcq]
++ vbroadcasti128 m12, [srcq + ssq]
++ lea srcq, [srcq + ssq*2]
++
++ PUT_4TAP_HS2 m11, m12, m8, m5, m6, xm13
++
++ INTERLEAVE_REGS wd, m4, m11, m12
++ punpckldq m2, m4
++ punpckldq m3, m11
++
++ pmaddwd m11, m0, m9
++ pmaddwd m4, m2, m10
++ pmaddwd m12, m1, m9
++ paddd m11, m4
++ pmaddwd m4, m3, m10
++ paddd m12, m4
++ phaddd m11, m11
++ phaddd m12, m12
++
++%ifidn %1, put
++ RND_SHR_MIN_R m11, m12, m14, xm15, m7
++%else
++ RND_SHR_R m11, m12, m14, 6
++%endif
++
++ vextracti128 xm4, m11, 1
++ movd [dstq], xm11
++ movd [dstq + 4], xm4
++ vextracti128 xm4, m12, 1
++ movd [dstq + dsq], xm12
++ movd [dstq + dsq + 4], xm4
++ lea dstq, [dstq + dsq*2]
++
++ pshufd m0, m0, q2031
++ pshufd m1, m1, q2031
++ pshufd m11, m2, q3120
++ pshufd m12, m3, q3120
++ pshufd m2, m2, q2031
++ pshufd m3, m3, q2031
++
++ mova m4, m3
++ psrad m4, 16
++ packssdw m4, m4
++
++ punpckldq m0, m11
++ punpckldq m1, m12
++
++ sub hd, 2
++ jg .hv_w48l
++ RET
++
++.hv_w8:
++ mov _wq, ss3q
++
++ cmp hd, 4
++ jg .hv_w88
++
++ lea ss3q, [ssq*3]
++
++ vpbroadcastq xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8]
++ vpmovsxbw m5, xm5
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, tsrc, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, tsrc, ds, ss3
++%endif
++
++ sub srcq, 6
++ sub srcq, ssq
++
++ mov ohd, hd
++ mov tdstq, dstq
++ mov tsrcq, srcq
++
++.hv_w8l:
++
++ PUT_8TAP_HS 0, 1, 2, m5, m6, xm13, 0*ssq
++ PUT_8TAP_HS 1, 2, 3, m5, m6, xm13, 1*ssq
++ PUT_8TAP_HS 2, 3, 4, m5, m6, xm13, 2*ssq
++ add srcq, ss3q
++
++ INTERLEAVE_REGS wd, m0, m1, m2
++
++.hv_w8c: ; Nx2, Nx4
++
++ PUT_8TAP_HS 3, 8, 11, m5, m6, xm13, 0*ssq
++ PUT_8TAP_HS 4, 8, 11, m5, m6, xm13, 1*ssq
++ lea srcq, [srcq + ssq*2]
++
++ INTERLEAVE_REGS wd, m2, m3, m4
++
++ MUL_ADD_R m8, m11, m0, m1, m2, m3, m9, m10
++ mova m2, m4
++
++%ifidn %1, put
++ RND_SHR_MIN_R m8, m11, m14, xm15, m7
++%else
++ RND_SHR_R m8, m11, m14, 6
++%endif
++
++ vextracti128 xm3, m8, 1
++ vextracti128 xm4, m11, 1
++
++ movq [dstq], xm8
++ movq [dstq + 8], xm3
++ movq [dstq + dsq], xm11
++ movq [dstq + dsq + 8], xm4
++ lea dstq, [dstq + dsq*2]
++
++ sub hd, 2
++ jg .hv_w8c
++
++ add tdstq, 2*8
++ add tsrcq, 2*8
++ mov hd, ohd
++ mov dstq, tdstq
++ mov srcq, tsrcq
++ sub _wd, 8
++ jg .hv_w8l
++ RET
++
++.hv_w88:
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3
++%endif
++
++ lea ss3q, [ssq*3]
++
++ vpbroadcastq xm7, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8]
++ vpmovsxbw m7, xm7
++
++ sub srcq, 6
++ sub srcq, ss3q
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, bdmax, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, bdmax, ds, ss3
++%endif
++
++ mov ohd, hd
++ mov tdstq, dstq
++
++ popcnt bdmaxd, bdmaxm
++ cmp bdmaxd, 12
++ je .hv_w88_12bit
++
++%ifidn %1, put
++ DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, tsrc, ss3
++%elifidn %1, prep
++ DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, tsrc, ds, ss3
++%endif
++
++ mov tsrcq, srcq
++
++.hv_w88l_10bit: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32:
++
++ vpbroadcastd m15, [pd_2] ; (1 << (6 - 4) >> 1)
++
++ PUT_8TAP_HS 0, 12, 13, m7, m15, 6 - 4, 0*ssq
++ PUT_8TAP_HS 1, 12, 13, m7, m15, 6 - 4, 1*ssq
++ PUT_8TAP_HS 2, 12, 13, m7, m15, 6 - 4, 2*ssq
++ add srcq, ss3q
++
++ PUT_8TAP_HS 3, 12, 13, m7, m15, 6 - 4, 0*ssq
++ PUT_8TAP_HS 4, 12, 13, m7, m15, 6 - 4, 1*ssq
++ lea srcq, [srcq + ssq*2]
++
++ PUT_8TAP_HS 5, 12, 13, m7, m15, 6 - 4, 0*ssq
++ PUT_8TAP_HS 6, 12, 13, m7, m15, 6 - 4, 1*ssq
++ lea srcq, [srcq + ssq*2]
++
++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m5, m6
++
++.hv_w88c_10bit:
++
++ PUT_8TAP_HS 12, 14, 15, m7, m15, 6 - 4, 0*ssq, [pd_2]
++ PUT_8TAP_HS 13, 14, 15, m7, m15, 6 - 4, 1*ssq, [pd_2]
++ lea srcq, [srcq + ssq*2]
++
++ INTERLEAVE_REGS wd, m6, m12, m13
++
++ MUL_ADD_R m14, m15, m0, m1, m2, m3, m8, m9
++ MUL_ACC_R m14, m15, m2, m3, m4, m5, m10
++ MUL_ACC_R m14, m15, m4, m5, m6, m12, m11
++
++%ifidn %1, put
++ vpbroadcastd m6, [pd_512] ; (1 << (6 + 4) >> 1)
++ vpbroadcastw m12, tsrcm ; bdmaxm
++ RND_SHR_MIN_R m14, m15, m6, 6 + 4, m12
++%else
++ vpbroadcastd m6, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6)
++ RND_SHR_R m14, m15, m6, 6
++%endif
++
++ mova m6, m13
++
++ vextracti128 xm12, m14, 1
++ vextracti128 xm13, m15, 1
++
++ movq [dstq], xm14
++ movq [dstq + 8], xm12
++ movq [dstq + dsq], xm15
++ movq [dstq + dsq + 8], xm13
++ lea dstq, [dstq + dsq*2]
++
++ sub hd, 2
++ jg .hv_w88c_10bit
++
++ add tdstq, 2*8
++ add tsrcq, 2*8
++ mov hd, ohd
++ mov dstq, tdstq
++ mov srcq, tsrcq
++ sub _wd, 8
++ jg .hv_w88l_10bit
++ RET
++
++.hv_w88_12bit:
++
++ mov tsrcq, srcq
++
++.hv_w88l_12bit: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32:
++
++ vpbroadcastd m15, [pd_8] ; (1 << (6 - 2) >> 1)
++
++ PUT_8TAP_HS 0, 12, 13, m7, m15, 6 - 2, 0*ssq
++ PUT_8TAP_HS 1, 12, 13, m7, m15, 6 - 2, 1*ssq
++ PUT_8TAP_HS 2, 12, 13, m7, m15, 6 - 2, 2*ssq
++ add srcq, ss3q
++
++ PUT_8TAP_HS 3, 12, 13, m7, m15, 6 - 2, 0*ssq
++ PUT_8TAP_HS 4, 12, 13, m7, m15, 6 - 2, 1*ssq
++ lea srcq, [srcq + ssq*2]
++
++ PUT_8TAP_HS 5, 12, 13, m7, m15, 6 - 2, 0*ssq
++ PUT_8TAP_HS 6, 12, 13, m7, m15, 6 - 2, 1*ssq
++ lea srcq, [srcq + ssq*2]
++
++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m5, m6
++
++.hv_w88c_12bit:
++
++ PUT_8TAP_HS 12, 14, 15, m7, m15, 6 - 2, 0*ssq, [pd_8]
++ PUT_8TAP_HS 13, 14, 15, m7, m15, 6 - 2, 1*ssq, [pd_8]
++ lea srcq, [srcq + ssq*2]
++
++ INTERLEAVE_REGS wd, m6, m12, m13
++
++ MUL_ADD_R m14, m15, m0, m1, m2, m3, m8, m9
++ MUL_ACC_R m14, m15, m2, m3, m4, m5, m10
++ MUL_ACC_R m14, m15, m4, m5, m6, m12, m11
++
++%ifidn %1, put
++ vpbroadcastd m6, [pd_128] ; (1 << (6 + 2) >> 1)
++ vpbroadcastw m12, tsrcm ; bdmaxm
++ RND_SHR_MIN_R m14, m15, m6, 6 + 2, m12
++%else
++ vpbroadcastd m6, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6)
++ RND_SHR_R m14, m15, m6, 6
++%endif
++
++ mova m6, m13
++
++ vextracti128 xm12, m14, 1
++ vextracti128 xm13, m15, 1
++
++ movq [dstq], xm14
++ movq [dstq + 8], xm12
++ movq [dstq + dsq], xm15
++ movq [dstq + dsq + 8], xm13
++ lea dstq, [dstq + dsq*2]
++
++ sub hd, 2
++ jg .hv_w88c_12bit
++
++ add tdstq, 2*8
++ add tsrcq, 2*8
++ mov hd, ohd
++ mov dstq, tdstq
++ mov srcq, tsrcq
++ sub _wd, 8
++ jg .hv_w88l_12bit
++ RET
++
++.hv_jmp_tbl:
++%ifidn %1, put
++ dd .hv_w2 - .hv_jmp_tbl
++%endif
++ dd .hv_w4 - .hv_jmp_tbl
++ dd .hv_w8 - .hv_jmp_tbl
++ dd .hv_w8 - .hv_jmp_tbl
++ dd .hv_w8 - .hv_jmp_tbl
++ dd .hv_w8 - .hv_jmp_tbl
++ dd .hv_w8 - .hv_jmp_tbl
++%endm
++
++filter_fn put
++filter_fn prep
++
++%endif ; ARCH_X86_64
+--
+GitLab
+
+
+From 07a3064c9ebd0827177706e135f25cc8a6c25399 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sat, 26 Dec 2020 21:38:58 -0500
+Subject: [PATCH 3/7] Enable AVX2 8tap put/prep HBD assembly
+
+---
+ src/meson.build | 1 +
+ src/x86/mc_init_tmpl.c | 38 ++++++++++++++++++++------------------
+ 2 files changed, 21 insertions(+), 18 deletions(-)
+
+diff --git a/src/meson.build b/src/meson.build
+index f9f5c120..ff62a9d8 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -208,6 +208,7 @@ if is_asm_enabled
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
++ 'x86/mc16_avx2.asm',
+ )
+ endif
+
+diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
+index 468069c5..fcfed9be 100644
+--- a/src/x86/mc_init_tmpl.c
++++ b/src/x86/mc_init_tmpl.c
+@@ -279,26 +279,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ return;
+
+ #if BITDEPTH == 8
+- init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+- init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+- init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+- init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+- init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+- init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+- init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+- init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+- init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+- init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+- init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+- init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+- init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+- init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+- init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+- init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+- init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+- init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+@@ -340,6 +322,26 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ c->resize = BF(dav1d_resize, avx2);
+ #endif
+
++ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
++ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
++ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
++ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
++ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
++ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
++ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
++ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
++ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
++
++ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
++ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
++ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
++ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
++ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
++ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
++ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
++ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
++ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
++
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+--
+GitLab
+
+
+From 68aa4049fe160b318f0f037b0f5514aefea8a69b Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sat, 26 Dec 2020 06:21:44 -0500
+Subject: [PATCH 4/7] x86: mc: Add AVX2 implementation of avg/w_avg/mask for
+ 16bpc
+
+Relative speed-ups over C code (compared with gcc-9.3.0):
+
+ C AVX2
+avg_w4_16bpc: 185.5 13.3 13.95x
+avg_w8_16bpc: 223.1 35.3 6.32x
+avg_w16_16bpc: 626.7 109.1 5.74x
+avg_w32_16bpc: 2284.0 400.8 5.70x
+avg_w64_16bpc: 5294.7 924.6 5.73x
+avg_w128_16bpc: 12887.0 2237.9 5.76x
+
+w_avg_w4_16bpc: 225.6 16.0 14.10x
+w_avg_w8_16bpc: 336.2 36.4 9.24x
+w_avg_w16_16bpc: 975.0 111.5 8.74x
+w_avg_w32_16bpc: 3633.7 403.1 9.01x
+w_avg_w64_16bpc: 8519.2 927.9 9.18x
+w_avg_w128_16bpc: 20873.9 2232.5 9.35x
+
+mask_w4_16bpc: 241.7 21.4 11.29x
+mask_w8_16bpc: 663.5 51.1 12.98x
+mask_w16_16bpc: 736.9 153.2 4.81x
+mask_w32_16bpc: 2650.9 582.7 4.55x
+mask_w64_16bpc: 6075.4 1359.7 4.47x
+mask_w128_16bpc: 14677.0 3330.2 4.41x
+---
+ src/x86/mc16_avx2.asm | 179 ++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 179 insertions(+)
+
+diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
+index ea6cfdbf..7b4f9cdf 100644
+--- a/src/x86/mc16_avx2.asm
++++ b/src/x86/mc16_avx2.asm
+@@ -19,7 +19,10 @@ SECTION_RODATA 32
+ spf_h_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+ db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+ pq_2: dq (6 - 4)
++pq_3: dq (6 - 4) + 1
++pq_6: dq (6 - 4) + 4
+ pq_4: dq (6 - 2)
++pq_5: dq (6 - 2) + 1
+ pq_8: dq (6 + 2)
+ pq_10: dq (6 + 4)
+ pd_32: dd (1 << 6 >> 1)
+@@ -32,7 +35,16 @@ pd_128: dd (1 << (6 + 2) >> 1)
+ nd_524256: dd (1 << 6 >> 1) - (8192 << 6)
+ nd_32766: dd (1 << (6 - 4) >> 1) - (8192 << (6 - 4))
+ nd_131064: dd (1 << (6 - 2) >> 1) - (8192 << (6 - 2))
++pd_16388: dd (1 << (6 - 4)) + 8192*2
++pd_16400: dd (1 << (6 - 2)) + 8192*2
++pd_131104: dd ((1 << (6 - 4)) + 8192*2) << 3
++pd_131200: dd ((1 << (6 - 2)) + 8192*2) << 3
++pd_524416: dd ((1 << (6 - 4)) + 8192*2) << 5
++pd_524800: dd ((1 << (6 - 2)) + 8192*2) << 5
+ pw_8192: dw 8192
++pw_1: dw 1
++pw_16: dw 16
++pw_64: dw 64
+
+ SECTION .text
+
+@@ -1550,4 +1562,171 @@ INIT_YMM avx2
+ filter_fn put
+ filter_fn prep
+
++%macro AVG 1
++ mova m0, [p1q]
++ mova m2, [p2q]
++ punpckhwd m1, m0, m2
++ punpcklwd m0, m2
++%ifidn %1, mask
++ mova xm2, [mq]
++ vpmovsxbw m2, xm2
++ vpbroadcastw m7, [pw_64]
++ psubw m7, m2
++ punpckhwd m3, m2, m7
++ punpcklwd m2, m7
++ pmaddwd m0, m2
++%else
++ pmaddwd m0, m3
++%endif
++ pmaddwd m1, m3
++ paddd m0, m4
++ paddd m1, m4
++ psrad m0, xm5
++ psrad m1, xm5
++ packusdw m0, m1
++ pminuw m0, m6
++%endm
++
++%macro bilin_fn 1
++%ifidn %1, avg
++cglobal avg_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, bdmax, ds3, ow
++%elifidn %1, w_avg
++cglobal w_avg_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, wg, bdmax, ow
++%else
++cglobal mask_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, m, bdmax, ow
++%endif
++
++ movifnidn hd, hm
++ movifnidn wd, wm
++
++%ifidn %1, avg
++ vpbroadcastw m3, [pw_1]
++ vpbroadcastd m4, [pd_16400] ; (1 << (6 - 2)) + 8192*2
++ movq xm5, [pq_5] ; (6 - 2) + 1
++%elifidn %1, w_avg
++ vpbroadcastw m3, wgm
++ vpbroadcastw m4, [pw_16]
++ psubw m4, m3
++ punpcklwd m3, m4
++ vpbroadcastd m4, [pd_131200] ; ((1 << (6 - 2)) + 8192*2) << 3
++ movq xm5, [pq_8] ; (6 - 2) + 1 + 3
++%else
++ movifnidn mq, mmp
++ vpbroadcastd m4, [pd_524800] ; ((1 << (6 - 2)) + 8192*2) << 5
++ movq xm5, [pq_10] ; (6 - 2) + 1 + 5
++%endif
++
++ popcnt bdmaxd, bdmaxm
++ cmp bdmaxd, 10
++ je .bits10
++
++%ifidn %1, avg
++ vpbroadcastd m4, [pd_16388] ; (1 << (6 - 4)) + 8192*2
++ movq xm5, [pq_3] ; (6 - 4) + 1
++%elifidn %1, w_avg
++ vpbroadcastd m4, [pd_131104] ; ((1 << (6 - 4)) + 8192*2) << 3
++ movq xm5, [pq_6] ; (6 - 4) + 1 + 3
++%else
++ vpbroadcastd m4, [pd_524416] ; ((1 << (6 - 4)) + 8192*2) << 5
++ movq xm5, [pq_8] ; (6 - 4) + 1 + 5
++%endif
++.bits10:
++
++ vpbroadcastw m6, bdmaxm
++
++ lea owd, [2*wd]
++
++DEFINE_ARGS dst, ds, p1, p2, w, h, m, jr, ow
++
++ lea jrq, [.jmp_tbl]
++ tzcnt wd, wm
++ sub wd, 2
++ movsxd wq, [jrq + wq*4]
++ add wq, jrq
++ jmp wq
++
++.w4:
++DEFINE_ARGS dst, ds, p1, p2, w, h, m, ds3, ow
++
++ lea ds3q, [dsq*3]
++
++.w4l:
++ AVG %1
++
++ vextracti128 xm1, m0, 1
++ movq [dstq], xm0
++ pextrq [dstq + dsq], xm0, 1
++ movq [dstq + 2*dsq], xm1
++ pextrq [dstq + ds3q], xm1, 1
++
++ lea dstq, [dstq + 4*dsq]
++ add p1q, 32
++ add p2q, 32
++%ifidn %1, mask
++ add mq, 16
++%endif
++
++ sub hd, 4
++ jg .w4l
++ RET
++
++.w8:
++ AVG %1
++
++ vextracti128 xm1, m0, 1
++ mova [dstq], xm0
++ mova [dstq + dsq], xm1
++
++ lea dstq, [dstq + dsq*2]
++ add p1q, 32
++ add p2q, 32
++%ifidn %1, mask
++ add mq, 16
++%endif
++
++ sub hd, 2
++ jg .w8
++
++ RET
++
++.w16:
++
++ mov wd, owd ; upper 32-bits of wq zerod by jmp
++ sub dsq, wq
++
++.w16l:
++ AVG %1
++
++ mova [dstq], m0
++
++ add dstq, 32
++ add p1q, 32
++ add p2q, 32
++%ifidn %1, mask
++ add mq, 16
++%endif
++
++ sub wd, 32
++ jg .w16l
++
++ add dstq, dsq
++ mov wd, owd
++ dec hd
++ jg .w16l
++
++ RET
++
++.jmp_tbl:
++ dd .w4 - .jmp_tbl
++ dd .w8 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++%endm
++
++bilin_fn avg
++bilin_fn w_avg
++bilin_fn mask
++
+ %endif ; ARCH_X86_64
+--
+GitLab
+
+
+From 6ba57502ac82b00d1441a36d4e12814eafd37982 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sun, 27 Dec 2020 04:11:21 -0500
+Subject: [PATCH 5/7] Enable AVX2 avg/w_avg/mask HBD assembly
+
+---
+ src/x86/mc_init_tmpl.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
+index fcfed9be..70798047 100644
+--- a/src/x86/mc_init_tmpl.c
++++ b/src/x86/mc_init_tmpl.c
+@@ -305,9 +305,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+- c->avg = BF(dav1d_avg, avx2);
+- c->w_avg = BF(dav1d_w_avg, avx2);
+- c->mask = BF(dav1d_mask, avx2);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+@@ -342,6 +339,10 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+
++ c->avg = BF(dav1d_avg, avx2);
++ c->w_avg = BF(dav1d_w_avg, avx2);
++ c->mask = BF(dav1d_mask, avx2);
++
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+--
+GitLab
+
+
+From c18338526a06c14409333e7d6ed34ae60a6dff46 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sun, 27 Dec 2020 17:25:54 -0500
+Subject: [PATCH 6/7] x86: mc: Add AVX2 implementation of blend/blend_h/blend_v
+ for 16bpc
+
+ Relative speed-ups over C code (compared with gcc-9.3.0):
+
+ C AVX2
+blend_w4_16bpc: 72.3 12.1 5.98x
+blend_w8_16bpc: 218.0 26.2 8.32x
+blend_w16_16bpc: 859.7 53.3 16.13x
+blend_w32_16bpc: 2193.1 137.5 15.95x
+
+blend_h_w2_16bpc: 87.3 21.0 4.16x
+blend_h_w4_16bpc: 137.8 22.8 6.04x
+blend_h_w8_16bpc: 126.5 29.7 4.26x
+blend_h_w16_16bpc: 211.4 28.7 7.37x
+blend_h_w32_16bpc: 385.5 50.9 7.57x
+blend_h_w64_16bpc: 726.2 88.0 8.25x
+blend_h_w128_16bpc: 1728.7 182.6 9.47x
+
+blend_v_w2_16bpc: 74.7 30.6 2.44x
+blend_v_w4_16bpc: 321.2 51.2 6.27x
+blend_v_w8_16bpc: 614.6 69.2 8.88x
+blend_v_w16_16bpc: 1211.0 57.2 21.17x
+blend_v_w32_16bpc: 2406.2 116.8 20.60x
+---
+ src/x86/mc16_avx2.asm | 381 ++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 381 insertions(+)
+
+diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
+index 7b4f9cdf..ee56da2e 100644
+--- a/src/x86/mc16_avx2.asm
++++ b/src/x86/mc16_avx2.asm
+@@ -1729,4 +1729,385 @@ bilin_fn avg
+ bilin_fn w_avg
+ bilin_fn mask
+
++INIT_XMM avx2
++cglobal blend_16bpc, 6, 7, 7, dst, ds, tmp, w, h, mask, jr
++ pxor m3, m3
++ lea jrq, [.jmp_tbl]
++ tzcnt wd, wm
++ sub wd, 2
++ movsxd wq, [jrq + wq*4]
++ add wq, jrq
++ jmp wq
++.w4:
++ movq m0, [dstq]
++ pinsrq m0, [dstq + dsq], 1
++ mova m1, [tmpq]
++ movq m2, [maskq]
++ psubb m2, m3, m2
++ pmovsxbw m2, m2
++ psllw m2, 9
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ movq [dstq], m0
++ pextrq [dstq + dsq], m0, 1
++ add maskq, 8
++ add tmpq, 16
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w4
++ RET
++INIT_YMM avx2
++.w8:
++ mova xm0, [dstq]
++ vinserti128 m0, [dstq + dsq], 1
++ mova m1, [tmpq]
++ mova xm2, [maskq]
++ psubb xm2, xm3, xm2
++ pmovsxbw m2, xm2
++ psllw m2, 9
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ mova [dstq], xm0
++ vextracti128 [dstq + dsq], m0, 1
++ add maskq, 16
++ add tmpq, 32
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w8
++ RET
++.w16:
++ mova m0, [dstq]
++ mova m4, [dstq + dsq]
++ mova m1, [tmpq]
++ mova m5, [tmpq + 32]
++ mova xm2, [maskq]
++ mova xm6, [maskq + 16]
++ psubb xm2, xm3, xm2
++ psubb xm6, xm3, xm6
++ pmovsxbw m2, xm2
++ pmovsxbw m6, xm6
++ psllw m2, 9
++ psllw m6, 9
++ psubw m1, m0, m1
++ psubw m5, m4, m5
++ pmulhrsw m1, m2
++ pmulhrsw m5, m6
++ paddw m0, m1
++ paddw m4, m5
++ mova [dstq], m0
++ mova [dstq + dsq], m4
++ add maskq, 32
++ add tmpq, 64
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w16
++ RET
++.w32:
++ mova m0, [dstq]
++ mova m4, [dstq + 32]
++ mova m1, [tmpq]
++ mova m5, [tmpq + 32]
++ mova xm2, [maskq]
++ mova xm6, [maskq + 16]
++ psubb xm2, xm3, xm2
++ psubb xm6, xm3, xm6
++ pmovsxbw m2, xm2
++ pmovsxbw m6, xm6
++ psllw m2, 9
++ psllw m6, 9
++ psubw m1, m0, m1
++ psubw m5, m4, m5
++ pmulhrsw m1, m2
++ pmulhrsw m5, m6
++ paddw m0, m1
++ paddw m4, m5
++ mova [dstq], m0
++ mova [dstq + 32], m4
++ add maskq, 32
++ add tmpq, 64
++ add dstq, dsq
++ dec hd
++ jg .w32
++ RET
++.jmp_tbl:
++ dd .w4 - .jmp_tbl
++ dd .w8 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w32 - .jmp_tbl
++
++cextern obmc_masks
++
++INIT_XMM avx2
++cglobal blend_v_16bpc, 5, 7, 7, dst, ds, tmp, w, h, o, jr
++ lea oq, [obmc_masks]
++ pxor m3, m3
++ movsx wq, wd
++ add oq, wq
++ lea jrq, [.jmp_tbl]
++ tzcnt wd, wm
++ sub wd, 1
++ movsxd wq, [jrq + wq*4]
++ add wq, jrq
++ jmp wq
++.w2:
++ vpbroadcastw m2, [oq]
++ psubb m2, m3, m2
++ pmovsxbw m2, m2
++ psllw m2, 9
++.w2l:
++ movd m0, [dstq]
++ movd m1, [tmpq]
++ pinsrd m0, [dstq + dsq], 1
++ pinsrd m1, [tmpq + 4], 1
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ movd [dstq], m0
++ pextrd [dstq + dsq], m0, 1
++ add tmpq, 8
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w2l
++ RET
++.w4:
++ vpbroadcastd m2, [oq]
++ psubb m2, m3, m2
++ pmovsxbw m2, m2
++ psllw m2, 9
++.w4l:
++ movq m0, [dstq]
++ movq m1, [tmpq]
++ pinsrq m0, [dstq + dsq], 1
++ pinsrq m1, [tmpq + 8], 1
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ movq [dstq], m0
++ pextrq [dstq + dsq], m0, 1
++ add tmpq, 16
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w4l
++ RET
++INIT_YMM avx2
++.w8:
++ vpbroadcastq xm2, [oq]
++ psubb xm2, xm3, xm2
++ pmovsxbw m2, xm2
++ psllw m2, 9
++.w8l:
++ mova xm0, [dstq]
++ vinserti128 m0, [dstq + dsq], 1
++ mova m1, [tmpq]
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ mova [dstq], xm0
++ vextracti128 [dstq + dsq], m0, 1
++ add tmpq, 32
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w8l
++ RET
++.w16:
++ mova xm2, [oq]
++ psubb xm2, xm3, xm2
++ pmovsxbw m2, xm2
++ psllw m2, 9
++.w16l:
++ mova m0, [dstq]
++ mova m4, [dstq + dsq]
++ mova m1, [tmpq]
++ mova m5, [tmpq + 32]
++ psubw m1, m0, m1
++ psubw m5, m4, m5
++ pmulhrsw m1, m2
++ pmulhrsw m5, m2
++ paddw m0, m1
++ paddw m4, m5
++ mova [dstq], m0
++ mova [dstq + dsq], m4
++ add tmpq, 64
++ lea dstq, [dstq + 2*dsq]
++ sub hd, 2
++ jg .w16l
++ RET
++.w32:
++ mova xm2, [oq]
++ mova xm6, [oq + 16]
++ psubb xm2, xm3, xm2
++ psubb xm6, xm3, xm6
++ pmovsxbw m2, xm2
++ pmovsxbw m6, xm6
++ psllw m2, 9
++ psllw m6, 9
++.w32l:
++ mova m0, [dstq]
++ mova m4, [dstq + 32]
++ mova m1, [tmpq]
++ mova m5, [tmpq + 32]
++ psubw m1, m0, m1
++ psubw m5, m4, m5
++ pmulhrsw m1, m2
++ pmulhrsw m5, m6
++ paddw m0, m1
++ paddw m4, m5
++ mova [dstq], m0
++ mova [dstq + 32], m4
++ add tmpq, 64
++ add dstq, dsq
++ dec hd
++ jg .w32l
++ RET
++.jmp_tbl:
++ dd .w2 - .jmp_tbl
++ dd .w4 - .jmp_tbl
++ dd .w8 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w32 - .jmp_tbl
++
++INIT_XMM avx2
++cglobal blend_h_16bpc, 5, 8, 7, dst, ds, tmp, w, h, o, jr, w2
++ pxor m3, m3
++ lea w2d, [wd*2]
++ lea oq, [obmc_masks]
++ movsx hq, hd
++ add oq, hq
++ imul hq, 3
++ shr hq, 2
++ lea jrq, [.jmp_tbl]
++ tzcnt wd, wm
++ sub wd, 1
++ movsxd wq, [jrq + wq*4]
++ add wq, jrq
++ jmp wq
++.w2:
++ movd m2, [oq]
++ psubb m2, m3, m2
++ punpcklbw m2, m2
++ pmovsxbw m2, m2
++ psllw m2, 9
++ movd m0, [dstq]
++ movd m1, [tmpq]
++ pinsrd m0, [dstq + dsq], 1
++ pinsrd m1, [tmpq + 4], 1
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ movd [dstq], m0
++ pextrd [dstq + dsq], m0, 1
++ add tmpq, 8
++ lea dstq, [dstq + 2*dsq]
++ add oq, 2
++ sub hd, 2
++ jg .w2
++ RET
++.w4:
++ movd m2, [oq]
++ punpcklbw m2, m2
++ punpcklwd m2, m2
++ psubb m2, m3, m2
++ pmovsxbw m2, m2
++ psllw m2, 9
++ movq m0, [dstq]
++ movq m1, [tmpq]
++ pinsrq m0, [dstq + dsq], 1
++ pinsrq m1, [tmpq + 8], 1
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ movq [dstq], m0
++ pextrq [dstq + dsq], m0, 1
++ add tmpq, 16
++ lea dstq, [dstq + 2*dsq]
++ add oq, 2
++ sub hd, 2
++ jg .w4
++ RET
++INIT_YMM avx2
++.w8:
++ movd xm2, [oq]
++ psubb xm2, xm3, xm2
++ punpcklbw xm2, xm2
++ punpcklwd xm2, xm2
++ punpckldq xm2, xm2
++ pmovsxbw m2, xm2
++ psllw m2, 9
++ mova xm0, [dstq]
++ vinserti128 m0, [dstq + dsq], 1
++ mova m1, [tmpq]
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ mova [dstq], xm0
++ vextracti128 [dstq + dsq], m0, 1
++ add tmpq, 32
++ lea dstq, [dstq + 2*dsq]
++ add oq, 2
++ sub hd, 2
++ jg .w8
++ RET
++.w16:
++ vpbroadcastb xm2, [oq]
++ vpbroadcastb xm6, [oq + 1]
++ psubb xm2, xm3, xm2
++ psubb xm6, xm3, xm6
++ pmovsxbw m2, xm2
++ pmovsxbw m6, xm6
++ psllw m2, 9
++ psllw m6, 9
++ mova m0, [dstq]
++ mova m1, [tmpq]
++ mova m4, [dstq + dsq]
++ mova m5, [tmpq + 32]
++ psubw m1, m0, m1
++ psubw m5, m4, m5
++ pmulhrsw m1, m2
++ pmulhrsw m5, m6
++ paddw m0, m1
++ paddw m4, m5
++ mova [dstq], m0
++ mova [dstq + dsq], m4
++ add tmpq, 64
++ lea dstq, [dstq + 2*dsq]
++ add oq, 2
++ sub hd, 2
++ jg .w16
++ RET
++.w32:
++ mov wd, w2d
++ sub dsq, wq
++.w32l:
++ vpbroadcastb xm2, [oq]
++ psubb xm2, xm3, xm2
++ pmovsxbw m2, xm2
++ psllw m2, 9
++ mov wd, w2d
++.w32c:
++ mova m0, [dstq]
++ mova m1, [tmpq]
++ psubw m1, m0, m1
++ pmulhrsw m1, m2
++ paddw m0, m1
++ mova [dstq], m0
++ add dstq, 32
++ add tmpq, 32
++ sub wd, 32
++ jg .w32c
++ add dstq, dsq
++ inc oq
++ dec hd
++ jg .w32l
++ RET
++.jmp_tbl:
++ dd .w2 - .jmp_tbl
++ dd .w4 - .jmp_tbl
++ dd .w8 - .jmp_tbl
++ dd .w16 - .jmp_tbl
++ dd .w32 - .jmp_tbl
++ dd .w32 - .jmp_tbl
++ dd .w32 - .jmp_tbl
++
+ %endif ; ARCH_X86_64
+--
+GitLab
+
+
+From 8770797232201d6e0e4106e799b6d76865feff77 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sun, 27 Dec 2020 21:13:15 -0500
+Subject: [PATCH 7/7] Enable AVX2 blend/blend_h/blend_v HBD assembly
+
+---
+ src/x86/mc_init_tmpl.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
+index 70798047..2dac21cf 100644
+--- a/src/x86/mc_init_tmpl.c
++++ b/src/x86/mc_init_tmpl.c
+@@ -308,9 +308,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+- c->blend = BF(dav1d_blend, avx2);
+- c->blend_v = BF(dav1d_blend_v, avx2);
+- c->blend_h = BF(dav1d_blend_h, avx2);
+
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
+@@ -342,6 +339,9 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+ c->avg = BF(dav1d_avg, avx2);
+ c->w_avg = BF(dav1d_w_avg, avx2);
+ c->mask = BF(dav1d_mask, avx2);
++ c->blend = BF(dav1d_blend, avx2);
++ c->blend_v = BF(dav1d_blend_v, avx2);
++ c->blend_h = BF(dav1d_blend_h, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+--
+GitLab
+
diff --git a/0002-wiener_2.patch b/0002-wiener_2.patch
new file mode 100644
index 000000000000..149be2fe1293
--- /dev/null
+++ b/0002-wiener_2.patch
@@ -0,0 +1,661 @@
+From 269eeaf7c01afc79a53537881ad03185bf491cf6 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Tue, 29 Dec 2020 06:58:33 -0500
+Subject: [PATCH] Add bpc suffix to lr functions
+
+---
+ src/x86/looprestoration.asm | 36 ++---
+ src/x86/looprestoration_init_tmpl.c | 204 +++++++++++++---------------
+ src/x86/looprestoration_sse.asm | 60 ++++----
+ 3 files changed, 146 insertions(+), 154 deletions(-)
+
+diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm
+index 8ebe230..e077cdd 100644
+--- a/src/x86/looprestoration.asm
++++ b/src/x86/looprestoration.asm
+@@ -66,8 +66,8 @@ SECTION .text
+ DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
+
+ INIT_YMM avx2
+-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h
++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+@@ -414,8 +414,8 @@ ALIGN function_align
+ add dstq, dst_strideq
+ ret
+
+-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h
++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+@@ -532,7 +532,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+ .h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+@@ -591,7 +591,7 @@ ALIGN function_align
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+ .hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+@@ -705,7 +705,7 @@ ALIGN function_align
+ jl .v_loop
+ ret
+
+-cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
++cglobal sgr_box3_h_8bpc, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov xlimd, edgem
+ movifnidn wd, wm
+ mov hd, hm
+@@ -805,7 +805,7 @@ cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
++cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
+ mov xq, -2
+ rorx ylimd, edged, 2
+@@ -868,7 +868,7 @@ cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
++cglobal sgr_calc_ab1_8bpc, 4, 6, 11, a, b, w, h, s
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+ add hd, 2
+@@ -937,8 +937,8 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
++cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
++ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ vpbroadcastd m15, [pw_16]
+@@ -1043,7 +1043,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
++cglobal sgr_weighted1_8bpc, 4, 6, 6, dst, stride, t, w, h, wt
+ %ifidn wtd, wtm
+ shl wtd, 4
+ movd xm5, wtd
+@@ -1082,7 +1082,7 @@ cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
++cglobal sgr_box5_h_8bpc, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+@@ -1200,7 +1200,7 @@ cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
++cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
+ mov xq, -2
+ rorx ylimd, edged, 2
+@@ -1293,7 +1293,7 @@ cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
+ jmp .loop_y_noload
+
+ INIT_YMM avx2
+-cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
++cglobal sgr_calc_ab2_8bpc, 4, 6, 11, a, b, w, h, s
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+ add hd, 2
+@@ -1364,8 +1364,8 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
+- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
++cglobal sgr_finish_filter2_8bpc, 5, 13, 13, t, src, stride, a, b, w, h, \
++ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ vpbroadcastd m9, [pw_5_6]
+@@ -1483,7 +1483,7 @@ cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
+ RET
+
+ INIT_YMM avx2
+-cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt
++cglobal sgr_weighted2_8bpc, 4, 7, 11, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movifnidn hd, hm
+ vpbroadcastd m0, wtm
+diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
+index 5df449c..11ebdd1 100644
+--- a/src/x86/looprestoration_init_tmpl.c
++++ b/src/x86/looprestoration_init_tmpl.c
+@@ -31,148 +31,140 @@
+ #include "common/intops.h"
+ #include "src/tables.h"
+
+-#define WIENER_FILTER(ext) \
+-void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const int16_t filter[2][8], \
+- enum LrEdgeFlags edges); \
+-void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+- const pixel (*left)[4], const pixel *lpf, \
+- ptrdiff_t lpf_stride, int w, int h, \
+- const int16_t filter[2][8], \
+- enum LrEdgeFlags edges);
++#define decl_wiener_filter_fns(ext) \
++decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \
++decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+-#define SGR_FILTER(ext) \
+-void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
+- const pixel (*left)[4], \
+- const pixel *src, const ptrdiff_t stride, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
+- const int w, const int h, const int strength); \
+-void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const int32_t *a, const int16_t *b, \
+- const int w, const int h); \
++#define decl_sgr_filter_fn(ext) \
++void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
++ const pixel (*left)[4], \
++ const pixel *src, const ptrdiff_t stride, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
++ const int w, const int h, const int strength); \
++void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const int32_t *a, const int16_t *b, \
++ const int w, const int h); \
+ \
+ /* filter with a 3x3 box (radius=1) */ \
+-static void dav1d_sgr_filter1_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const pixel (*left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, const int strength, \
+- const enum LrEdgeFlags edges) \
++static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const pixel (*left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const int strength, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
+ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
+ \
+- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
++ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
+ if (edges & LR_HAVE_TOP) \
+- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+- NULL, lpf, lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
++ NULL, lpf, lpf_stride, w, 2, edges); \
+ \
+ if (edges & LR_HAVE_BOTTOM) \
+- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+- lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
++ lpf_stride, w, 2, edges); \
+ \
+- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
+- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
+- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
++ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
++ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
++ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
+ } \
+ \
+-void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
+- const pixel (*left)[4], \
+- const pixel *src, const ptrdiff_t stride, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
+- const int w, const int h, \
+- const enum LrEdgeFlags edges); \
+-void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
+- const int w, const int h, const int strength); \
+-void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const int32_t *a, const int16_t *b, \
+- const int w, const int h); \
++void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
++ const pixel (*left)[4], \
++ const pixel *src, const ptrdiff_t stride, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
++ const int w, const int h, \
++ const enum LrEdgeFlags edges); \
++void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
++ const int w, const int h, const int strength); \
++void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const int32_t *a, const int16_t *b, \
++ const int w, const int h); \
+ \
+ /* filter with a 5x5 box (radius=2) */ \
+-static void dav1d_sgr_filter2_##ext(coef *tmp, \
+- const pixel *src, const ptrdiff_t stride, \
+- const pixel (*left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, const int strength, \
+- const enum LrEdgeFlags edges) \
++static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
++ const pixel *src, const ptrdiff_t stride, \
++ const pixel (*left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const int strength, \
++ const enum LrEdgeFlags edges) \
+ { \
+ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
+ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
+ \
+- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
++ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
+ if (edges & LR_HAVE_TOP) \
+- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+- NULL, lpf, lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
++ NULL, lpf, lpf_stride, w, 2, edges); \
+ \
+ if (edges & LR_HAVE_BOTTOM) \
+- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+- lpf_stride, w, 2, edges); \
++ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
++ lpf_stride, w, 2, edges); \
+ \
+- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
+- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
+- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
++ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
++ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
++ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
+ } \
+ \
+-void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
+- const coef *t1, const int w, const int h, \
+- const int wt); \
+-void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
+- const coef *t1, const coef *t2, \
+- const int w, const int h, \
+- const uint32_t wt); \
++void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
++ const coef *t1, const int w, const int h, \
++ const int wt); \
++void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
++ const coef *t1, const coef *t2, \
++ const int w, const int h, \
++ const uint32_t wt); \
+ \
+-static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
+- const pixel (*const left)[4], \
+- const pixel *lpf, const ptrdiff_t lpf_stride, \
+- const int w, const int h, const int sgr_idx, \
+- const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
++static void BF(sgr_filter, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
++ const pixel (*const left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const int sgr_idx, \
++ const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
+ { \
+ if (!dav1d_sgr_params[sgr_idx][0]) { \
+ ALIGN_STK_32(coef, tmp, 64 * 384,); \
+- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, dav1d_sgr_params[sgr_idx][3], edges); \
+- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
++ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, dav1d_sgr_params[sgr_idx][3], edges); \
++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
+ } else if (!dav1d_sgr_params[sgr_idx][1]) { \
+ ALIGN_STK_32(coef, tmp, 64 * 384,); \
+- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, dav1d_sgr_params[sgr_idx][2], edges); \
+- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
++ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, dav1d_sgr_params[sgr_idx][2], edges); \
++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
+ } else { \
+ ALIGN_STK_32(coef, tmp1, 64 * 384,); \
+ ALIGN_STK_32(coef, tmp2, 64 * 384,); \
+- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, dav1d_sgr_params[sgr_idx][2], edges); \
+- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
+- w, h, dav1d_sgr_params[sgr_idx][3], edges); \
++ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, dav1d_sgr_params[sgr_idx][2], edges); \
++ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
++ w, h, dav1d_sgr_params[sgr_idx][3], edges); \
+ const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
+- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
++ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
+ } \
+ }
+
+ #if BITDEPTH == 8
+-WIENER_FILTER(sse2)
+-WIENER_FILTER(ssse3)
+-SGR_FILTER(ssse3)
++decl_wiener_filter_fns(sse2);
++decl_wiener_filter_fns(ssse3);
++decl_sgr_filter_fn(ssse3)
+ # if ARCH_X86_64
+-WIENER_FILTER(avx2)
+-SGR_FILTER(avx2)
++decl_wiener_filter_fns(avx2);
++decl_sgr_filter_fn(avx2)
+ # endif
+ #endif
+
+@@ -181,21 +173,21 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+ #if BITDEPTH == 8
+- c->wiener[0] = dav1d_wiener_filter7_sse2;
+- c->wiener[1] = dav1d_wiener_filter5_sse2;
++ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
++ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ #if BITDEPTH == 8
+- c->wiener[0] = dav1d_wiener_filter7_ssse3;
+- c->wiener[1] = dav1d_wiener_filter5_ssse3;
+- c->selfguided = sgr_filter_ssse3;
++ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
++ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
++ c->selfguided = BF(sgr_filter, ssse3);
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+ #if BITDEPTH == 8 && ARCH_X86_64
+- c->wiener[0] = dav1d_wiener_filter7_avx2;
+- c->wiener[1] = dav1d_wiener_filter5_avx2;
+- c->selfguided = sgr_filter_avx2;
++ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
++ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
++ c->selfguided = BF(sgr_filter, avx2);
+ #endif
+ }
+diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm
+index 5d3ca49..4b77138 100644
+--- a/src/x86/looprestoration_sse.asm
++++ b/src/x86/looprestoration_sse.asm
+@@ -97,8 +97,8 @@ SECTION .text
+ %macro WIENER 0
+ %if ARCH_X86_64
+ DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
+-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h, x
++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h, x
+ %define base 0
+ mov fltq, fltmp
+ mov edged, r8m
+@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
+ %define m11 [stk+96]
+ %define stk_off 112
+ %endif
+-cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
++cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ add lpfq, [rsp+gprsize*1]
+ call .hv_bottom
+ .v1:
+- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ RET
+ .no_top:
+ lea t3, [lpfq+lpf_strideq*4]
+@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ dec hd
+ jnz .main
+ .v3:
+- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ .v2:
+- call mangle(private_prefix %+ _wiener_filter7_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ jmp .v1
+ .extend_right:
+ movd m2, [lpfq-4]
+@@ -685,8 +685,8 @@ ALIGN function_align
+ %endif
+
+ %if ARCH_X86_64
+-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+- lpf_stride, w, edge, flt, h, x
++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
++ lpf_stride, w, edge, flt, h, x
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ %define m11 [stk+80]
+ %define stk_off 96
+ %endif
+-cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
++cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ dec hd
+ jnz .main
+ .v2:
+- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ add dstq, dst_strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+ .v1:
+- call mangle(private_prefix %+ _wiener_filter5_ssse3).v
++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ jmp .end
+ .h:
+ %define stk esp+4
+@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+ .h_have_right:
+ %macro %%h5 0
+ %if cpuflag(ssse3)
+@@ -991,7 +991,7 @@ ALIGN function_align
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+ .hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+@@ -1161,7 +1161,7 @@ WIENER
+ %endmacro
+
+ %if ARCH_X86_64
+-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
++cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ mov xlimd, edgem
+ movifnidn xd, xm
+ mov hd, hm
+@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ add xd, xlimd
+ xor xlimd, 2 ; 2*!have_right
+ %else
+-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
++cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ %define wq r0m
+ %define xlimd r1m
+ %define hd hmp
+@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
++cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+ movifnidn edged, edgem
+ %else
+-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
++cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ %define sumsq_baseq dword [esp+0]
+ %define sum_baseq dword [esp+4]
+ %define ylimd dword [esp+8]
+@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ jl .loop_x
+ RET
+
+-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
++cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+- tmp_base, src_base, a_base, b_base, x, y
++cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
++ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ mova m15, [pw_16]
+@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+ mov b_baseq, bq
+ xor xd, xd
+ %else
+-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
++cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ %define tmp_baseq [esp+8]
+ %define src_baseq [esp+12]
+ %define a_baseq [esp+16]
+@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ jl .loop_x
+ RET
+
+-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
++cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
+ movifnidn hd, hm
+ %if ARCH_X86_32
+ SETUP_PIC r6, 0
+@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
++cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ mova m10, [pb_0]
+ mova m11, [pb_0_1]
+ %else
+-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
++cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ %define edgeb byte edgem
+ %define wd xd
+ %define wq wd
+@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
++cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
+ mov ylimd, edged
+ %else
+-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
++cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ %define wm [esp+0]
+ %define hm [esp+4]
+ %define edgem [esp+8]
+@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ jmp .sum_loop_y_noload
+ %endif
+
+-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
++cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+ RET
+
+ %if ARCH_X86_64
+-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
++cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
+ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+ psrlw m11, m12, 1 ; pw_128
+ pxor m13, m13
+ %else
+-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
++cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
+ %define tmp_baseq r0m
+ %define src_baseq r1m
+ %define a_baseq r3m
+@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+ RET
+
+ %undef t2
+-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
++cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movd m0, wtm
+ %if ARCH_X86_64
+--
+GitLab
+
diff --git a/0003-wiener_3.patch b/0003-wiener_3.patch
new file mode 100644
index 000000000000..b2852c881396
--- /dev/null
+++ b/0003-wiener_3.patch
@@ -0,0 +1,492 @@
+From 43c61c3f259400cde5facbe7ce50769088b5f5b6 Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Sun, 10 Jan 2021 14:12:10 -0500
+Subject: [PATCH] x86: lr: Add AVX2 implementation of wiener filter for 16 bpc
+
+Relative speed-ups over C code (compared with gcc-9.3.0):
+
+ C AVX2
+wiener_5tap_10bpc: 194892.0 14831.9 13.14x
+wiener_5tap_12bpc: 194295.4 14828.9 13.10x
+wiener_7tap_10bpc: 194391.7 19461.4 9.99x
+wiener_7tap_12bpc: 194136.1 19418.7 10.00x
+---
+ src/x86/looprestoration16_avx2.asm | 466 +++++++++++++++++++++++++++++
+ 1 file changed, 466 insertions(+)
+ create mode 100644 src/x86/looprestoration16_avx2.asm
+
+diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm
+new file mode 100644
+index 0000000..2012860
+--- /dev/null
++++ b/src/x86/looprestoration16_avx2.asm
+@@ -0,0 +1,466 @@
++; Copyright (c) 2017-2021, The rav1e contributors
++; Copyright (c) 2021, Nathan Egge
++; All rights reserved.
++;
++; This source code is subject to the terms of the BSD 2 Clause License and
++; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
++; was not distributed with this source code in the LICENSE file, you can
++; obtain it at www.aomedia.org/license/software. If the Alliance for Open
++; Media Patent License 1.0 was not distributed with this source code in the
++; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
++
++%include "config.asm"
++%include "ext/x86/x86inc.asm"
++
++%if ARCH_X86_64
++
++SECTION_RODATA 32
++
++wiener5_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
++wiener5_shufB: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13
++wiener5_shufC: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1
++wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
++pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++
++wiener7_shufB: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9
++wiener7_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
++wiener7_shufD: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
++rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
++rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
++wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
++
++pq_3: dq (6 - 4) + 1
++pq_5: dq (6 - 2) + 1
++pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
++pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
++
++pq_11: dq 12 - (6 - 4) + 1
++pq_9: dq 12 - (6 - 2) + 1
++nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
++nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
++
++pb_wiener5_l: times 2 db 2, 3
++pb_wiener5_r: times 2 db -6, -5
++
++pb_wiener7_l: times 2 db 4, 5
++pb_wiener7_m: times 2 db -4, -3
++pb_wiener7_r: times 2 db -8, -7
++
++SECTION .text
++
++INIT_YMM avx2
++cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ vbroadcasti128 m6, [wiener5_shufA]
++ vpbroadcastd m12, [fq + 2]
++ vbroadcasti128 m7, [wiener5_shufB]
++ vpbroadcastw m13, [fq + 6]
++ vbroadcasti128 m8, [wiener5_shufC]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m9, [pd_65540]
++ movq xm10, [pq_3]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m9, [pd_262160]
++ movq xm10, [pq_5]
++.bits10:
++ pxor m11, m11
++ add wq, wq
++ add srcq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
++.v_loop:
++ mov xq, wq
++ test edgeb, 1 ; LR_HAVE_LEFT
++ jz .h_extend_left
++ test leftq, leftq
++ jz .h_loop
++ movd xm4, [leftq + 4]
++ vpblendd m4, [srcq + xq - 4], 0xfe
++ add leftq, 8
++ jmp .h_main
++.h_extend_left:
++ vbroadcasti128 m5, [srcq + xq]
++ mova m4, [srcq + xq]
++ palignr m4, m5, 12
++ pshufb m4, [wiener5_l_shuf]
++ jmp .h_main
++.h_loop:
++ movu m4, [srcq + xq - 4]
++.h_main:
++ movu m5, [srcq + xq + 4]
++ test edgeb, 2 ; LR_HAVE_RIGHT
++ jnz .h_have_right
++ cmp xd, -18*2
++ jl .h_have_right
++ movd xm2, xd
++ vpbroadcastd m0, [pb_wiener5_l]
++ vpbroadcastd m1, [pb_wiener5_r]
++ vpbroadcastb m2, xm2
++ movu m3, [pb_0to31]
++ psubb m0, m2
++ psubb m1, m2
++ pminub m0, m3
++ pminub m1, m3
++ pshufb m4, m0
++ pshufb m5, m1
++.h_have_right:
++ pshufb m0, m4, m6
++ pshufb m2, m4, m7
++ paddw m0, m2
++ pmaddwd m0, m12
++ pshufb m1, m5, m6
++ pshufb m3, m5, m7
++ paddw m1, m3
++ pmaddwd m1, m12
++ pshufb m4, m8
++ pmaddwd m4, m13
++ pshufb m5, m8
++ pmaddwd m5, m13
++ paddd m0, m4
++ paddd m1, m5
++ paddd m0, m9
++ paddd m1, m9
++ psrad m0, xm10
++ psrad m1, xm10
++ packssdw m0, m1
++ pmaxsw m0, m11
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add srcq, ssq
++ add dstq, 384*2
++ dec hd
++ jg .v_loop
++ RET
++
++DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
++
++INIT_YMM avx2
++cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ pxor m6, m6
++ vpbroadcastd m7, [fq + 2]
++ vpbroadcastd m8, [fq + 6]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m9, [nd_1047552]
++ movq xm10, [pq_11]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m9, [nd_1048320]
++ movq xm10, [pq_9]
++.bits10:
++ vpbroadcastw m11, bdmaxm
++ add wq, wq
++ add midq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
++ mov msq, 2*384
++ mov t0, midq
++ lea t1, [t0 + msq]
++ lea t2, [t1 + msq]
++ lea t3, [t2 + msq]
++ lea t4, [t3 + msq]
++ test edgeb, 4 ; LR_HAVE_TOP
++ jnz .have_top
++ mov t0, t2
++ mov t1, t2
++.have_top:
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jnz .v_loop
++ cmp hd, 2
++ jg .v_loop
++ cmp hd, 1
++ jne .limit_v
++ mov t3, t2
++.limit_v:
++ mov t4, t3
++.v_loop:
++ mov xq, wq
++.h_loop:
++ mova m1, [t0 + xq]
++ mova m2, [t1 + xq]
++ mova m3, [t2 + xq]
++ mova m4, [t3 + xq]
++ mova m5, [t4 + xq]
++ punpcklwd m0, m1, m2
++ pmaddwd m0, m7
++ punpckhwd m1, m2
++ pmaddwd m1, m7
++ punpcklwd m2, m5, m4
++ pmaddwd m2, m7
++ punpckhwd m5, m4
++ pmaddwd m5, m7
++ paddd m0, m2
++ paddd m1, m5
++ punpcklwd m2, m3, m6
++ pmaddwd m2, m8
++ punpckhwd m3, m6
++ pmaddwd m3, m8
++ paddd m0, m2
++ paddd m1, m3
++ paddd m0, m9
++ paddd m1, m9
++ psrad m0, xm10
++ psrad m1, xm10
++ packusdw m0, m1
++ pminuw m0, m11
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add dstq, dsq
++ mov t0, t1
++ mov t1, t2
++ mov t2, t3
++ mov t3, t4
++ add t4, msq
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jnz .have_bottom
++ cmp hd, 3
++ jg .have_bottom
++ mov t4, t3
++.have_bottom:
++ dec hd
++ jg .v_loop
++ RET
++
++INIT_YMM avx2
++cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ vpbroadcastd m7, [fq]
++ vpbroadcastd m8, [fq + 4]
++ vbroadcasti128 m10, [rev_w]
++ vbroadcasti128 m11, [wiener5_shufA]
++ vbroadcasti128 m12, [wiener7_shufB]
++ vbroadcasti128 m13, [wiener7_shufC]
++ vbroadcasti128 m14, [wiener7_shufD]
++ vbroadcasti128 m15, [rev_d]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m9, [pd_65540]
++ mov rhq, [pq_3]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m9, [pd_262160]
++ mov rhq, [pq_5]
++.bits10:
++ add wq, wq
++ add srcq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
++.v_loop:
++ mov xq, wq
++ test edgeb, 1 ; LR_HAVE_LEFT
++ jz .h_extend_left
++ test leftq, leftq
++ jz .h_loop
++ movq xm4, [leftq + 2]
++ vpblendw xm4, [srcq + xq - 6], 0xf8
++ vinserti128 m4, [srcq + xq + 10], 1
++ add leftq, 8
++ jmp .h_main
++.h_extend_left:
++ vbroadcasti128 m5, [srcq + xq]
++ mova m4, [srcq + xq]
++ palignr m4, m5, 10
++ pshufb m4, [wiener7_l_shuf]
++ jmp .h_main
++.h_loop:
++ movu m4, [srcq + xq - 6]
++.h_main:
++ movu m5, [srcq + xq + 2]
++ movu m6, [srcq + xq + 6]
++ test edgeb, 2 ; LR_HAVE_RIGHT
++ jnz .h_have_right
++ cmp xd, -19*2
++ jl .h_have_right
++ movd xm3, xd
++ vpbroadcastd m0, [pb_wiener7_l]
++ vpbroadcastd m1, [pb_wiener7_m]
++ vpbroadcastd m2, [pb_wiener7_r]
++ vpbroadcastb m3, xm3
++ psubb m0, m3
++ psubb m1, m3
++ psubb m2, m3
++ movu m3, [pb_0to31]
++ pminub m0, m3
++ pminub m1, m3
++ pminub m2, m3
++ pshufb m4, m0
++ pshufb m5, m1
++ pshufb m6, m2
++ cmp xd, -9*2
++ jne .hack
++ vpbroadcastw xm3, [srcq + xq + 16]
++ vinserti128 m5, xm3, 1
++ jmp .h_have_right
++.hack:
++ cmp xd, -1*2
++ jne .h_have_right
++ vpbroadcastw xm5, [srcq + xq]
++.h_have_right:
++ pshufb m6, m10
++ pshufb m0, m4, m11
++ pshufb m2, m5, m12
++ paddw m0, m2
++ pmaddwd m0, m7
++ pshufb m2, m4, m13
++ pshufb m4, m14
++ paddw m2, m4
++ pmaddwd m2, m8
++ pshufb m1, m6, m11
++ pshufb m5, m11
++ pmaddwd m1, m7
++ pmaddwd m5, m7
++ pshufb m3, m6, m13
++ pshufb m6, m14
++ paddw m3, m6
++ pmaddwd m3, m8
++ paddd m0, m2
++ paddd m1, m3
++ pshufb m1, m15
++ paddd m1, m5
++ movq xm4, rhq
++ pxor m5, m5
++ paddd m0, m9
++ paddd m1, m9
++ psrad m0, xm4
++ psrad m1, xm4
++ packssdw m0, m1
++ pmaxsw m0, m5
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add srcq, ssq
++ add dstq, 384*2
++ dec hd
++ jg .v_loop
++ RET
++
++INIT_YMM avx2
++cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
++ movifnidn wd, wm
++ movifnidn hd, hm
++ movifnidn edgeb, edgem
++ pxor m6, m6
++ vpbroadcastd m7, [fq]
++ vpbroadcastw m8, [fq + 4]
++ vpbroadcastd m9, [fq + 6]
++ popcnt bdmaxd, bdmaxm
++ vpbroadcastd m10, [nd_1047552]
++ movq xm11, [pq_11]
++ cmp bdmaxd, 10
++ je .bits10
++ vpbroadcastd m10, [nd_1048320]
++ movq xm11, [pq_9]
++.bits10:
++ vpbroadcastw m12, bdmaxm
++ add wq, wq
++ add midq, wq
++ add dstq, wq
++ neg wq
++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
++ mov msq, 2*384
++ mov t0, midq
++ mov t1, t0
++ lea t2, [t1 + msq]
++ lea t3, [t2 + msq]
++ lea t4, [t3 + msq]
++ lea t5, [t4 + msq]
++ lea t6, [t5 + msq]
++ test edgeb, 4 ; LR_HAVE_TOP
++ jnz .have_top
++ mov t0, t3
++ mov t1, t3
++ mov t2, t3
++.have_top:
++ cmp hd, 3
++ jg .v_loop
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jz .no_bottom0
++ cmp hd, 1
++ jg .v_loop
++ jmp .h3
++.no_bottom0:
++ cmp hd, 2
++ je .h2
++ jns .h3
++.h1:
++ mov t4, t3
++.h2:
++ mov t5, t4
++.h3:
++ mov t6, t5
++.v_loop:
++ mov xq, wq
++.h_loop:
++ mova m1, [t0 + xq]
++ mova m2, [t1 + xq]
++ mova m3, [t5 + xq]
++ mova m4, [t6 + xq]
++ punpcklwd m0, m1, m2
++ pmaddwd m0, m7
++ punpckhwd m1, m2
++ pmaddwd m1, m7
++ punpcklwd m2, m4, m3
++ pmaddwd m2, m7
++ punpckhwd m4, m3
++ pmaddwd m4, m7
++ paddd m0, m2
++ paddd m1, m4
++ mova m3, [t2 + xq]
++ mova m4, [t4 + xq]
++ punpcklwd m2, m3, m4
++ pmaddwd m2, m8
++ punpckhwd m3, m4
++ pmaddwd m3, m8
++ paddd m0, m2
++ paddd m1, m3
++ mova m3, [t3 + xq]
++ punpcklwd m2, m3, m6
++ pmaddwd m2, m9
++ punpckhwd m3, m6
++ pmaddwd m3, m9
++ paddd m0, m2
++ paddd m1, m3
++ paddd m0, m10
++ paddd m1, m10
++ psrad m0, xm11
++ psrad m1, xm11
++ packusdw m0, m1
++ pminuw m0, m12
++ mova [dstq + xq], m0
++ add xq, 32
++ jl .h_loop
++ add dstq, dsq
++ mov t0, t1
++ mov t1, t2
++ mov t2, t3
++ mov t3, t4
++ mov t4, t5
++ mov t5, t6
++ add t6, msq
++ cmp hd, 4
++ jg .next_row
++ test edgeb, 8 ; LR_HAVE_BOTTOM
++ jz .no_bottom
++ cmp hd, 2
++ jg .next_row
++.no_bottom:
++ mov t6, t5
++.next_row:
++ dec hd
++ jg .v_loop
++ RET
++
++%endif ; ARCH_X86_64
+--
+GitLab
+
diff --git a/0004-wiener_4.patch b/0004-wiener_4.patch
new file mode 100644
index 000000000000..1876e7bd1d25
--- /dev/null
+++ b/0004-wiener_4.patch
@@ -0,0 +1,101 @@
+From 2d59aa7b52713b77243bda12066213fca8447f9d Mon Sep 17 00:00:00 2001
+From: "Nathan E. Egge" <unlord@xiph.org>
+Date: Wed, 13 Jan 2021 14:54:42 -0500
+Subject: [PATCH] Enable AVX2 wiener filter HBD assembly
+
+---
+ src/meson.build | 1 +
+ src/x86/looprestoration_init_tmpl.c | 41 +++++++++++++++++++++++++++--
+ 2 files changed, 40 insertions(+), 2 deletions(-)
+
+diff --git a/src/meson.build b/src/meson.build
+index ca0b406..c5c305d 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -209,7 +209,8 @@ if is_asm_enabled
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
++ 'x86/looprestoration16_avx2.asm',
+ 'x86/mc16_avx2.asm',
+ )
+ endif
+
+diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c
+index 11ebdd1..dfc9f84 100644
+--- a/src/x86/looprestoration_init_tmpl.c
++++ b/src/x86/looprestoration_init_tmpl.c
+@@ -31,9 +31,41 @@
+ #include "common/intops.h"
+ #include "src/tables.h"
+
++#if BITDEPTH != 8
++#undef decl_wiener_filter_fn
++#define decl_wiener_filter_fn(name, ext) \
++void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
++ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
++ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
++void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
++ const int16_t fv[7], int w, int h, \
++ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
++static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
++ const pixel (*const left)[4], \
++ const pixel *lpf, const ptrdiff_t lpf_stride, \
++ const int w, const int h, const int16_t filter[2][8], \
++ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
++ ALIGN_STK_64(int16_t, mid, 68 * 384,); \
++ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, filter[0], w, h, \
++ edges HIGHBD_TAIL_SUFFIX); \
++ if (edges & LR_HAVE_TOP) { \
++ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, filter[0], w, 2, \
++ edges HIGHBD_TAIL_SUFFIX); \
++ } \
++ if (edges & LR_HAVE_BOTTOM) { \
++ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
++ lpf_stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
++ } \
++ BF(name##_v, ext)(dst, dst_stride, mid, filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
++}
++#define decl_wiener_filter_fns(ext) \
++decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
++decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
++#else
+ #define decl_wiener_filter_fns(ext) \
+ decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+ decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext))
++#endif
+
+ #define decl_sgr_filter_fn(ext) \
+ void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
+@@ -163,11 +195,14 @@ decl_wiener_filter_fns(sse2);
+ decl_wiener_filter_fns(ssse3);
+ decl_sgr_filter_fn(ssse3)
+ # if ARCH_X86_64
+-decl_wiener_filter_fns(avx2);
+ decl_sgr_filter_fn(avx2)
+ # endif
+ #endif
+
++#if ARCH_X86_64
++decl_wiener_filter_fns(avx2);
++#endif
++
+ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+@@ -185,9 +220,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
+ #endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+-#if BITDEPTH == 8 && ARCH_X86_64
++#if ARCH_X86_64
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
++# if BITDEPTH == 8
+ c->selfguided = BF(sgr_filter, avx2);
++# endif
+ #endif
+ }
+--
+GitLab
+
diff --git a/PKGBUILD b/PKGBUILD
index 24b454943efa..0265211a8852 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -1,7 +1,8 @@
# Maintainer: Ben Grant <ben@190n.org>
+_testvideo=Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu
pkgname=dav1d-git-optimized
-pkgver=r1489.ffd052b
+pkgver=r1556.05d05f9
pkgrel=1
license=('BSD')
pkgdesc='AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations'
@@ -9,22 +10,45 @@ url='https://code.videolan.org/videolan/dav1d'
arch=('x86_64')
provides=('dav1d' 'libdav1d.so')
conflicts=('dav1d' 'dav1d-git')
-makedepends=('meson' 'ninja' 'git' 'nasm')
-source=('git+https://code.videolan.org/videolan/dav1d.git')
-sha256sums=('SKIP')
-options=(!buildflags)
+makedepends=('meson' 'git' 'nasm')
+source=('git+https://code.videolan.org/videolan/dav1d.git'
+ "http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/$_testvideo"
+ '0001-1112.patch'
+ '0002-wiener_2.patch'
+ '0003-wiener_3.patch'
+ '0004-wiener_4.patch')
+sha256sums=('SKIP'
+ 'e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87'
+ '421c4732d3a3fc85428263f4e4419f7b3bfc7059a29c2b81055a6ebf4345d0eb'
+ '32385f2316886cef326e7887a3de96fdada2ee723b269908794ed770da460626'
+ '1cf8db585f98ef8e63bb3f44f11679cdc554377f58964bebc7ca29aa1639d1ea'
+ '5e46d8d6fcf2d2cdb062368b23af534ecf123321594f9d548a6f14d80d16d981')
pkgver () {
cd dav1d
printf "r%s.%s" "$(git rev-list --count HEAD)" "$(git rev-parse --short HEAD)"
}
+prepare () {
+ cd dav1d
+ # from https://code.videolan.org/videolan/dav1d/-/merge_requests/1112
+ patch -Np1 -i ${srcdir}/0001-1112.patch
+ patch -Np1 -i ${srcdir}/0002-wiener_2.patch
+ patch -Np1 -i ${srcdir}/0003-wiener_3.patch
+ patch -Np1 -i ${srcdir}/0004-wiener_4.patch
+}
+
build () {
- export CFLAGS="-flto -O3 -march=native"
- export CXXFLAGS="-flto -O3 -march=native"
- export LDFLAGS="-flto -O3 -march=native"
cd dav1d
- arch-meson build --optimization=3 -Db_lto=true
+ export CC=gcc
+ arch-meson build \
+ -Denable_tests=false \
+ -Dc_args="-march=native -O3 -fuse-ld=bfd" \
+ -Db_lto=false \
+ -Db_pgo=generate
+ ninja -C build
+ ./build/tools/dav1d -i "$srcdir/$_testvideo" --muxer null --framethreads $(nproc) --tilethread 4
+ meson configure build -Db_pgo=use
ninja -C build
}