diff options
author | Ben Grant | 2021-01-19 10:31:16 -0800 |
---|---|---|
committer | Ben Grant | 2021-01-19 10:31:16 -0800 |
commit | 0ef85b5b789f87392cd0ae19291693c6c718b3c3 (patch) | |
tree | e6dd5699565b7a8d939c2465ee90047a699e9ab1 | |
parent | e5a701db715ae63ab1d888a3358221752a696abc (diff) | |
download | aur-0ef85b5b789f87392cd0ae19291693c6c718b3c3.tar.gz |
Add patches and PGO support from SnoopCat
-rw-r--r-- | 0001-1112.patch | 3780 | ||||
-rw-r--r-- | 0002-wiener_2.patch | 661 | ||||
-rw-r--r-- | 0003-wiener_3.patch | 492 | ||||
-rw-r--r-- | 0004-wiener_4.patch | 101 | ||||
-rw-r--r-- | PKGBUILD | 42 |
5 files changed, 5067 insertions, 9 deletions
diff --git a/0001-1112.patch b/0001-1112.patch new file mode 100644 index 000000000000..e5ed26b240a9 --- /dev/null +++ b/0001-1112.patch @@ -0,0 +1,3780 @@ +From efd27b6182c04072e1cc4b80b24aa28e78d6bfea Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Mon, 21 Dec 2020 15:38:02 -0500 +Subject: [PATCH 1/7] Add bpc suffix to mc functions + +--- + src/x86/mc_avx2.asm | 132 +++++++-------- + src/x86/mc_avx512.asm | 60 +++---- + src/x86/mc_init_tmpl.c | 362 ++++++++++++++++++++--------------------- + src/x86/mc_sse.asm | 132 +++++++-------- + 4 files changed, 343 insertions(+), 343 deletions(-) + +diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm +index dda8234f..e77a6474 100644 +--- a/src/x86/mc_avx2.asm ++++ b/src/x86/mc_avx2.asm +@@ -110,7 +110,7 @@ cextern resize_filter + %endmacro + + %macro HV_JMP_TABLE 5-* +- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) ++ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 +@@ -176,8 +176,8 @@ cextern resize_filter + %endrep + %endmacro + +-%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) +-%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) ++%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) ++%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) + + %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +@@ -187,22 +187,22 @@ HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 + HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 + HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 + HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +-SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 +-SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 +-BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 +-BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 ++SCALED_JMP_TABLE put_8tap_scaled_8bpc_avx2, 2, 4, 8, 16, 32, 64, 128 ++SCALED_JMP_TABLE prep_8tap_scaled_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE avg_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_avg_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE mask_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_420_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_422_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_444_8bpc_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE blend_8bpc_avx2, 4, 8, 16, 32 ++BIDIR_JMP_TABLE blend_v_8bpc_avx2, 2, 4, 8, 16, 32 ++BIDIR_JMP_TABLE blend_h_8bpc_avx2, 2, 4, 8, 16, 32, 32, 32 + + SECTION .text + + INIT_XMM avx2 +-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy ++cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + lea r7, [put_avx2] + tzcnt wd, wm +@@ -769,7 +769,7 @@ INIT_YMM avx2 + %endif + RET + +-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ++cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep%+SUFFIX] + tzcnt wd, wm +@@ -1439,7 +1439,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + %assign FILTER_SHARP (2*15 << 16) | 3*15 + + %macro FN 4 ; fn, type, type_h, type_v +-cglobal %1_%2 ++cglobal %1_%2_8bpc + mov t0d, FILTER_%3 + %ifidn %3, %4 + mov t1d, t0d +@@ -1447,7 +1447,7 @@ cglobal %1_%2 + mov t1d, FILTER_%4 + %endif + %ifnidn %2, regular ; skip the jump in the last filter +- jmp mangle(private_prefix %+ _%1 %+ SUFFIX) ++ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) + %endif + %endmacro + +@@ -1469,7 +1469,7 @@ PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR + PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH + PUT_8TAP_FN regular, REGULAR, REGULAR + +-cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 ++cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 +@@ -2135,7 +2135,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR + PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH + PREP_8TAP_FN regular, REGULAR, REGULAR + +-cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 ++cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 +@@ -2725,26 +2725,26 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + %ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +-cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy ++cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +-cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy ++cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 + %else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +-cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy ++cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +-cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy ++cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+120] + %endif + %xdefine base_reg r11 + %define rndshift 6 + %endif +- lea base_reg, [%1_8tap_scaled_avx2] +-%define base base_reg-%1_8tap_scaled_avx2 ++ lea base_reg, [%1_8tap_scaled_8bpc_avx2] ++%define base base_reg-%1_8tap_scaled_8bpc_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm + %if isprep && UNIX64 +@@ -2807,7 +2807,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + je .dy1 + cmp dyd, 2048 + je .dy2 +- movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] ++ movzx wd, word [base+%1_8tap_scaled_8bpc_avx2_table+wq*2] + add wq, base_reg + jmp wq + %ifidn %1, put +@@ -3280,7 +3280,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + pmulhrsw m3, m12 + jmp .vloop + .dy1: +- movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] ++ movzx wd, word [base+%1_8tap_scaled_8bpc_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq + %ifidn %1, put +@@ -3647,7 +3647,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + pblendw m3, m4, 0xaa + jmp .dy1_vloop + .dy2: +- movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] ++ movzx wd, word [base+%1_8tap_scaled_8bpc_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq + %ifidn %1, put +@@ -4026,10 +4026,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %endmacro + + %macro BILIN_SCALED_FN 1 +-cglobal %1_bilin_scaled ++cglobal %1_bilin_scaled_8bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, t0d +- jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) ++ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) + %endmacro + + %if WIN64 +@@ -4114,11 +4114,11 @@ MC_8TAP_SCALED prep + paddd m%1, m0, m%2 + %endmacro + +-cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts ++cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts + %if WIN64 + sub rsp, 0xa0 + %endif +- call mangle(private_prefix %+ _warp_affine_8x8_avx2).main ++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main + .loop: + psrad m7, 13 + psrad m0, 13 +@@ -4128,12 +4128,12 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d +- jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end +- call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2 ++ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end ++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop + +-cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ ++cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ + beta, filter, tmp1, delta, my, gamma + %if WIN64 + sub rsp, 0xa0 +@@ -4390,9 +4390,9 @@ ALIGN function_align + add tmp2q, %1*32 + %endmacro + +-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +-%define base r6-avg %+ SUFFIX %+ _table +- lea r6, [avg %+ SUFFIX %+ _table] ++cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 ++%define base r6-avg_8bpc %+ SUFFIX %+ _table ++ lea r6, [avg_8bpc %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] +@@ -4420,9 +4420,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + + %define W_AVG_INC_PTR AVG_INC_PTR + +-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +-%define base r6-w_avg %+ SUFFIX %+ _table +- lea r6, [w_avg %+ SUFFIX %+ _table] ++cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 ++%define base r6-w_avg_8bpc %+ SUFFIX %+ _table ++ lea r6, [w_avg_8bpc %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight +@@ -4470,9 +4470,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + add tmp1q, %1*32 + %endmacro + +-cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-mask %+ SUFFIX %+ _table +- lea r7, [mask %+ SUFFIX %+ _table] ++cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-mask_8bpc %+ SUFFIX %+ _table ++ lea r7, [mask_8bpc %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp +@@ -4513,9 +4513,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 + packuswb m%1, m1 + %endmacro + +-cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask +-%define base r6-blend_avx2_table +- lea r6, [blend_avx2_table] ++cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask ++%define base r6-blend_8bpc_avx2_table ++ lea r6, [blend_8bpc_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movifnidn maskq, maskmp +@@ -4630,15 +4630,15 @@ ALIGN function_align + jg .w32 + RET + +-cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask +-%define base r5-blend_v_avx2_table +- lea r5, [blend_v_avx2_table] ++cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask ++%define base r5-blend_v_8bpc_avx2_table ++ lea r5, [blend_v_8bpc_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 +- add maskq, obmc_masks-blend_v_avx2_table ++ add maskq, obmc_masks-blend_v_8bpc_avx2_table + jmp wq + .w2: + vpbroadcastd xm2, [maskq+2*2] +@@ -4741,9 +4741,9 @@ ALIGN function_align + jg .w32_loop + RET + +-cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask +-%define base r5-blend_h_avx2_table +- lea r5, [blend_h_avx2_table] ++cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask ++%define base r5-blend_h_8bpc_avx2_table ++ lea r5, [blend_h_8bpc_avx2_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm +@@ -4867,7 +4867,7 @@ ALIGN function_align + jl .w32_loop0 + RET + +-cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ ++cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes +@@ -5054,7 +5054,7 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + .end: + RET + +-cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ ++cglobal resize_8bpc, 6, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 +@@ -5192,9 +5192,9 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ + jg .loop_y + RET + +-cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-w_mask_420_avx2_table +- lea r7, [w_mask_420_avx2_table] ++cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_420_8bpc_avx2_table ++ lea r7, [w_mask_420_8bpc_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm +@@ -5398,9 +5398,9 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 + jg .w128_loop + RET + +-cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-w_mask_422_avx2_table +- lea r7, [w_mask_422_avx2_table] ++cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_422_8bpc_avx2_table ++ lea r7, [w_mask_422_8bpc_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm +@@ -5571,9 +5571,9 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 + jg .w128_loop + RET + +-cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-w_mask_444_avx2_table +- lea r7, [w_mask_444_avx2_table] ++cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_444_8bpc_avx2_table ++ lea r7, [w_mask_444_8bpc_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp +diff --git a/src/x86/mc_avx512.asm b/src/x86/mc_avx512.asm +index a13c2423..72525f86 100644 +--- a/src/x86/mc_avx512.asm ++++ b/src/x86/mc_avx512.asm +@@ -146,7 +146,7 @@ cextern mc_subpel_filters + %endmacro + + %macro HV_JMP_TABLE 5-* +- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) ++ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 +@@ -188,19 +188,19 @@ cextern mc_subpel_filters + %endrep + %endmacro + +-%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) ++%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) + + %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + + BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 + HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 + HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE avg_8bpc_avx512icl, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_avg_8bpc_avx512icl, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE mask_8bpc_avx512icl, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_420_8bpc_avx512icl, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_422_8bpc_avx512icl, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_444_8bpc_avx512icl, 4, 8, 16, 32, 64, 128 + + SECTION .text + +@@ -221,7 +221,7 @@ INIT_ZMM cpuname + DECLARE_REG_TMP 3, 5, 6 + + INIT_ZMM avx512icl +-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ++cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea t2, [prep_avx512icl] + tzcnt wd, wm +@@ -772,7 +772,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + %assign FILTER_SHARP (2*15 << 16) | 3*15 + + %macro FN 4 ; fn, type, type_h, type_v +-cglobal %1_%2 ++cglobal %1_%2_8bpc + mov t0d, FILTER_%3 + %ifidn %3, %4 + mov t1d, t0d +@@ -780,7 +780,7 @@ cglobal %1_%2 + mov t1d, FILTER_%4 + %endif + %ifnidn %2, regular ; skip the jump in the last filter +- jmp mangle(private_prefix %+ _%1 %+ SUFFIX) ++ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) + %endif + %endmacro + +@@ -829,7 +829,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR + PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH + PREP_8TAP_FN regular, REGULAR, REGULAR + +-cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 ++cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 +@@ -1753,9 +1753,9 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + add tmp2q, %1*mmsize + %endmacro + +-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +-%define base r6-avg_avx512icl_table +- lea r6, [avg_avx512icl_table] ++cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 ++%define base r6-avg_8bpc_avx512icl_table ++ lea r6, [avg_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] +@@ -1783,9 +1783,9 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + + %define W_AVG_INC_PTR AVG_INC_PTR + +-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +-%define base r6-w_avg_avx512icl_table +- lea r6, [w_avg_avx512icl_table] ++cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 ++%define base r6-w_avg_8bpc_avx512icl_table ++ lea r6, [w_avg_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight +@@ -1837,9 +1837,9 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + add tmp1q, %1*64 + %endmacro + +-cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-mask_avx512icl_table +- lea r7, [mask_avx512icl_table] ++cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-mask_8bpc_avx512icl_table ++ lea r7, [mask_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp +@@ -1877,9 +1877,9 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 + packuswb m%1, m1 + %endmacro + +-cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-w_mask_420_avx512icl_table +- lea r7, [w_mask_420_avx512icl_table] ++cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_420_8bpc_avx512icl_table ++ lea r7, [w_mask_420_8bpc_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm +@@ -2070,9 +2070,9 @@ cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 + jg .w128_loop + RET + +-cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-w_mask_422_avx512icl_table +- lea r7, [w_mask_422_avx512icl_table] ++cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_422_8bpc_avx512icl_table ++ lea r7, [w_mask_422_8bpc_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm +@@ -2243,9 +2243,9 @@ cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 + jg .w128_loop + RET + +-cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 +-%define base r7-w_mask_444_avx512icl_table +- lea r7, [w_mask_444_avx512icl_table] ++cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_444_8bpc_avx512icl_table ++ lea r7, [w_mask_444_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] +diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c +index 47f0104a..468069c5 100644 +--- a/src/x86/mc_init_tmpl.c ++++ b/src/x86/mc_init_tmpl.c +@@ -28,157 +28,157 @@ + #include "src/cpu.h" + #include "src/mc.h" + +-decl_mc_fn(dav1d_put_8tap_regular_avx2); +-decl_mc_fn(dav1d_put_8tap_regular_ssse3); +-decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2); +-decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3); +-decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2); +-decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3); +-decl_mc_fn(dav1d_put_8tap_smooth_avx2); +-decl_mc_fn(dav1d_put_8tap_smooth_ssse3); +-decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2); +-decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3); +-decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2); +-decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3); +-decl_mc_fn(dav1d_put_8tap_sharp_avx2); +-decl_mc_fn(dav1d_put_8tap_sharp_ssse3); +-decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2); +-decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3); +-decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2); +-decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3); +-decl_mc_fn(dav1d_put_bilin_avx2); +-decl_mc_fn(dav1d_put_bilin_ssse3); +- +-decl_mct_fn(dav1d_prep_8tap_regular_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_regular_avx2); +-decl_mct_fn(dav1d_prep_8tap_regular_ssse3); +-decl_mct_fn(dav1d_prep_8tap_regular_sse2); +-decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); +-decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); +-decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2); +-decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); +-decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); +-decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2); +-decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_smooth_avx2); +-decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); +-decl_mct_fn(dav1d_prep_8tap_smooth_sse2); +-decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); +-decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); +-decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2); +-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); +-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); +-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2); +-decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_sharp_avx2); +-decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); +-decl_mct_fn(dav1d_prep_8tap_sharp_sse2); +-decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); +-decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); +-decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2); +-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl); +-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); +-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); +-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2); +-decl_mct_fn(dav1d_prep_bilin_avx512icl); +-decl_mct_fn(dav1d_prep_bilin_avx2); +-decl_mct_fn(dav1d_prep_bilin_ssse3); +-decl_mct_fn(dav1d_prep_bilin_sse2); +- +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_ssse3); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); +-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_ssse3); +-decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2); +-decl_mc_scaled_fn(dav1d_put_bilin_scaled_ssse3); +- +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_ssse3); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2); +-decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_ssse3); +-decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2); +-decl_mct_scaled_fn(dav1d_prep_bilin_scaled_ssse3); +- +-decl_avg_fn(dav1d_avg_avx512icl); +-decl_avg_fn(dav1d_avg_avx2); +-decl_avg_fn(dav1d_avg_ssse3); +-decl_w_avg_fn(dav1d_w_avg_avx512icl); +-decl_w_avg_fn(dav1d_w_avg_avx2); +-decl_w_avg_fn(dav1d_w_avg_ssse3); +-decl_mask_fn(dav1d_mask_avx512icl); +-decl_mask_fn(dav1d_mask_avx2); +-decl_mask_fn(dav1d_mask_ssse3); +-decl_w_mask_fn(dav1d_w_mask_420_avx512icl); +-decl_w_mask_fn(dav1d_w_mask_420_avx2); +-decl_w_mask_fn(dav1d_w_mask_420_ssse3); +-decl_w_mask_fn(dav1d_w_mask_422_avx512icl); +-decl_w_mask_fn(dav1d_w_mask_422_avx2); +-decl_w_mask_fn(dav1d_w_mask_444_avx512icl); +-decl_w_mask_fn(dav1d_w_mask_444_avx2); +-decl_blend_fn(dav1d_blend_avx2); +-decl_blend_fn(dav1d_blend_ssse3); +-decl_blend_dir_fn(dav1d_blend_v_avx2); +-decl_blend_dir_fn(dav1d_blend_v_ssse3); +-decl_blend_dir_fn(dav1d_blend_h_avx2); +-decl_blend_dir_fn(dav1d_blend_h_ssse3); +- +-decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2); +-decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4); +-decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3); +-decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2); +-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2); +-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4); +-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3); +-decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2); +- +-decl_emu_edge_fn(dav1d_emu_edge_avx2); +-decl_emu_edge_fn(dav1d_emu_edge_ssse3); +- +-decl_resize_fn(dav1d_resize_avx2); +-decl_resize_fn(dav1d_resize_ssse3); ++decl_mc_fn(BF(dav1d_put_8tap_regular, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_regular, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_smooth, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_smooth, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_sharp, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_sharp, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, ssse3)); ++decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, avx2)); ++decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, ssse3)); ++decl_mc_fn(BF(dav1d_put_bilin, avx2)); ++decl_mc_fn(BF(dav1d_put_bilin, ssse3)); ++ ++decl_mct_fn(BF(dav1d_prep_8tap_regular, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, sse2)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, avx2)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, ssse3)); ++decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, sse2)); ++decl_mct_fn(BF(dav1d_prep_bilin, avx512icl)); ++decl_mct_fn(BF(dav1d_prep_bilin, avx2)); ++decl_mct_fn(BF(dav1d_prep_bilin, ssse3)); ++decl_mct_fn(BF(dav1d_prep_bilin, sse2)); ++ ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_smooth, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_smooth, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_sharp, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_regular_sharp, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_regular, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_regular, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_sharp, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_smooth_sharp, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_regular, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_regular, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_smooth, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_8tap_scaled_sharp_smooth, ssse3)); ++decl_mc_scaled_fn(BF(dav1d_put_bilin_scaled, avx2)); ++decl_mc_scaled_fn(BF(dav1d_put_bilin_scaled, ssse3)); ++ ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_smooth, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_smooth, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_sharp, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_regular_sharp, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_regular, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_regular, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_sharp, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_smooth_sharp, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_regular, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_regular, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_smooth, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_8tap_scaled_sharp_smooth, ssse3)); ++decl_mct_scaled_fn(BF(dav1d_prep_bilin_scaled, avx2)); ++decl_mct_scaled_fn(BF(dav1d_prep_bilin_scaled, ssse3)); ++ ++decl_avg_fn(BF(dav1d_avg, avx512icl)); ++decl_avg_fn(BF(dav1d_avg, avx2)); ++decl_avg_fn(BF(dav1d_avg, ssse3)); ++decl_w_avg_fn(BF(dav1d_w_avg, avx512icl)); ++decl_w_avg_fn(BF(dav1d_w_avg, avx2)); ++decl_w_avg_fn(BF(dav1d_w_avg, ssse3)); ++decl_mask_fn(BF(dav1d_mask, avx512icl)); ++decl_mask_fn(BF(dav1d_mask, avx2)); ++decl_mask_fn(BF(dav1d_mask, ssse3)); ++decl_w_mask_fn(BF(dav1d_w_mask_420, avx512icl)); ++decl_w_mask_fn(BF(dav1d_w_mask_420, avx2)); ++decl_w_mask_fn(BF(dav1d_w_mask_420, ssse3)); ++decl_w_mask_fn(BF(dav1d_w_mask_422, avx512icl)); ++decl_w_mask_fn(BF(dav1d_w_mask_422, avx2)); ++decl_w_mask_fn(BF(dav1d_w_mask_444, avx512icl)); ++decl_w_mask_fn(BF(dav1d_w_mask_444, avx2)); ++decl_blend_fn(BF(dav1d_blend, avx2)); ++decl_blend_fn(BF(dav1d_blend, ssse3)); ++decl_blend_dir_fn(BF(dav1d_blend_v, avx2)); ++decl_blend_dir_fn(BF(dav1d_blend_v, ssse3)); ++decl_blend_dir_fn(BF(dav1d_blend_h, avx2)); ++decl_blend_dir_fn(BF(dav1d_blend_h, ssse3)); ++ ++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, avx2)); ++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4)); ++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, ssse3)); ++decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse2)); ++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, avx2)); ++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4)); ++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, ssse3)); ++decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse2)); ++ ++decl_emu_edge_fn(BF(dav1d_emu_edge, avx2)); ++decl_emu_edge_fn(BF(dav1d_emu_edge, ssse3)); ++ ++decl_resize_fn(BF(dav1d_resize, avx2)); ++decl_resize_fn(BF(dav1d_resize, ssse3)); + + COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + #define init_mc_fn(type, name, suffix) \ +- c->mc[type] = dav1d_put_##name##_##suffix ++ c->mc[type] = BF(dav1d_put_##name, suffix) + #define init_mct_fn(type, name, suffix) \ +- c->mct[type] = dav1d_prep_##name##_##suffix ++ c->mct[type] = BF(dav1d_prep_##name, suffix) + #define init_mc_scaled_fn(type, name, suffix) \ +- c->mc_scaled[type] = dav1d_put_##name##_##suffix ++ c->mc_scaled[type] = BF(dav1d_put_##name, suffix) + #define init_mct_scaled_fn(type, name, suffix) \ +- c->mct_scaled[type] = dav1d_prep_##name##_##suffix ++ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix) + + const unsigned flags = dav1d_get_cpu_flags(); + +@@ -197,8 +197,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + +- c->warp8x8 = dav1d_warp_affine_8x8_sse2; +- c->warp8x8t = dav1d_warp_affine_8x8t_sse2; ++ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2); ++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2); + #endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) +@@ -251,27 +251,27 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + #endif + +- c->avg = dav1d_avg_ssse3; +- c->w_avg = dav1d_w_avg_ssse3; +- c->mask = dav1d_mask_ssse3; +- c->w_mask[2] = dav1d_w_mask_420_ssse3; +- c->blend = dav1d_blend_ssse3; +- c->blend_v = dav1d_blend_v_ssse3; +- c->blend_h = dav1d_blend_h_ssse3; ++ c->avg = BF(dav1d_avg, ssse3); ++ c->w_avg = BF(dav1d_w_avg, ssse3); ++ c->mask = BF(dav1d_mask, ssse3); ++ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3); ++ c->blend = BF(dav1d_blend, ssse3); ++ c->blend_v = BF(dav1d_blend_v, ssse3); ++ c->blend_h = BF(dav1d_blend_h, ssse3); + +- c->warp8x8 = dav1d_warp_affine_8x8_ssse3; +- c->warp8x8t = dav1d_warp_affine_8x8t_ssse3; ++ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3); ++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3); + +- c->emu_edge = dav1d_emu_edge_ssse3; +- c->resize = dav1d_resize_ssse3; ++ c->emu_edge = BF(dav1d_emu_edge, ssse3); ++ c->resize = BF(dav1d_resize, ssse3); + #endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) + return; + + #if BITDEPTH == 8 +- c->warp8x8 = dav1d_warp_affine_8x8_sse4; +- c->warp8x8t = dav1d_warp_affine_8x8t_sse4; ++ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4); ++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4); + #endif + + #if ARCH_X86_64 +@@ -323,21 +323,21 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + +- c->avg = dav1d_avg_avx2; +- c->w_avg = dav1d_w_avg_avx2; +- c->mask = dav1d_mask_avx2; +- c->w_mask[0] = dav1d_w_mask_444_avx2; +- c->w_mask[1] = dav1d_w_mask_422_avx2; +- c->w_mask[2] = dav1d_w_mask_420_avx2; +- c->blend = dav1d_blend_avx2; +- c->blend_v = dav1d_blend_v_avx2; +- c->blend_h = dav1d_blend_h_avx2; +- +- c->warp8x8 = dav1d_warp_affine_8x8_avx2; +- c->warp8x8t = dav1d_warp_affine_8x8t_avx2; +- +- c->emu_edge = dav1d_emu_edge_avx2; +- c->resize = dav1d_resize_avx2; ++ c->avg = BF(dav1d_avg, avx2); ++ c->w_avg = BF(dav1d_w_avg, avx2); ++ c->mask = BF(dav1d_mask, avx2); ++ c->w_mask[0] = BF(dav1d_w_mask_444, avx2); ++ c->w_mask[1] = BF(dav1d_w_mask_422, avx2); ++ c->w_mask[2] = BF(dav1d_w_mask_420, avx2); ++ c->blend = BF(dav1d_blend, avx2); ++ c->blend_v = BF(dav1d_blend_v, avx2); ++ c->blend_h = BF(dav1d_blend_h, avx2); ++ ++ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2); ++ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2); ++ ++ c->emu_edge = BF(dav1d_emu_edge, avx2); ++ c->resize = BF(dav1d_resize, avx2); + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) +@@ -355,12 +355,12 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl); + +- c->avg = dav1d_avg_avx512icl; +- c->w_avg = dav1d_w_avg_avx512icl; +- c->mask = dav1d_mask_avx512icl; +- c->w_mask[0] = dav1d_w_mask_444_avx512icl; +- c->w_mask[1] = dav1d_w_mask_422_avx512icl; +- c->w_mask[2] = dav1d_w_mask_420_avx512icl; ++ c->avg = BF(dav1d_avg, avx512icl); ++ c->w_avg = BF(dav1d_w_avg, avx512icl); ++ c->mask = BF(dav1d_mask, avx512icl); ++ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl); ++ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl); ++ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl); + #endif + #endif + } +diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm +index edbd1865..8b38daa7 100644 +--- a/src/x86/mc_sse.asm ++++ b/src/x86/mc_sse.asm +@@ -113,13 +113,13 @@ cextern mc_subpel_filters + %endrep + %endmacro + +-BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128 +-BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16 +-BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32 +-BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32 +-BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 ++BIDIR_JMP_TABLE avg_8bpc_ssse3, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_avg_8bpc_ssse3, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE mask_8bpc_ssse3, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_420_8bpc_ssse3, 4, 8, 16, 16, 16, 16 ++BIDIR_JMP_TABLE blend_8bpc_ssse3, 4, 8, 16, 32 ++BIDIR_JMP_TABLE blend_v_8bpc_ssse3, 2, 4, 8, 16, 32 ++BIDIR_JMP_TABLE blend_h_8bpc_ssse3, 2, 4, 8, 16, 16, 16, 16 + + %macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) +@@ -131,15 +131,15 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 + %endrep + %endmacro + +-%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep) +-%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put) +-%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep) ++%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep) ++%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) ++%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) + + BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 + BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + + %macro HV_JMP_TABLE 5-* +- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) ++ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 +@@ -202,8 +202,8 @@ HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 + %endmacro + + %if ARCH_X86_64 +-SCALED_JMP_TABLE put_8tap_scaled_ssse3, 2, 4, 8, 16, 32, 64, 128 +-SCALED_JMP_TABLE prep_8tap_scaled_ssse3, 4, 8, 16, 32, 64, 128 ++SCALED_JMP_TABLE put_8tap_scaled_8bpc_ssse3, 2, 4, 8, 16, 32, 64, 128 ++SCALED_JMP_TABLE prep_8tap_scaled_8bpc_ssse3, 4, 8, 16, 32, 64, 128 + %endif + + %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX +@@ -228,7 +228,7 @@ INIT_XMM ssse3 + %endif + %endmacro + +-cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy ++cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn srcq, srcmp +@@ -853,7 +853,7 @@ cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy + %define base 0 + %endif + +-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ++cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + LEA r6, prep%+SUFFIX + tzcnt wd, wm +@@ -1450,7 +1450,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + %assign FILTER_SHARP (2*15 << 16) | 3*15 + + %macro FN 4 ; prefix, type, type_h, type_v +-cglobal %1_%2 ++cglobal %1_%2_8bpc + mov t0d, FILTER_%3 + %ifidn %3, %4 + mov t1d, t0d +@@ -1458,7 +1458,7 @@ cglobal %1_%2 + mov t1d, FILTER_%4 + %endif + %ifnidn %2, regular ; skip the jump in the last filter +- jmp mangle(private_prefix %+ _%1 %+ SUFFIX) ++ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) + %endif + %endmacro + +@@ -1488,7 +1488,7 @@ FN put_8tap, regular, REGULAR, REGULAR + %define base 0 + %endif + +-cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 ++cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + %assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h +@@ -2739,7 +2739,7 @@ FN prep_8tap, regular, REGULAR, REGULAR + %define base_reg r7 + %define base 0 + %endif +-cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 ++cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 + %assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h +@@ -3920,26 +3920,26 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 + %ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +-cglobal put_8tap_scaled, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy ++cglobal put_8tap_scaled_8bpc, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +-cglobal put_8tap_scaled, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy ++cglobal put_8tap_scaled_8bpc, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 + %else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +-cglobal prep_8tap_scaled, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy ++cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +-cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy ++cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+0x138] + %endif + %xdefine base_reg r11 + %define rndshift 6 + %endif +- LEA base_reg, %1_8tap_scaled_ssse3 +-%define base base_reg-%1_8tap_scaled_ssse3 ++ LEA base_reg, %1_8tap_scaled_8bpc_ssse3 ++%define base base_reg-%1_8tap_scaled_8bpc_ssse3 + tzcnt wd, wm + movd m8, dxm + movd m14, mxm +@@ -4001,7 +4001,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + je .dy1 + cmp dyd, 2048 + je .dy2 +- movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] ++ movzx wd, word [base+%1_8tap_scaled_8bpc_ssse3_table+wq*2] + add wq, base_reg + jmp wq + %ifidn %1, put +@@ -4557,7 +4557,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + mova [rsp+0x80], m4 + jmp .vloop + .dy1: +- movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] ++ movzx wd, word [base+%1_8tap_scaled_8bpc_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq + %ifidn %1, put +@@ -5049,7 +5049,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + mova [rsp+0x80], m7 + jmp .dy1_vloop + .dy2: +- movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] ++ movzx wd, word [base+%1_8tap_scaled_8bpc_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq + %ifidn %1, put +@@ -5522,10 +5522,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %endmacro + + %macro BILIN_SCALED_FN 1 +-cglobal %1_bilin_scaled ++cglobal %1_bilin_scaled_8bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 +- jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) ++ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) + %endmacro + + %if ARCH_X86_64 +@@ -5719,15 +5719,15 @@ MC_8TAP_SCALED prep + + %macro WARP_AFFINE_8X8T 0 + %if ARCH_X86_64 +-cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts ++cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts + %else +-cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts ++cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts + %if copy_args + %define tmpm [esp+stack_size-4*1] + %define tsm [esp+stack_size-4*2] + %endif + %endif +- call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main ++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main + .loop: + %if ARCH_X86_32 + %define m12 m4 +@@ -5768,24 +5768,24 @@ cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts + mova [tmpq+tsq*0], m12 + mova [tmpq+tsq*2], m14 + dec counterd +- jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end ++ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end + %if ARCH_X86_32 + mov tmpm, tmpd + mov r0, [esp+0x100] + mov r1, [esp+0x104] + %endif +- call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2 ++ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop + %endmacro + + %macro WARP_AFFINE_8X8 0 + %if ARCH_X86_64 +-cglobal warp_affine_8x8, 6, 14, 16, 0x90, \ ++cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ + dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ + filter, tmp1, delta, my, gamma + %else +-cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \ ++cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ + dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ + filter, tmp1, delta, my, gamma + %define alphaq r0 +@@ -6375,12 +6375,12 @@ DECLARE_REG_TMP 6, 7 + add tmp2q, %1*mmsize + %endmacro + +-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +- LEA r6, avg_ssse3_table ++cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 ++ LEA r6, avg_8bpc_ssse3_table + tzcnt wd, wm ; leading zeros + movifnidn hd, hm ; move h(stack) to h(register) if not already that register + movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg +- mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align ++ mova m2, [pw_1024+r6-avg_8bpc_ssse3_table] ; fill m2 with shift/align + add wq, r6 + BIDIR_FN AVG + +@@ -6406,14 +6406,14 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + + %define W_AVG_INC_PTR AVG_INC_PTR + +-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +- LEA r6, w_avg_ssse3_table ++cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 ++ LEA r6, w_avg_8bpc_ssse3_table + tzcnt wd, wm + movd m4, r6m + movifnidn hd, hm + pxor m0, m0 + movsxd wq, dword [r6+wq*4] +- mova m5, [pw_2048+r6-w_avg_ssse3_table] ++ mova m5, [pw_2048+r6-w_avg_8bpc_ssse3_table] + pshufb m4, m0 + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 +@@ -6460,14 +6460,14 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + %endmacro + + %if ARCH_X86_64 +-cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 + movifnidn hd, hm + %else +-cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 ++cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 + %define hd dword r5m + %endif +-%define base r6-mask_ssse3_table +- LEA r6, mask_ssse3_table ++%define base r6-mask_8bpc_ssse3_table ++ LEA r6, mask_8bpc_ssse3_table + tzcnt wd, wm + movsxd wq, dword [r6+wq*4] + pxor m4, m4 +@@ -6514,13 +6514,13 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 + W_MASK_420_B (%1*16), %2 + %endmacro + +-%define base r6-w_mask_420_ssse3_table ++%define base r6-w_mask_420_8bpc_ssse3_table + %if ARCH_X86_64 + %define reg_pw_6903 m8 + %define reg_pw_2048 m9 + ; args: dst, stride, tmp1, tmp2, w, h, mask, sign +-cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask +- lea r6, [w_mask_420_ssse3_table] ++cglobal w_mask_420_8bpc, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask ++ lea r6, [w_mask_420_8bpc_ssse3_table] + mov wd, wm + tzcnt r7d, wd + movd m0, r7m ; sign +@@ -6540,9 +6540,9 @@ cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask + %else + %define reg_pw_6903 [base+pw_6903] + %define reg_pw_2048 m3 +-cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask ++cglobal w_mask_420_8bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + tzcnt wd, wm +- LEA r6, w_mask_420_ssse3_table ++ LEA r6, w_mask_420_8bpc_ssse3_table + movd m0, r7m ; sign + mov maskq, r6mp + mov wd, [r6+wq*4] +@@ -6656,9 +6656,9 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + BLEND_64M %1, %2, m2, m3 + %endmacro + +-cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask +-%define base r6-blend_ssse3_table +- LEA r6, blend_ssse3_table ++cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask ++%define base r6-blend_8bpc_ssse3_table ++ LEA r6, blend_8bpc_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movifnidn maskq, maskmp +@@ -6732,15 +6732,15 @@ cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask + jg .w32 + RET + +-cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask +-%define base r5-blend_v_ssse3_table +- LEA r5, blend_v_ssse3_table ++cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask ++%define base r5-blend_v_8bpc_ssse3_table ++ LEA r5, blend_v_8bpc_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + mova m5, [base+pw_512] + add wq, r5 +- add maskq, obmc_masks-blend_v_ssse3_table ++ add maskq, obmc_masks-blend_v_8bpc_ssse3_table + jmp wq + .w2: + movd m3, [maskq+4] +@@ -6840,8 +6840,8 @@ cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask + %endif + RET + +-cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask +-%define base t0-blend_h_ssse3_table ++cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask ++%define base t0-blend_h_8bpc_ssse3_table + %if ARCH_X86_32 + ; We need to keep the PIC pointer for w4, reload wd from stack instead + DECLARE_REG_TMP 6 +@@ -6849,7 +6849,7 @@ cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask + DECLARE_REG_TMP 5 + mov r6d, wd + %endif +- LEA t0, blend_h_ssse3_table ++ LEA t0, blend_h_8bpc_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, dword [t0+wq*4] +@@ -6954,7 +6954,7 @@ cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask + ; bw, bh total filled size + ; iw, ih, copied block -> fill bottom, right + ; x, y, offset in bw/bh -> fill top, left +-cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ ++cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can +@@ -7317,13 +7317,13 @@ cextern resize_filter + %endmacro + + %if ARCH_X86_64 +-cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \ ++cglobal resize_8bpc, 0, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + %elif STACK_ALIGNMENT >= 16 +-cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ ++cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + %else +-cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ ++cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + %endif + movifnidn dstq, dstmp +-- +GitLab + + +From da299eb148a2a799411132166d32613d15586578 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Mon, 21 Dec 2020 00:38:05 -0500 +Subject: [PATCH 2/7] x86: mc: Add AVX2 implementation of 8tap put/prep for + 16bpc + +Relative speed-ups over C code (compared with gcc-9.3.0): + + C AVX2 +mc_8tap_regular_w2_0_16bpc: 146.2 17.4 8.40x +mc_8tap_regular_w4_0_16bpc: 117.1 17.2 6.81x +mc_8tap_regular_w8_0_16bpc: 128.9 17.2 7.49x +mc_8tap_regular_w16_0_16bpc: 148.3 24.6 6.03x +mc_8tap_regular_w32_0_16bpc: 233.3 55.4 4.21x +mc_8tap_regular_w64_0_16bpc: 571.2 319.2 1.79x +mc_8tap_regular_w128_0_16bpc: 1027.1 961.1 1.07x +mc_8tap_regular_w2_h_16bpc: 378.0 36.9 10.24x +mc_8tap_regular_w4_h_16bpc: 660.2 49.0 13.47x +mc_8tap_regular_w8_h_16bpc: 1215.7 112.9 10.77x +mc_8tap_regular_w16_h_16bpc: 2989.1 295.8 10.11x +mc_8tap_regular_w32_h_16bpc: 8724.6 939.4 9.29x +mc_8tap_regular_w64_h_16bpc: 29957.4 3296.8 9.09x +mc_8tap_regular_w128_h_16bpc: 83043.1 9318.0 8.91x +mc_8tap_regular_w2_hv_16bpc: 829.3 142.3 5.83x +mc_8tap_regular_w4_hv_16bpc: 1508.8 168.5 8.95x +mc_8tap_regular_w8_hv_16bpc: 2163.8 232.1 9.32x +mc_8tap_regular_w16_hv_16bpc: 3710.8 595.0 6.24x +mc_8tap_regular_w32_hv_16bpc: 10317.3 1814.8 5.69x +mc_8tap_regular_w64_hv_16bpc: 33509.6 6120.2 5.48x +mc_8tap_regular_w128_hv_16bpc: 91086.4 17263.0 5.28x +mc_8tap_regular_w2_v_16bpc: 523.6 69.9 7.49x +mc_8tap_regular_w4_v_16bpc: 769.4 68.8 11.18x +mc_8tap_regular_w8_v_16bpc: 1292.3 92.7 13.94x +mc_8tap_regular_w16_v_16bpc: 3078.1 242.5 12.69x +mc_8tap_regular_w32_v_16bpc: 8706.3 731.1 11.91x +mc_8tap_regular_w64_v_16bpc: 28948.7 2593.4 11.16x +mc_8tap_regular_w128_v_16bpc: 79731.9 7838.7 10.17x + +mct_8tap_regular_w4_0_16bpc: 106.6 15.7 6.79x +mct_8tap_regular_w8_0_16bpc: 120.1 24.7 4.86x +mct_8tap_regular_w16_0_16bpc: 276.4 43.0 6.43x +mct_8tap_regular_w32_0_16bpc: 940.5 171.7 5.48x +mct_8tap_regular_w64_0_16bpc: 2238.6 485.7 4.61x +mct_8tap_regular_w128_0_16bpc: 5529.9 1113.5 4.97x +mct_8tap_regular_w4_h_16bpc: 394.6 36.2 10.90x +mct_8tap_regular_w8_h_16bpc: 1121.1 125.4 8.94x +mct_8tap_regular_w16_h_16bpc: 3706.9 383.0 9.68x +mct_8tap_regular_w32_h_16bpc: 13628.9 1554.6 8.77x +mct_8tap_regular_w64_h_16bpc: 31807.7 3727.5 8.53x +mct_8tap_regular_w128_h_16bpc: 77388.9 9688.3 7.99x +mct_8tap_regular_w4_hv_16bpc: 1099.5 118.1 9.31x +mct_8tap_regular_w8_hv_16bpc: 2280.3 242.0 9.42x +mct_8tap_regular_w16_hv_16bpc: 4510.8 718.0 6.28x +mct_8tap_regular_w32_hv_16bpc: 15620.4 2853.5 5.47x +mct_8tap_regular_w64_hv_16bpc: 35833.0 6572.0 5.45x +mct_8tap_regular_w128_hv_16bpc: 85563.2 16550.1 5.17x +mct_8tap_regular_w4_v_16bpc: 578.3 47.9 12.07x +mct_8tap_regular_w8_v_16bpc: 1470.1 99.0 14.85x +mct_8tap_regular_w16_v_16bpc: 4165.9 296.8 14.04x +mct_8tap_regular_w32_v_16bpc: 14660.0 1142.4 12.83x +mct_8tap_regular_w64_v_16bpc: 33287.1 2787.2 11.94x +mct_8tap_regular_w128_v_16bpc: 79527.0 7323.4 10.86x +--- + src/x86/mc16_avx2.asm | 1553 +++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 1553 insertions(+) + create mode 100644 src/x86/mc16_avx2.asm + +diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm +new file mode 100644 +index 00000000..ea6cfdbf +--- /dev/null ++++ b/src/x86/mc16_avx2.asm +@@ -0,0 +1,1553 @@ ++; Copyright (c) 2017-2020, The rav1e contributors ++; Copyright (c) 2020, Nathan Egge ++; All rights reserved. ++; ++; This source code is subject to the terms of the BSD 2 Clause License and ++; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ++; was not distributed with this source code in the LICENSE file, you can ++; obtain it at www.aomedia.org/license/software. If the Alliance for Open ++; Media Patent License 1.0 was not distributed with this source code in the ++; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ++ ++%include "config.asm" ++%include "ext/x86/x86inc.asm" ++ ++%if ARCH_X86_64 ++ ++SECTION_RODATA 32 ++ ++spf_h_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 ++ db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 ++pq_2: dq (6 - 4) ++pq_4: dq (6 - 2) ++pq_8: dq (6 + 2) ++pq_10: dq (6 + 4) ++pd_32: dd (1 << 6 >> 1) ++pd_34: dd (1 << 6 >> 1) + (1 << (6 - 4) >> 1) ++pd_40: dd (1 << 6 >> 1) + (1 << (6 - 2) >> 1) ++pd_2: dd (1 << (6 - 4) >> 1) ++pd_512: dd (1 << (6 + 4) >> 1) ++pd_8: dd (1 << (6 - 2) >> 1) ++pd_128: dd (1 << (6 + 2) >> 1) ++nd_524256: dd (1 << 6 >> 1) - (8192 << 6) ++nd_32766: dd (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) ++nd_131064: dd (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) ++pw_8192: dw 8192 ++ ++SECTION .text ++ ++%macro PUT_4TAP_H 6 ++ pshufb %1, %3 ++ pshufb %2, %3 ++ pmaddwd %1, %4 ++ pmaddwd %2, %4 ++ phaddd %1, %2 ++ paddd %1, %5 ++ psrad %1, %6 ++%endm ++ ++%macro PUT_8TAP_H 8 ++ movu xm%1, [srcq + %8 + 0] ++ movu xm%3, [srcq + %8 + 2] ++ vinserti128 m%1, [srcq + ssq + %8 + 0], 1 ++ vinserti128 m%3, [srcq + ssq + %8 + 2], 1 ++ movu xm%2, [srcq + %8 + 4] ++ movu xm%4, [srcq + %8 + 6] ++ vinserti128 m%2, [srcq + ssq + %8 + 4], 1 ++ vinserti128 m%4, [srcq + ssq + %8 + 6], 1 ++ pmaddwd m%1, %5 ++ pmaddwd m%3, %5 ++ pmaddwd m%2, %5 ++ pmaddwd m%4, %5 ++ phaddd m%1, m%3 ++ phaddd m%2, m%4 ++ phaddd m%1, m%2 ++ paddd m%1, %6 ++ psrad m%1, %7 ++%endm ++ ++%macro PUT_4TAP_HS1 5 ++ pshufb %1, %2 ++ pmaddwd %1, %3 ++ phaddd %1, %1 ++ paddd %1, %4 ++ psrad %1, %5 ++ packssdw %1, %1 ++%endm ++ ++%macro PUT_4TAP_HS2 6 ++ pshufb %1, %3 ++ pshufb %2, %3 ++ pmaddwd %1, %4 ++ pmaddwd %2, %4 ++ phaddd %1, %1 ++ phaddd %2, %2 ++ paddd %1, %5 ++ paddd %2, %5 ++ psrad %1, %6 ++ psrad %2, %6 ++ packssdw %1, %1 ++ packssdw %2, %2 ++%endm ++ ++%macro PUT_8TAP_HS 7-8 ++ movu xm%1, [srcq + %7 + 0] ++ movu xm%3, [srcq + %7 + 2] ++ vinserti128 m%1, [srcq + %7 + 8], 1 ++ vinserti128 m%3, [srcq + %7 + 10], 1 ++ pmaddwd m%1, %4 ++ pmaddwd m%3, %4 ++ phaddd m%1, m%3 ++ movu xm%2, [srcq + %7 + 4] ++ movu xm%3, [srcq + %7 + 6] ++ vinserti128 m%2, [srcq + %7 + 12], 1 ++ vinserti128 m%3, [srcq + %7 + 14], 1 ++ pmaddwd m%2, %4 ++ pmaddwd m%3, %4 ++ phaddd m%2, m%3 ++%if %0 > 7 ++ vpbroadcastd %5, %8 ++%endif ++ phaddd m%1, m%2 ++ paddd m%1, %5 ++ psrad m%1, %6 ++ packssdw m%1, m%1 ++%endm ++ ++%macro LOAD_REGS_2 3 ++ mov%1 xm%2, [srcq + ssq*0] ++ mov%1 xm%3, [srcq + ssq*1] ++%ifidn %1, u ++ vpermq m%2, m%2, q3120 ++ vpermq m%3, m%3, q3120 ++%endif ++ lea srcq, [srcq + ssq*2] ++%endm ++ ++%macro LOAD_REGS_3 4 ++ mov%1 xm%2, [srcq + ssq*0] ++ mov%1 xm%3, [srcq + ssq*1] ++ mov%1 xm%4, [srcq + ssq*2] ++%ifidn %1, u ++ vpermq m%2, m%2, q3120 ++ vpermq m%3, m%3, q3120 ++ vpermq m%4, m%4, q3120 ++%endif ++ add srcq, ss3q ++%endm ++ ++%macro LOAD_REGS 3-8 ++%if %0 == 3 ++ LOAD_REGS_2 %1, %2, %3 ++%elif %0 == 4 ++ LOAD_REGS_3 %1, %2, %3, %4 ++%elif %0 == 5 ++ LOAD_REGS_2 %1, %2, %3 ++ LOAD_REGS_2 %1, %4, %5 ++%elif %0 == 6 ++ LOAD_REGS_3 %1, %2, %3, %4 ++ LOAD_RESG_2 %1, %5, %6 ++%elif %0 == 7 ++ LOAD_REGS_3 %1, %2, %3, %4 ++ LOAD_REGS_3 %1, %5, %6, %7 ++%else ++ LOAD_REGS_3 %1, %2, %3, %4 ++ LOAD_REGS_2 %1, %5, %6 ++ LOAD_REGS_2 %1, %7, %8 ++%endif ++%endm ++ ++%macro STORE_REGS 3 ++%ifidn %1, u ++ vpermq m%2, m%2, q3120 ++ vpermq m%3, m%3, q3120 ++%endif ++ mov%1 [dstq + dsq*0], xm%2 ++ mov%1 [dstq + dsq*1], xm%3 ++ lea dstq, [dstq + dsq*2] ++%endm ++ ++%macro INTERLEAVE_REGS 4-8 ++ punpckl%1 %2, %3 ++ punpckl%1 %3, %4 ++%if %0 > 4 ++ punpckl%1 %4, %5 ++ punpckl%1 %5, %6 ++%endif ++%if %0 > 6 ++ punpckl%1 %6, %7 ++ punpckl%1 %7, %8 ++%endif ++%endm ++ ++%macro MUL_ADD_R 8 ++ pmaddwd %3, %7 ++ pmaddwd %1, %5, %8 ++ paddd %1, %3 ++ mova %3, %5 ++ pmaddwd %4, %7 ++ pmaddwd %2, %6, %8 ++ paddd %2, %4 ++ mova %4, %6 ++%endm ++ ++%macro MUL_ACC_R 7 ++ pmaddwd %3, %5, %7 ++ pmaddwd %4, %6, %7 ++ paddd %1, %3 ++ paddd %2, %4 ++ mova %3, %5 ++ mova %4, %6 ++%endm ++ ++%macro RND_SHR_MIN_R 5 ++ paddd %1, %3 ++ paddd %2, %3 ++ psrad %1, %4 ++ psrad %2, %4 ++ packusdw %1, %1 ++ packusdw %2, %2 ++ pminuw %1, %5 ++ pminuw %2, %5 ++%endm ++ ++%macro RND_SHR_R 4 ++ paddd %1, %3 ++ paddd %2, %3 ++ psrad %1, %4 ++ psrad %2, %4 ++ packssdw %1, %1 ++ packssdw %2, %2 ++%endm ++ ++; int8_t subpel_filters[5][15][8] ++%assign FILTER_REGULAR (0*15 << 7) | 3*15 ++%assign FILTER_SMOOTH (1*15 << 7) | 4*15 ++%assign FILTER_SHARP (2*15 << 7) | 3*15 ++ ++%macro make_8tap_fn 4 ; type, op, type_h, type_v ++INIT_XMM avx2 ++cglobal %1_8tap_%2_16bpc ++ mov t0d, FILTER_%3 ++ mov t1d, FILTER_%4 ++ jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) ++%endmacro ++ ++cextern mc_subpel_filters ++ ++%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) ++ ++%macro filter_fn 1 ++ ++%if WIN64 ++%ifidn %1, put ++DECLARE_REG_TMP 5, 4 ++%else ++DECLARE_REG_TMP 4, 5 ++%endif ++%else ++DECLARE_REG_TMP 7, 8 ++%endif ++ ++make_8tap_fn %1, regular, REGULAR, REGULAR ++make_8tap_fn %1, regular_smooth, REGULAR, SMOOTH ++make_8tap_fn %1, regular_sharp, REGULAR, SHARP ++make_8tap_fn %1, smooth, SMOOTH, SMOOTH ++make_8tap_fn %1, smooth_regular, SMOOTH, REGULAR ++make_8tap_fn %1, smooth_sharp, SMOOTH, SHARP ++make_8tap_fn %1, sharp, SHARP, SHARP ++make_8tap_fn %1, sharp_regular, SHARP, REGULAR ++make_8tap_fn %1, sharp_smooth, SHARP, SMOOTH ++ ++INIT_YMM avx2 ++%ifidn %1, put ++cglobal put_8tap_16bpc, 4, 10, 16, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 ++%else ++cglobal prep_8tap_16bpc, 3, 10, 16, dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 ++%endif ++ ++%ifidn %1, put ++ imul mxd, mxm, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) ++ add mxd, t0d ++ imul myd, mym, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) ++ add myd, t1d ++%else ++ imul myd, mym, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) ++ add myd, t1d ++ imul mxd, mxm, 0x4081 ; (1 << 14) | (1 << 7) | (1 << 0) ++ add mxd, t0d ++%endif ++ ++ movsxd _wq, _wm ++ movifnidn hd, hm ++ ++%ifidn %1, put ++ vpbroadcastw m7, bdmaxm ++%else ++ lea dsq, [_wq*2] ++%endif ++ ++ test mxd, (0x7f << 14) ++ jnz .%1_8tap_h_16bpc ++ test myd, (0x7f << 14) ++ jnz .%1_8tap_v_16bpc ++ ++; ---- {put,prep}_16bpc ---- ++ ++INIT_XMM avx2 ++.%1_16bpc: ; cglobal put_16bpc, 6, 8, 8, dst, ds, src, ss, w, h ++ ++%ifidn %1, prep ++INIT_YMM avx2 ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastq m8, [pq_4] ++ vpbroadcastw m9, [pw_8192] ++ cmp bdmaxd, 12 ++ jne .prep_bits10 ++ vpbroadcastq m8, [pq_2] ++.prep_bits10: ++INIT_XMM avx2 ++%endif ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3 ++%else ++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3 ++%endif ++ ++ lea jrq, [.jmp_tbl] ++ tzcnt _wd, _wm ++%ifidn %1, put ++ sub _wd, 1 ++%else ++ sub _wd, 2 ++%endif ++ movsxd _wq, [jrq + _wq*4] ++ add _wq, jrq ++ jmp _wq ++ ++%ifidn %1, put ++.w2: ; 2xN ++ movd m0, [srcq] ++ movd m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ movd [dstq], m0 ++ movd [dstq + dsq], m1 ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .w2 ++ RET ++%endif ++ ++.w4: ; 4xN ++ movq m0, [srcq] ++ movq m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++%ifidn %1, prep ++ psllw m0, m8 ++ psllw m1, m8 ++ psubw m0, m9 ++ psubw m1, m9 ++%endif ++ movq [dstq], m0 ++ movq [dstq + dsq], m1 ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .w4 ++ RET ++ ++ ; XXX is unaligned input (but aligned output) a hard requirement, or is checkasm broken? ++.w8: ; 8xN ++ movu m0, [srcq] ++ movu m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++%ifidn %1, prep ++ psllw m0, m8 ++ psllw m1, m8 ++ psubw m0, m9 ++ psubw m1, m9 ++%endif ++ mova [dstq], m0 ++ mova [dstq + dsq], m1 ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .w8 ++ RET ++ ++INIT_YMM avx2 ++.w16: ; 16xN ++ movu m0, [srcq] ++ movu m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++%ifidn %1, prep ++ psllw m0, xm8 ++ psllw m1, xm8 ++ psubw m0, m9 ++ psubw m1, m9 ++%endif ++ mova [dstq], m0 ++ mova [dstq + dsq], m1 ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .w16 ++ RET ++ ++.w32: ; 32xN ++ movu m0, [srcq + 32*0] ++ movu m1, [srcq + 32*1] ++ movu m2, [srcq + ssq] ++ movu m3, [srcq + ssq + 32*1] ++ lea srcq, [srcq + ssq*2] ++%ifidn %1, prep ++ psllw m0, xm8 ++ psllw m1, xm8 ++ psllw m2, xm8 ++ psllw m3, xm8 ++ psubw m0, m9 ++ psubw m1, m9 ++ psubw m2, m9 ++ psubw m3, m9 ++%endif ++ mova [dstq + 32*0], m0 ++ mova [dstq + 32*1], m1 ++ mova [dstq + dsq + 32*0], m2 ++ mova [dstq + dsq + 32*1], m3 ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .w32 ++ RET ++ ++.w64: ; 64xN ++ movu m0, [srcq + 32*0] ++ movu m1, [srcq + 32*1] ++ movu m2, [srcq + 32*2] ++ movu m3, [srcq + 32*3] ++ movu m4, [srcq + ssq + 32*0] ++ movu m5, [srcq + ssq + 32*1] ++ movu m6, [srcq + ssq + 32*2] ++ movu m7, [srcq + ssq + 32*3] ++ lea srcq, [srcq + ssq*2] ++%ifidn %1, prep ++ psllw m0, xm8 ++ psllw m1, xm8 ++ psllw m2, xm8 ++ psllw m3, xm8 ++ psllw m4, xm8 ++ psllw m5, xm8 ++ psllw m6, xm8 ++ psllw m7, xm8 ++ psubw m0, m9 ++ psubw m1, m9 ++ psubw m2, m9 ++ psubw m3, m9 ++ psubw m4, m9 ++ psubw m5, m9 ++ psubw m6, m9 ++ psubw m7, m9 ++%endif ++ mova [dstq + 32*0], m0 ++ mova [dstq + 32*1], m1 ++ mova [dstq + 32*2], m2 ++ mova [dstq + 32*3], m3 ++ mova [dstq + dsq + 32*0], m4 ++ mova [dstq + dsq + 32*1], m5 ++ mova [dstq + dsq + 32*2], m6 ++ mova [dstq + dsq + 32*3], m7 ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .w64 ++ RET ++ ++.w128: ; 128xN ++ movu m0, [srcq + 32*0] ++ movu m1, [srcq + 32*1] ++ movu m2, [srcq + 32*2] ++ movu m3, [srcq + 32*3] ++ movu m4, [srcq + 32*4] ++ movu m5, [srcq + 32*5] ++ movu m6, [srcq + 32*6] ++ movu m7, [srcq + 32*7] ++ add srcq, ssq ++%ifidn %1, prep ++ psllw m0, xm8 ++ psllw m1, xm8 ++ psllw m2, xm8 ++ psllw m3, xm8 ++ psllw m4, xm8 ++ psllw m5, xm8 ++ psllw m6, xm8 ++ psllw m7, xm8 ++ psubw m0, m9 ++ psubw m1, m9 ++ psubw m2, m9 ++ psubw m3, m9 ++ psubw m4, m9 ++ psubw m5, m9 ++ psubw m6, m9 ++ psubw m7, m9 ++%endif ++ mova [dstq + 32*0], m0 ++ mova [dstq + 32*1], m1 ++ mova [dstq + 32*2], m2 ++ mova [dstq + 32*3], m3 ++ mova [dstq + 32*4], m4 ++ mova [dstq + 32*5], m5 ++ mova [dstq + 32*6], m6 ++ mova [dstq + 32*7], m7 ++ add dstq, dsq ++ dec hd ++ jg .w128 ++ RET ++ ++.jmp_tbl: ++%ifidn %1, put ++ dd .w2 - .jmp_tbl ++%endif ++ dd .w4 - .jmp_tbl ++ dd .w8 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w32 - .jmp_tbl ++ dd .w64 - .jmp_tbl ++ dd .w128 - .jmp_tbl ++ ++; ---- {put,prep}_8tap_h_16bpc ---- ++ ++INIT_XMM avx2 ++.%1_8tap_h_16bpc: ; cglobal put_8tap_h_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, bdmax ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 ++%else ++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 ++%endif ++ ++ cmp _wd, 4 ++ jle .h_use4tap ++ shr mxd, 7 ++.h_use4tap: ++ and mxd, 0x7f ++ ++ test myd, (0x7f << 14) ++ jnz .%1_8tap_hv_16bpc ++ ++INIT_YMM avx2 ++ popcnt bdmaxd, bdmaxm ++%ifidn %1, put ++ vpbroadcastd m6, [pd_34] ; (1 << 6 >> 1) + (1 << (6 - 4) >> 1) ++%else ++ vpbroadcastd m6, [nd_32766] ; (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) ++ vpbroadcastq m7, [pq_2] ; (6 - 4) ++%endif ++ cmp bdmaxd, 12 ++ jne .h_bits10 ++%ifidn %1, put ++ vpbroadcastd m6, [pd_40] ; (1 << 6 >> 1) + (1 << (6 - 2) >> 1) ++%else ++ vpbroadcastd m6, [nd_131064] ; (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) ++ vpbroadcastq m7, [pq_4] ; (6 - 2) ++%endif ++.h_bits10: ++INIT_XMM avx2 ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, w2, jr, ss3 ++%else ++ DEFINE_ARGS dst, src, ss, _w, h, mx, w2, jr, ds, ss3 ++%endif ++ ++ lea w2q, [_wq*2] ++ ++ lea jrq, [.h_jmp_tbl] ++ tzcnt _wd, _wm ++%ifidn %1, put ++ sub _wd, 1 ++%else ++ sub _wd, 2 ++%endif ++ movsxd _wq, [jrq + _wq*4] ++ add _wq, jrq ++ jmp _wq ++ ++%ifidn %1, put ++.h_w2: ++ sub srcq, 2 ++ mova xm4, [spf_h_shuf] ++ vpbroadcastd m5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8 + 2] ++ vpmovsxbw m5, m5 ++ ++.h_w2l: ++ movu m0, [srcq] ++ movu m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++%ifidn %1, put ++ PUT_4TAP_H m0, m1, m4, m5, m6, 6 ++ packusdw m0, m0 ++ pminuw m0, m7 ++%else ++ PUT_4TAP_H m0, m1, m4, m5, m6, m7 ++ packssdw m0, m1 ++%endif ++ ++ movd [dstq], m0 ++ pextrd [dstq + dsq], m0, 1 ++ lea dstq, [dstq + dsq*2] ++ ++ sub hd, 2 ++ jg .h_w2l ++ RET ++%endif ++ ++INIT_YMM avx2 ++.h_w4: ++ sub srcq, 2 ++ mova m4, [spf_h_shuf] ++ vpbroadcastd xm5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8 + 2] ++ vpmovsxbw m5, xm5 ++ ++.h_w4l: ++ vbroadcasti128 m0, [srcq] ++ vbroadcasti128 m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++%ifidn %1, put ++ PUT_4TAP_H m0, m1, m4, m5, m6, 6 ++ packusdw m0, m0 ++ pminuw m0, m7 ++%else ++ PUT_4TAP_H m0, m1, m4, m5, m6, xm7 ++ packssdw m0, m0 ++%endif ++ ++ vextracti128 xm1, m0, 1 ++ movd [dstq], xm0 ++ movd [dstq + 4], xm1 ++ pextrd [dstq + dsq], xm0, 1 ++ pextrd [dstq + dsq + 4], xm1, 1 ++ lea dstq, [dstq + dsq*2] ++ ++ sub hd, 2 ++ jg .h_w4l ++ RET ++ ++.h_w8: ++ sub srcq, 6 ++ vpbroadcastq xm5, [jrq - .h_jmp_tbl + subpel_filters + mxq*8] ++ vpmovsxbw m5, xm5 ++ ++.h_w8l: ++ mov _wd, w2d ++ ++.h_w8c: ++%ifidn %1, put ++ PUT_8TAP_H 0, 1, 2, 3, m5, m6, 6, 4*0 ++ PUT_8TAP_H 1, 2, 3, 4, m5, m6, 6, 4*2 ++ packusdw m0, m1 ++ pminuw m0, m7 ++%else ++ PUT_8TAP_H 0, 1, 2, 3, m5, m6, xm7, 4*0 ++ PUT_8TAP_H 1, 2, 3, 4, m5, m6, xm7, 4*2 ++ packssdw m0, m1 ++%endif ++ add srcq, 8*2 ++ ++ mova [dstq], xm0 ++ vextracti128 [dstq + dsq], m0, 1 ++ ++ add dstq, 8*2 ++ sub _wd, 8*2 ++ jg .h_w8c ++ ++ sub srcq, w2q ++ sub dstq, w2q ++ lea srcq, [srcq + ssq*2] ++ lea dstq, [dstq + dsq*2] ++ sub hd, 2 ++ jg .h_w8l ++ RET ++ ++.h_jmp_tbl: ++%ifidn %1, put ++ dd .h_w2 - .h_jmp_tbl ++%endif ++ dd .h_w4 - .h_jmp_tbl ++ dd .h_w8 - .h_jmp_tbl ++ dd .h_w8 - .h_jmp_tbl ++ dd .h_w8 - .h_jmp_tbl ++ dd .h_w8 - .h_jmp_tbl ++ dd .h_w8 - .h_jmp_tbl ++ ++; ---- {put,prep}_8tap_v_16bpc ---- ++ ++INIT_XMM avx2 ++.%1_8tap_v_16bpc: ; cglobal put_8tap_v_16bpc, 4, 9, 0, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 ++%else ++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 ++%endif ++ ++ cmp hd, 4 ++ jle .v_use4tap ++ shr myd, 7 ++.v_use4tap: ++ and myd, 0x7f ++ ++INIT_YMM avx2 ++%ifidn %1, put ++ vpbroadcastd m6, [pd_32] ; (1 << 6 >> 1) ++%else ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m6, [nd_32766] ; (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) ++ vpbroadcastq m7, [pq_2] ; (6 - 4) ++ cmp bdmaxd, 12 ++ jne .v_bits10 ++ vpbroadcastd m6, [nd_131064] ; (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) ++ vpbroadcastq m7, [pq_4] ; (6 - 2) ++.v_bits10: ++%endif ++INIT_XMM avx2 ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, w2, my, jr, ss3 ++%else ++ DEFINE_ARGS dst, src, ss, _w, h, w2, my, jr, ds, ss3 ++%endif ++ ++ lea jrq, [.v_jmp_tbl] ++ lea w2q, [_wq*2] ++ lea ss3q, [ssq*3] ++ ++INIT_YMM avx2 ++ lea myq, [jrq - .v_jmp_tbl + subpel_filters + myq*8] ++ vpbroadcastw m8, [myq+0] ++ vpbroadcastw m9, [myq+2] ++ vpbroadcastw m10, [myq+4] ++ vpbroadcastw m11, [myq+6] ++ vpmovsxbw m8, xm8 ++ vpmovsxbw m9, xm9 ++ vpmovsxbw m10, xm10 ++ vpmovsxbw m11, xm11 ++INIT_XMM avx2 ++ ++ tzcnt _wd, _wm ++%ifidn %1, put ++ sub _wd, 1 ++%else ++ sub _wd, 2 ++%endif ++ movsxd _wq, [jrq + _wq*4] ++ add _wq, jrq ++ jmp _wq ++ ++%ifidn %1, put ++.v_w2: ++ ++ cmp hd, 4 ++ jg .v_w28 ++ ++ sub srcq, ssq ++ LOAD_REGS d, 0, 1, 2 ++ INTERLEAVE_REGS wd, m0, m1, m2 ++ ++.v_w2l: ; 2x2, 2x4 ++ ++ LOAD_REGS d, 3, 4 ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++ MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10 ++ mova m2, m4 ++ ++ RND_SHR_MIN_R m5, m8, m6, 6, m7 ++ STORE_REGS d, 5, 8 ++ ++ sub hd, 2 ++ jg .v_w2l ++ RET ++ ++.v_w28: ++ ++ sub srcq, ss3q ++ LOAD_REGS d, 0, 1, 2, 3, 4, 12, 13 ++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13 ++ ++.v_w28l: ; 2x6, 2x8, 2x12, 2x16, 2x24, 2x32 ++ ++ sub srcq, ssq ++ LOAD_REGS d, 13, 14, 15 ++ INTERLEAVE_REGS wd, m13, m14, m15 ++ ++ MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9 ++ MUL_ACC_R m5, m15, m2, m3, m4, m12, m10 ++ MUL_ACC_R m5, m15, m4, m12, m13, m14, m11 ++ ++ RND_SHR_MIN_R m5, m15, m6, 6, m7 ++ STORE_REGS d, 5, 15 ++ ++ sub hd, 2 ++ jg .v_w28l ++ RET ++%endif ++ ++.v_w4: ++ ++ cmp hd, 4 ++ jg .v_w48 ++ ++ sub srcq, ssq ++ LOAD_REGS q, 0, 1, 2 ++ INTERLEAVE_REGS wd, m0, m1, m2 ++ ++.v_w4l: ; 4x2 4x4 ++ ++ LOAD_REGS q, 3, 4 ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++ MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10 ++ mova m2, m4 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m5, m8, m6, 6, m7 ++%else ++ RND_SHR_R m5, m8, m6, m7 ++%endif ++ STORE_REGS q, 5, 8 ++ ++ sub hd, 2 ++ jg .v_w4l ++ RET ++ ++.v_w48: ++ ++ sub srcq, ss3q ++ LOAD_REGS q, 0, 1, 2, 3, 4, 12, 13 ++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13 ++ ++.v_w48l: ; 4x6, 4x8, 4x12, 4x16, 4x24, 4x32 ++ ++ sub srcq, ssq ++ LOAD_REGS q, 13, 14, 15 ++ INTERLEAVE_REGS wd, m13, m14, m15 ++ ++ MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9 ++ MUL_ACC_R m5, m15, m2, m3, m4, m12, m10 ++ MUL_ACC_R m5, m15, m4, m12, m13, m14, m11 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m5, m15, m6, 6, m7 ++%else ++ RND_SHR_R m5, m15, m6, m7 ++%endif ++ STORE_REGS q, 5, 15 ++ ++ sub hd, 2 ++ jg .v_w48l ++ ++ RET ++ ++INIT_YMM avx2 ++.v_w8: ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, oh, h, w2, tdst, tsrc, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, oh, h, w2, tdst, tsrc, ds, ss3 ++%endif ++ ++ mov ohd, hd ++ mov tdstq, dstq ++ ++ cmp hd, 4 ++ jg .v_w88 ++ ++ sub srcq, ssq ++ mov tsrcq, srcq ++ ++.v_w8l: ; N = 8, 16, 32, 64, 128 ++ ++ LOAD_REGS u, 0, 1, 2 ++ INTERLEAVE_REGS wd, m0, m1, m2 ++ ++.v_w8c: ; Nx2, Nx4 ++ ++ LOAD_REGS u, 3, 4 ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++ MUL_ADD_R m5, m8, m0, m1, m2, m3, m9, m10 ++ mova m2, m4 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m5, m8, m6, 6, m7 ++%else ++ RND_SHR_R m5, m8, m6, xm7 ++%endif ++ STORE_REGS u, 5, 8 ++ ++ sub hd, 2 ++ jg .v_w8c ++ ++ add tdstq, 2*8 ++ add tsrcq, 2*8 ++ mov hd, ohd ++ mov dstq, tdstq ++ mov srcq, tsrcq ++ sub w2d, 2*8 ++ jg .v_w8l ++ ++ RET ++ ++.v_w88: ++ ++ sub srcq, ss3q ++ mov tsrcq, srcq ++ ++.v_w88l: ; N = 8, 16, 32, 64, 128 ++ ++ LOAD_REGS u, 0, 1, 2, 3, 4, 12, 13 ++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m12, m13 ++ ++.v_w88c: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32 ++ ++ sub srcq, ssq ++ ++ LOAD_REGS u, 13, 14, 15 ++ INTERLEAVE_REGS wd, m13, m14, m15 ++ ++ MUL_ADD_R m5, m15, m0, m1, m2, m3, m8, m9 ++ MUL_ACC_R m5, m15, m2, m3, m4, m12, m10 ++ MUL_ACC_R m5, m15, m4, m12, m13, m14, m11 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m5, m15, m6, 6, m7 ++%else ++ RND_SHR_R m5, m15, m6, xm7 ++%endif ++ STORE_REGS u, 5, 15 ++ ++ sub hd, 2 ++ jg .v_w88c ++ ++ add tdstq, 2*8 ++ add tsrcq, 2*8 ++ mov hd, ohd ++ mov dstq, tdstq ++ mov srcq, tsrcq ++ sub w2d, 2*8 ++ jg .v_w88l ++ ++ RET ++ ++.v_jmp_tbl: ++%ifidn %1, put ++ dd .v_w2 - .v_jmp_tbl ++%endif ++ dd .v_w4 - .v_jmp_tbl ++ dd .v_w8 - .v_jmp_tbl ++ dd .v_w8 - .v_jmp_tbl ++ dd .v_w8 - .v_jmp_tbl ++ dd .v_w8 - .v_jmp_tbl ++ dd .v_w8 - .v_jmp_tbl ++ ++; ---- {put,prep}_8tap_hv_16bpc ---- ++ ++INIT_XMM avx2 ++.%1_8tap_hv_16bpc: ; cglobal put_8tap_hv_16bpc, 4, 9, 0, dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, bdmax, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, bdmax, ds, ss3 ++%endif ++ ++ cmp hd, 4 ++ jle .hv_use4tap ++ shr myd, 7 ++.hv_use4tap: ++ and myd, 0x7f ++ ++INIT_YMM avx2 ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m6, [pd_2] ; (1 << (6 - 4) >> 1) ++ movq xm13, [pq_2] ; 6 - 4 ++%ifidn %1, put ++ vpbroadcastd m14, [pd_512] ; (1 << (6 + 4) >> 1) ++ movq xm15, [pq_10] ; 6 + 4 ++%else ++ vpbroadcastd m14, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6) ++%endif ++ cmp bdmaxd, 12 ++ jne .hv_bits10 ++ vpbroadcastd m6, [pd_8] ; (1 << (6 - 2) >> 1) ++ movq xm13, [pq_4] ; 6 - 2 ++%ifidn %1, put ++ vpbroadcastd m14, [pd_128] ; (1 << (6 + 2) >> 1) ++ movq xm15, [pq_8] ; 6 + 2 ++%endif ++.hv_bits10: ++INIT_XMM avx2 ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3 ++%endif ++ ++ lea jrq, [.hv_jmp_tbl] ++ ++INIT_YMM avx2 ++ lea ss3q, [jrq - .hv_jmp_tbl + subpel_filters + myq*8] ++ vpbroadcastw xm8, [ss3q] ++ vpbroadcastw xm9, [ss3q + 2] ++ vpbroadcastw xm10, [ss3q + 4] ++ vpbroadcastw xm11, [ss3q + 6] ++ vpmovsxbw m8, xm8 ++ vpmovsxbw m9, xm9 ++ vpmovsxbw m10, xm10 ++ vpmovsxbw m11, xm11 ++INIT_XMM avx2 ++ ++ ; Width is need for for filters 8 and larger, see .hv_w8 ++ mov ss3q, _wq ++ ++ tzcnt _wd, _wm ++%ifidn %1, put ++ sub _wd, 1 ++%else ++ sub _wd, 2 ++%endif ++ movsxd _wq, [jrq + _wq*4] ++ add _wq, jrq ++ jmp _wq ++ ++%ifidn %1, put ++.hv_w2: ++ cmp hd, 4 ++ jg .hv_w28 ++ ++ lea ss3q, [ssq*3] ++ ++ mova m8, [spf_h_shuf] ++ vpbroadcastd m5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] ++ vpmovsxbw m5, m5 ++ ++ sub srcq, 2 ++ sub srcq, ssq ++ ++ movu m0, [srcq] ++ movu m1, [srcq + ssq] ++ movu m2, [srcq + ssq*2] ++ add srcq, ss3q ++ ++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, m13 ++ PUT_4TAP_HS1 m2, m8, m5, m6, m13 ++ INTERLEAVE_REGS wd, m0, m1, m2 ++ ++.hv_w2l: ++ ++ movu m3, [srcq] ++ movu m4, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, m13 ++ ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++ MUL_ADD_R m11, m12, m0, m1, m2, m3, m9, m10 ++ mova m2, m4 ++ ++ RND_SHR_MIN_R m11, m12, m14, m15, m7 ++ STORE_REGS d, 11, 12 ++ ++ sub hd, 2 ++ jg .hv_w2l ++ ++ RET ++ ++.hv_w28: ++ lea ss3q, [ssq*3] ++ ++ mova m8, [spf_h_shuf] ++ vpbroadcastd m5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] ++ vpmovsxbw m5, m5 ++ ++ lea myq, [jrq - .hv_jmp_tbl + subpel_filters + myq*8] ++ vpbroadcastd m9, [myq] ++ vpbroadcastd m10, [myq + 4] ++ vpmovsxbw m9, m9 ++ vpmovsxbw m10, m10 ++ ++ sub srcq, 2 ++ sub srcq, ss3q ++ ++ movu m0, [srcq] ++ movu m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, m13 ++ ++ movu m4, [srcq] ++ movu m3, [srcq + ssq] ++ movu m2, [srcq + ssq*2] ++ add srcq, ss3q ++ ++ PUT_4TAP_HS2 m4, m3, m8, m5, m6, m13 ++ PUT_4TAP_HS1 m2, m8, m5, m6, m13 ++ ++ INTERLEAVE_REGS wd, m0, m1, m4, m3, m2 ++ punpckldq m0, m4 ++ punpckldq m1, m3 ++ ++ movu m3, [srcq] ++ movu m4, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, m13 ++ ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++.hv_w28l: ++ ++ movu m11, [srcq] ++ movu m12, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m11, m12, m8, m5, m6, m13 ++ ++ INTERLEAVE_REGS wd, m4, m11, m12 ++ punpckldq m2, m4 ++ punpckldq m3, m11 ++ ++ pmaddwd m11, m0, m9 ++ pmaddwd m4, m2, m10 ++ pmaddwd m12, m1, m9 ++ paddd m11, m4 ++ pmaddwd m4, m3, m10 ++ paddd m12, m4 ++ phaddd m11, m11 ++ phaddd m12, m12 ++ ++ RND_SHR_MIN_R m11, m12, m14, m15, m7 ++ STORE_REGS d, 11, 12 ++ ++ pshufd m0, m0, q2031 ++ pshufd m1, m1, q2031 ++ pshufd m11, m2, q3120 ++ pshufd m12, m3, q3120 ++ pshufd m2, m2, q2031 ++ pshufd m3, m3, q2031 ++ ++ mova m4, m3 ++ psrad m4, 16 ++ packssdw m4, m4 ++ ++ punpckldq m0, m11 ++ punpckldq m1, m12 ++ ++ sub hd, 2 ++ jg .hv_w28l ++ ++ RET ++%endif ++ ++INIT_YMM avx2 ++.hv_w4: ++ cmp hd, 4 ++ jg .hv_w48 ++ ++ lea ss3q, [ssq*3] ++ ++ mova m8, [spf_h_shuf] ++ vpbroadcastd xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] ++ vpmovsxbw m5, xm5 ++ ++ sub srcq, 2 ++ sub srcq, ssq ++ ++ vbroadcasti128 m0, [srcq] ++ vbroadcasti128 m1, [srcq + ssq] ++ vbroadcasti128 m2, [srcq + ssq*2] ++ add srcq, ss3q ++ ++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, xm13 ++ PUT_4TAP_HS1 m2, m8, m5, m6, xm13 ++ INTERLEAVE_REGS wd, m0, m1, m2 ++ ++.hv_w4l: ++ ++ vbroadcasti128 m3, [srcq] ++ vbroadcasti128 m4, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, xm13 ++ ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++ MUL_ADD_R m11, m12, m0, m1, m2, m3, m9, m10 ++ mova m2, m4 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m11, m12, m14, xm15, m7 ++%else ++ RND_SHR_R m11, m12, m14, 6 ++%endif ++ ++ vextracti128 xm3, m11, 1 ++ vextracti128 xm4, m12, 1 ++ ++ movd [dstq], xm11 ++ movd [dstq + 4], xm3 ++ movd [dstq + dsq], xm12 ++ movd [dstq + dsq + 4], xm4 ++ lea dstq, [dstq + dsq*2] ++ ++ sub hd, 2 ++ jg .hv_w4l ++ ++ RET ++ ++.hv_w48: ++ lea ss3q, [ssq*3] ++ ++ mova m8, [spf_h_shuf] ++ vpbroadcastd xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8 + 2] ++ vpmovsxbw m5, xm5 ++ ++ lea myq, [jrq - .hv_jmp_tbl + subpel_filters + myq*8] ++ vpbroadcastd xm9, [myq] ++ vpbroadcastd xm10, [myq + 4] ++ vpmovsxbw m9, xm9 ++ vpmovsxbw m10, xm10 ++ ++ sub srcq, 2 ++ sub srcq, ss3q ++ ++ vbroadcasti128 m0, [srcq] ++ vbroadcasti128 m1, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m0, m1, m8, m5, m6, xm13 ++ ++ vbroadcasti128 m4, [srcq] ++ vbroadcasti128 m3, [srcq + ssq] ++ vbroadcasti128 m2, [srcq + ssq*2] ++ add srcq, ss3q ++ ++ PUT_4TAP_HS2 m4, m3, m8, m5, m6, xm13 ++ PUT_4TAP_HS1 m2, m8, m5, m6, xm13 ++ ++ INTERLEAVE_REGS wd, m0, m1, m4, m3, m2 ++ punpckldq m0, m4 ++ punpckldq m1, m3 ++ ++ vbroadcasti128 m3, [srcq] ++ vbroadcasti128 m4, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m3, m4, m8, m5, m6, xm13 ++ ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++.hv_w48l: ++ ++ vbroadcasti128 m11, [srcq] ++ vbroadcasti128 m12, [srcq + ssq] ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_4TAP_HS2 m11, m12, m8, m5, m6, xm13 ++ ++ INTERLEAVE_REGS wd, m4, m11, m12 ++ punpckldq m2, m4 ++ punpckldq m3, m11 ++ ++ pmaddwd m11, m0, m9 ++ pmaddwd m4, m2, m10 ++ pmaddwd m12, m1, m9 ++ paddd m11, m4 ++ pmaddwd m4, m3, m10 ++ paddd m12, m4 ++ phaddd m11, m11 ++ phaddd m12, m12 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m11, m12, m14, xm15, m7 ++%else ++ RND_SHR_R m11, m12, m14, 6 ++%endif ++ ++ vextracti128 xm4, m11, 1 ++ movd [dstq], xm11 ++ movd [dstq + 4], xm4 ++ vextracti128 xm4, m12, 1 ++ movd [dstq + dsq], xm12 ++ movd [dstq + dsq + 4], xm4 ++ lea dstq, [dstq + dsq*2] ++ ++ pshufd m0, m0, q2031 ++ pshufd m1, m1, q2031 ++ pshufd m11, m2, q3120 ++ pshufd m12, m3, q3120 ++ pshufd m2, m2, q2031 ++ pshufd m3, m3, q2031 ++ ++ mova m4, m3 ++ psrad m4, 16 ++ packssdw m4, m4 ++ ++ punpckldq m0, m11 ++ punpckldq m1, m12 ++ ++ sub hd, 2 ++ jg .hv_w48l ++ RET ++ ++.hv_w8: ++ mov _wq, ss3q ++ ++ cmp hd, 4 ++ jg .hv_w88 ++ ++ lea ss3q, [ssq*3] ++ ++ vpbroadcastq xm5, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8] ++ vpmovsxbw m5, xm5 ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, tsrc, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, tsrc, ds, ss3 ++%endif ++ ++ sub srcq, 6 ++ sub srcq, ssq ++ ++ mov ohd, hd ++ mov tdstq, dstq ++ mov tsrcq, srcq ++ ++.hv_w8l: ++ ++ PUT_8TAP_HS 0, 1, 2, m5, m6, xm13, 0*ssq ++ PUT_8TAP_HS 1, 2, 3, m5, m6, xm13, 1*ssq ++ PUT_8TAP_HS 2, 3, 4, m5, m6, xm13, 2*ssq ++ add srcq, ss3q ++ ++ INTERLEAVE_REGS wd, m0, m1, m2 ++ ++.hv_w8c: ; Nx2, Nx4 ++ ++ PUT_8TAP_HS 3, 8, 11, m5, m6, xm13, 0*ssq ++ PUT_8TAP_HS 4, 8, 11, m5, m6, xm13, 1*ssq ++ lea srcq, [srcq + ssq*2] ++ ++ INTERLEAVE_REGS wd, m2, m3, m4 ++ ++ MUL_ADD_R m8, m11, m0, m1, m2, m3, m9, m10 ++ mova m2, m4 ++ ++%ifidn %1, put ++ RND_SHR_MIN_R m8, m11, m14, xm15, m7 ++%else ++ RND_SHR_R m8, m11, m14, 6 ++%endif ++ ++ vextracti128 xm3, m8, 1 ++ vextracti128 xm4, m11, 1 ++ ++ movq [dstq], xm8 ++ movq [dstq + 8], xm3 ++ movq [dstq + dsq], xm11 ++ movq [dstq + dsq + 8], xm4 ++ lea dstq, [dstq + dsq*2] ++ ++ sub hd, 2 ++ jg .hv_w8c ++ ++ add tdstq, 2*8 ++ add tsrcq, 2*8 ++ mov hd, ohd ++ mov dstq, tdstq ++ mov srcq, tsrcq ++ sub _wd, 8 ++ jg .hv_w8l ++ RET ++ ++.hv_w88: ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, mx, my, jr, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, _w, h, mx, my, jr, ds, ss3 ++%endif ++ ++ lea ss3q, [ssq*3] ++ ++ vpbroadcastq xm7, [jrq - .hv_jmp_tbl + subpel_filters + mxq*8] ++ vpmovsxbw m7, xm7 ++ ++ sub srcq, 6 ++ sub srcq, ss3q ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, bdmax, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, bdmax, ds, ss3 ++%endif ++ ++ mov ohd, hd ++ mov tdstq, dstq ++ ++ popcnt bdmaxd, bdmaxm ++ cmp bdmaxd, 12 ++ je .hv_w88_12bit ++ ++%ifidn %1, put ++ DEFINE_ARGS dst, ds, src, ss, _w, h, oh, tdst, tsrc, ss3 ++%elifidn %1, prep ++ DEFINE_ARGS dst, src, ss, _w, h, oh, tdst, tsrc, ds, ss3 ++%endif ++ ++ mov tsrcq, srcq ++ ++.hv_w88l_10bit: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32: ++ ++ vpbroadcastd m15, [pd_2] ; (1 << (6 - 4) >> 1) ++ ++ PUT_8TAP_HS 0, 12, 13, m7, m15, 6 - 4, 0*ssq ++ PUT_8TAP_HS 1, 12, 13, m7, m15, 6 - 4, 1*ssq ++ PUT_8TAP_HS 2, 12, 13, m7, m15, 6 - 4, 2*ssq ++ add srcq, ss3q ++ ++ PUT_8TAP_HS 3, 12, 13, m7, m15, 6 - 4, 0*ssq ++ PUT_8TAP_HS 4, 12, 13, m7, m15, 6 - 4, 1*ssq ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_8TAP_HS 5, 12, 13, m7, m15, 6 - 4, 0*ssq ++ PUT_8TAP_HS 6, 12, 13, m7, m15, 6 - 4, 1*ssq ++ lea srcq, [srcq + ssq*2] ++ ++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m5, m6 ++ ++.hv_w88c_10bit: ++ ++ PUT_8TAP_HS 12, 14, 15, m7, m15, 6 - 4, 0*ssq, [pd_2] ++ PUT_8TAP_HS 13, 14, 15, m7, m15, 6 - 4, 1*ssq, [pd_2] ++ lea srcq, [srcq + ssq*2] ++ ++ INTERLEAVE_REGS wd, m6, m12, m13 ++ ++ MUL_ADD_R m14, m15, m0, m1, m2, m3, m8, m9 ++ MUL_ACC_R m14, m15, m2, m3, m4, m5, m10 ++ MUL_ACC_R m14, m15, m4, m5, m6, m12, m11 ++ ++%ifidn %1, put ++ vpbroadcastd m6, [pd_512] ; (1 << (6 + 4) >> 1) ++ vpbroadcastw m12, tsrcm ; bdmaxm ++ RND_SHR_MIN_R m14, m15, m6, 6 + 4, m12 ++%else ++ vpbroadcastd m6, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6) ++ RND_SHR_R m14, m15, m6, 6 ++%endif ++ ++ mova m6, m13 ++ ++ vextracti128 xm12, m14, 1 ++ vextracti128 xm13, m15, 1 ++ ++ movq [dstq], xm14 ++ movq [dstq + 8], xm12 ++ movq [dstq + dsq], xm15 ++ movq [dstq + dsq + 8], xm13 ++ lea dstq, [dstq + dsq*2] ++ ++ sub hd, 2 ++ jg .hv_w88c_10bit ++ ++ add tdstq, 2*8 ++ add tsrcq, 2*8 ++ mov hd, ohd ++ mov dstq, tdstq ++ mov srcq, tsrcq ++ sub _wd, 8 ++ jg .hv_w88l_10bit ++ RET ++ ++.hv_w88_12bit: ++ ++ mov tsrcq, srcq ++ ++.hv_w88l_12bit: ; Nx6, Nx8, Nx12, Nx16, Nx24, Nx32: ++ ++ vpbroadcastd m15, [pd_8] ; (1 << (6 - 2) >> 1) ++ ++ PUT_8TAP_HS 0, 12, 13, m7, m15, 6 - 2, 0*ssq ++ PUT_8TAP_HS 1, 12, 13, m7, m15, 6 - 2, 1*ssq ++ PUT_8TAP_HS 2, 12, 13, m7, m15, 6 - 2, 2*ssq ++ add srcq, ss3q ++ ++ PUT_8TAP_HS 3, 12, 13, m7, m15, 6 - 2, 0*ssq ++ PUT_8TAP_HS 4, 12, 13, m7, m15, 6 - 2, 1*ssq ++ lea srcq, [srcq + ssq*2] ++ ++ PUT_8TAP_HS 5, 12, 13, m7, m15, 6 - 2, 0*ssq ++ PUT_8TAP_HS 6, 12, 13, m7, m15, 6 - 2, 1*ssq ++ lea srcq, [srcq + ssq*2] ++ ++ INTERLEAVE_REGS wd, m0, m1, m2, m3, m4, m5, m6 ++ ++.hv_w88c_12bit: ++ ++ PUT_8TAP_HS 12, 14, 15, m7, m15, 6 - 2, 0*ssq, [pd_8] ++ PUT_8TAP_HS 13, 14, 15, m7, m15, 6 - 2, 1*ssq, [pd_8] ++ lea srcq, [srcq + ssq*2] ++ ++ INTERLEAVE_REGS wd, m6, m12, m13 ++ ++ MUL_ADD_R m14, m15, m0, m1, m2, m3, m8, m9 ++ MUL_ACC_R m14, m15, m2, m3, m4, m5, m10 ++ MUL_ACC_R m14, m15, m4, m5, m6, m12, m11 ++ ++%ifidn %1, put ++ vpbroadcastd m6, [pd_128] ; (1 << (6 + 2) >> 1) ++ vpbroadcastw m12, tsrcm ; bdmaxm ++ RND_SHR_MIN_R m14, m15, m6, 6 + 2, m12 ++%else ++ vpbroadcastd m6, [nd_524256] ; (1 << 6 >> 1) - (8192 << 6) ++ RND_SHR_R m14, m15, m6, 6 ++%endif ++ ++ mova m6, m13 ++ ++ vextracti128 xm12, m14, 1 ++ vextracti128 xm13, m15, 1 ++ ++ movq [dstq], xm14 ++ movq [dstq + 8], xm12 ++ movq [dstq + dsq], xm15 ++ movq [dstq + dsq + 8], xm13 ++ lea dstq, [dstq + dsq*2] ++ ++ sub hd, 2 ++ jg .hv_w88c_12bit ++ ++ add tdstq, 2*8 ++ add tsrcq, 2*8 ++ mov hd, ohd ++ mov dstq, tdstq ++ mov srcq, tsrcq ++ sub _wd, 8 ++ jg .hv_w88l_12bit ++ RET ++ ++.hv_jmp_tbl: ++%ifidn %1, put ++ dd .hv_w2 - .hv_jmp_tbl ++%endif ++ dd .hv_w4 - .hv_jmp_tbl ++ dd .hv_w8 - .hv_jmp_tbl ++ dd .hv_w8 - .hv_jmp_tbl ++ dd .hv_w8 - .hv_jmp_tbl ++ dd .hv_w8 - .hv_jmp_tbl ++ dd .hv_w8 - .hv_jmp_tbl ++%endm ++ ++filter_fn put ++filter_fn prep ++ ++%endif ; ARCH_X86_64 +-- +GitLab + + +From 07a3064c9ebd0827177706e135f25cc8a6c25399 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sat, 26 Dec 2020 21:38:58 -0500 +Subject: [PATCH 3/7] Enable AVX2 8tap put/prep HBD assembly + +--- + src/meson.build | 1 + + src/x86/mc_init_tmpl.c | 38 ++++++++++++++++++++------------------ + 2 files changed, 21 insertions(+), 18 deletions(-) + +diff --git a/src/meson.build b/src/meson.build +index f9f5c120..ff62a9d8 100644 +--- a/src/meson.build ++++ b/src/meson.build +@@ -208,6 +208,7 @@ if is_asm_enabled + + if dav1d_bitdepths.contains('16') + libdav1d_sources_asm += files( ++ 'x86/mc16_avx2.asm', + ) + endif + +diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c +index 468069c5..fcfed9be 100644 +--- a/src/x86/mc_init_tmpl.c ++++ b/src/x86/mc_init_tmpl.c +@@ -279,26 +279,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + return; + + #if BITDEPTH == 8 +- init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); +- init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); +- init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); +- init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); +- init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); +- init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); +- init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); +- init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); +- init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); + +- init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); +- init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); +- init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); +- init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); +- init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); +- init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); +- init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); +- init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); +- init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); +@@ -340,6 +322,26 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + c->resize = BF(dav1d_resize, avx2); + #endif + ++ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); ++ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); ++ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); ++ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); ++ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); ++ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); ++ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); ++ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); ++ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); ++ ++ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); ++ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); ++ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); ++ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); ++ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); ++ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); ++ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); ++ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); ++ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); ++ + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; + +-- +GitLab + + +From 68aa4049fe160b318f0f037b0f5514aefea8a69b Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sat, 26 Dec 2020 06:21:44 -0500 +Subject: [PATCH 4/7] x86: mc: Add AVX2 implementation of avg/w_avg/mask for + 16bpc + +Relative speed-ups over C code (compared with gcc-9.3.0): + + C AVX2 +avg_w4_16bpc: 185.5 13.3 13.95x +avg_w8_16bpc: 223.1 35.3 6.32x +avg_w16_16bpc: 626.7 109.1 5.74x +avg_w32_16bpc: 2284.0 400.8 5.70x +avg_w64_16bpc: 5294.7 924.6 5.73x +avg_w128_16bpc: 12887.0 2237.9 5.76x + +w_avg_w4_16bpc: 225.6 16.0 14.10x +w_avg_w8_16bpc: 336.2 36.4 9.24x +w_avg_w16_16bpc: 975.0 111.5 8.74x +w_avg_w32_16bpc: 3633.7 403.1 9.01x +w_avg_w64_16bpc: 8519.2 927.9 9.18x +w_avg_w128_16bpc: 20873.9 2232.5 9.35x + +mask_w4_16bpc: 241.7 21.4 11.29x +mask_w8_16bpc: 663.5 51.1 12.98x +mask_w16_16bpc: 736.9 153.2 4.81x +mask_w32_16bpc: 2650.9 582.7 4.55x +mask_w64_16bpc: 6075.4 1359.7 4.47x +mask_w128_16bpc: 14677.0 3330.2 4.41x +--- + src/x86/mc16_avx2.asm | 179 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 179 insertions(+) + +diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm +index ea6cfdbf..7b4f9cdf 100644 +--- a/src/x86/mc16_avx2.asm ++++ b/src/x86/mc16_avx2.asm +@@ -19,7 +19,10 @@ SECTION_RODATA 32 + spf_h_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 + pq_2: dq (6 - 4) ++pq_3: dq (6 - 4) + 1 ++pq_6: dq (6 - 4) + 4 + pq_4: dq (6 - 2) ++pq_5: dq (6 - 2) + 1 + pq_8: dq (6 + 2) + pq_10: dq (6 + 4) + pd_32: dd (1 << 6 >> 1) +@@ -32,7 +35,16 @@ pd_128: dd (1 << (6 + 2) >> 1) + nd_524256: dd (1 << 6 >> 1) - (8192 << 6) + nd_32766: dd (1 << (6 - 4) >> 1) - (8192 << (6 - 4)) + nd_131064: dd (1 << (6 - 2) >> 1) - (8192 << (6 - 2)) ++pd_16388: dd (1 << (6 - 4)) + 8192*2 ++pd_16400: dd (1 << (6 - 2)) + 8192*2 ++pd_131104: dd ((1 << (6 - 4)) + 8192*2) << 3 ++pd_131200: dd ((1 << (6 - 2)) + 8192*2) << 3 ++pd_524416: dd ((1 << (6 - 4)) + 8192*2) << 5 ++pd_524800: dd ((1 << (6 - 2)) + 8192*2) << 5 + pw_8192: dw 8192 ++pw_1: dw 1 ++pw_16: dw 16 ++pw_64: dw 64 + + SECTION .text + +@@ -1550,4 +1562,171 @@ INIT_YMM avx2 + filter_fn put + filter_fn prep + ++%macro AVG 1 ++ mova m0, [p1q] ++ mova m2, [p2q] ++ punpckhwd m1, m0, m2 ++ punpcklwd m0, m2 ++%ifidn %1, mask ++ mova xm2, [mq] ++ vpmovsxbw m2, xm2 ++ vpbroadcastw m7, [pw_64] ++ psubw m7, m2 ++ punpckhwd m3, m2, m7 ++ punpcklwd m2, m7 ++ pmaddwd m0, m2 ++%else ++ pmaddwd m0, m3 ++%endif ++ pmaddwd m1, m3 ++ paddd m0, m4 ++ paddd m1, m4 ++ psrad m0, xm5 ++ psrad m1, xm5 ++ packusdw m0, m1 ++ pminuw m0, m6 ++%endm ++ ++%macro bilin_fn 1 ++%ifidn %1, avg ++cglobal avg_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, bdmax, ds3, ow ++%elifidn %1, w_avg ++cglobal w_avg_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, wg, bdmax, ow ++%else ++cglobal mask_16bpc, 4, 9, 8, dst, ds, p1, p2, w, h, m, bdmax, ow ++%endif ++ ++ movifnidn hd, hm ++ movifnidn wd, wm ++ ++%ifidn %1, avg ++ vpbroadcastw m3, [pw_1] ++ vpbroadcastd m4, [pd_16400] ; (1 << (6 - 2)) + 8192*2 ++ movq xm5, [pq_5] ; (6 - 2) + 1 ++%elifidn %1, w_avg ++ vpbroadcastw m3, wgm ++ vpbroadcastw m4, [pw_16] ++ psubw m4, m3 ++ punpcklwd m3, m4 ++ vpbroadcastd m4, [pd_131200] ; ((1 << (6 - 2)) + 8192*2) << 3 ++ movq xm5, [pq_8] ; (6 - 2) + 1 + 3 ++%else ++ movifnidn mq, mmp ++ vpbroadcastd m4, [pd_524800] ; ((1 << (6 - 2)) + 8192*2) << 5 ++ movq xm5, [pq_10] ; (6 - 2) + 1 + 5 ++%endif ++ ++ popcnt bdmaxd, bdmaxm ++ cmp bdmaxd, 10 ++ je .bits10 ++ ++%ifidn %1, avg ++ vpbroadcastd m4, [pd_16388] ; (1 << (6 - 4)) + 8192*2 ++ movq xm5, [pq_3] ; (6 - 4) + 1 ++%elifidn %1, w_avg ++ vpbroadcastd m4, [pd_131104] ; ((1 << (6 - 4)) + 8192*2) << 3 ++ movq xm5, [pq_6] ; (6 - 4) + 1 + 3 ++%else ++ vpbroadcastd m4, [pd_524416] ; ((1 << (6 - 4)) + 8192*2) << 5 ++ movq xm5, [pq_8] ; (6 - 4) + 1 + 5 ++%endif ++.bits10: ++ ++ vpbroadcastw m6, bdmaxm ++ ++ lea owd, [2*wd] ++ ++DEFINE_ARGS dst, ds, p1, p2, w, h, m, jr, ow ++ ++ lea jrq, [.jmp_tbl] ++ tzcnt wd, wm ++ sub wd, 2 ++ movsxd wq, [jrq + wq*4] ++ add wq, jrq ++ jmp wq ++ ++.w4: ++DEFINE_ARGS dst, ds, p1, p2, w, h, m, ds3, ow ++ ++ lea ds3q, [dsq*3] ++ ++.w4l: ++ AVG %1 ++ ++ vextracti128 xm1, m0, 1 ++ movq [dstq], xm0 ++ pextrq [dstq + dsq], xm0, 1 ++ movq [dstq + 2*dsq], xm1 ++ pextrq [dstq + ds3q], xm1, 1 ++ ++ lea dstq, [dstq + 4*dsq] ++ add p1q, 32 ++ add p2q, 32 ++%ifidn %1, mask ++ add mq, 16 ++%endif ++ ++ sub hd, 4 ++ jg .w4l ++ RET ++ ++.w8: ++ AVG %1 ++ ++ vextracti128 xm1, m0, 1 ++ mova [dstq], xm0 ++ mova [dstq + dsq], xm1 ++ ++ lea dstq, [dstq + dsq*2] ++ add p1q, 32 ++ add p2q, 32 ++%ifidn %1, mask ++ add mq, 16 ++%endif ++ ++ sub hd, 2 ++ jg .w8 ++ ++ RET ++ ++.w16: ++ ++ mov wd, owd ; upper 32-bits of wq zerod by jmp ++ sub dsq, wq ++ ++.w16l: ++ AVG %1 ++ ++ mova [dstq], m0 ++ ++ add dstq, 32 ++ add p1q, 32 ++ add p2q, 32 ++%ifidn %1, mask ++ add mq, 16 ++%endif ++ ++ sub wd, 32 ++ jg .w16l ++ ++ add dstq, dsq ++ mov wd, owd ++ dec hd ++ jg .w16l ++ ++ RET ++ ++.jmp_tbl: ++ dd .w4 - .jmp_tbl ++ dd .w8 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++%endm ++ ++bilin_fn avg ++bilin_fn w_avg ++bilin_fn mask ++ + %endif ; ARCH_X86_64 +-- +GitLab + + +From 6ba57502ac82b00d1441a36d4e12814eafd37982 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sun, 27 Dec 2020 04:11:21 -0500 +Subject: [PATCH 5/7] Enable AVX2 avg/w_avg/mask HBD assembly + +--- + src/x86/mc_init_tmpl.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c +index fcfed9be..70798047 100644 +--- a/src/x86/mc_init_tmpl.c ++++ b/src/x86/mc_init_tmpl.c +@@ -305,9 +305,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + +- c->avg = BF(dav1d_avg, avx2); +- c->w_avg = BF(dav1d_w_avg, avx2); +- c->mask = BF(dav1d_mask, avx2); + c->w_mask[0] = BF(dav1d_w_mask_444, avx2); + c->w_mask[1] = BF(dav1d_w_mask_422, avx2); + c->w_mask[2] = BF(dav1d_w_mask_420, avx2); +@@ -342,6 +339,10 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + ++ c->avg = BF(dav1d_avg, avx2); ++ c->w_avg = BF(dav1d_w_avg, avx2); ++ c->mask = BF(dav1d_mask, avx2); ++ + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; + +-- +GitLab + + +From c18338526a06c14409333e7d6ed34ae60a6dff46 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sun, 27 Dec 2020 17:25:54 -0500 +Subject: [PATCH 6/7] x86: mc: Add AVX2 implementation of blend/blend_h/blend_v + for 16bpc + + Relative speed-ups over C code (compared with gcc-9.3.0): + + C AVX2 +blend_w4_16bpc: 72.3 12.1 5.98x +blend_w8_16bpc: 218.0 26.2 8.32x +blend_w16_16bpc: 859.7 53.3 16.13x +blend_w32_16bpc: 2193.1 137.5 15.95x + +blend_h_w2_16bpc: 87.3 21.0 4.16x +blend_h_w4_16bpc: 137.8 22.8 6.04x +blend_h_w8_16bpc: 126.5 29.7 4.26x +blend_h_w16_16bpc: 211.4 28.7 7.37x +blend_h_w32_16bpc: 385.5 50.9 7.57x +blend_h_w64_16bpc: 726.2 88.0 8.25x +blend_h_w128_16bpc: 1728.7 182.6 9.47x + +blend_v_w2_16bpc: 74.7 30.6 2.44x +blend_v_w4_16bpc: 321.2 51.2 6.27x +blend_v_w8_16bpc: 614.6 69.2 8.88x +blend_v_w16_16bpc: 1211.0 57.2 21.17x +blend_v_w32_16bpc: 2406.2 116.8 20.60x +--- + src/x86/mc16_avx2.asm | 381 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 381 insertions(+) + +diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm +index 7b4f9cdf..ee56da2e 100644 +--- a/src/x86/mc16_avx2.asm ++++ b/src/x86/mc16_avx2.asm +@@ -1729,4 +1729,385 @@ bilin_fn avg + bilin_fn w_avg + bilin_fn mask + ++INIT_XMM avx2 ++cglobal blend_16bpc, 6, 7, 7, dst, ds, tmp, w, h, mask, jr ++ pxor m3, m3 ++ lea jrq, [.jmp_tbl] ++ tzcnt wd, wm ++ sub wd, 2 ++ movsxd wq, [jrq + wq*4] ++ add wq, jrq ++ jmp wq ++.w4: ++ movq m0, [dstq] ++ pinsrq m0, [dstq + dsq], 1 ++ mova m1, [tmpq] ++ movq m2, [maskq] ++ psubb m2, m3, m2 ++ pmovsxbw m2, m2 ++ psllw m2, 9 ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ movq [dstq], m0 ++ pextrq [dstq + dsq], m0, 1 ++ add maskq, 8 ++ add tmpq, 16 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w4 ++ RET ++INIT_YMM avx2 ++.w8: ++ mova xm0, [dstq] ++ vinserti128 m0, [dstq + dsq], 1 ++ mova m1, [tmpq] ++ mova xm2, [maskq] ++ psubb xm2, xm3, xm2 ++ pmovsxbw m2, xm2 ++ psllw m2, 9 ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ mova [dstq], xm0 ++ vextracti128 [dstq + dsq], m0, 1 ++ add maskq, 16 ++ add tmpq, 32 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w8 ++ RET ++.w16: ++ mova m0, [dstq] ++ mova m4, [dstq + dsq] ++ mova m1, [tmpq] ++ mova m5, [tmpq + 32] ++ mova xm2, [maskq] ++ mova xm6, [maskq + 16] ++ psubb xm2, xm3, xm2 ++ psubb xm6, xm3, xm6 ++ pmovsxbw m2, xm2 ++ pmovsxbw m6, xm6 ++ psllw m2, 9 ++ psllw m6, 9 ++ psubw m1, m0, m1 ++ psubw m5, m4, m5 ++ pmulhrsw m1, m2 ++ pmulhrsw m5, m6 ++ paddw m0, m1 ++ paddw m4, m5 ++ mova [dstq], m0 ++ mova [dstq + dsq], m4 ++ add maskq, 32 ++ add tmpq, 64 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w16 ++ RET ++.w32: ++ mova m0, [dstq] ++ mova m4, [dstq + 32] ++ mova m1, [tmpq] ++ mova m5, [tmpq + 32] ++ mova xm2, [maskq] ++ mova xm6, [maskq + 16] ++ psubb xm2, xm3, xm2 ++ psubb xm6, xm3, xm6 ++ pmovsxbw m2, xm2 ++ pmovsxbw m6, xm6 ++ psllw m2, 9 ++ psllw m6, 9 ++ psubw m1, m0, m1 ++ psubw m5, m4, m5 ++ pmulhrsw m1, m2 ++ pmulhrsw m5, m6 ++ paddw m0, m1 ++ paddw m4, m5 ++ mova [dstq], m0 ++ mova [dstq + 32], m4 ++ add maskq, 32 ++ add tmpq, 64 ++ add dstq, dsq ++ dec hd ++ jg .w32 ++ RET ++.jmp_tbl: ++ dd .w4 - .jmp_tbl ++ dd .w8 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w32 - .jmp_tbl ++ ++cextern obmc_masks ++ ++INIT_XMM avx2 ++cglobal blend_v_16bpc, 5, 7, 7, dst, ds, tmp, w, h, o, jr ++ lea oq, [obmc_masks] ++ pxor m3, m3 ++ movsx wq, wd ++ add oq, wq ++ lea jrq, [.jmp_tbl] ++ tzcnt wd, wm ++ sub wd, 1 ++ movsxd wq, [jrq + wq*4] ++ add wq, jrq ++ jmp wq ++.w2: ++ vpbroadcastw m2, [oq] ++ psubb m2, m3, m2 ++ pmovsxbw m2, m2 ++ psllw m2, 9 ++.w2l: ++ movd m0, [dstq] ++ movd m1, [tmpq] ++ pinsrd m0, [dstq + dsq], 1 ++ pinsrd m1, [tmpq + 4], 1 ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ movd [dstq], m0 ++ pextrd [dstq + dsq], m0, 1 ++ add tmpq, 8 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w2l ++ RET ++.w4: ++ vpbroadcastd m2, [oq] ++ psubb m2, m3, m2 ++ pmovsxbw m2, m2 ++ psllw m2, 9 ++.w4l: ++ movq m0, [dstq] ++ movq m1, [tmpq] ++ pinsrq m0, [dstq + dsq], 1 ++ pinsrq m1, [tmpq + 8], 1 ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ movq [dstq], m0 ++ pextrq [dstq + dsq], m0, 1 ++ add tmpq, 16 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w4l ++ RET ++INIT_YMM avx2 ++.w8: ++ vpbroadcastq xm2, [oq] ++ psubb xm2, xm3, xm2 ++ pmovsxbw m2, xm2 ++ psllw m2, 9 ++.w8l: ++ mova xm0, [dstq] ++ vinserti128 m0, [dstq + dsq], 1 ++ mova m1, [tmpq] ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ mova [dstq], xm0 ++ vextracti128 [dstq + dsq], m0, 1 ++ add tmpq, 32 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w8l ++ RET ++.w16: ++ mova xm2, [oq] ++ psubb xm2, xm3, xm2 ++ pmovsxbw m2, xm2 ++ psllw m2, 9 ++.w16l: ++ mova m0, [dstq] ++ mova m4, [dstq + dsq] ++ mova m1, [tmpq] ++ mova m5, [tmpq + 32] ++ psubw m1, m0, m1 ++ psubw m5, m4, m5 ++ pmulhrsw m1, m2 ++ pmulhrsw m5, m2 ++ paddw m0, m1 ++ paddw m4, m5 ++ mova [dstq], m0 ++ mova [dstq + dsq], m4 ++ add tmpq, 64 ++ lea dstq, [dstq + 2*dsq] ++ sub hd, 2 ++ jg .w16l ++ RET ++.w32: ++ mova xm2, [oq] ++ mova xm6, [oq + 16] ++ psubb xm2, xm3, xm2 ++ psubb xm6, xm3, xm6 ++ pmovsxbw m2, xm2 ++ pmovsxbw m6, xm6 ++ psllw m2, 9 ++ psllw m6, 9 ++.w32l: ++ mova m0, [dstq] ++ mova m4, [dstq + 32] ++ mova m1, [tmpq] ++ mova m5, [tmpq + 32] ++ psubw m1, m0, m1 ++ psubw m5, m4, m5 ++ pmulhrsw m1, m2 ++ pmulhrsw m5, m6 ++ paddw m0, m1 ++ paddw m4, m5 ++ mova [dstq], m0 ++ mova [dstq + 32], m4 ++ add tmpq, 64 ++ add dstq, dsq ++ dec hd ++ jg .w32l ++ RET ++.jmp_tbl: ++ dd .w2 - .jmp_tbl ++ dd .w4 - .jmp_tbl ++ dd .w8 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w32 - .jmp_tbl ++ ++INIT_XMM avx2 ++cglobal blend_h_16bpc, 5, 8, 7, dst, ds, tmp, w, h, o, jr, w2 ++ pxor m3, m3 ++ lea w2d, [wd*2] ++ lea oq, [obmc_masks] ++ movsx hq, hd ++ add oq, hq ++ imul hq, 3 ++ shr hq, 2 ++ lea jrq, [.jmp_tbl] ++ tzcnt wd, wm ++ sub wd, 1 ++ movsxd wq, [jrq + wq*4] ++ add wq, jrq ++ jmp wq ++.w2: ++ movd m2, [oq] ++ psubb m2, m3, m2 ++ punpcklbw m2, m2 ++ pmovsxbw m2, m2 ++ psllw m2, 9 ++ movd m0, [dstq] ++ movd m1, [tmpq] ++ pinsrd m0, [dstq + dsq], 1 ++ pinsrd m1, [tmpq + 4], 1 ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ movd [dstq], m0 ++ pextrd [dstq + dsq], m0, 1 ++ add tmpq, 8 ++ lea dstq, [dstq + 2*dsq] ++ add oq, 2 ++ sub hd, 2 ++ jg .w2 ++ RET ++.w4: ++ movd m2, [oq] ++ punpcklbw m2, m2 ++ punpcklwd m2, m2 ++ psubb m2, m3, m2 ++ pmovsxbw m2, m2 ++ psllw m2, 9 ++ movq m0, [dstq] ++ movq m1, [tmpq] ++ pinsrq m0, [dstq + dsq], 1 ++ pinsrq m1, [tmpq + 8], 1 ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ movq [dstq], m0 ++ pextrq [dstq + dsq], m0, 1 ++ add tmpq, 16 ++ lea dstq, [dstq + 2*dsq] ++ add oq, 2 ++ sub hd, 2 ++ jg .w4 ++ RET ++INIT_YMM avx2 ++.w8: ++ movd xm2, [oq] ++ psubb xm2, xm3, xm2 ++ punpcklbw xm2, xm2 ++ punpcklwd xm2, xm2 ++ punpckldq xm2, xm2 ++ pmovsxbw m2, xm2 ++ psllw m2, 9 ++ mova xm0, [dstq] ++ vinserti128 m0, [dstq + dsq], 1 ++ mova m1, [tmpq] ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ mova [dstq], xm0 ++ vextracti128 [dstq + dsq], m0, 1 ++ add tmpq, 32 ++ lea dstq, [dstq + 2*dsq] ++ add oq, 2 ++ sub hd, 2 ++ jg .w8 ++ RET ++.w16: ++ vpbroadcastb xm2, [oq] ++ vpbroadcastb xm6, [oq + 1] ++ psubb xm2, xm3, xm2 ++ psubb xm6, xm3, xm6 ++ pmovsxbw m2, xm2 ++ pmovsxbw m6, xm6 ++ psllw m2, 9 ++ psllw m6, 9 ++ mova m0, [dstq] ++ mova m1, [tmpq] ++ mova m4, [dstq + dsq] ++ mova m5, [tmpq + 32] ++ psubw m1, m0, m1 ++ psubw m5, m4, m5 ++ pmulhrsw m1, m2 ++ pmulhrsw m5, m6 ++ paddw m0, m1 ++ paddw m4, m5 ++ mova [dstq], m0 ++ mova [dstq + dsq], m4 ++ add tmpq, 64 ++ lea dstq, [dstq + 2*dsq] ++ add oq, 2 ++ sub hd, 2 ++ jg .w16 ++ RET ++.w32: ++ mov wd, w2d ++ sub dsq, wq ++.w32l: ++ vpbroadcastb xm2, [oq] ++ psubb xm2, xm3, xm2 ++ pmovsxbw m2, xm2 ++ psllw m2, 9 ++ mov wd, w2d ++.w32c: ++ mova m0, [dstq] ++ mova m1, [tmpq] ++ psubw m1, m0, m1 ++ pmulhrsw m1, m2 ++ paddw m0, m1 ++ mova [dstq], m0 ++ add dstq, 32 ++ add tmpq, 32 ++ sub wd, 32 ++ jg .w32c ++ add dstq, dsq ++ inc oq ++ dec hd ++ jg .w32l ++ RET ++.jmp_tbl: ++ dd .w2 - .jmp_tbl ++ dd .w4 - .jmp_tbl ++ dd .w8 - .jmp_tbl ++ dd .w16 - .jmp_tbl ++ dd .w32 - .jmp_tbl ++ dd .w32 - .jmp_tbl ++ dd .w32 - .jmp_tbl ++ + %endif ; ARCH_X86_64 +-- +GitLab + + +From 8770797232201d6e0e4106e799b6d76865feff77 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sun, 27 Dec 2020 21:13:15 -0500 +Subject: [PATCH 7/7] Enable AVX2 blend/blend_h/blend_v HBD assembly + +--- + src/x86/mc_init_tmpl.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c +index 70798047..2dac21cf 100644 +--- a/src/x86/mc_init_tmpl.c ++++ b/src/x86/mc_init_tmpl.c +@@ -308,9 +308,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + c->w_mask[0] = BF(dav1d_w_mask_444, avx2); + c->w_mask[1] = BF(dav1d_w_mask_422, avx2); + c->w_mask[2] = BF(dav1d_w_mask_420, avx2); +- c->blend = BF(dav1d_blend, avx2); +- c->blend_v = BF(dav1d_blend_v, avx2); +- c->blend_h = BF(dav1d_blend_h, avx2); + + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2); +@@ -342,6 +339,9 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + c->avg = BF(dav1d_avg, avx2); + c->w_avg = BF(dav1d_w_avg, avx2); + c->mask = BF(dav1d_mask, avx2); ++ c->blend = BF(dav1d_blend, avx2); ++ c->blend_v = BF(dav1d_blend_v, avx2); ++ c->blend_h = BF(dav1d_blend_h, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; +-- +GitLab + diff --git a/0002-wiener_2.patch b/0002-wiener_2.patch new file mode 100644 index 000000000000..149be2fe1293 --- /dev/null +++ b/0002-wiener_2.patch @@ -0,0 +1,661 @@ +From 269eeaf7c01afc79a53537881ad03185bf491cf6 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Tue, 29 Dec 2020 06:58:33 -0500 +Subject: [PATCH] Add bpc suffix to lr functions + +--- + src/x86/looprestoration.asm | 36 ++--- + src/x86/looprestoration_init_tmpl.c | 204 +++++++++++++--------------- + src/x86/looprestoration_sse.asm | 60 ++++---- + 3 files changed, 146 insertions(+), 154 deletions(-) + +diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm +index 8ebe230..e077cdd 100644 +--- a/src/x86/looprestoration.asm ++++ b/src/x86/looprestoration.asm +@@ -66,8 +66,8 @@ SECTION .text + DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers + + INIT_YMM avx2 +-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h ++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm +@@ -414,8 +414,8 @@ ALIGN function_align + add dstq, dst_strideq + ret + +-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h ++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm +@@ -532,7 +532,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right +- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right + .h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 +@@ -591,7 +591,7 @@ ALIGN function_align + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right +- call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right + .hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 +@@ -705,7 +705,7 @@ ALIGN function_align + jl .v_loop + ret + +-cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim ++cglobal sgr_box3_h_8bpc, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov xlimd, edgem + movifnidn wd, wm + mov hd, hm +@@ -805,7 +805,7 @@ cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim + RET + + INIT_YMM avx2 +-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim ++cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov xq, -2 + rorx ylimd, edged, 2 +@@ -868,7 +868,7 @@ cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, + RET + + INIT_YMM avx2 +-cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s ++cglobal sgr_calc_ab1_8bpc, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +@@ -937,8 +937,8 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s + RET + + INIT_YMM avx2 +-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ +- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y ++cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ ++ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm + vpbroadcastd m15, [pw_16] +@@ -1043,7 +1043,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ + RET + + INIT_YMM avx2 +-cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt ++cglobal sgr_weighted1_8bpc, 4, 6, 6, dst, stride, t, w, h, wt + %ifidn wtd, wtm + shl wtd, 4 + movd xm5, wtd +@@ -1082,7 +1082,7 @@ cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt + RET + + INIT_YMM avx2 +-cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim ++cglobal sgr_box5_h_8bpc, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm +@@ -1200,7 +1200,7 @@ cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli + RET + + INIT_YMM avx2 +-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim ++cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov xq, -2 + rorx ylimd, edged, 2 +@@ -1293,7 +1293,7 @@ cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, + jmp .loop_y_noload + + INIT_YMM avx2 +-cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s ++cglobal sgr_calc_ab2_8bpc, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +@@ -1364,8 +1364,8 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s + RET + + INIT_YMM avx2 +-cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ +- tmp_ptr, src_ptr, a_ptr, b_ptr, x, y ++cglobal sgr_finish_filter2_8bpc, 5, 13, 13, t, src, stride, a, b, w, h, \ ++ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm + vpbroadcastd m9, [pw_5_6] +@@ -1483,7 +1483,7 @@ cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ + RET + + INIT_YMM avx2 +-cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt ++cglobal sgr_weighted2_8bpc, 4, 7, 11, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movifnidn hd, hm + vpbroadcastd m0, wtm +diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c +index 5df449c..11ebdd1 100644 +--- a/src/x86/looprestoration_init_tmpl.c ++++ b/src/x86/looprestoration_init_tmpl.c +@@ -31,148 +31,140 @@ + #include "common/intops.h" + #include "src/tables.h" + +-#define WIENER_FILTER(ext) \ +-void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const int16_t filter[2][8], \ +- enum LrEdgeFlags edges); \ +-void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \ +- const pixel (*left)[4], const pixel *lpf, \ +- ptrdiff_t lpf_stride, int w, int h, \ +- const int16_t filter[2][8], \ +- enum LrEdgeFlags edges); ++#define decl_wiener_filter_fns(ext) \ ++decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \ ++decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext)) + +-#define SGR_FILTER(ext) \ +-void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ +- const pixel (*left)[4], \ +- const pixel *src, const ptrdiff_t stride, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ +- const int w, const int h, const int strength); \ +-void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const int32_t *a, const int16_t *b, \ +- const int w, const int h); \ ++#define decl_sgr_filter_fn(ext) \ ++void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ ++ const pixel (*left)[4], \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \ ++ const int w, const int h, const int strength); \ ++void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int32_t *a, const int16_t *b, \ ++ const int w, const int h); \ + \ + /* filter with a 3x3 box (radius=1) */ \ +-static void dav1d_sgr_filter1_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const pixel (*left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, const int strength, \ +- const enum LrEdgeFlags edges) \ ++static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const pixel (*left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int strength, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ + ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ + \ +- dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ ++ BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ +- dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ +- NULL, lpf, lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ ++ NULL, lpf, lpf_stride, w, 2, edges); \ + \ + if (edges & LR_HAVE_BOTTOM) \ +- dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ +- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ +- lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ ++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ ++ lpf_stride, w, 2, edges); \ + \ +- dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ +- dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ +- dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ ++ BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \ ++ BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \ ++ BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \ + } \ + \ +-void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ +- const pixel (*left)[4], \ +- const pixel *src, const ptrdiff_t stride, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ +- const int w, const int h, \ +- const enum LrEdgeFlags edges); \ +-void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ +- const int w, const int h, const int strength); \ +-void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const int32_t *a, const int16_t *b, \ +- const int w, const int h); \ ++void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \ ++ const pixel (*left)[4], \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \ ++ const int w, const int h, const int strength); \ ++void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int32_t *a, const int16_t *b, \ ++ const int w, const int h); \ + \ + /* filter with a 5x5 box (radius=2) */ \ +-static void dav1d_sgr_filter2_##ext(coef *tmp, \ +- const pixel *src, const ptrdiff_t stride, \ +- const pixel (*left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, const int strength, \ +- const enum LrEdgeFlags edges) \ ++static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const pixel (*left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int strength, \ ++ const enum LrEdgeFlags edges) \ + { \ + ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ + ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ + \ +- dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ ++ BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ + if (edges & LR_HAVE_TOP) \ +- dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ +- NULL, lpf, lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ ++ NULL, lpf, lpf_stride, w, 2, edges); \ + \ + if (edges & LR_HAVE_BOTTOM) \ +- dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ +- NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ +- lpf_stride, w, 2, edges); \ ++ BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ ++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ ++ lpf_stride, w, 2, edges); \ + \ +- dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ +- dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ +- dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ ++ BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \ ++ BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \ ++ BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \ + } \ + \ +-void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ +- const coef *t1, const int w, const int h, \ +- const int wt); \ +-void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ +- const coef *t1, const coef *t2, \ +- const int w, const int h, \ +- const uint32_t wt); \ ++void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \ ++ const coef *t1, const int w, const int h, \ ++ const int wt); \ ++void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \ ++ const coef *t1, const coef *t2, \ ++ const int w, const int h, \ ++ const uint32_t wt); \ + \ +-static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ +- const pixel (*const left)[4], \ +- const pixel *lpf, const ptrdiff_t lpf_stride, \ +- const int w, const int h, const int sgr_idx, \ +- const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ ++static void BF(sgr_filter, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int sgr_idx, \ ++ const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ + { \ + if (!dav1d_sgr_params[sgr_idx][0]) { \ + ALIGN_STK_32(coef, tmp, 64 * 384,); \ +- dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, dav1d_sgr_params[sgr_idx][3], edges); \ +- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ ++ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ ++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ + } else if (!dav1d_sgr_params[sgr_idx][1]) { \ + ALIGN_STK_32(coef, tmp, 64 * 384,); \ +- dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, dav1d_sgr_params[sgr_idx][2], edges); \ +- dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ ++ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ ++ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ + } else { \ + ALIGN_STK_32(coef, tmp1, 64 * 384,); \ + ALIGN_STK_32(coef, tmp2, 64 * 384,); \ +- dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, dav1d_sgr_params[sgr_idx][2], edges); \ +- dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ +- w, h, dav1d_sgr_params[sgr_idx][3], edges); \ ++ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ ++ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ + const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \ +- dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ ++ BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \ + } \ + } + + #if BITDEPTH == 8 +-WIENER_FILTER(sse2) +-WIENER_FILTER(ssse3) +-SGR_FILTER(ssse3) ++decl_wiener_filter_fns(sse2); ++decl_wiener_filter_fns(ssse3); ++decl_sgr_filter_fn(ssse3) + # if ARCH_X86_64 +-WIENER_FILTER(avx2) +-SGR_FILTER(avx2) ++decl_wiener_filter_fns(avx2); ++decl_sgr_filter_fn(avx2) + # endif + #endif + +@@ -181,21 +173,21 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + #if BITDEPTH == 8 +- c->wiener[0] = dav1d_wiener_filter7_sse2; +- c->wiener[1] = dav1d_wiener_filter5_sse2; ++ c->wiener[0] = BF(dav1d_wiener_filter7, sse2); ++ c->wiener[1] = BF(dav1d_wiener_filter5, sse2); + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + #if BITDEPTH == 8 +- c->wiener[0] = dav1d_wiener_filter7_ssse3; +- c->wiener[1] = dav1d_wiener_filter5_ssse3; +- c->selfguided = sgr_filter_ssse3; ++ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); ++ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); ++ c->selfguided = BF(sgr_filter, ssse3); + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + #if BITDEPTH == 8 && ARCH_X86_64 +- c->wiener[0] = dav1d_wiener_filter7_avx2; +- c->wiener[1] = dav1d_wiener_filter5_avx2; +- c->selfguided = sgr_filter_avx2; ++ c->wiener[0] = BF(dav1d_wiener_filter7, avx2); ++ c->wiener[1] = BF(dav1d_wiener_filter5, avx2); ++ c->selfguided = BF(sgr_filter, avx2); + #endif + } +diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm +index 5d3ca49..4b77138 100644 +--- a/src/x86/looprestoration_sse.asm ++++ b/src/x86/looprestoration_sse.asm +@@ -97,8 +97,8 @@ SECTION .text + %macro WIENER 0 + %if ARCH_X86_64 + DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers +-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h, x ++cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h, x + %define base 0 + mov fltq, fltmp + mov edged, r8m +@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5 + %define m11 [stk+96] + %define stk_off 112 + %endif +-cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride ++cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + %define base r6-pb_right_ext_mask-21 + %define stk esp + %define dstq leftq +@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + add lpfq, [rsp+gprsize*1] + call .hv_bottom + .v1: +- call mangle(private_prefix %+ _wiener_filter7_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + RET + .no_top: + lea t3, [lpfq+lpf_strideq*4] +@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + dec hd + jnz .main + .v3: +- call mangle(private_prefix %+ _wiener_filter7_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + .v2: +- call mangle(private_prefix %+ _wiener_filter7_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + jmp .v1 + .extend_right: + movd m2, [lpfq-4] +@@ -685,8 +685,8 @@ ALIGN function_align + %endif + + %if ARCH_X86_64 +-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ +- lpf_stride, w, edge, flt, h, x ++cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ ++ lpf_stride, w, edge, flt, h, x + mov fltq, fltmp + mov edged, r8m + mov wd, wm +@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + %define m11 [stk+80] + %define stk_off 96 + %endif +-cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride ++cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + %define stk esp + %define leftmp [stk+28] + %define m8 [base+pw_m16380] +@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + dec hd + jnz .main + .v2: +- call mangle(private_prefix %+ _wiener_filter5_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + add dstq, dst_strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + movifnidn dstmp, dstq + .v1: +- call mangle(private_prefix %+ _wiener_filter5_ssse3).v ++ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + jmp .end + .h: + %define stk esp+4 +@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + jnz .h_have_right + cmp xd, -17 + jl .h_have_right +- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right + .h_have_right: + %macro %%h5 0 + %if cpuflag(ssse3) +@@ -991,7 +991,7 @@ ALIGN function_align + jnz .hv_have_right + cmp xd, -17 + jl .hv_have_right +- call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right ++ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right + .hv_have_right: + %%h5 + mova m2, [t3+xq*2] +@@ -1161,7 +1161,7 @@ WIENER + %endmacro + + %if ARCH_X86_64 +-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim ++cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + mov xlimd, edgem + movifnidn xd, xm + mov hd, hm +@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + add xd, xlimd + xor xlimd, 2 ; 2*!have_right + %else +-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim ++cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + %define wq r0m + %define xlimd r1m + %define hd hmp +@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + RET + + %if ARCH_X86_64 +-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim ++cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim + movifnidn edged, edgem + %else +-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y ++cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y + %define sumsq_baseq dword [esp+0] + %define sum_baseq dword [esp+4] + %define ylimd dword [esp+8] +@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y + jl .loop_x + RET + +-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s ++cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 +@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s + RET + + %if ARCH_X86_64 +-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ +- tmp_base, src_base, a_base, b_base, x, y ++cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ ++ tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm + mova m15, [pw_16] +@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ + mov b_baseq, bq + xor xd, xd + %else +-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y ++cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y + %define tmp_baseq [esp+8] + %define src_baseq [esp+12] + %define a_baseq [esp+16] +@@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y + jl .loop_x + RET + +-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt ++cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt + movifnidn hd, hm + %if ARCH_X86_32 + SETUP_PIC r6, 0 +@@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt + RET + + %if ARCH_X86_64 +-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim ++cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + mova m10, [pb_0] + mova m11, [pb_0_1] + %else +-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge ++cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge + %define edgeb byte edgem + %define wd xd + %define wq wd +@@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge + RET + + %if ARCH_X86_64 +-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim ++cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov ylimd, edged + %else +-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr ++cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr + %define wm [esp+0] + %define hm [esp+4] + %define edgem [esp+8] +@@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr + jmp .sum_loop_y_noload + %endif + +-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s ++cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 +@@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s + RET + + %if ARCH_X86_64 +-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ ++cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm +@@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ + psrlw m11, m12, 1 ; pw_128 + pxor m13, m13 + %else +-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y ++cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y + %define tmp_baseq r0m + %define src_baseq r1m + %define a_baseq r3m +@@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y + RET + + %undef t2 +-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt ++cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movd m0, wtm + %if ARCH_X86_64 +-- +GitLab + diff --git a/0003-wiener_3.patch b/0003-wiener_3.patch new file mode 100644 index 000000000000..b2852c881396 --- /dev/null +++ b/0003-wiener_3.patch @@ -0,0 +1,492 @@ +From 43c61c3f259400cde5facbe7ce50769088b5f5b6 Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Sun, 10 Jan 2021 14:12:10 -0500 +Subject: [PATCH] x86: lr: Add AVX2 implementation of wiener filter for 16 bpc + +Relative speed-ups over C code (compared with gcc-9.3.0): + + C AVX2 +wiener_5tap_10bpc: 194892.0 14831.9 13.14x +wiener_5tap_12bpc: 194295.4 14828.9 13.10x +wiener_7tap_10bpc: 194391.7 19461.4 9.99x +wiener_7tap_12bpc: 194136.1 19418.7 10.00x +--- + src/x86/looprestoration16_avx2.asm | 466 +++++++++++++++++++++++++++++ + 1 file changed, 466 insertions(+) + create mode 100644 src/x86/looprestoration16_avx2.asm + +diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm +new file mode 100644 +index 0000000..2012860 +--- /dev/null ++++ b/src/x86/looprestoration16_avx2.asm +@@ -0,0 +1,466 @@ ++; Copyright (c) 2017-2021, The rav1e contributors ++; Copyright (c) 2021, Nathan Egge ++; All rights reserved. ++; ++; This source code is subject to the terms of the BSD 2 Clause License and ++; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ++; was not distributed with this source code in the LICENSE file, you can ++; obtain it at www.aomedia.org/license/software. If the Alliance for Open ++; Media Patent License 1.0 was not distributed with this source code in the ++; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ++ ++%include "config.asm" ++%include "ext/x86/x86inc.asm" ++ ++%if ARCH_X86_64 ++ ++SECTION_RODATA 32 ++ ++wiener5_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 ++wiener5_shufB: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 ++wiener5_shufC: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 ++wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ++ ++wiener7_shufB: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 ++wiener7_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 ++wiener7_shufD: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 ++rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 ++rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 ++wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ++ ++pq_3: dq (6 - 4) + 1 ++pq_5: dq (6 - 2) + 1 ++pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4)) ++pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2)) ++ ++pq_11: dq 12 - (6 - 4) + 1 ++pq_9: dq 12 - (6 - 2) + 1 ++nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8)) ++nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8)) ++ ++pb_wiener5_l: times 2 db 2, 3 ++pb_wiener5_r: times 2 db -6, -5 ++ ++pb_wiener7_l: times 2 db 4, 5 ++pb_wiener7_m: times 2 db -4, -3 ++pb_wiener7_r: times 2 db -8, -7 ++ ++SECTION .text ++ ++INIT_YMM avx2 ++cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ vbroadcasti128 m6, [wiener5_shufA] ++ vpbroadcastd m12, [fq + 2] ++ vbroadcasti128 m7, [wiener5_shufB] ++ vpbroadcastw m13, [fq + 6] ++ vbroadcasti128 m8, [wiener5_shufC] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m9, [pd_65540] ++ movq xm10, [pq_3] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m9, [pd_262160] ++ movq xm10, [pq_5] ++.bits10: ++ pxor m11, m11 ++ add wq, wq ++ add srcq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x ++.v_loop: ++ mov xq, wq ++ test edgeb, 1 ; LR_HAVE_LEFT ++ jz .h_extend_left ++ test leftq, leftq ++ jz .h_loop ++ movd xm4, [leftq + 4] ++ vpblendd m4, [srcq + xq - 4], 0xfe ++ add leftq, 8 ++ jmp .h_main ++.h_extend_left: ++ vbroadcasti128 m5, [srcq + xq] ++ mova m4, [srcq + xq] ++ palignr m4, m5, 12 ++ pshufb m4, [wiener5_l_shuf] ++ jmp .h_main ++.h_loop: ++ movu m4, [srcq + xq - 4] ++.h_main: ++ movu m5, [srcq + xq + 4] ++ test edgeb, 2 ; LR_HAVE_RIGHT ++ jnz .h_have_right ++ cmp xd, -18*2 ++ jl .h_have_right ++ movd xm2, xd ++ vpbroadcastd m0, [pb_wiener5_l] ++ vpbroadcastd m1, [pb_wiener5_r] ++ vpbroadcastb m2, xm2 ++ movu m3, [pb_0to31] ++ psubb m0, m2 ++ psubb m1, m2 ++ pminub m0, m3 ++ pminub m1, m3 ++ pshufb m4, m0 ++ pshufb m5, m1 ++.h_have_right: ++ pshufb m0, m4, m6 ++ pshufb m2, m4, m7 ++ paddw m0, m2 ++ pmaddwd m0, m12 ++ pshufb m1, m5, m6 ++ pshufb m3, m5, m7 ++ paddw m1, m3 ++ pmaddwd m1, m12 ++ pshufb m4, m8 ++ pmaddwd m4, m13 ++ pshufb m5, m8 ++ pmaddwd m5, m13 ++ paddd m0, m4 ++ paddd m1, m5 ++ paddd m0, m9 ++ paddd m1, m9 ++ psrad m0, xm10 ++ psrad m1, xm10 ++ packssdw m0, m1 ++ pmaxsw m0, m11 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add srcq, ssq ++ add dstq, 384*2 ++ dec hd ++ jg .v_loop ++ RET ++ ++DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 ++ ++INIT_YMM avx2 ++cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ pxor m6, m6 ++ vpbroadcastd m7, [fq + 2] ++ vpbroadcastd m8, [fq + 6] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m9, [nd_1047552] ++ movq xm10, [pq_11] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m9, [nd_1048320] ++ movq xm10, [pq_9] ++.bits10: ++ vpbroadcastw m11, bdmaxm ++ add wq, wq ++ add midq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x ++ mov msq, 2*384 ++ mov t0, midq ++ lea t1, [t0 + msq] ++ lea t2, [t1 + msq] ++ lea t3, [t2 + msq] ++ lea t4, [t3 + msq] ++ test edgeb, 4 ; LR_HAVE_TOP ++ jnz .have_top ++ mov t0, t2 ++ mov t1, t2 ++.have_top: ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jnz .v_loop ++ cmp hd, 2 ++ jg .v_loop ++ cmp hd, 1 ++ jne .limit_v ++ mov t3, t2 ++.limit_v: ++ mov t4, t3 ++.v_loop: ++ mov xq, wq ++.h_loop: ++ mova m1, [t0 + xq] ++ mova m2, [t1 + xq] ++ mova m3, [t2 + xq] ++ mova m4, [t3 + xq] ++ mova m5, [t4 + xq] ++ punpcklwd m0, m1, m2 ++ pmaddwd m0, m7 ++ punpckhwd m1, m2 ++ pmaddwd m1, m7 ++ punpcklwd m2, m5, m4 ++ pmaddwd m2, m7 ++ punpckhwd m5, m4 ++ pmaddwd m5, m7 ++ paddd m0, m2 ++ paddd m1, m5 ++ punpcklwd m2, m3, m6 ++ pmaddwd m2, m8 ++ punpckhwd m3, m6 ++ pmaddwd m3, m8 ++ paddd m0, m2 ++ paddd m1, m3 ++ paddd m0, m9 ++ paddd m1, m9 ++ psrad m0, xm10 ++ psrad m1, xm10 ++ packusdw m0, m1 ++ pminuw m0, m11 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add dstq, dsq ++ mov t0, t1 ++ mov t1, t2 ++ mov t2, t3 ++ mov t3, t4 ++ add t4, msq ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jnz .have_bottom ++ cmp hd, 3 ++ jg .have_bottom ++ mov t4, t3 ++.have_bottom: ++ dec hd ++ jg .v_loop ++ RET ++ ++INIT_YMM avx2 ++cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ vpbroadcastd m7, [fq] ++ vpbroadcastd m8, [fq + 4] ++ vbroadcasti128 m10, [rev_w] ++ vbroadcasti128 m11, [wiener5_shufA] ++ vbroadcasti128 m12, [wiener7_shufB] ++ vbroadcasti128 m13, [wiener7_shufC] ++ vbroadcasti128 m14, [wiener7_shufD] ++ vbroadcasti128 m15, [rev_d] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m9, [pd_65540] ++ mov rhq, [pq_3] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m9, [pd_262160] ++ mov rhq, [pq_5] ++.bits10: ++ add wq, wq ++ add srcq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh ++.v_loop: ++ mov xq, wq ++ test edgeb, 1 ; LR_HAVE_LEFT ++ jz .h_extend_left ++ test leftq, leftq ++ jz .h_loop ++ movq xm4, [leftq + 2] ++ vpblendw xm4, [srcq + xq - 6], 0xf8 ++ vinserti128 m4, [srcq + xq + 10], 1 ++ add leftq, 8 ++ jmp .h_main ++.h_extend_left: ++ vbroadcasti128 m5, [srcq + xq] ++ mova m4, [srcq + xq] ++ palignr m4, m5, 10 ++ pshufb m4, [wiener7_l_shuf] ++ jmp .h_main ++.h_loop: ++ movu m4, [srcq + xq - 6] ++.h_main: ++ movu m5, [srcq + xq + 2] ++ movu m6, [srcq + xq + 6] ++ test edgeb, 2 ; LR_HAVE_RIGHT ++ jnz .h_have_right ++ cmp xd, -19*2 ++ jl .h_have_right ++ movd xm3, xd ++ vpbroadcastd m0, [pb_wiener7_l] ++ vpbroadcastd m1, [pb_wiener7_m] ++ vpbroadcastd m2, [pb_wiener7_r] ++ vpbroadcastb m3, xm3 ++ psubb m0, m3 ++ psubb m1, m3 ++ psubb m2, m3 ++ movu m3, [pb_0to31] ++ pminub m0, m3 ++ pminub m1, m3 ++ pminub m2, m3 ++ pshufb m4, m0 ++ pshufb m5, m1 ++ pshufb m6, m2 ++ cmp xd, -9*2 ++ jne .hack ++ vpbroadcastw xm3, [srcq + xq + 16] ++ vinserti128 m5, xm3, 1 ++ jmp .h_have_right ++.hack: ++ cmp xd, -1*2 ++ jne .h_have_right ++ vpbroadcastw xm5, [srcq + xq] ++.h_have_right: ++ pshufb m6, m10 ++ pshufb m0, m4, m11 ++ pshufb m2, m5, m12 ++ paddw m0, m2 ++ pmaddwd m0, m7 ++ pshufb m2, m4, m13 ++ pshufb m4, m14 ++ paddw m2, m4 ++ pmaddwd m2, m8 ++ pshufb m1, m6, m11 ++ pshufb m5, m11 ++ pmaddwd m1, m7 ++ pmaddwd m5, m7 ++ pshufb m3, m6, m13 ++ pshufb m6, m14 ++ paddw m3, m6 ++ pmaddwd m3, m8 ++ paddd m0, m2 ++ paddd m1, m3 ++ pshufb m1, m15 ++ paddd m1, m5 ++ movq xm4, rhq ++ pxor m5, m5 ++ paddd m0, m9 ++ paddd m1, m9 ++ psrad m0, xm4 ++ psrad m1, xm4 ++ packssdw m0, m1 ++ pmaxsw m0, m5 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add srcq, ssq ++ add dstq, 384*2 ++ dec hd ++ jg .v_loop ++ RET ++ ++INIT_YMM avx2 ++cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax ++ movifnidn wd, wm ++ movifnidn hd, hm ++ movifnidn edgeb, edgem ++ pxor m6, m6 ++ vpbroadcastd m7, [fq] ++ vpbroadcastw m8, [fq + 4] ++ vpbroadcastd m9, [fq + 6] ++ popcnt bdmaxd, bdmaxm ++ vpbroadcastd m10, [nd_1047552] ++ movq xm11, [pq_11] ++ cmp bdmaxd, 10 ++ je .bits10 ++ vpbroadcastd m10, [nd_1048320] ++ movq xm11, [pq_9] ++.bits10: ++ vpbroadcastw m12, bdmaxm ++ add wq, wq ++ add midq, wq ++ add dstq, wq ++ neg wq ++ DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x ++ mov msq, 2*384 ++ mov t0, midq ++ mov t1, t0 ++ lea t2, [t1 + msq] ++ lea t3, [t2 + msq] ++ lea t4, [t3 + msq] ++ lea t5, [t4 + msq] ++ lea t6, [t5 + msq] ++ test edgeb, 4 ; LR_HAVE_TOP ++ jnz .have_top ++ mov t0, t3 ++ mov t1, t3 ++ mov t2, t3 ++.have_top: ++ cmp hd, 3 ++ jg .v_loop ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jz .no_bottom0 ++ cmp hd, 1 ++ jg .v_loop ++ jmp .h3 ++.no_bottom0: ++ cmp hd, 2 ++ je .h2 ++ jns .h3 ++.h1: ++ mov t4, t3 ++.h2: ++ mov t5, t4 ++.h3: ++ mov t6, t5 ++.v_loop: ++ mov xq, wq ++.h_loop: ++ mova m1, [t0 + xq] ++ mova m2, [t1 + xq] ++ mova m3, [t5 + xq] ++ mova m4, [t6 + xq] ++ punpcklwd m0, m1, m2 ++ pmaddwd m0, m7 ++ punpckhwd m1, m2 ++ pmaddwd m1, m7 ++ punpcklwd m2, m4, m3 ++ pmaddwd m2, m7 ++ punpckhwd m4, m3 ++ pmaddwd m4, m7 ++ paddd m0, m2 ++ paddd m1, m4 ++ mova m3, [t2 + xq] ++ mova m4, [t4 + xq] ++ punpcklwd m2, m3, m4 ++ pmaddwd m2, m8 ++ punpckhwd m3, m4 ++ pmaddwd m3, m8 ++ paddd m0, m2 ++ paddd m1, m3 ++ mova m3, [t3 + xq] ++ punpcklwd m2, m3, m6 ++ pmaddwd m2, m9 ++ punpckhwd m3, m6 ++ pmaddwd m3, m9 ++ paddd m0, m2 ++ paddd m1, m3 ++ paddd m0, m10 ++ paddd m1, m10 ++ psrad m0, xm11 ++ psrad m1, xm11 ++ packusdw m0, m1 ++ pminuw m0, m12 ++ mova [dstq + xq], m0 ++ add xq, 32 ++ jl .h_loop ++ add dstq, dsq ++ mov t0, t1 ++ mov t1, t2 ++ mov t2, t3 ++ mov t3, t4 ++ mov t4, t5 ++ mov t5, t6 ++ add t6, msq ++ cmp hd, 4 ++ jg .next_row ++ test edgeb, 8 ; LR_HAVE_BOTTOM ++ jz .no_bottom ++ cmp hd, 2 ++ jg .next_row ++.no_bottom: ++ mov t6, t5 ++.next_row: ++ dec hd ++ jg .v_loop ++ RET ++ ++%endif ; ARCH_X86_64 +-- +GitLab + diff --git a/0004-wiener_4.patch b/0004-wiener_4.patch new file mode 100644 index 000000000000..1876e7bd1d25 --- /dev/null +++ b/0004-wiener_4.patch @@ -0,0 +1,101 @@ +From 2d59aa7b52713b77243bda12066213fca8447f9d Mon Sep 17 00:00:00 2001 +From: "Nathan E. Egge" <unlord@xiph.org> +Date: Wed, 13 Jan 2021 14:54:42 -0500 +Subject: [PATCH] Enable AVX2 wiener filter HBD assembly + +--- + src/meson.build | 1 + + src/x86/looprestoration_init_tmpl.c | 41 +++++++++++++++++++++++++++-- + 2 files changed, 40 insertions(+), 2 deletions(-) + +diff --git a/src/meson.build b/src/meson.build +index ca0b406..c5c305d 100644 +--- a/src/meson.build ++++ b/src/meson.build +@@ -209,7 +209,8 @@ if is_asm_enabled + + if dav1d_bitdepths.contains('16') + libdav1d_sources_asm += files( ++ 'x86/looprestoration16_avx2.asm', + 'x86/mc16_avx2.asm', + ) + endif + +diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c +index 11ebdd1..dfc9f84 100644 +--- a/src/x86/looprestoration_init_tmpl.c ++++ b/src/x86/looprestoration_init_tmpl.c +@@ -31,9 +31,41 @@ + #include "common/intops.h" + #include "src/tables.h" + ++#if BITDEPTH != 8 ++#undef decl_wiener_filter_fn ++#define decl_wiener_filter_fn(name, ext) \ ++void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \ ++ ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \ ++ int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ ++void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \ ++ const int16_t fv[7], int w, int h, \ ++ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \ ++static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int16_t filter[2][8], \ ++ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \ ++ ALIGN_STK_64(int16_t, mid, 68 * 384,); \ ++ BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, filter[0], w, h, \ ++ edges HIGHBD_TAIL_SUFFIX); \ ++ if (edges & LR_HAVE_TOP) { \ ++ BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, filter[0], w, 2, \ ++ edges HIGHBD_TAIL_SUFFIX); \ ++ } \ ++ if (edges & LR_HAVE_BOTTOM) { \ ++ BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \ ++ lpf_stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \ ++ } \ ++ BF(name##_v, ext)(dst, dst_stride, mid, filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \ ++} ++#define decl_wiener_filter_fns(ext) \ ++decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \ ++decl_wiener_filter_fn(dav1d_wiener_filter5, ext) ++#else + #define decl_wiener_filter_fns(ext) \ + decl_wiener_filter_fn(BF(dav1d_wiener_filter7, ext)); \ + decl_wiener_filter_fn(BF(dav1d_wiener_filter5, ext)) ++#endif + + #define decl_sgr_filter_fn(ext) \ + void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ +@@ -163,11 +195,14 @@ decl_wiener_filter_fns(sse2); + decl_wiener_filter_fns(ssse3); + decl_sgr_filter_fn(ssse3) + # if ARCH_X86_64 +-decl_wiener_filter_fns(avx2); + decl_sgr_filter_fn(avx2) + # endif + #endif + ++#if ARCH_X86_64 ++decl_wiener_filter_fns(avx2); ++#endif ++ + COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + +@@ -185,9 +220,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; +-#if BITDEPTH == 8 && ARCH_X86_64 ++#if ARCH_X86_64 + c->wiener[0] = BF(dav1d_wiener_filter7, avx2); + c->wiener[1] = BF(dav1d_wiener_filter5, avx2); ++# if BITDEPTH == 8 + c->selfguided = BF(sgr_filter, avx2); ++# endif + #endif + } +-- +GitLab + @@ -1,7 +1,8 @@ # Maintainer: Ben Grant <ben@190n.org> +_testvideo=Sparks-5994fps-AV1-10bit-1920x1080-film-grain-synthesis-2013kbps.obu pkgname=dav1d-git-optimized -pkgver=r1489.ffd052b +pkgver=r1556.05d05f9 pkgrel=1 license=('BSD') pkgdesc='AV1 cross-platform Decoder, focused on speed and correctness -- latest git version compiled with optimizations' @@ -9,22 +10,45 @@ url='https://code.videolan.org/videolan/dav1d' arch=('x86_64') provides=('dav1d' 'libdav1d.so') conflicts=('dav1d' 'dav1d-git') -makedepends=('meson' 'ninja' 'git' 'nasm') -source=('git+https://code.videolan.org/videolan/dav1d.git') -sha256sums=('SKIP') -options=(!buildflags) +makedepends=('meson' 'git' 'nasm') +source=('git+https://code.videolan.org/videolan/dav1d.git' + "http://download.opencontent.netflix.com.s3.amazonaws.com/AV1/Sparks/$_testvideo" + '0001-1112.patch' + '0002-wiener_2.patch' + '0003-wiener_3.patch' + '0004-wiener_4.patch') +sha256sums=('SKIP' + 'e56e20de5bfad7ec073d3e53ea6c760d0b11ed143f087b1bc32230e4840fea87' + '421c4732d3a3fc85428263f4e4419f7b3bfc7059a29c2b81055a6ebf4345d0eb' + '32385f2316886cef326e7887a3de96fdada2ee723b269908794ed770da460626' + '1cf8db585f98ef8e63bb3f44f11679cdc554377f58964bebc7ca29aa1639d1ea' + '5e46d8d6fcf2d2cdb062368b23af534ecf123321594f9d548a6f14d80d16d981') pkgver () { cd dav1d printf "r%s.%s" "$(git rev-list --count HEAD)" "$(git rev-parse --short HEAD)" } +prepare () { + cd dav1d + # from https://code.videolan.org/videolan/dav1d/-/merge_requests/1112 + patch -Np1 -i ${srcdir}/0001-1112.patch + patch -Np1 -i ${srcdir}/0002-wiener_2.patch + patch -Np1 -i ${srcdir}/0003-wiener_3.patch + patch -Np1 -i ${srcdir}/0004-wiener_4.patch +} + build () { - export CFLAGS="-flto -O3 -march=native" - export CXXFLAGS="-flto -O3 -march=native" - export LDFLAGS="-flto -O3 -march=native" cd dav1d - arch-meson build --optimization=3 -Db_lto=true + export CC=gcc + arch-meson build \ + -Denable_tests=false \ + -Dc_args="-march=native -O3 -fuse-ld=bfd" \ + -Db_lto=false \ + -Db_pgo=generate + ninja -C build + ./build/tools/dav1d -i "$srcdir/$_testvideo" --muxer null --framethreads $(nproc) --tilethread 4 + meson configure build -Db_pgo=use ninja -C build } |