diff -Nru dav1d-0.9.0/debian/changelog dav1d-0.9.1/debian/changelog --- dav1d-0.9.0/debian/changelog 2021-05-25 20:05:56.000000000 +0000 +++ dav1d-0.9.1/debian/changelog 2021-07-30 20:28:01.000000000 +0000 @@ -1,12 +1,18 @@ -dav1d (0.9.0-1~20.04.sav0) focal; urgency=medium +dav1d (0.9.1-1~20.04.sav0) focal; urgency=medium * Backport to Focal * debian/control: Set debhelper-compat (= 12) BD - + d/rules: Change override_dh_auto_test to call "meson test" directly, as - dh 13 now uses "meson test" (needed for "-t 10" option) not "ninja test" - (and ensure to change directory to the correct architecture obj-* path) + * d/rules: Change override_dh_auto_test to call "meson test" directly, due + dh >= 13 calling "meson test" (needed for "-t 10" option) not "ninja test" - -- Rob Savoury Tue, 25 May 2021 13:05:56 -0700 + -- Rob Savoury Fri, 30 Jul 2021 13:28:01 -0700 + +dav1d (0.9.1-1) experimental; urgency=medium + + * New upstream release. + * Add more copyright holders in d/copyright. + + -- Dylan Aïssi Fri, 30 Jul 2021 11:21:25 +0200 dav1d (0.9.0-1) experimental; urgency=medium diff -Nru dav1d-0.9.0/debian/copyright dav1d-0.9.1/debian/copyright --- dav1d-0.9.0/debian/copyright 2021-05-17 12:45:02.000000000 +0000 +++ dav1d-0.9.1/debian/copyright 2021-07-30 09:21:25.000000000 +0000 @@ -126,6 +126,9 @@ 2019, B Krishnan Iyer 2019, James Almer 2001-2016, Alliance for Open Media + 2017-2021, The rav1e contributors + 2020, Nathan Egge + 2021, Matthias Dressel License: BSD-2-clause Files: include/compat/getopt.h diff -Nru dav1d-0.9.0/examples/dp_renderer.h dav1d-0.9.1/examples/dp_renderer.h --- dav1d-0.9.0/examples/dp_renderer.h 2021-05-16 16:47:22.518950500 +0000 +++ dav1d-0.9.1/examples/dp_renderer.h 2021-07-28 21:38:28.853851800 +0000 @@ -82,7 +82,7 @@ // Cookie passed to the renderer implementation callbacks void *cookie; // Callback to create the renderer - void* (*create_renderer)(); + void* (*create_renderer)(void); // Callback to destroy the renderer void (*destroy_renderer)(void *cookie); // Callback to the render function that renders a prevously sent frame diff -Nru dav1d-0.9.0/examples/dp_renderer_sdl.c dav1d-0.9.1/examples/dp_renderer_sdl.c --- dav1d-0.9.0/examples/dp_renderer_sdl.c 2021-05-16 16:47:22.518950500 +0000 +++ dav1d-0.9.1/examples/dp_renderer_sdl.c 2021-07-28 21:38:28.853851800 +0000 @@ -43,7 +43,7 @@ SDL_Texture *tex; } Dav1dPlayRendererPrivateContext; -static void *sdl_renderer_create() +static void *sdl_renderer_create(void) { SDL_Window *win = dp_create_sdl_window(0); if (win == NULL) diff -Nru dav1d-0.9.0/include/common/attributes.h dav1d-0.9.1/include/common/attributes.h --- dav1d-0.9.0/include/common/attributes.h 2021-05-16 16:47:22.518950500 +0000 +++ dav1d-0.9.1/include/common/attributes.h 2021-07-28 21:38:28.853851800 +0000 @@ -33,6 +33,14 @@ #include #include +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + #ifdef __GNUC__ #define ATTR_ALIAS __attribute__((may_alias)) #define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr))) @@ -93,9 +101,11 @@ */ #ifdef _MSC_VER #define NOINLINE __declspec(noinline) -#else /* !_MSC_VER */ +#elif __has_attribute(noclone) +#define NOINLINE __attribute__((noinline, noclone)) +#else #define NOINLINE __attribute__((noinline)) -#endif /* !_MSC_VER */ +#endif #ifdef __clang__ #define NO_SANITIZE(x) __attribute__((no_sanitize(x))) @@ -160,10 +170,6 @@ } #endif /* !_MSC_VER */ -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - #ifndef static_assert #define CHECK_OFFSET(type, field, name) \ struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; } diff -Nru dav1d-0.9.0/include/dav1d/dav1d.h dav1d-0.9.1/include/dav1d/dav1d.h --- dav1d-0.9.0/include/dav1d/dav1d.h 2021-05-16 16:47:22.518950500 +0000 +++ dav1d-0.9.1/include/dav1d/dav1d.h 2021-07-28 21:38:28.853851800 +0000 @@ -105,7 +105,12 @@ * @param buf The data to be parser. * @param sz Size of the data. * - * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error. + * @return + * 0: Success, and out is filled with the parsed Sequence Header + * OBU parameters. + * DAV1D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer. + * other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in + * arguments, and other errors during parsing. * * @note It is safe to feed this function data containing other OBUs than a * Sequence Header, as they will simply be ignored. If there is more than diff -Nru dav1d-0.9.0/meson.build dav1d-0.9.1/meson.build --- dav1d-0.9.0/meson.build 2021-05-16 16:47:22.518950500 +0000 +++ dav1d-0.9.1/meson.build 2021-07-28 21:38:28.853851800 +0000 @@ -23,14 +23,14 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.9.0', + version: '0.9.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.49.0') -dav1d_soname_version = '5.1.0' +dav1d_soname_version = '5.1.1' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -173,16 +173,16 @@ # Header checks -stdatomic_dependency = [] +stdatomic_dependencies = [] if not cc.check_header('stdatomic.h') if cc.get_id() == 'msvc' # we have a custom replacement for MSVC - stdatomic_dependency = declare_dependency( + stdatomic_dependencies += declare_dependency( include_directories : include_directories('include/compat/msvc'), ) elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''', name : 'GCC-style atomics', args : test_args) - stdatomic_dependency = declare_dependency( + stdatomic_dependencies += declare_dependency( include_directories : include_directories('include/compat/gcc'), ) else @@ -190,6 +190,11 @@ endif endif +if host_machine.cpu_family().startswith('wasm') + # enable atomics + bulk-memory features + stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true) +endif + if cc.check_header('unistd.h') cdata.set('HAVE_UNISTD_H', 1) endif @@ -247,6 +252,7 @@ '-Wno-maybe-uninitialized', '-Wno-missing-field-initializers', '-Wno-unused-parameter', + '-Wstrict-prototypes', '-Werror=missing-prototypes', '-Wshorten-64-to-32', ] diff -Nru dav1d-0.9.0/NEWS dav1d-0.9.1/NEWS --- dav1d-0.9.0/NEWS 2021-05-16 16:47:22.514950500 +0000 +++ dav1d-0.9.1/NEWS 2021-07-28 21:38:28.849851600 +0000 @@ -1,3 +1,16 @@ +Changes for 0.9.1 'Golden Eagle': +--------------------------------- + +0.9.1 is a middle-size revision of dav1d, adding notably 10b acceleration for SSSE3: + - 10/12b SSSE3 optimizations for mc (avg, w_avg, mask, w_mask, emu_edge), + prep/put_bilin, prep/put_8tap, ipred (dc/h/v, paeth, smooth, pal, filter), wiener, + sgr (10b), warp8x8, deblock, film_grain, cfl_ac/pred for 32bit and 64bit x86 processors + - Film grain NEON for fguv 10/12b, fgy/fguv 8b and fgy/fguv 10/12 arm32 + - Fixes for filmgrain on ARM + - itx 10bit optimizations for 4x4/x8/x16, 8x4/x8/x16 for SSE4 + - Misc improvements on SSE2, SSE4 + + Changes for 0.9.0 'Golden Eagle': --------------------------------- diff -Nru dav1d-0.9.0/README.md dav1d-0.9.1/README.md --- dav1d-0.9.0/README.md 2021-05-16 16:47:22.514950500 +0000 +++ dav1d-0.9.1/README.md 2021-07-28 21:38:28.849851600 +0000 @@ -60,7 +60,7 @@ Notably, the codebase is in pure C and asm. -We are on IRC, on the **#dav1d** channel on *Freenode*. +We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [KiwiIRC Web Interface](https://kiwiirc.com/nextclient/#ircs://irc.libera.chat/#dav1d). See the [contributions document](CONTRIBUTING.md). diff -Nru dav1d-0.9.0/src/arm/32/film_grain16.S dav1d-0.9.1/src/arm/32/film_grain16.S --- dav1d-0.9.0/src/arm/32/film_grain16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/film_grain16.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,949 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 + +.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off + vmov.u16 r11, \src1[0+\off] + vmov.u16 r12, \src3[0+\off] + add r11, r11, r3 + vmov.u16 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u16 r11, \src3[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u16 r12, \src2[0+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u16 lr, \src4[0+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u16 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u16 r12, \src4[2+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, d2, d3, 0 + gather_interleaved d8, d9, d0, d1, d2, d3, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, lsl #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH*2 // grain_lut stride + ldr r10, [sp, #124] // bitdepth_max + + eor r4, r4, #15 // 15 - scaling_shift + vdup.16 q6, r10 // bitdepth_max + clz r10, r10 + vdup.16 q13, r4 // 15 - scaling_shift + rsb r10, r10, #24 // bitdepth_min_8 + cmp r8, #0 + vdup.16 q12, r10 // bitdepth_min_8 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i16 q14, #16 + vmov.i16 q15, #235 + vshl.s16 q14, q14, q12 + vshl.s16 q15, q15, q12 + b 2f +1: + // no clip + vmov.i16 q14, #0 + vmov q15, q6 +2: + vshr.u16 q6, q6, #1 // grain_max + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #18 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.16 d14, d24[0] + vdup.16 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + sub r2, r2, #32 // src_stride -= 32 + sub r9, r9, #32 // grain_stride -= 32 + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.16 {d0}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r6]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r8], r9 // grain_lut top old +.endif +.if \oy + vld1.16 {q4, q5}, [r6], r9 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r1, :128]! // src +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if !\ox && !\oy + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif +.if !\oy + vmvn.i16 q5, #0xf000 // 0x0fff +.endif + vld1.16 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vmlal.s16 q0, d16, d25 +.endif + +.if \oy +.if \ox + add r8, r8, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vmvn d0, d12 // grain_min + vqrshrn.s32 d4, q1, #5 + vmin.s16 d16, d16, d12 + vmin.s16 d4, d4, d12 + vmax.s16 d16, d16, d0 + vmax.s16 d4, d4, d0 +.endif + + vmull.s16 q0, d4, d14 + vmull.s16 q1, d5, d14 + vmull.s16 q2, d6, d14 + vmull.s16 q3, d7, d14 + vmlal.s16 q0, d16, d15 + vmlal.s16 q1, d17, d15 + vmlal.s16 q2, d18, d15 + vmlal.s16 q3, d19, d15 + vmull.s16 q8, d20, d15 + vmull.s16 q9, d21, d15 + vmull.s16 q10, d22, d15 + vmull.s16 q11, d23, d15 + vmlal.s16 q8, d8, d14 + vmlal.s16 q9, d9, d14 + vmlal.s16 q10, d10, d14 + vmlal.s16 q11, d11, d14 + vmvn q4, q6 // grain_min + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q6 + vmin.s16 q9, q1, q6 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 q10, q2, q6 + vmin.s16 q11, q3, q6 + vmax.s16 q8, q8, q4 + vmax.s16 q9, q9, q4 + vld1.16 {q2, q3}, [r1, :128], r2 // src + vmvn.i16 q5, #0xf000 // 0x0fff + vmax.s16 q10, q10, q4 + vmax.s16 q11, q11, q4 +.elseif \ox + vmvn d4, d12 // grain_min + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 d16, d16, d12 + vmax.s16 d16, d16, d4 + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q5 + vand q1, q1, q5 + vand q2, q2, q5 + vand q3, q3, q5 + + bl gather32_neon + +.if \ox || \oy + vpush {q6-q7} +.endif + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + +.if \ox || \oy + vpop {q6-q7} +.endif + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + + vst1.16 {q0, q1}, [r0, :128]! // dst + subs r7, r7, #1 +.if \oy + vdup.16 d14, d25[0] + vdup.16 d15, d25[1] +.endif + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r10, r11, [sp, #124] // uv, is_id + ldr r6, [sp, #136] // bitdepth_max + + clz r7, r6 + rsb r7, r7, #24 // bitdepth_min_8 + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset + vld1.16 {d30[]}, [r12] // uv_luma_mult + lsl r10, r10, r7 // uv_offset << bitdepth_min_8 + vld1.16 {d30[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + eor lr, lr, #15 // 15 - scaling_shift + + vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 + + cmp r12, #0 + vdup.16 q13, lr // 15 - scaling_shift + + beq 1f + // clip + cmp r11, #0 + mov r8, #16 + mov r9, #240 + lsl r8, r8, r7 + lsl r9, r9, r7 + beq 2f + // is_id + mov r9, #235 + lsl r9, r9, r7 + b 2f +1: + // no clip + mov r8, #0 + mov r9, r6 // bitdepth_max +2: + vmov.16 d30[3], r6 // bitdepth_max + vdup.16 d31, r8 // clip_min + + mov r10, #GRAIN_WIDTH*2 // grain_lut stride + +.if \sy + mov r6, #23 + mov r7, #22 +.else + mov r6, #27 + mov r7, #17 +.endif + vmov.16 d31[1], r9 // clip_max + + ldrd r8, r9, [sp, #116] // offsets, h + + add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + vmov.16 d31[2], r6 // overlap y [0] + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + vmov.16 d31[3], r7 // overlap y [1] + + add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + sub r7, r7, #32 // luma_stride -= 32 + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + sub r2, r2, #32 // src_stride -= 32 + sub r10, r10, #32 // grain_stride -= 32 +.if \oy + mov r12, lr +.endif +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r8]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if \oy + vld1.16 {q4, q5}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif + vld1.16 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + add r11, r11, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vmull.s16 q8, d20, d29 + vmull.s16 q9, d21, d29 + vmull.s16 q10, d22, d29 + vmull.s16 q11, d23, d29 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vmlal.s16 q8, d8, d28 + vmlal.s16 q9, d9, d28 + vmlal.s16 q10, d10, d28 + vmlal.s16 q11, d11, d28 + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q7 + vmin.s16 q9, q1, q7 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q10, q2, q7 + vmin.s16 q11, q3, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 q10, q10, q6 + vmax.s16 q11, q11, q6 +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q4, q5}, [r1, :128]! // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d8, d29 + vmlal.s16 q7, d9, d29 + vmlal.s16 q0, d10, d29 + vmlal.s16 q1, d11, d29 + vld1.16 {q4, q5}, [r1, :128] // src + sub r1, r1, #32 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 + vmlal.s16 q0, d8, d29 + vmlal.s16 q1, d9, d29 + vmlal.s16 q2, d10, d29 + vmlal.s16 q3, d11, d29 + vdup.16 q14, d30[2] // uv_offset + vshrn.s32 d0, q0, #6 + vshrn.s32 d1, q1, #6 + vshrn.s32 d2, q2, #6 + vshrn.s32 d3, q3, #6 + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vadd.i16 q2, q0, q14 + vadd.i16 q3, q1, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmin.s16 q2, q2, q4 + vmin.s16 q3, q3, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 + vmax.s16 q2, q2, q5 + vmax.s16 q3, q3, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 + vand q2, q2, q14 + vand q3, q3, q14 +.endif + + bl gather32_neon + + vld1.16 {q0, q1}, [r1, :128]! // src + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vld1.16 {q2, q3}, [r1, :128], r2 // src + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + +.if \oy + vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmax.s16 q2, q2, q4 + vmax.s16 q3, q3, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + vmin.s16 q2, q2, q5 + vmin.s16 q3, q3, q5 + + vst1.16 {q0, q1}, [r0, :128]! // dst + + subs r9, r9, #1 +.if \oy + vmov.32 d31[1], lr // new coeffs for overlap y +.endif + + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if \oy + vld1.16 {q2, q3}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5], r10 // grain_lut +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d17, q1, #5 + vqrshrn.s32 d18, q2, #5 + vqrshrn.s32 d19, q3, #5 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q8, q8, q7 + vmin.s16 q9, q9, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vrshr.u16 q0, q0, #1 + vrshr.u16 q1, q1, #1 +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q2, q3}, [r1, :128], r2 // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d4, d29 + vmlal.s16 q7, d5, d29 + vmlal.s16 q0, d6, d29 + vmlal.s16 q1, d7, d29 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vdup.16 q14, d30[2] // uv_offset + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + vld1.16 {q2, q3}, [r1, :128], r2 // src + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 +.endif + + bl gather16_neon + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q2, q8 // *src + noise + vqadd.s16 q1, q3, q9 + +.if \oy + // Swap the two last coefficients of d31, place them first in d28 + vrev64.16 d28, d31 +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + + subs r9, r9, #1 +.if \oy + // Take the first two 16 bit coefficients of d28 and place them at the + // end of d31 + vtrn.32 d31, d28 +.endif + + vst1.16 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.9.0/src/arm/32/film_grain.S dav1d-0.9.1/src/arm/32/film_grain.S --- dav1d-0.9.0/src/arm/32/film_grain.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/film_grain.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,714 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 + +.macro gather_interleaved dst1, dst2, src1, src2, off + vmov.u8 r11, \src1[0+\off] + vmov.u8 r12, \src2[0+\off] + add r11, r11, r3 + vmov.u8 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u8 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u8 r12, \src1[4+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u8 lr, \src2[4+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u8 r11, \src1[6+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u8 r12, \src2[6+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 + gather_interleaved \dst1, \dst3, \src1, \src3, 0 + gather_interleaved \dst1, \dst3, \src1, \src3, 1 + gather_interleaved \dst2, \dst4, \src2, \src4, 0 + gather_interleaved \dst2, \dst4, \src2, \src4, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, 0 + gather_interleaved d8, d9, d0, d1, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH // grain_lut stride + + neg r4, r4 + vdup.16 q13, r4 // -scaling_shift + cmp r8, #0 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i8 q14, #16 + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #9 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.8 d14, d24[0] + vdup.8 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.8 {d8}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.8 {q2, q3}, [r6], r9 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r8], r9 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r1, :128], r2 // src + vld1.8 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d4, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d4, q5, #5 +.endif + + vmull.s8 q4, d20, d15 + vmull.s8 q5, d21, d15 + vmull.s8 q8, d22, d15 + vmull.s8 q9, d23, d15 + vmlal.s8 q4, d4, d14 + vmlal.s8 q5, d5, d14 + vmlal.s8 q8, d6, d14 + vmlal.s8 q9, d7, d14 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q8, #5 + vqrshrn.s16 d23, q9, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif + + bl gather32_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q2, d8 // scaling + vmovl.u8 q3, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q2 // scaling * grain + vmul.i16 q9, q9, q3 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r7, r7, #1 +.if \oy + vdup.8 d14, d25[0] + vdup.8 d15, d25[1] +.endif + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + ldrd r8, r9, [sp, #116] // offsets, h + ldrd r10, r11, [sp, #124] // uv, is_id + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + add r10, r10, #FGD_UV_OFFSET + vld1.16 {d4[]}, [r12] // uv_luma_mult + vld1.16 {d4[2]}, [r10] // uv_offset + vld1.16 {d4[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg lr, lr // -scaling_shift + + cmp r12, #0 + vdup.16 q13, lr // -scaling_shift + + beq 1f + // clip + cmp r11, #0 + vmov.i8 q14, #16 + vmov.i8 q15, #240 + beq 2f + // is_id + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + mov r10, #GRAIN_WIDTH // grain_lut stride + + add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: + +.if \sy + vmov.i8 d6, #23 + vmov.i8 d7, #22 +.else + vmov.i8 d6, #27 + vmov.i8 d7, #17 +.endif + +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8, q9}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmull.s8 q6, d22, d7 + vmull.s8 q7, d23, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vmlal.s8 q6, d18, d6 + vmlal.s8 q7, d19, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q6, #5 + vqrshrn.s16 d23, q7, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if !\csfl + vld1.8 {q8, q9}, [r1, :128] // src + vmovl.u8 q4, d0 + vmovl.u8 q5, d1 + vmovl.u8 q6, d2 + vmovl.u8 q7, d3 + vmovl.u8 q0, d16 + vmovl.u8 q1, d17 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q6, q6, d4[0] + vmul.i16 q7, q7, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vmul.i16 q8, q8, d4[1] + vmul.i16 q9, q9, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vqadd.s16 q6, q6, q8 + vqadd.s16 q7, q7, q9 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vshr.s16 q6, q6, #6 + vshr.s16 q7, q7, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vadd.i16 q6, q6, q0 + vadd.i16 q7, q7, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 + vqmovun.s16 d2, q6 + vqmovun.s16 d3, q7 +.endif + + bl gather32_neon + + vld1.8 {q0, q1}, [r1, :128], r2 // src + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r9, r9, #1 +.if \oy + vdup.8 d6, d25[0] + vdup.8 d7, d25[1] +.endif + + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10}, [r5], r10 // grain_lut + vld1.8 {q11}, [r1, :128], r2 // src + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if \csfl + vrshrn.u16 d0, q0, #1 + vrshrn.u16 d1, q1, #1 +.else + vrshr.u16 q4, q0, #1 + vrshr.u16 q5, q1, #1 + vmovl.u8 q0, d22 + vmovl.u8 q1, d23 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 +.endif + + bl gather16_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + + vaddw.u8 q8, q8, d22 // *src + noise + vaddw.u8 q9, q9, d23 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + + vmax.u8 q0, q0, q14 + vmin.u8 q0, q0, q15 + + subs r9, r9, #1 +.if \oy + vswp d6, d7 +.endif + vst1.8 {q0}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.9.0/src/arm/64/film_grain16.S dav1d-0.9.1/src/arm/64/film_grain16.S --- dav1d-0.9.0/src/arm/64/film_grain16.S 2021-05-16 16:47:22.526950600 +0000 +++ dav1d-0.9.1/src/arm/64/film_grain16.S 2021-07-28 21:38:28.865851900 +0000 @@ -27,6 +27,7 @@ #include "src/arm/asm.S" #include "util.S" +#include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 @@ -64,11 +65,18 @@ gather_interleaved \dst2, \dst1, \src4, \src2, 8 .endm -function gather_neon +function gather32_neon gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h ret endfunc +function gather16_neon + gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 + gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 + ins v6.d[1], v7.d[0] + ret +endfunc + const overlap_coeffs_0, align=4 .short 27, 17, 0, 0 .short 17, 27, 32, 32 @@ -110,6 +118,7 @@ stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift ldr w11, [x6, #8] // offsets[1][0] ldr w13, [x6, #4] // offsets[0][1] ldr w15, [x6, #12] // offsets[1][1] @@ -122,8 +131,7 @@ mov x9, #GRAIN_WIDTH*2 // grain_lut stride neg w10, w10 // bitdepth_min_8 - neg w4, w4 - dup v29.4s, w4 // -scaling_shift + dup v29.8h, w4 // 15 - scaling_shift dup v27.8h, w10 // bitdepth_min_8 movrel x16, overlap_coeffs_0 @@ -207,7 +215,7 @@ and v1.16b, v1.16b, v4.16b and v2.16b, v2.16b, v4.16b and v3.16b, v3.16b, v4.16b - bl gather_neon + bl gather32_neon .if \ox smull v20.4s, v20.4h, v27.4h @@ -268,7 +276,7 @@ smax v19.8h, v19.8h, v25.8h .endif - uxtl v4.8h, v6.8b // scaling + uxtl v4.8h, v6.8b // scaling .if \ox && !\oy sqrshrn v20.4h, v20.4s, #5 .endif @@ -281,37 +289,18 @@ smax v20.4h, v20.4h, v25.4h .endif uxtl2 v7.8h, v7.16b - .if \ox && !\oy - smull v20.4s, v20.4h, v4.4h // scaling * grain -.else - smull v20.4s, v16.4h, v4.4h + ins v16.d[0], v20.d[0] .endif - smull2 v21.4s, v16.8h, v4.8h - smull v22.4s, v17.4h, v5.4h - smull2 v23.4s, v17.8h, v5.8h - smull v16.4s, v18.4h, v6.4h - smull2 v17.4s, v18.8h, v6.8h - smull v18.4s, v19.4h, v7.4h - smull2 v19.4s, v19.8h, v7.8h - - srshl v20.4s, v20.4s, v29.4s // round2(scaling * grain, scaling_shift) - srshl v21.4s, v21.4s, v29.4s - srshl v22.4s, v22.4s, v29.4s - srshl v23.4s, v23.4s, v29.4s - srshl v16.4s, v16.4s, v29.4s - srshl v17.4s, v17.4s, v29.4s - srshl v18.4s, v18.4s, v29.4s - srshl v19.4s, v19.4s, v29.4s - - sqxtn v20.4h, v20.4s - sqxtn2 v20.8h, v21.4s - sqxtn v21.4h, v22.4s - sqxtn2 v21.8h, v23.4s - sqxtn v22.4h, v16.4s - sqxtn2 v22.8h, v17.4s - sqxtn v23.4h, v18.4s - sqxtn2 v23.8h, v19.4s + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h usqadd v0.8h, v20.8h // *src + noise usqadd v1.8h, v21.8h @@ -359,3 +348,506 @@ .hword L(fgy_loop_tbl) - L(loop_10) .hword L(fgy_loop_tbl) - L(loop_11) endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldp x8, x9, [sp, #80] // offsets, h + ldp x10, x11, [sp, #96] // uv, is_id + ldr w16, [sp, #120] // bitdepth_max + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + dup v23.8h, w16 // bitdepth_max + clz w16, w16 + eor w13, w13, #15 // 15 - scaling_shift + sub w16, w16, #24 // -bitdepth_min_8 + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + neg w16, w16 // bitdepth_min_8 + ld1r {v8.8h}, [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1r {v9.8h}, [x15] // uv_mult + + dup v29.8h, w13 // 15 - scaling_shift + dup v27.8h, w16 // bitdepth_min_8 + + cbz w12, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #240 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + cbz w11, 2f + // is_id + movi v31.8h, #235 + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v23.16b // bitdepth_max +2: + + ushr v15.8h, v23.8h, #1 // grain_max + sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 + not v14.16b, v15.16b // grain_min + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH*2 // grain_lut stride + + add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #112] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.8h, #23 + movi v26.8h, #22 +.else + movi v25.8h, #27 + movi v26.8h, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +1: +.if \ox + ld1 {v4.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v5.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut + +.if \ox + smull v4.4s, v4.4h, v27.4h + smlal v4.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v5.4s, v5.4h, v27.4h + smlal v5.4s, v0.4h, v28.4h + sqrshrn v4.4h, v4.4s, #5 + sqrshrn v5.4h, v5.4s, #5 + smin v4.4h, v4.4h, v15.4h + smin v5.4h, v5.4h, v15.4h + smax v4.4h, v4.4h, v14.4h + smax v5.4h, v5.4h, v14.4h + ins v16.d[0], v4.d[0] + ins v0.d[0], v5.d[0] +.endif + + smull v6.4s, v16.4h, v26.4h + smull2 v7.4s, v16.8h, v26.8h + smull v10.4s, v17.4h, v26.4h + smull2 v11.4s, v17.8h, v26.8h + smull v16.4s, v18.4h, v26.4h + smull2 v17.4s, v18.8h, v26.8h + smull v18.4s, v19.4h, v26.4h + smull2 v19.4s, v19.8h, v26.8h + smlal v6.4s, v0.4h, v25.4h + smlal2 v7.4s, v0.8h, v25.8h + smlal v10.4s, v1.4h, v25.4h + smlal2 v11.4s, v1.8h, v25.8h + smlal v16.4s, v2.4h, v25.4h + smlal2 v17.4s, v2.8h, v25.8h + smlal v18.4s, v3.4h, v25.4h + smlal2 v19.4s, v3.8h, v25.8h + sqrshrn v6.4h, v6.4s, #5 + sqrshrn2 v6.8h, v7.4s, #5 + sqrshrn v7.4h, v10.4s, #5 + sqrshrn2 v7.8h, v11.4s, #5 + sqrshrn v10.4h, v16.4s, #5 + sqrshrn2 v10.8h, v17.4s, #5 + sqrshrn v11.4h, v18.4s, #5 + sqrshrn2 v11.8h, v19.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v4.4h, v4.4s, #5 + smin v4.4h, v4.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v6.8h, v15.8h + smin v17.8h, v7.8h, v15.8h + smin v18.8h, v10.8h, v15.8h + smin v19.8h, v11.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h + smax v18.8h, v18.8h, v14.8h + smax v19.8h, v19.8h, v14.8h +.endif + +.if \ox && !\oy + smax v4.4h, v4.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v4.d[0] +.endif + +.if !\csfl + smull v4.4s, v0.4h, v8.4h + smull2 v5.4s, v0.8h, v8.8h + smull v6.4s, v1.4h, v8.4h + smull2 v7.4s, v1.8h, v8.8h + smull v0.4s, v2.4h, v8.4h + smull2 v1.4s, v2.8h, v8.8h + smull v2.4s, v3.4h, v8.4h + smull2 v3.4s, v3.8h, v8.8h + smlal v4.4s, v10.4h, v9.4h + smlal2 v5.4s, v10.8h, v9.8h + smlal v6.4s, v11.4h, v9.4h + smlal2 v7.4s, v11.8h, v9.8h + smlal v0.4s, v12.4h, v9.4h + smlal2 v1.4s, v12.8h, v9.8h + smlal v2.4s, v13.4h, v9.4h + smlal2 v3.4s, v13.8h, v9.8h + shrn v4.4h, v4.4s, #6 + shrn2 v4.8h, v5.4s, #6 + shrn v5.4h, v6.4s, #6 + shrn2 v5.8h, v7.4s, #6 + shrn v6.4h, v0.4s, #6 + shrn2 v6.8h, v1.4s, #6 + shrn v7.4h, v2.4s, #6 + shrn2 v7.8h, v3.4s, #6 + add v0.8h, v4.8h, v24.8h + add v1.8h, v5.8h, v24.8h + add v2.8h, v6.8h, v24.8h + add v3.8h, v7.8h, v24.8h + movi v20.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smin v2.8h, v2.8h, v23.8h + smin v3.8h, v3.8h, v23.8h + smax v0.8h, v0.8h, v20.8h + smax v1.8h, v1.8h, v20.8h + smax v2.8h, v2.8h, v20.8h + smax v3.8h, v3.8h, v20.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b + and v2.16b, v2.16b, v23.16b + and v3.16b, v3.16b, v23.16b +.endif + + bl gather32_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + uxtl v6.8h, v7.8b + uxtl2 v7.8h, v7.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + sqrdmulh v18.8h, v18.8h, v6.8h + sqrdmulh v19.8h, v19.8h, v7.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + usqadd v12.8h, v18.8h + usqadd v13.8h, v19.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umax v2.8h, v12.8h, v30.8h + umax v3.8h, v13.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w9, w9, #1 +.if \oy + dup v25.8h, v28.h[0] + dup v26.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +1: +.if \ox + ld1 {v18.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v19.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut + +.if \ox + smull v18.4s, v18.4h, v27.4h + smlal v18.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v19.4s, v19.4h, v27.4h + smlal v19.4s, v20.4h, v28.4h + sqrshrn v18.4h, v18.4s, #5 + sqrshrn v19.4h, v19.4s, #5 + smin v18.4h, v18.4h, v15.4h + smin v19.4h, v19.4h, v15.4h + smax v18.4h, v18.4h, v14.4h + smax v19.4h, v19.4h, v14.4h + ins v16.d[0], v18.d[0] + ins v20.d[0], v19.d[0] +.endif + + smull v0.4s, v16.4h, v26.4h + smull2 v1.4s, v16.8h, v26.8h + smull v2.4s, v17.4h, v26.4h + smull2 v3.4s, v17.8h, v26.8h + smlal v0.4s, v20.4h, v25.4h + smlal2 v1.4s, v20.8h, v25.8h + smlal v2.4s, v21.4h, v25.4h + smlal2 v3.4s, v21.8h, v25.8h + sqrshrn v16.4h, v0.4s, #5 + sqrshrn2 v16.8h, v1.4s, #5 + sqrshrn v17.4h, v2.4s, #5 + sqrshrn2 v17.8h, v3.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v18.4h, v18.4s, #5 + smin v18.4h, v18.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v16.8h, v15.8h + smin v17.8h, v17.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h +.endif + +.if \ox && !\oy + smax v18.4h, v18.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v18.d[0] +.endif + addp v0.8h, v0.8h, v1.8h + addp v1.8h, v2.8h, v3.8h + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 +.if !\csfl + smull v2.4s, v0.4h, v8.4h + smull2 v3.4s, v0.8h, v8.8h + smull v0.4s, v1.4h, v8.4h + smull2 v1.4s, v1.8h, v8.8h + smlal v2.4s, v10.4h, v9.4h + smlal2 v3.4s, v10.8h, v9.8h + smlal v0.4s, v11.4h, v9.4h + smlal2 v1.4s, v11.8h, v9.8h + shrn v2.4h, v2.4s, #6 + shrn2 v2.8h, v3.4s, #6 + shrn v3.4h, v0.4s, #6 + shrn2 v3.8h, v1.4s, #6 + add v0.8h, v2.8h, v24.8h + add v1.8h, v3.8h, v24.8h + movi v2.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smax v0.8h, v0.8h, v2.8h + smax v1.8h, v1.8h, v2.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b +.endif + + bl gather16_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.8h, v1.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff -Nru dav1d-0.9.0/src/arm/64/film_grain.S dav1d-0.9.1/src/arm/64/film_grain.S --- dav1d-0.9.0/src/arm/64/film_grain.S 2021-05-16 16:47:22.526950600 +0000 +++ dav1d-0.9.1/src/arm/64/film_grain.S 2021-07-28 21:38:28.865851900 +0000 @@ -232,12 +232,14 @@ smull2 v5.8h, v0.16b, v27.16b smull v6.8h, v1.8b, v29.8b smull2 v7.8h, v1.16b, v29.16b - add v2.8h, v2.8h, v4.8h - add v3.8h, v3.8h, v5.8h - saddl v4.4s, v2.4h, v6.4h - saddl2 v5.4s, v2.8h, v6.8h - saddl v6.4s, v3.4h, v7.4h - saddl2 v7.4s, v3.8h, v7.8h + saddl v0.4s, v2.4h, v4.4h + saddl2 v1.4s, v2.8h, v4.8h + saddl v2.4s, v3.4h, v5.4h + saddl2 v3.4s, v3.8h, v5.8h + saddw v4.4s, v0.4s, v6.4h + saddw2 v5.4s, v1.4s, v6.8h + saddw v6.4s, v2.4s, v7.4h + saddw2 v7.4s, v3.4s, v7.8h ret endfunc @@ -450,14 +452,18 @@ smull2 v7.8h, v0.16b, v28.16b smull v0.8h, v1.8b, v29.8b smull2 v1.8h, v1.16b, v29.16b - add v2.8h, v2.8h, v4.8h - add v3.8h, v3.8h, v5.8h - add v0.8h, v0.8h, v6.8h - add v1.8h, v1.8h, v7.8h - saddl v4.4s, v2.4h, v0.4h - saddl2 v5.4s, v2.8h, v0.8h - saddl v6.4s, v3.4h, v1.4h - saddl2 v7.4s, v3.8h, v1.8h + saddl v22.4s, v2.4h, v4.4h + saddl2 v23.4s, v2.8h, v4.8h + saddl v26.4s, v3.4h, v5.4h + saddl2 v27.4s, v3.8h, v5.8h + saddl v2.4s, v0.4h, v6.4h + saddl2 v3.4s, v0.8h, v6.8h + saddl v6.4s, v1.4h, v7.4h + saddl2 v7.4s, v1.8h, v7.8h + add v4.4s, v22.4s, v2.4s + add v5.4s, v23.4s, v3.4s + add v6.4s, v26.4s, v6.4s + add v7.4s, v27.4s, v7.4s ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid dup v26.16b, v30.b[5] @@ -476,14 +482,18 @@ smull2 v27.8h, v0.16b, v28.16b smull v28.8h, v1.8b, v29.8b smull2 v29.8h, v1.16b, v29.16b - add v2.8h, v2.8h, v22.8h - add v3.8h, v3.8h, v23.8h - add v26.8h, v26.8h, v28.8h - add v27.8h, v27.8h, v29.8h - saddl v0.4s, v2.4h, v26.4h - saddl2 v1.4s, v2.8h, v26.8h - saddl v2.4s, v3.4h, v27.4h - saddl2 v3.4s, v3.8h, v27.8h + saddl v0.4s, v2.4h, v22.4h + saddl2 v1.4s, v2.8h, v22.8h + saddl v2.4s, v3.4h, v23.4h + saddl2 v3.4s, v3.8h, v23.8h + saddl v22.4s, v26.4h, v28.4h + saddl2 v23.4s, v26.8h, v28.8h + saddl v26.4s, v27.4h, v29.4h + saddl2 v27.4s, v27.8h, v29.8h + add v0.4s, v0.4s, v22.4s + add v1.4s, v1.4s, v23.4s + add v2.4s, v2.4s, v26.4s + add v3.4s, v3.4s, v27.4s dup v26.16b, v30.b[2] dup v27.16b, v30.b[7] smull v22.8h, v17.8b, v26.8b @@ -498,14 +508,16 @@ mov v16.16b, v17.16b mov v17.16b, v18.16b - add v22.8h, v22.8h, v24.8h - add v23.8h, v23.8h, v25.8h + saddl v0.4s, v22.4h, v24.4h + saddl2 v1.4s, v22.8h, v24.8h + saddl v2.4s, v23.4h, v25.4h + saddl2 v3.4s, v23.8h, v25.8h mov v19.16b, v20.16b mov v20.16b, v21.16b - saddw v4.4s, v4.4s, v22.4h - saddw2 v5.4s, v5.4s, v22.8h - saddw v6.4s, v6.4s, v23.4h - saddw2 v7.4s, v7.4s, v23.8h + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s ret endfunc @@ -711,32 +723,38 @@ smull2 v3.8h, v9.16b, v23.16b smull v8.8h, v10.8b, v24.8b smull2 v9.8h, v10.16b, v24.16b - add v0.8h, v0.8h, v2.8h - add v1.8h, v1.8h, v3.8h smull v10.8h, v11.8b, v26.8b smull2 v11.8h, v11.16b, v26.16b - smull v2.8h, v12.8b, v27.8b - smull2 v3.8h, v12.16b, v27.16b - add v8.8h, v8.8h, v10.8h - add v9.8h, v9.8h, v11.8h + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b smull v10.8h, v13.8b, v28.8b smull2 v11.8h, v13.16b, v28.16b - saddl v4.4s, v0.4h, v8.4h - saddl2 v5.4s, v0.8h, v8.8h - saddl v6.4s, v1.4h, v9.4h - saddl2 v7.4s, v1.8h, v9.8h - smull v8.8h, v14.8b, v25.8b - smull2 v9.8h, v14.16b, v25.16b - add v2.8h, v2.8h, v10.8h - add v3.8h, v3.8h, v11.8h - saddl v0.4s, v2.4h, v8.4h - saddl2 v1.4s, v2.8h, v8.8h - saddl v2.4s, v3.4h, v9.4h - saddl2 v3.4s, v3.8h, v9.8h + smull v12.8h, v14.8b, v25.8b + smull2 v13.8h, v14.16b, v25.16b + add v4.4s, v22.4s, v0.4s + add v5.4s, v23.4s, v1.4s + add v6.4s, v24.4s, v2.4s + add v7.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid dup v22.16b, v29.b[7] @@ -758,36 +776,42 @@ smull2 v3.8h, v9.16b, v23.16b smull v8.8h, v10.8b, v24.8b smull2 v9.8h, v10.16b, v24.16b - add v0.8h, v0.8h, v2.8h - add v1.8h, v1.8h, v3.8h smull v10.8h, v11.8b, v26.8b smull2 v11.8h, v11.16b, v26.16b - smull v2.8h, v12.8b, v27.8b - smull2 v3.8h, v12.16b, v27.16b - add v8.8h, v8.8h, v10.8h - add v9.8h, v9.8h, v11.8h + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b smull v10.8h, v13.8b, v28.8b smull2 v11.8h, v13.16b, v28.16b - saddl v12.4s, v0.4h, v8.4h - saddl2 v13.4s, v0.8h, v8.8h - saddl v0.4s, v1.4h, v9.4h - saddl2 v1.4s, v1.8h, v9.8h - smull v8.8h, v17.8b, v25.8b - smull2 v9.8h, v17.16b, v25.16b - add v2.8h, v2.8h, v10.8h - add v3.8h, v3.8h, v11.8h - add v4.4s, v4.4s, v12.4s - add v5.4s, v5.4s, v13.4s - add v6.4s, v6.4s, v0.4s - add v7.4s, v7.4s, v1.4s - saddl v0.4s, v2.4h, v8.4h - saddl2 v1.4s, v2.8h, v8.8h - saddl v2.4s, v3.4h, v9.4h - saddl2 v3.4s, v3.8h, v9.8h + smull v12.8h, v17.8b, v25.8b + smull2 v13.8h, v17.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid dup v22.16b, v29.b[14] @@ -809,42 +833,46 @@ smull2 v3.8h, v9.16b, v23.16b smull v8.8h, v10.8b, v24.8b smull2 v9.8h, v10.16b, v24.16b - add v0.8h, v0.8h, v2.8h - add v1.8h, v1.8h, v3.8h smull v10.8h, v11.8b, v26.8b smull2 v11.8h, v11.16b, v26.16b - smull v2.8h, v12.8b, v27.8b - smull2 v3.8h, v12.16b, v27.16b - add v8.8h, v8.8h, v10.8h - add v9.8h, v9.8h, v11.8h + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b smull v10.8h, v13.8b, v28.8b smull2 v11.8h, v13.16b, v28.16b - saddl v12.4s, v0.4h, v8.4h - saddl2 v13.4s, v0.8h, v8.8h - saddl v0.4s, v1.4h, v9.4h - saddl2 v1.4s, v1.8h, v9.8h - smull v8.8h, v20.8b, v25.8b - smull2 v9.8h, v20.16b, v25.16b - add v2.8h, v2.8h, v10.8h - add v3.8h, v3.8h, v11.8h - add v4.4s, v4.4s, v12.4s - add v5.4s, v5.4s, v13.4s - add v6.4s, v6.4s, v0.4s - add v7.4s, v7.4s, v1.4s - saddl v0.4s, v2.4h, v8.4h - saddl2 v1.4s, v2.8h, v8.8h - saddl v2.4s, v3.4h, v9.4h - saddl2 v3.4s, v3.8h, v9.8h + smull v12.8h, v20.8b, v25.8b + smull2 v19.8h, v20.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + mov v13.16b, v14.16b + mov v14.16b, v15.16b add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s - - mov v13.16b, v14.16b - mov v14.16b, v15.16b - mov v16.16b, v17.16b mov v17.16b, v18.16b + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v19.4h + saddw2 v7.4s, v7.4s, v19.8h mov v19.16b, v20.16b mov v20.16b, v21.16b @@ -1483,43 +1511,50 @@ .macro gather_interleaved dst1, dst2, src1, src2, off umov w14, \src1[0+\off] - umov w15, \src2[1+\off] + umov w15, \src2[8+\off] umov w16, \src1[2+\off] add x14, x14, x3 - umov w17, \src2[3+\off] + umov w17, \src2[10+\off] add x15, x15, x3 - ld1 {\dst1}[0+\off], [x14] + ld1 {\dst1}[0+\off], [x14] umov w14, \src1[4+\off] add x16, x16, x3 - ld1 {\dst2}[1+\off], [x15] - umov w15, \src2[5+\off] + ld1 {\dst2}[8+\off], [x15] + umov w15, \src2[12+\off] add x17, x17, x3 - ld1 {\dst1}[2+\off], [x16] + ld1 {\dst1}[2+\off], [x16] umov w16, \src1[6+\off] add x14, x14, x3 - ld1 {\dst2}[3+\off], [x17] - umov w17, \src2[7+\off] + ld1 {\dst2}[10+\off], [x17] + umov w17, \src2[14+\off] add x15, x15, x3 - ld1 {\dst1}[4+\off], [x14] + ld1 {\dst1}[4+\off], [x14] add x16, x16, x3 - ld1 {\dst2}[5+\off], [x15] + ld1 {\dst2}[12+\off], [x15] add x17, x17, x3 - ld1 {\dst1}[6+\off], [x16] - ld1 {\dst2}[7+\off], [x17] + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[14+\off], [x17] .endm .macro gather dst1, dst2, src1, src2 gather_interleaved \dst1, \dst2, \src1, \src2, 0 gather_interleaved \dst2, \dst1, \src2, \src1, 0 - gather_interleaved \dst1, \dst2, \src1, \src2, 8 - gather_interleaved \dst2, \dst1, \src2, \src1, 8 + gather_interleaved \dst1, \dst2, \src1, \src2, 1 + gather_interleaved \dst2, \dst1, \src2, \src1, 1 .endm -function gather_neon +function gather32_neon gather v4.b, v5.b, v0.b, v1.b ret endfunc +function gather16_neon + gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 + gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 + ins v4.d[1], v5.d[1] + ret +endfunc + const overlap_coeffs_0, align=4 .byte 27, 17, 0, 0, 0, 0, 0, 0 .byte 17, 27, 32, 32, 32, 32, 32, 32 @@ -1564,7 +1599,7 @@ mov x9, #GRAIN_WIDTH // grain_lut stride neg w4, w4 - dup v29.8h, w4 // -scaling_shift + dup v29.8h, w4 // -scaling_shift movrel x16, overlap_coeffs_0 @@ -1635,7 +1670,7 @@ .endif ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut - bl gather_neon + bl gather32_neon .if \ox smull v20.8h, v20.8b, v27.8b @@ -1765,7 +1800,7 @@ .macro fguv layout, sx, sy function fguv_32x32_\layout\()_8bpc_neon, export=1 str x30, [sp, #-32]! - stp d8, d9, [sp, #16] + str d8, [sp, #16] ldp x8, x9, [sp, #32] // offsets, h ldp x10, x11, [sp, #48] // uv, is_id @@ -1778,11 +1813,11 @@ add x14, x10, #FGD_UV_LUMA_MULT add x15, x10, #FGD_UV_MULT add x10, x10, #FGD_UV_OFFSET - ld1 {v8.h}[0], [x14] // uv_luma_mult - ld1r {v24.8h}, [x10] // uv_offset - ld1 {v8.h}[1], [x15] // uv_mult + ld1 {v8.h}[0], [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1 {v8.h}[1], [x15] // uv_mult - dup v29.8h, w13 // -scaling_shift + dup v29.8h, w13 // -scaling_shift cbz w12, 1f // clip @@ -1918,7 +1953,7 @@ sqxtun2 v1.16b, v5.8h .endif - bl gather_neon + bl gather32_neon .if \ox smull v20.8h, v20.8b, v27.8b @@ -2029,7 +2064,7 @@ fguv_loop_sx0 1, 1, 1 9: - ldp d8, d9, [sp, #16] + ldr d8, [sp, #16] ldr x30, [sp], #32 ret @@ -2085,7 +2120,7 @@ sqxtun2 v0.16b, v3.8h .endif - bl gather_neon + bl gather16_neon .if \ox smull v20.8h, v20.8b, v27.8b @@ -2176,7 +2211,7 @@ fguv_loop_sx1 1, 1, 1 9: - ldp d8, d9, [sp, #16] + ldr d8, [sp, #16] ldr x30, [sp], #32 ret diff -Nru dav1d-0.9.0/src/arm/film_grain_init_tmpl.c dav1d-0.9.1/src/arm/film_grain_init_tmpl.c --- dav1d-0.9.0/src/arm/film_grain_init_tmpl.c 2021-05-16 16:47:22.530950500 +0000 +++ dav1d-0.9.1/src/arm/film_grain_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -60,6 +60,7 @@ GEN_GRAIN_UV(420); GEN_GRAIN_UV(422); GEN_GRAIN_UV(444); +#endif // Use ptrdiff_t instead of int for the last few parameters, to get the // same layout of parameters on the stack across platforms. @@ -149,7 +150,6 @@ } } -#if BITDEPTH == 8 #define fguv_ss_fn(nm, sx, sy) \ static void \ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ @@ -204,16 +204,12 @@ fguv_ss_fn(422, 1, 0); fguv_ss_fn(444, 0, 0); -#endif -#endif - COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if ARCH_AARCH64 -#if BITDEPTH == 8 +#if ARCH_AARCH64 && BITDEPTH == 8 c->generate_grain_y = BF(dav1d_generate_grain_y, neon); c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); @@ -221,10 +217,7 @@ #endif c->fgy_32x32xn = fgy_32x32xn_neon; -#if BITDEPTH == 8 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; -#endif -#endif } diff -Nru dav1d-0.9.0/src/decode.c dav1d-0.9.1/src/decode.c --- dav1d-0.9.0/src/decode.c 2021-05-16 16:47:22.534950700 +0000 +++ dav1d-0.9.1/src/decode.c 2021-07-28 21:38:28.877852000 +0000 @@ -2629,7 +2629,7 @@ } } - if (f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { + if (f->seq_hdr->ref_frame_mvs && f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { dav1d_refmvs_save_tmvs(&t->rt, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); @@ -3114,7 +3114,7 @@ t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; if (dav1d_decode_tile_sbrow(t)) goto error; } - if (f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { + if (f->seq_hdr->ref_frame_mvs && f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); } diff -Nru dav1d-0.9.0/src/ext/x86/x86inc.asm dav1d-0.9.1/src/ext/x86/x86inc.asm --- dav1d-0.9.0/src/ext/x86/x86inc.asm 2021-05-16 16:47:22.534950700 +0000 +++ dav1d-0.9.1/src/ext/x86/x86inc.asm 2021-07-28 21:38:28.877852000 +0000 @@ -1339,26 +1339,50 @@ %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 - %if avx_enabled && %5 + %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 - %ifnum regnumof%7 - %ifnum regnumof%8 - %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 - ; Most VEX-encoded instructions require an additional byte to encode when - ; src2 is a high register (e.g. m8..15). If the instruction is commutative - ; we can swap src1 and src2 when doing so reduces the instruction length. - %xdefine __src1 %8 - %xdefine __src2 %7 + %if %5 + %ifnum regnumof%7 + %ifnum regnumof%8 + %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 + ; Most VEX-encoded instructions require an additional byte to encode when + ; src2 is a high register (e.g. m8..15). If the instruction is commutative + ; we can swap src1 and src2 when doing so reduces the instruction length. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif %endif + %elifnum regnumof%8 ; put memory operands in src2 when possible + %xdefine __src1 %8 + %xdefine __src2 %7 + %else + %assign __emulate_avx 1 %endif + %elifnnum regnumof%7 + ; EVEX allows imm8 shift instructions to be used with memory operands, + ; but VEX does not. This handles those special cases. + %ifnnum %8 + %assign __emulate_avx 1 + %elif notcpuflag(avx512) + %assign __emulate_avx 1 + %endif + %endif + %if __emulate_avx ; a separate load is required + %if %3 + vmovaps %6, %7 + %else + vmovdqa %6, %7 + %endif + __instr %6, %8 + %else + __instr %6, __src1, __src2 %endif - __instr %6, __src1, __src2 %else __instr %6, %7, %8 %endif %elif %0 == 7 - %if avx_enabled && %5 + %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6 diff -Nru dav1d-0.9.0/src/lib.c dav1d-0.9.1/src/lib.c --- dav1d-0.9.0/src/lib.c 2021-05-16 16:47:22.538950700 +0000 +++ dav1d-0.9.1/src/lib.c 2021-07-28 21:38:28.881852000 +0000 @@ -318,7 +318,7 @@ } if (!c->seq_hdr) { - res = DAV1D_ERR(EINVAL); + res = DAV1D_ERR(ENOENT); goto error; } diff -Nru dav1d-0.9.0/src/meson.build dav1d-0.9.1/src/meson.build --- dav1d-0.9.0/src/meson.build 2021-05-16 16:47:22.538950700 +0000 +++ dav1d-0.9.1/src/meson.build 2021-07-28 21:38:28.885852000 +0000 @@ -144,6 +144,7 @@ if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'arm/32/cdef.S', + 'arm/32/film_grain.S', 'arm/32/ipred.S', 'arm/32/loopfilter.S', 'arm/32/looprestoration.S', @@ -154,6 +155,7 @@ if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( 'arm/32/cdef16.S', + 'arm/32/film_grain16.S', 'arm/32/ipred16.S', 'arm/32/itx16.S', 'arm/32/loopfilter16.S', @@ -193,6 +195,7 @@ 'x86/itx_avx2.asm', 'x86/looprestoration_avx2.asm', 'x86/cdef_sse.asm', + 'x86/itx_sse.asm', ) if dav1d_bitdepths.contains('8') @@ -205,7 +208,6 @@ 'x86/loopfilter_avx2.asm', 'x86/film_grain_sse.asm', 'x86/ipred_sse.asm', - 'x86/itx_sse.asm', 'x86/loopfilter_sse.asm', 'x86/looprestoration_sse.asm', 'x86/mc_sse.asm', @@ -222,6 +224,12 @@ 'x86/looprestoration16_avx2.asm', 'x86/mc16_avx2.asm', 'x86/cdef16_sse.asm', + 'x86/film_grain16_sse.asm', + 'x86/ipred16_sse.asm', + 'x86/itx16_sse.asm', + 'x86/loopfilter16_sse.asm', + 'x86/looprestoration16_sse.asm', + 'x86/mc16_sse.asm', ) endif @@ -274,7 +282,7 @@ rev_target, config_h_target, include_directories : dav1d_inc_dirs, - dependencies: [stdatomic_dependency], + dependencies: [stdatomic_dependencies], c_args : [stackalign_flag, stackrealign_flag, api_export_flags], install : false, build_by_default : false, @@ -287,7 +295,7 @@ 'dav1d_bitdepth_@0@'.format(bitdepth), libdav1d_tmpl_sources, config_h_target, include_directories: dav1d_inc_dirs, - dependencies : [stdatomic_dependency], + dependencies : [stdatomic_dependencies], c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag, install : false, build_by_default : false, @@ -300,7 +308,7 @@ 'dav1d_arch_bitdepth_@0@'.format(bitdepth), libdav1d_arch_tmpl_sources, config_h_target, include_directories: dav1d_inc_dirs, - dependencies : [stdatomic_dependency], + dependencies : [stdatomic_dependencies], c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags, install : false, build_by_default : false, @@ -326,7 +334,7 @@ include_directories : dav1d_inc_dirs, dependencies : [ - stdatomic_dependency, + stdatomic_dependencies, thread_dependency, thread_compat_dep, libdl_dependency, diff -Nru dav1d-0.9.0/src/x86/film_grain16_avx2.asm dav1d-0.9.1/src/x86/film_grain16_avx2.asm --- dav1d-0.9.0/src/x86/film_grain16_avx2.asm 2021-05-16 16:47:22.542950600 +0000 +++ dav1d-0.9.1/src/x86/film_grain16_avx2.asm 2021-07-28 21:38:28.893852000 +0000 @@ -29,9 +29,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_0x10000: times 8 dd 0x10000 -pw_1024: times 16 dw 1024 -pw_23_22: times 8 dw 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 @@ -49,6 +46,7 @@ ; these two should be next to each other pw_4: times 2 dw 4 pw_16: times 2 dw 16 +pw_23_22: dw 23, 22, 0, 32 %macro JMP_TABLE 1-* %xdefine %1_table %%table @@ -63,6 +61,8 @@ JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -404,8 +404,9 @@ %endif RET +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 -cglobal generate_grain_uv_420_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax +cglobal generate_grain_uv_%1_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax %define base r8-pb_mask lea r8, [pb_mask] movifnidn bdmaxd, bdmaxm @@ -423,10 +424,15 @@ pxor xm0, xm9 vpbroadcastd xm9, [base+pd_m65536] lea r6, [gaussian_sequence] - mov r7d, 38 +%if %2 + mov r7d, 73-35*%3 add bufq, 44*2 .loop_y: mov r5, -44 +%else + mov r5, -82*73 + add bufq, 2*82*73 +%endif .loop_x: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 @@ -452,14 +458,16 @@ movq [bufq+r5*2], xm2 add r5, 4 jl .loop_x +%if %2 add bufq, 82*2 dec r7d jg .loop_y +%endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_420_16bpc_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_420_16bpc_avx2_table] + movsxd r5, [base+generate_grain_uv_%1_16bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_avx2_table] jmp r5 .ar0: @@ -474,40 +482,61 @@ pcmpeqw m7, m7 vpbroadcastw m14, xm14 ; max_gain pxor m7, m14 ; min_grain - DEFINE_ARGS buf, bufy, h + DEFINE_ARGS buf, bufy, h, x pmovsxbw xm4, xm4 - vpbroadcastw m6, [hmul_bits+4] +%if %2 + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif vpbroadcastw m4, xm4 pxor m5, m5 - sub bufq, 2*(82*38+82-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif add bufyq, 2*(3+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar0: +%if %2 ; first 32 pixels movu xm8, [bufyq] - movu xm9, [bufyq+82*2] movu xm10, [bufyq+ 16] +%if %3 + movu xm9, [bufyq+82*2] movu xm11, [bufyq+82*2+16] +%endif vinserti128 m8, [bufyq+ 32], 1 - vinserti128 m9, [bufyq+82*2+32], 1 vinserti128 m10, [bufyq+ 48], 1 +%if %3 + vinserti128 m9, [bufyq+82*2+32], 1 vinserti128 m11, [bufyq+82*2+48], 1 paddw m8, m9 paddw m10, m11 +%endif phaddw m8, m10 movu xm10, [bufyq+ 64] - movu xm11, [bufyq+82*2+64] movu xm12, [bufyq+ 80] +%if %3 + movu xm11, [bufyq+82*2+64] movu xm13, [bufyq+82*2+80] +%endif vinserti128 m10, [bufyq+ 96], 1 - vinserti128 m11, [bufyq+82*2+96], 1 vinserti128 m12, [bufyq+ 112], 1 +%if %3 + vinserti128 m11, [bufyq+82*2+96], 1 vinserti128 m13, [bufyq+82*2+112], 1 paddw m10, m11 paddw m12, m13 +%endif phaddw m10, m12 pmulhrsw m8, m6 pmulhrsw m10, m6 +%else + xor xd, xd +.x_loop_ar0: + movu m8, [bufyq+xq*2] + movu m10, [bufyq+xq*2+32] +%endif punpckhwd m9, m8, m5 punpcklwd m8, m5 punpckhwd m11, m10, m5 @@ -517,20 +546,28 @@ packssdw m8, m9 packssdw m10, m11 REPX {pmulhrsw x, m3}, m8, m10 +%if %2 paddw m8, [bufq+ 0] paddw m10, [bufq+32] +%else + paddw m8, [bufq+xq*2+ 0] + paddw m10, [bufq+xq*2+32] +%endif pminsw m8, m14 pminsw m10, m14 pmaxsw m8, m7 pmaxsw m10, m7 +%if %2 movu [bufq+ 0], m8 movu [bufq+32], m10 ; last 6 pixels movu xm8, [bufyq+32*4] movu xm10, [bufyq+32*4+16] +%if %3 paddw xm8, [bufyq+32*4+82*2] paddw xm10, [bufyq+32*4+82*2+16] +%endif phaddw xm8, xm10 pmulhrsw xm8, xm6 punpckhwd xm9, xm8, xm5 @@ -545,9 +582,31 @@ pmaxsw xm8, xm7 vpblendw xm0, xm8, xm0, 11000000b movu [bufq+32*2], xm0 +%else + movu [bufq+xq*2+ 0], m8 + movu [bufq+xq*2+32], m10 + add xd, 32 + cmp xd, 64 + jl .x_loop_ar0 + + ; last 12 pixels + movu m8, [bufyq+64*2] + punpckhwd m9, m8, m5 + punpcklwd m8, m5 + REPX {pmaddwd x, m4}, m8, m9 + REPX {psrad x, 5}, m8, m9 + packssdw m8, m9 + pmulhrsw m8, m3 + movu m0, [bufq+64*2] + paddw m8, m0 + pminsw m8, m14 + pmaxsw m8, m7 + vpblendd m0, m8, m0, 11000000b + movu [bufq+64*2], m0 +%endif add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar0 RET @@ -565,26 +624,40 @@ pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd - vpbroadcastw xm6, [hmul_bits+4] + vpbroadcastw xm6, [hmul_bits+2+%3*2] vpbroadcastd xm3, xm3 - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: - mov xq, -38 + mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu xm0, [bufq+xq*2-82*2-2] ; top/left +%if %2 movu xm8, [bufyq+xq*4] +%else + movq xm8, [bufyq+xq*2] +%endif psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right +%if %2 +%if %3 phaddw xm8, [bufyq+xq*4+82*2] pshufd xm9, xm8, q3232 paddw xm8, xm9 +%else + phaddw xm8, xm8 +%endif pmulhrsw xm8, xm6 +%endif punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 @@ -613,7 +686,7 @@ .x_loop_ar1_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 RET @@ -628,7 +701,9 @@ pcmpeqd xm5, xm5 vpbroadcastd xm6, xm6 ; max_grain pxor xm5, xm6 ; min_grain - vpbroadcastw xm7, [base+hmul_bits+4] +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] +%endif vpbroadcastw xm15, [base+round_vals-12+shiftq*2] movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5] @@ -645,11 +720,15 @@ pshufd m10, m0, q2222 DEFINE_ARGS buf, bufy, fg_data, h, x - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar2: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar2: movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] @@ -658,17 +737,27 @@ psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] +%if %2 movu xm8, [bufyq+xq*4] +%if %3 paddw xm8, [bufyq+xq*4+82*2] +%endif phaddw xm8, xm8 +%else + movq xm8, [bufyq+xq*2] +%endif vinserti128 m4, xm0, 1 ; y=-1,x=[-2,+5] punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] punpckhwd m4, m0, m4 ; y=-2/-1 interleaved, x=[+2,+5] punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] +%if %2 pmulhrsw xm1, xm8, xm7 punpcklwd xm1, xm15 ; luma, round interleaved +%else + punpcklwd xm1, xm8, xm15 +%endif vpblendd m1, m1, m4, 11110000b pmaddwd m2, m11 @@ -704,7 +793,7 @@ .x_loop_ar2_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 RET @@ -731,7 +820,9 @@ pcmpeqd xm13, xm13 vpbroadcastd xm15, xm15 ; max_grain pxor xm13, xm15 ; min_grain - vpbroadcastw xm12, [base+hmul_bits+4] +%if %2 + vpbroadcastw xm12, [base+hmul_bits+2+%3*2] +%endif movq xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma @@ -758,11 +849,15 @@ pinsrw xm11, [base+round_vals-10+shiftq*2], 3 DEFINE_ARGS buf, bufy, fg_data, h, unused, x - sub bufq, 2*(82*38+44-(82*3+41)) +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif add bufyq, 2*(79+82*3) - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar3: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar3: movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] @@ -772,16 +867,24 @@ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] +%if %2 movu xm7, [bufyq+xq*4] +%if %3 paddw xm7, [bufyq+xq*4+82*2] +%endif phaddw xm7, xm7 +%else + movq xm7, [bufyq+xq*2] +%endif palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] +%if %2 pmulhrsw xm7, xm12 +%endif punpcklwd m1, m7 psrldq m3, m2, 2 @@ -831,20 +934,25 @@ .x_loop_ar3_end: add bufq, 82*2 - add bufyq, 82*4 + add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 %if WIN64 mov rsp, r6 %endif RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 INIT_YMM avx2 cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask - vpbroadcastw m11, [base+round_vals+r7*2-12] + vpbroadcastw m11, [base+mul_bits+r7*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, r9m ; bdmax sar r9d, 11 ; is_12bpc @@ -854,7 +962,6 @@ lea r9d, [r6d*2+r9d] vpbroadcastw m12, [base+max+r9*2] vpbroadcastw m10, r9m - mov r9mp, r7 pxor m2, m2 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ @@ -921,27 +1028,17 @@ vpgatherdd m5, [scalingq+m6-3], m3 vpgatherdd m6, [scalingq+m7-3], m9 REPX {psrld x, 24}, m8, m4, m5, m6 - REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] movu m3, [grain_lutq+offxyq*2+32] ; noise = round2(scaling[src] * grain, scaling_shift) - ; the problem here is that since the grain is 10-bits, the product of - ; scaling*grain is 17+sign bits, so we need to unfortunately do some - ; of these steps in 32-bits - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r9m}, m9, m7, m3, m8 - packssdw m9, m7 - packssdw m3, m8 + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1014,7 +1111,8 @@ vpgatherdd m5, [scalingq+m6-3], m3 vpgatherdd m6, [scalingq+m7-3], m9 REPX {psrld x, 24}, m8, m4, m5, m6 - REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] @@ -1033,17 +1131,9 @@ movu m3, [grain_lutq+offxyq*2+32] ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r9m}, m9, m7, m3, m8 - packssdw m9, m7 - packssdw m3, m8 + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1167,16 +1257,11 @@ vpgatherdd m6, [scalingq+m4-3], m3 vpgatherdd m4, [scalingq+m5-3], m9 REPX {psrld x, 24}, m6, m4 - REPX {por x, [pd_0x10000]}, m6, m4 + packssdw m6, m4 ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m9, m7, m11 - punpcklwd m7, m11 - pmaddwd m6, m7 - pmaddwd m4, m9 - - REPX {psrad x, r9m}, m6, m4 - packssdw m6, m4 + pmullw m6, m11 + pmulhrsw m6, m7 ; same for the other half pminuw m1, m10, [srcq+32] ; m0-1: src as word @@ -1187,16 +1272,11 @@ vpgatherdd m5, [scalingq+m4-3], m3 vpgatherdd m4, [scalingq+m9-3], m7 REPX {psrld x, 24}, m5, m4 - REPX {por x, [pd_0x10000]}, m5, m4 - - punpckhwd m9, m8, m11 - punpcklwd m8, m11 - pmaddwd m5, m8 - pmaddwd m4, m9 - - REPX {psrad x, r9m}, m5, m4 packssdw m5, m4 + pmullw m5, m11 + pmulhrsw m5, m8 + ; dst = clip_pixel(src, noise) paddw m0, m6 paddw m1, m5 @@ -1313,15 +1393,11 @@ pcmpeqw m9, m9 vpgatherdd m4, [scalingq+m5-3], m9 REPX {psrld x, 24}, m6, m4 - REPX {por x, [pd_0x10000]}, m6, m4 + packssdw m6, m4 ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m9, m7, m11 - punpcklwd m7, m11 - pmaddwd m9, m4 - pmaddwd m7, m6 - REPX {psrad x, r9m}, m9, m7 - packssdw m7, m9 + pmullw m6, m11 + pmulhrsw m7, m6 ; other half punpckhwd m5, m1, m2 @@ -1333,15 +1409,11 @@ pcmpeqw m6, m6 vpgatherdd m4, [scalingq+m5-3], m6 REPX {psrld x, 24}, m9, m4 - REPX {por x, [pd_0x10000]}, m9, m4 + packssdw m9, m4 ; noise = round2(scaling[src] * grain, scaling_shift) - punpckhwd m6, m3, m11 - punpcklwd m3, m11 - pmaddwd m6, m4 - pmaddwd m3, m9 - REPX {psrad x, r9m}, m6, m3 - packssdw m3, m6 + pmullw m9, m11 + pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m7 @@ -1373,12 +1445,13 @@ .end_hv: RET -cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask lea r8, [pb_mask] mov r7d, [fg_dataq+FGData.scaling_shift] - vpbroadcastw m11, [base+round_vals+r7*2-12] + vpbroadcastw m11, [base+mul_bits+r7*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, r13m ; bdmax sar r9d, 11 ; is_12bpc @@ -1391,12 +1464,11 @@ vpbroadcastw m12, [base+max+r10*2] vpbroadcastw m10, r13m pxor m2, m2 - mov r13mp, r7 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl -%macro FGUV_32x32xN_LOOP 1 ; not-csfl +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %if %1 @@ -1408,8 +1480,12 @@ vpbroadcastd m9, [base+pw_4+r9*4] pmullw m15, m9 %else - vpbroadcastd m14, [pw_1024] - vpbroadcastd m15, [pw_23_22] + vpbroadcastd m14, [pd_16] +%if %2 + vpbroadcastq m15, [pw_23_22] +%else + vpbroadcastq m15, [pw_27_17_17_27] +%endif %endif movifnidn sbyd, sbym @@ -1431,7 +1507,7 @@ mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*4] + lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 @@ -1452,8 +1528,8 @@ rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride @@ -1463,21 +1539,30 @@ %%loop_y: ; src mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] ; m0-1: src as word +%else + mova m1, [srcq+32] +%endif ; luma_src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m3, m4, m0 @@ -1510,24 +1595,21 @@ vpgatherdd m5, [scalingq+m6-3], m3 vpgatherdd m6, [scalingq+m7-3], m9 REPX {psrld x, 24}, m8, m4, m5, m6 - REPX {por x, [pd_0x10000]}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m9, m7, m3, m8 - packssdw m9, m7 - packssdw m3, m8 + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1537,23 +1619,34 @@ pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*4 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 sub hb, 2 +%else + dec hb +%endif jg %%loop_y - add wq, 16 + add wq, 32>>%2 jge %%end mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] cmp byte [fg_dataq+FGData.overlap_flag], 0 je %%loop_x @@ -1573,13 +1666,13 @@ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride @@ -1588,6 +1681,7 @@ mov grain_lutq, grain_lutmp %%loop_y_h_overlap: mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] ; luma_src @@ -1595,14 +1689,21 @@ mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m1, [srcq+32] + + ; luma_src + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m3, m4, m0 @@ -1622,18 +1723,32 @@ ; grain = grain_lut[offy+y][offx+x] movu m9, [grain_lutq+offxyq*2] +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif movd xm5, [grain_lutq+left_offxyq*2+ 0] - pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 ; {left0, left1} - punpcklwd xm7, xm9, xm3 ; {cur0, cur1} +%if %2 + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} + punpckldq xm7, xm9, xm3 ; {cur0, cur1} punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} +%else + punpcklwd xm5, xm9 +%endif %if %1 - pmaddwd xm5, [pw_23_22] +%if %2 + vpbroadcastq xm8, [pw_23_22] %else - pmaddwd xm5, xm15 + movq xm8, [pw_27_17_17_27] %endif + pmaddwd xm5, xm8 vpbroadcastd xm8, [pd_16] paddd xm5, xm8 +%else + pmaddwd xm5, xm15 + paddd xm5, xm14 +%endif psrad xm5, 5 packssdw xm5, xm5 pcmpeqw xm8, xm8 @@ -1641,11 +1756,11 @@ pxor xm8, xm7 pmaxsw xm5, xm8 pminsw xm5, xm7 - vpblendw xm7, xm5, xm9, 11111110b - psrldq xm5, 2 - vpblendw xm5, xm3, 11111110b - vpblendd m9, m7, 00001111b - vpblendd m3, m5, 00001111b + vpblendd m9, m9, m5, 00000001b +%if %2 + psrldq xm5, 4 + vpblendd m3, m3, m5, 00000001b +%endif ; scaling[luma_src] punpckhwd m5, m4, m2 @@ -1655,15 +1770,11 @@ pcmpeqw m7, m7 vpgatherdd m4, [scalingq+m5-3], m7 REPX {psrld x, 24}, m8, m4 - REPX {por x, [pd_0x10000]}, m8, m4 + packssdw m8, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - REPX {psrad x, r13m}, m9, m7 - packssdw m9, m7 + pmullw m8, m11 + pmulhrsw m9, m8 ; same for the other half punpckhwd m7, m6, m2 @@ -1673,15 +1784,11 @@ vpgatherdd m5, [scalingq+m6-3], m8 vpgatherdd m6, [scalingq+m7-3], m4 REPX {psrld x, 24}, m5, m6 - REPX {por x, [pd_0x10000]}, m5, m6 + packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m3, m8 - packssdw m3, m8 + pmullw m5, m11 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1691,23 +1798,36 @@ pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*4 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + + add grain_lutq, 82*(2<<%2) +%if %2 sub hb, 2 +%else + dec hb +%endif jg %%loop_y_h_overlap - add wq, 16 + add wq, 32>>%2 jge %%end mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] ; r8m = sbym cmp dword r8m, 0 @@ -1739,7 +1859,7 @@ mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*4] + lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 @@ -1766,9 +1886,9 @@ rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride @@ -1776,11 +1896,15 @@ movzx top_offxyd, offxyw shr offxyd, 16 +%if %2 == 0 + lea r10, [pw_27_17_17_27] +%endif mov hd, hm mov grain_lutq, grain_lutmp %%loop_y_v_overlap: ; src mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] ; luma_src @@ -1788,14 +1912,21 @@ mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m1, [srcq+32] + + ; luma_src + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m3, m4, m0 @@ -1818,20 +1949,59 @@ movu m5, [grain_lutq+top_offxyq*2] punpckhwd m7, m5, m9 punpcklwd m5, m9 ; {top/cur interleaved} +%if %3 + vpbroadcastd m3, [pw_23_22] +%elif %2 + vpbroadcastd m3, [pw_27_17_17_27] +%else + vpbroadcastd m3, [r10] +%endif + REPX {pmaddwd x, m3}, m7, m5 %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 + vpbroadcastd m8, [pd_16] + REPX {paddd x, m8}, m7, m5 %else - REPX {pmaddwd x, m15}, m7, m5 + REPX {paddd x, m14}, m7, m5 %endif - vpbroadcastd m3, [pd_16] - REPX {paddd x, m3}, m7, m5 REPX {psrad x, 5}, m7, m5 packssdw m9, m5, m7 +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif +%if %3 == 0 +%if %2 + movu m5, [grain_lutq+top_offxyq*2+82*2] +%else + movu m5, [grain_lutq+top_offxyq*2+32] +%endif + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} +%if %2 + vpbroadcastd m3, [pw_27_17_17_27+4] +%else + vpbroadcastd m3, [r10] +%endif + REPX {pmaddwd x, m3}, m7, m5 +%if %1 + REPX {paddd x, m8}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 +%endif ; %3 == 0 pcmpeqw m7, m7 psraw m5, m10, 1 pxor m7, m5 +%if %3 pmaxsw m9, m7 pminsw m9, m5 +%else + REPX {pmaxsw x, m7}, m9, m3 + REPX {pminsw x, m5}, m9, m3 +%endif ; scaling[luma_src] punpckhwd m5, m4, m2 @@ -1841,17 +2011,13 @@ pcmpeqw m7, m7 vpgatherdd m4, [scalingq+m5-3], m7 REPX {psrld x, 24}, m8, m4 - REPX {por x, [pd_0x10000]}, m8, m4 + packssdw m8, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - REPX {psrad x, r13m}, m9, m7 - packssdw m9, m7 + pmullw m8, m11 + pmulhrsw m9, m8 - ; same for the other half + ; scaling for the other half punpckhwd m7, m6, m2 punpcklwd m6, m2 ; m4-7: luma_src as dword pcmpeqw m8, m8 @@ -1859,16 +2025,11 @@ vpgatherdd m5, [scalingq+m6-3], m8 vpgatherdd m6, [scalingq+m7-3], m4 REPX {psrld x, 24}, m5, m6 - REPX {por x, [pd_0x10000]}, m5, m6 + packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - movu m3, [grain_lutq+offxyq*2+82*2] - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m3, m8 - packssdw m3, m8 + pmullw m5, m11 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -1878,25 +2039,43 @@ pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 sub hb, 2 +%else + mova [dstq+32], m1 + dec hb +%endif jle %%end_y_v_overlap +%if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*4 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y + add r10, 4 + jmp %%loop_y_v_overlap +%endif %%end_y_v_overlap: - add wq, 16 + add wq, 32>>%2 jge %%end_hv mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to @@ -1919,15 +2098,19 @@ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] +%if %2 == 0 + lea r12, [pw_27_17_17_27] + mov r13mp, r12 +%endif + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride @@ -1940,66 +2123,153 @@ %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movd xm5, [grain_lutq+left_offxyq*2] - pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 - pinsrw xm5, [grain_lutq+topleft_offxyq*2], 2 ; { left0, left1, top/left } +%if %2 + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 +%if %3 + vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } +%else + ; insert both top/left lines + movd xm9, [grain_lutq+topleft_offxyq*2+82*2] + pinsrw xm9, [grain_lutq+topleft_offxyq*2], 2 + vinserti128 m5, xm9, 1 +%endif +%else + pinsrd xm5, [grain_lutq+topleft_offxyq*2], 1 +%endif movu m9, [grain_lutq+offxyq*2] +%if %2 movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif movu m8, [grain_lutq+top_offxyq*2] - punpcklwd xm7, xm9, xm3 ; { cur0, cur1 } - punpckldq xm7, xm8 ; { cur0, cur1, top0 } - punpcklwd xm5, xm7 ; { cur/left } interleaved - pmaddwd xm5, [pw_23_22] - vpbroadcastd xm0, [pd_16] +%if %2 + punpckldq xm7, xm9, xm3 ; { cur0, cur1 } +%if %3 + vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } +%else + ; insert both top lines + movu m1, [grain_lutq+top_offxyq*2+82*2] + punpckldq xm0, xm1, xm8 + vinserti128 m7, xm0, 1 +%endif +%else + movu m1, [grain_lutq+top_offxyq*2+32] + punpckldq xm7, xm9, xm8 +%endif + punpcklwd m5, m7 ; { cur/left } interleaved +%if %2 +%if %1 + vpbroadcastq m0, [pw_23_22] + pmaddwd m5, m0 + vpbroadcastd m0, [pd_16] + paddd m5, m0 +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + vextracti128 xm0, m5, 1 + packssdw xm5, xm0 +%else +%if %1 + movddup xm0, [pw_27_17_17_27] + pmaddwd xm5, xm0 + vpbroadcastd m0, [pd_16] paddd xm5, xm0 +%else + pmaddwd xm5, xm15 + paddd xm5, xm14 +%endif psrad xm5, 5 packssdw xm5, xm5 - pcmpeqw xm0, xm0 - psraw xm7, xm10, 1 - pxor xm0, xm7 +%endif + pcmpeqw m0, m0 + psraw m7, m10, 1 + pxor m0, m7 pminsw xm5, xm7 pmaxsw xm5, xm0 - pcmpeqw xm7, xm7 - psrldq xm7, 14 ; 0xffff, 0..... - vpblendvb m9, m5, m7 ; line 0 - psrldq xm5, 2 - vpblendvb m3, m5, m7 ; line 1 - psrldq xm5, 2 - vpblendvb m5, m8, m5, m7 ; top line + vpblendd m9, m9, m5, 00000001b +%if %2 + psrldq xm5, 4 + vpblendd m3, m3, m5, 00000001b +%if %3 == 0 + psrldq xm5, 4 + vpblendd m1, m1, m5, 00000001b +%endif +%endif + psrldq xm5, 4 + vpblendd m5, m8, m5, 00000001b - punpckhwd m7, m5, m9 + punpckhwd m8, m5, m9 punpcklwd m5, m9 ; {top/cur interleaved} +%if %3 + vpbroadcastd m9, [pw_23_22] +%elif %2 + vpbroadcastd m9, [pw_27_17_17_27] +%else + xchg r12, r13mp + vpbroadcastd m9, [r12] +%endif + REPX {pmaddwd x, m9}, m8, m5 %if %1 - REPX {pmaddwd x, [pw_23_22]}, m7, m5 + vpbroadcastd m4, [pd_16] + REPX {paddd x, m4}, m8, m5 %else - REPX {pmaddwd x, m15}, m7, m5 + REPX {paddd x, m14}, m8, m5 %endif - vpbroadcastd m9, [pd_16] - REPX {paddd x, m9}, m5, m7 - REPX {psrad x, 5}, m5, m7 - packssdw m9, m5, m7 - pcmpeqw m5, m5 - psraw m7, m10, 1 - pxor m5, m7 - pmaxsw m9, m5 + REPX {psrad x, 5}, m8, m5 + packssdw m9, m5, m8 +%if %3 pminsw m9, m7 + pmaxsw m9, m0 +%else + punpckhwd m8, m1, m3 + punpcklwd m1, m3 ; {top/cur interleaved} +%if %2 + vpbroadcastd m3, [pw_27_17_17_27+4] +%else + vpbroadcastd m3, [r12] + xchg r12, r13mp +%endif + REPX {pmaddwd x, m3}, m8, m1 +%if %1 + REPX {paddd x, m4}, m8, m1 +%else + REPX {paddd x, m14}, m8, m1 +%endif + REPX {psrad x, 5}, m8, m1 + packssdw m3, m1, m8 + REPX {pminsw x, m7}, m9, m3 + REPX {pmaxsw x, m0}, m9, m3 +%endif ; src mova m0, [srcq] +%if %2 mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif ; luma_src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm7, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+32], 1 vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*2+ 0] - mova xm8, [lumaq+lstrideq*2+16] - vinserti128 m6, [lumaq+lstrideq*2+32], 1 - vinserti128 m8, [lumaq+lstrideq*2+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m4, m7 phaddw m6, m8 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif %if %1 punpckhwd m8, m4, m0 @@ -2025,15 +2295,11 @@ pcmpeqw m7, m7 vpgatherdd m4, [scalingq+m5-3], m7 REPX {psrld x, 24}, m8, m4 - REPX {por x, [pd_0x10000]}, m8, m4 + packssdw m8, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m7, m9, m11 - punpcklwd m9, m11 - pmaddwd m9, m8 - pmaddwd m7, m4 - REPX {psrad x, r13m}, m9, m7 - packssdw m9, m7 + pmullw m8, m11 + pmulhrsw m9, m8 ; same for the other half punpckhwd m7, m6, m2 @@ -2043,15 +2309,11 @@ vpgatherdd m5, [scalingq+m6-3], m8 vpgatherdd m6, [scalingq+m7-3], m4 REPX {psrld x, 24}, m5, m6 - REPX {por x, [pd_0x10000]}, m5, m6 + packssdw m5, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) - punpckhwd m8, m3, m11 - punpcklwd m3, m11 - pmaddwd m3, m5 - pmaddwd m8, m6 - REPX {psrad x, r13m}, m3, m8 - packssdw m3, m8 + pmullw m5, m11 + pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m9 @@ -2061,32 +2323,53 @@ pminsw m0, m12 pminsw m1, m12 mova [dstq], m0 +%if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*4 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 sub hb, 2 jg %%loop_y_h_overlap +%else + dec hb + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap + add r13mp, 4 + jmp %%loop_y_hv_overlap +%endif %%end_y_hv_overlap: - add wq, 16 + add wq, 32>>%2 jge %%end_hv mov srcq, r10mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*4] + lea lumaq, [lumaq+wq*(2<<%2)] jmp %%loop_x_hv_overlap %%end_hv: RET %endmacro - FGUV_32x32xN_LOOP 1 + %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: - FGUV_32x32xN_LOOP 0 + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 diff -Nru dav1d-0.9.0/src/x86/film_grain16_sse.asm dav1d-0.9.1/src/x86/film_grain16_sse.asm --- dav1d-0.9.0/src/x86/film_grain16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/film_grain16_sse.asm 2021-07-28 21:38:28.893852000 +0000 @@ -0,0 +1,3450 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 +pd_16: times 4 dd 16 +pw_1: times 8 dw 1 +pw_16384: times 8 dw 16384 +pw_8192: times 8 dw 8192 +pw_23_22: dw 23, 22 + times 3 dw 0, 32 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +pw_27_17_17_27: dw 27, 17, 17, 27 + times 2 dw 0, 32 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if ARCH_X86_32 +%undef base +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 8 +%define %%tmp %8 +%endif +%rep (%6/2) +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %6 == 8 +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4*%7] +%else + pinsrw %1, [%3+%4*%7], %%idx + 0 +%endif + pinsrw %1, [%3+%5*%7], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +%macro SPLATD 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask +%else +cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax + LEA r4, $$ +%define base r4-$$ +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r5d, [bdmaxq+1] + shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r5 + SPLATW m6, [base+round+r3*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] + mov r3, -73*82*2 + sub bufq, r3 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +.loop: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r5, r7, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+r3], m3 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] + jmp r3 + +.ar1: +%if WIN64 + DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 + lea bufq, [r0-2*(82*73-(82*3+79))] + PUSH r8 +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 + DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 + PUSH r6 +%define shiftd r1d +%endif + sub bufq, 2*(82*73-(82*3+79)) +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] +%if WIN64 + DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 +%elif ARCH_X86_64 + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 +%undef shiftd + DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 +%define hd dword r0m +%define maxd dword minm +%endif +%if cpuflag(sse4) + pmovsxbw m4, m4 +%else + pxor m3, m3 + pcmpgtb m3, m4 + punpcklbw m4, m3 +%endif + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +%if WIN64 + POP r8 +%elif ARCH_X86_32 + POP r6 +%undef maxd +%undef hd +%endif +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m0, [base+round_vals-12+shiftq*2] + pshuflw m0, m0, q0000 + movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 + pxor m2, m2 + punpcklwd m0, m2 + pcmpgtb m2, m6 + punpckhbw m3, m6, m2 + punpcklbw m6, m2 + pshufd m2, m6, q3333 + pshufd m1, m6, q2222 + pshufd m7, m6, q1111 + pshufd m6, m6, q0000 + pshufd m4, m3, q1111 + pshufd m3, m3, q0000 +%if ARCH_X86_64 + SWAP 0, 12 + SWAP 1, 8 + SWAP 2, 9 + SWAP 3, 10 + SWAP 4, 11 +%else +%define m12 [rsp+0*16] +%define m8 [rsp+1*16] +%define m9 [rsp+2*16] +%define m10 [rsp+3*16] +%define m11 [rsp+4*16] + mova m12, m0 + mova m8, m1 + mova m9, m2 + mova m10, m3 + mova m11, m4 + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m0, bdmaxd ; max_grain + pcmpeqw m1, m1 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + psrldq m2, 14 + pslldq m2, 2 + pxor m2, m1 +%endif + pxor m1, m0 ; min_grain +%if ARCH_X86_64 + SWAP 0, 13 + SWAP 1, 14 + SWAP 2, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] + mova m13, m0 + mova m14, m1 +%if !cpuflag(sse4) +%define m15 [rsp+7*16] + mova m15, m2 +%endif +%endif + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m2, m0, 2 + psrldq m3, m0, 4 + psrldq m4, m0, 6 + psrldq m5, m0, 8 + punpcklwd m0, m2 + punpcklwd m3, m4 + punpcklwd m5, m1 + psrldq m2, m1, 2 + psrldq m4, m1, 4 + punpcklwd m2, m4 + psrldq m4, m1, 6 + psrldq m1, 8 + punpcklwd m4, m1 + pmaddwd m0, m6 + pmaddwd m3, m7 + pmaddwd m5, m8 + pmaddwd m2, m9 + pmaddwd m4, m10 + paddd m0, m3 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 ; accumulated top 2 rows + paddd m0, m12 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m4, m1, q3321 + pxor m2, m2 + pcmpgtw m2, m4 + punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd m2, m1, m11 + paddd m2, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + paddd m2, m4 + packssdw m2, m2 + pminsw m2, m13 + pmaxsw m2, m14 + psrldq m4, 4 + pslldq m2, 2 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000010b +%else + pand m1, m15 + pandn m3, m15, m2 + por m1, m3 +%endif + ; overwrite previous pixel, this should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 64 + %define tmp rsp +%elif ARCH_X86_64 + %define tmp rsp+stack_offset-72 +%else +%assign stack_offset stack_offset_old + ALLOC_STACK -16*12 + %define tmp rsp + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m7, bdmaxd ; max_grain + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m4, m4 + psrldq m4, 14 + pslldq m4, 4 + pxor m4, m6 +%endif + pxor m6, m7 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + +%if ARCH_X86_64 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m14 [rsp+10*16] +%define m15 [esp+11*16] + mova m14, m6 + mova m15, m7 +%endif + + ; build cf0-1 until 18-19 in m5-12 and r0/1 + pxor m1, m1 + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + +%if cpuflag(sse4) + pshufd m4, m2, q3333 +%else + pshufd m5, m2, q3333 + mova [tmp+48], m5 +%endif + pshufd m3, m2, q2222 + pshufd m1, m2, q0000 + pshufd m2, m2, q1111 + pshufd m7, m0, q2222 + pshufd m6, m0, q1111 + pshufd m5, m0, q0000 + pshufd m0, m0, q3333 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+4*16] +%define m9 [esp+5*16] +%define m10 [rsp+6*16] +%define m11 [esp+7*16] +%define m12 [rsp+8*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + + ; build cf20,round in r2 + ; build cf21-23,round*2 in m13 + pxor m1, m1 + movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pcmpgtb m1, m0 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+ 0], m1 + mova [tmp+16], m2 + psrldq m3, m0, 10 + pinsrw m3, [base+round_vals+shiftq*2-10], 3 + +%if ARCH_X86_64 + SWAP 3, 13 +%else +%define m13 [esp+9*16] + mova m13, m3 +%endif + + pinsrw m0, [base+round_vals+shiftq*2-12], 5 + pshufd m3, m0, q2222 + mova [tmp+32], m3 + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m2 + paddd m0, m3 + ; m0 = top line first 6 multiplied by cf, m1 = top line last entry + + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] + punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + + pmaddwd m1, m8 + pmaddwd m4, m9 + pmaddwd m3, m10 + pmaddwd m2, m11 + paddd m1, m4 + paddd m3, m2 + paddd m0, m1 + paddd m0, m3 + ; m0 = top 2 lines multiplied by cf + + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, [base+pw_1] + +%if cpuflag(sse4) + pmaddwd m1, m12 +%else + pmaddwd m1, [tmp+48] +%endif + pmaddwd m3, [tmp+ 0] + pmaddwd m4, [tmp+16] + pmaddwd m2, [tmp+32] + paddd m1, m3 + paddd m4, m2 + paddd m0, m1 + paddd m0, m4 + ; m0 = top 3 lines multiplied by cf plus rounding for downshift + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m15 + pmaxsw m2, m14 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m12 + pandn m3, m12, m2 + por m1, m3 +%endif + ; overwrite a couple of pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + lea r6d, [bdmaxq+1] +%else +cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h +%define base r2-$$ + LEA r2, $$ + mov fg_dataq, r2m + mov r6d, r4m + inc r6d +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + SPLATW m6, [base+round+r5*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] +%if ARCH_X86_64 + SPLATW m2, [base+pw_seed_xor+uvq*4] +%else + mov r5d, r3m + SPLATW m2, [base+pw_seed_xor+r5*4] +%endif + pxor m0, m2 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +%if %2 + mov hd, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov xq, -44 +%else + mov xq, -82*73 + add bufq, 82*73*2 +%endif +.loop_x: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r9, r10, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+xq*2], m3 + add xq, 4 + jl .loop_x +%if %2 + add bufq, 82*2 + dec hd + jg .loop_y +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] + jmp r5 + +.ar0: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset_old stack_offset + ALLOC_STACK -16*2 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + SPLATW m3, [base+hmul_bits+shiftq*2-10] +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m1, bdmaxd ; max_gain +%else + SPLATW m1, r4m + psraw m1, 1 +%endif + pcmpeqw m7, m7 + pxor m7, m1 ; min_grain +%if ARCH_X86_64 + SWAP 1, 14 + DEFINE_ARGS buf, bufy, h, x +%else +%define m14 [rsp+0*16] + mova m14, m1 + DEFINE_ARGS buf, bufy, pic_reg, h, x +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATW m4, m4 + pxor m5, m5 +%if %2 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + pslldq m2, 12 +%if ARCH_X86_64 + SWAP 2, 12 +%else +%define m12 [rsp+1*16] + mova m12, m2 +%endif +%endif +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: + ; first 32 pixels + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*(2<<%2)] +%if %2 +%if %3 + movu m2, [bufyq+xq*4+82*2] + paddw m0, m2 +%endif + movu m1, [bufyq+xq*4 +16] +%if %3 + movu m2, [bufyq+xq*4+82*2+16] + paddw m1, m2 +%endif + phaddw m0, m1 + pmulhrsw m0, m6 +%endif + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + REPX {pmaddwd x, m4}, m0, m1 + REPX {psrad x, 5}, m0, m1 + packssdw m0, m1 + pmulhrsw m0, m3 + movu m1, [bufq+xq*2] + paddw m0, m1 + pminsw m0, m14 + pmaxsw m0, m7 + cmp xd, 72-40*%2 + je .end + movu [bufq+xq*2], m0 + add xd, 8 + jmp .x_loop_ar0 + + ; last 6/4 pixels +.end: +%if %2 +%if cpuflag(sse4) + pblendw m0, m1, 11000000b +%else + pand m1, m12 + pandn m2, m12, m0 + por m0, m1, m2 +%endif + movu [bufq+xq*2], m0 +%else + movq [bufq+xq*2], m0 +%endif + + add bufq, 82*2 + add bufyq, 82*(2<<%3) + dec hd + jg .y_loop_ar0 +%if ARCH_X86_32 +%undef m12 +%undef m14 +%endif + RET + +.ar1: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x +%else +%assign stack_offset stack_offset_old +%xdefine rstk rsp +%assign stack_size_padded 0 + DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] +%if WIN64 + DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 +%if %2 + lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] +%else + lea bufq, [r0-2*(82*69+3)] +%endif +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 +%else + DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 +%define hd dword r1m +%define mind dword r3m +%define maxd dword r4m +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif +%endif +%if ARCH_X86_64 + mov shiftd, [r2+FGData.ar_coeff_shift] +%else + mov shiftd, [r3+FGData.ar_coeff_shift] +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 ; cf0-4 in words + pshuflw m4, m4, q2100 + psrldq m4, 2 ; cf0-3,4 in words + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pxor m6, m6 + punpcklwd m3, m6 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATD m3, m3 + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 +%if ARCH_X86_64 + mov mind, maxd + xor mind, -1 +%else + DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 + mov r2, maxd + xor r2, -1 + mov mind, r2 +%endif +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu m7, [bufyq+xq*4] +%if %3 + movu m1, [bufyq+xq*4+82*2] + phaddw m7, m1 +%else + phaddw m7, m7 +%endif +%else + movq m7, [bufyq+xq*2] +%endif + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 +%if %2 +%if %3 + pshufd m2, m7, q3232 + paddw m7, m2 +%endif + pmulhrsw m7, m6 +%endif + punpcklwd m1, m7 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 +%if ARCH_X86_32 +%undef maxd +%undef mind +%undef hd +%endif + RET + +.ar2: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift + ALLOC_STACK -16*8 + mov bufyq, r1m + mov uvd, r3m +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m5, bdmaxd ; max_grain +%else + SPLATW m5, r4m + psraw m5, 1 +%endif + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m7, m7 + psrldq m7, 14 + pslldq m7, 2 + pxor m7, m6 +%endif + pxor m6, m5 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m7, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 5, 13 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] +%define m15 [rsp+7*16] + mova m13, m5 + mova m14, m6 + mova m15, m7 +%endif + + ; coef values + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pinsrw m2, [base+round_vals-12+shiftq*2], 5 + + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m1, m0, q3333 + pshufd m0, m0, q2222 + pshufd m3, m2, q1111 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+0*16] +%define m9 [rsp+1*16] +%define m10 [rsp+2*16] +%define m11 [rsp+3*16] +%define m12 [rsp+4*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m4, m0, 2 ; y=-2,x=[-1,+5] + psrldq m1, m0, 4 ; y=-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-2,x=[+1,+5] + psrldq m2, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] + pmaddwd m0, m6 + pmaddwd m1, m7 + pmaddwd m2, m8 + paddd m0, m1 + paddd m0, m2 + psrldq m3, m5, 2 ; y=-1,x=[-1,+5] + psrldq m1, m5, 4 ; y=-1,x=[-0,+5] + psrldq m4, m5, 6 ; y=-1,x=[+1,+5] + psrldq m2, m5, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + punpcklwd m4, m2 + pmaddwd m3, m9 + pmaddwd m4, m10 + paddd m3, m4 + paddd m0, m3 + + ; luma component & rounding +%if %2 + movu m1, [bufyq+xq*4] +%if %3 + movu m2, [bufyq+xq*4+82*2] + phaddw m1, m2 + pshufd m2, m1, q3232 + paddw m1, m2 +%else + phaddw m1, m1 +%endif +%if cpuflag(sse4) + pmulhrsw m1, m15 +%elif %3 + pmulhrsw m1, [base+pw_8192] +%else + pmulhrsw m1, [base+pw_16384] +%endif +%else + movq m1, [bufyq+xq*2] +%endif + punpcklwd m1, [base+pw_1] + pmaddwd m1, m12 + paddd m0, m1 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m2, m1, q3321 + pxor m3, m3 + pcmpgtw m3, m2 + punpcklwd m2, m3 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd m3, m1, m11 + paddd m3, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd m3, m2 + packssdw m3, m3 + pminsw m3, m13 + pmaxsw m3, m14 + psrldq m1, 2 + pslldq m3, 2 + psrldq m2, 4 +%if cpuflag(sse4) + pblendw m1, m3, 00000010b +%else + pand m1, m15 + pandn m4, m15, m3 + por m1, m4 +%endif + ; overwrite previous pixel, should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 96 + %define tmp rsp +%else + %define tmp rsp+stack_offset-120 +%endif +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 + mov bufyq, r1m + mov uvd, r3m + %define tmp rsp +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + SPLATW m4, [base+round_vals-12+shiftq*2] + pxor m5, m5 + pcmpgtw m5, m4 + punpcklwd m4, m5 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m6, bdmaxd ; max_grain +%else + SPLATW m6, r4m + psraw m6, 1 +%endif + pcmpeqw m7, m7 +%if !cpuflag(sse4) + pcmpeqw m3, m3 + psrldq m3, 14 + pslldq m3, 4 + pxor m3, m7 +%endif + pxor m7, m6 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m3, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 3, 11 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m11 [rsp+ 9*16] +%define m12 [rsp+10*16] +%define m14 [rsp+12*16] +%define m15 [rsp+13*16] + mova m11, m3 + mova m12, m4 + mova m14, m6 + mova m15, m7 +%endif + + ; cf from y=-3,x=-3 until y=-3,x=-2 + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m0, m0, q3333 + pshufd m5, m2, q0000 + pshufd m6, m2, q1111 + mova [tmp+16*0], m1 + mova [tmp+16*1], m3 + mova [tmp+16*2], m4 + mova [tmp+16*3], m0 + mova [tmp+16*4], m5 + mova [tmp+16*5], m6 + pshufd m6, m2, q2222 + pshufd m7, m2, q3333 + + ; cf from y=-1,x=-1 to y=0,x=-1 + luma component + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 ; luma + punpcklbw m0, m1 + pshufd m3, m0, q3232 + psrldq m5, m0, 10 + ; y=0,x=[-3 to -1] + "1.0" for current pixel + pinsrw m5, [base+round_vals-10+shiftq*2], 3 + ; y=-1,x=[-1 to +2] + pshufd m1, m0, q0000 + pshufd m0, m0, q1111 + ; y=-1,x=+3 + luma + punpcklwd m3, m2 + pshufd m3, m3, q0000 + +%if ARCH_X86_64 + SWAP 1, 8 + SWAP 0, 9 + SWAP 3, 10 + SWAP 5, 13 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else +%define m8 [rsp+ 6*16] +%define m9 [rsp+ 7*16] +%define m10 [rsp+ 8*16] +%define m13 [rsp+11*16] + mova m8, m1 + mova m9, m0 + mova m10, m3 + mova m13, m5 + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + ; first line + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, [tmp+0*16] + pmaddwd m2, [tmp+1*16] + pmaddwd m3, [tmp+2*16] + paddd m0, m2 + paddd m0, m3 ; first 6 x of top y + + ; second line [m0/1 are busy] + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] + punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, [tmp+3*16] + pmaddwd m4, [tmp+4*16] + pmaddwd m3, [tmp+5*16] + pmaddwd m5, m6 + paddd m1, m4 + paddd m3, m5 + paddd m0, m1 + paddd m0, m3 ; top 2 lines + + ; third line [m0 is busy] & luma + round + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] +%if %2 + movu m5, [bufyq+xq*4] +%if %3 + movu m4, [bufyq+xq*4+82*2] + phaddw m5, m4 +%else + phaddw m5, m5 +%endif +%else + movq m5, [bufyq+xq*2] +%endif + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] +%if %3 + pshufd m4, m5, q3232 + paddw m5, m4 +%endif +%if %2 +%if cpuflag(sse4) + pmulhrsw m5, m11 +%elif %3 + pmulhrsw m5, [base+pw_8192] +%else + pmulhrsw m5, [base+pw_16384] +%endif +%endif + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, m5 + pmaddwd m1, m7 + pmaddwd m3, m8 + pmaddwd m4, m9 + pmaddwd m2, m10 + paddd m1, m3 + paddd m4, m2 + paddd m0, m12 ; += round + paddd m1, m4 + paddd m0, m1 + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m14 + pmaxsw m2, m15 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m11 + pandn m3, m11, m2 + por m1, m3 +%endif + ; overwrite previous pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] + + mov r0m, r0 + mov r2m, r1 + mov r4m, r2 + mov r6m, r3 + mov r7m, r4 + mov r8m, r5 +%else +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov scalingq, r5m + mov fg_dataq, r3m +%if STACK_ALIGNMENT < mmsize + mov r6, r9m + +%define r9m [rsp+8*mmsize+ 4*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + + mov r9m, r6 +%endif + LEA r5, $$ +%define base r5-$$ + mov r5m, picptrq +%else +cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 3 +%else + DECLARE_REG_TMP 9, 10 +%endif + mov t0d, r9m ; bdmax + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t0d, [t0d*3] + lea t0d, [r6d*2+t0d] + SPLATW m4, [base+max+t0*2] + SPLATW m2, r9m + + pcmpeqw m1, m1 + psraw m7, m2, 1 ; max_grain + pxor m1, m7 ; min_grain + SPLATD m6, [base+pd_16] + + SCRATCH 1, 9, 0 + SCRATCH 2, 10, 1 + SCRATCH 3, 11, 2 + SCRATCH 4, 12, 3 + SCRATCH 5, 13, 4 + SCRATCH 6, 14, 5 + SCRATCH 7, 15, 6 + + mova m6, [base+pw_27_17_17_27] ; for horizontal filter + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + movzx t0d, byte [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz .no_vertical_overlap + test sbyd, sbyd + jnz .vertical_overlap +.no_vertical_overlap: + mov dword r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak +%endif + +.loop_x_odd: + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp ; src += stride + add grain_lutq, 82*2 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk + add offxyd, 16 + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy +%endif + + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m4, [grain_lutq+r5*2] +%else + movd m4, [grain_lutq+left_offxyq*2] +%endif + punpcklwd m4, m5 + pmaddwd m4, m6 + paddd m4, m14 + psrad m4, 5 + packssdw m4, m4 + pminsw m4, m15 + pmaxsw m4, m9 + shufps m4, m5, q3210 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; noise = round2(scaling[src] * grain, scaling_shift) + movu m5, [grain_lutq+offxyq*2+16] + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + mov seed, r3m +%else + SPLATD m7, [pw_27_17_17_27] +%endif + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+1*gprsize] + movu m2, [grain_lutq+r5*2] +%else + movu m2, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + REPX {pmaddwd x, m7}, m4, m2 + REPX {paddd x, m14}, m4, m2 + REPX {psrad x, 5}, m4, m2 + packssdw m2, m4 + pminsw m2, m15 + pmaxsw m2, m9 + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m3, [grain_lutq+r5*2+16] +%else + movu m3, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + REPX {pmaddwd x, m7}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m3, m5 + pminsw m3, m15 + pmaxsw m3, m9 + + ; src + pand m0, m10, [srcq+ 0] ; m0-1: src as word + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m4, m2 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 +%else + vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk_v +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + jmp .loop_x_odd_v_overlap + +.next_blk_v: + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r0, [rsp+8*mmsize+1*gprsize] + add r3, 16 + add r0, 16 + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy + + mov seed, r3m + xor r0, r0 +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + movu m4, [grain_lutq+r0*2] + movd m5, [grain_lutq+r5*2] + mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy + movd m3, [grain_lutq+r5*2] +%else + movu m4, [grain_lutq+top_offxyq*2] + movd m5, [grain_lutq+left_offxyq*2] + movd m3, [grain_lutq+topleft_offxyq*2] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd m5, m2 + punpcklwd m3, m4 + REPX {pmaddwd x, m6}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m5, m3 + pminsw m5, m15 + pmaxsw m5, m9 + shufps m3, m5, m2, q3210 + shufps m5, m4, q3232 + ; followed by v interpolation (top | cur -> cur) + movu m0, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m1, [grain_lutq+r0*2+16] +%else + movu m1, [grain_lutq+top_offxyq*2+16] +%endif + punpcklwd m2, m5, m3 + punpckhwd m5, m3 + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + REPX {pmaddwd x, m7}, m2, m5, m3, m1 + REPX {paddd x, m14}, m2, m5, m3, m1 + REPX {psrad x, 5}, m2, m5, m3, m1 + packssdw m2, m5 + packssdw m3, m1 + REPX {pminsw x, m15}, m2, m3 + REPX {pmaxsw x, m9}, m2, m3 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m2, m4 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + or dword r8m, 4 +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov r5, r5m + add offxyd, 16 + add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + add offxyd, 16 + add top_offxyd, 16 + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET +%if ARCH_X86_32 + DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +%endif + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r4, r3m + mov r3, r4m + mov r5, r5m +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r1m [rsp+8*mmsize+ 4*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + mov r0m, r0 + mov r2m, r2 + mov r4m, r3 + mov r5m, r5 + + mov r0, r6m + mov r2, r7m + mov r3, r8m + mov r5, r9m +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] +%define r9m [rsp+8*mmsize+12*gprsize] + mov r6m, r0 + mov r7m, r2 + mov r8m, r3 + mov r9m, r5 + + mov r2, r10m + mov r3, r11m + mov r5, r12m + mov r0, r13m +%define r10m [rsp+8*mmsize+13*gprsize] +%define r11m [rsp+8*mmsize+14*gprsize] +%define r12m [rsp+8*mmsize+15*gprsize] + mov r10m, r2 + mov r11m, r3 + mov r12m, r5 + + SPLATW m2, r13m +%else +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused + mov srcq, srcm + mov fg_dataq, r3m +%endif + LEA r5, $$ +%define base r5-$$ + + DECLARE_REG_TMP 0, 2, 3 +%else +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + + DECLARE_REG_TMP 9, 10, 11 +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if STACK_ALIGNMENT >= mmsize + mov t0d, r13m ; bdmax +%endif + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t1d, [t0d*3] + mov t2d, r12m + inc t2d + imul r6d, t2d + add t1d, r6d + SPLATW m4, [base+max+t1*2] +%if STACK_ALIGNMENT >= mmsize + SPLATW m2, r13m +%endif + + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + +%define mzero m7 + +%if %3 + SPLATD m2, [base+pw_23_22] +%endif + +%if ARCH_X86_32 + mov scalingq, r5m + mov r5m, r5 +%else + mov r13mp, strideq +%endif + + pcmpeqw m0, m0 + psraw m1, m10, 1 + pxor m0, m1 + + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap + + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + DECLARE_REG_TMP 9 +%endif + +%if %1 + mov r6d, r11m + SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] + SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklwd m6, m1, m0 + SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] + SPLATD m7, [base+pw_4+t0*4] + pmullw m5, m7 +%else + SPLATD m6, [base+pd_16] +%if %2 + mova m5, [base+pw_23_22] +%else + mova m5, [base+pw_27_17_17_27] +%endif +%endif + + SCRATCH 6, 14, 6 + SCRATCH 5, 15, 7 + +%if ARCH_X86_32 + DECLARE_REG_TMP 0 +%else + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + mov t0d, [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz %%no_vertical_overlap + test sbyd, sbyd + jnz %%vertical_overlap + +%%no_vertical_overlap: + mov r8m, t0d +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4mp, wq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride +%endif + +%if %2 == 0 +%%loop_x_odd: +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] + mova m1, [srcq+16] ; m0-1: src as word + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m3, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m6, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m3, m5 + pmulhrsw m4, m3 + pmulhrsw m6, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma + + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0m, dstq + mov r9m, lumaq + mov r4m, wq +%endif +%if %2 == 0 + btc dword r8m, 2 + jc %%next_blk + add offxyd, 16 + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%%next_blk: +%endif + test dword r8m, 1 + je %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2+ 0] +%endif + punpcklwd m5, m7 ; {left0, cur0} +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + pmaddwd m5, [PIC_ptr(pw_23_22)] +%else + pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] +%endif + paddd m5, [PIC_ptr(pd_16)] +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m7, q3210 + movu m3, [grain_lutq+offxyq*2+16] + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 + vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m4 + pmulhrsw m5, m7 + pmulhrsw m3, m4 + + ; dst = clip_pixel(src, noise) + paddw m0, m5 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end: + RET + +%%vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov r3m, seed + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4m, wq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %2 == 0 +%%loop_x_odd_v_overlap: +%endif +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy + movu m5, [grain_lutq+r0*2] +%else + movu m5, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m5, [grain_lutq+r0*2+16] +%else + movu m5, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m7, m5, m4 + punpcklwd m5, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m4, m5, m7 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m5, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m5, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m5, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m7, m5, m0 + punpcklwd m5, m0 + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + punpckhwd m7, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m5, m6 + REPX {pmaxsw x, mzero}, m5, m6 + REPX {pminsw x, m10}, m5, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m5, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m5 + pmulhrsw m3, m7 + pmulhrsw m4, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + dec hw + jle %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 +%if %3 + jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_v_overlap +%endif + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else + btc dword r8m, 2 + jc %%loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut + + mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy + add offxyd, 16 + add t0d, 16 + mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd + mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2] +%endif + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+2*gprsize] + movu m4, [grain_lutq+r0*2] +%if %2 + pinsrw m5, [grain_lutq+r5*2], 2 +%else + movd m3, [grain_lutq+r5*2] +%endif +%else + movu m4, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } +%else + movd m3, [grain_lutq+topleft_offxyq*2] +%endif +%endif +%if %2 == 0 + punpckldq m5, m3 +%endif + punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } + punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + movddup m0, [PIC_ptr(pw_23_22)] +%else + movddup m0, [PIC_ptr(pw_27_17_17_27)] +%endif +%else + pshufd m0, m15, q1010 +%endif + pmaddwd m5, m0 +%if %1 + paddd m5, [PIC_ptr(pd_16)] +%else + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 + shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter + shufps m5, m4, q3231 ; top0-7 post-h_filter + + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 +%else + REPX {paddd x, m14}, m5, m7 +%endif + REPX {psrad x, 5}, m5, m7 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; right half + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m0, [grain_lutq+r0*2+16] +%else + movu m0, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m1, m0, m4 + punpcklwd m0, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m1, m0 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 +%else + REPX {paddd x, m14}, m1, m0 +%endif + REPX {psrad x, 5}, m1, m0 + packssdw m4, m0, m1 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m6, [lumaq+ 0] + mova m5, [lumaq+(16<<%2)] +%if %2 + phaddw m6, [lumaq+16] + phaddw m5, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m6, mzero + pavgw m5, mzero +%endif + +%if %1 + punpckhwd m7, m6, m0 + punpcklwd m6, m0 + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + punpckhwd m7, m5, m1 + punpcklwd m5, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m6, m5 + REPX {pmaxsw x, mzero}, m6, m5 + REPX {pminsw x, m10}, m6, m5 ; clip_pixel() +%else + REPX {pand x, m10}, m6, m5 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 + vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 +%else +%if %3 == 0 + ; register shortage :) + push r12 +%endif + vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 + vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 +%if %3 == 0 + pop r12 +%endif +%endif + REPX {psrlw x, 8}, m7, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m6 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_hv_overlap +%%end_y_hv_overlap: +%endif +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov dstmp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else + or dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 diff -Nru dav1d-0.9.0/src/x86/film_grain_avx2.asm dav1d-0.9.1/src/x86/film_grain_avx2.asm --- dav1d-0.9.0/src/x86/film_grain_avx2.asm 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/film_grain_avx2.asm 2021-07-28 21:38:28.893852000 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019-2021, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; @@ -38,7 +38,8 @@ pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pd_m65536: dd ~0xffff -pb_23_22: times 2 db 23, 22 +pb_23_22: db 23, 22 + times 3 db 0, 32 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 @@ -47,24 +48,25 @@ max: dw 255, 240, 235 min: dw 0, 16 pb_27_17_17_27: db 27, 17, 17, 27 + times 2 db 0, 32 pw_1: dw 1 -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro ALIGN 4 -JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -90,8 +92,16 @@ SECTION .text +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + INIT_XMM avx2 -cglobal generate_grain_y, 2, 9, 16, buf, fg_data +cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] @@ -132,8 +142,8 @@ ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_avx2_table+r2*4] - lea r2, [r2+base+generate_grain_y_avx2_table] + movsxd r2, [base+generate_grain_y_8bpc_avx2_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_avx2_table] jmp r2 .ar1: @@ -420,7 +430,7 @@ %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 -cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv +cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] @@ -478,8 +488,8 @@ ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_avx2_table] + movsxd r5, [base+generate_grain_uv_%1_8bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_avx2_table] jmp r5 .ar0: @@ -975,7 +985,7 @@ generate_grain_uv_fn 444, 0, 0 INIT_YMM avx2 -cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut +cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut pcmpeqw m10, m10 psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] @@ -1092,12 +1102,12 @@ jz .loop_x ; r8m = sbym - movd xm15, [pb_27_17_17_27] + movq xm15, [pb_27_17_17_27] cmp dword r8m, 0 jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) - movd xm14, [pw_1024] + movq xm14, [pw_1024] .loop_x_h_overlap: mov r6d, seed or seed, 0xEFF4 @@ -1156,8 +1166,7 @@ pmaddubsw xm4, xm15, xm4 pmulhrsw xm4, xm14 packsswb xm4, xm4 - vpblendw xm4, xm3, 11111110b - vpblendd m3, m4, 00001111b + vpblendd m3, m3, m4, 00000001b pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 @@ -1329,7 +1338,7 @@ ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap - movd xm15, [pb_27_17_17_27] + movq xm15, [pb_27_17_17_27] .loop_x_hv_overlap: vpbroadcastw m8, [pb_27_17_17_27] @@ -1409,10 +1418,8 @@ pmulhrsw xm7, xm14 packsswb xm4, xm4 packsswb xm7, xm7 - vpblendw xm4, xm3, 11111110b - vpblendw xm7, xm6, 11111110b - vpblendd m3, m4, 00001111b - vpblendd m6, m7, 00001111b + vpblendd m3, m4, 00000001b + vpblendd m6, m7, 00000001b ; followed by v interpolation (top | cur -> cur) punpckhbw m7, m6, m3 punpcklbw m6, m3 @@ -1461,10 +1468,8 @@ RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver -cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id - pcmpeqw m10, m10 - psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask @@ -1490,11 +1495,16 @@ %else vpbroadcastd m14, [pw_1024] %if %2 - vpbroadcastd m15, [pb_23_22] + vpbroadcastq m15, [pb_23_22] %else - vpbroadcastd xm15, [pb_27_17_17_27] + vpbroadcastq xm15, [pb_27_17_17_27] %endif %endif +%if %3 + vpbroadcastw m10, [pb_23_22] +%elif %2 + mova m10, [pb_8x_27_17_8x_17_27] +%endif mov overlapd, [fg_dataq+FGData.overlap_flag] movifnidn sbyd, sbym @@ -1593,16 +1603,13 @@ ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -1743,16 +1750,13 @@ ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -1763,7 +1767,7 @@ ; grain = grain_lut[offy+y][offx+x] %if %2 %if %1 - vpbroadcastd m6, [pb_23_22] ; FIXME + vpbroadcastq m6, [pb_23_22] %endif movu xm3, [grain_lutq+offxyq+ 0] movd xm4, [grain_lutq+left_offxyq+ 0] @@ -1778,12 +1782,10 @@ pmulhrsw m4, m14 %endif packsswb m4, m4 - pcmpeqw m6, m6 ; FIXME - psrldq m6, 15 ; FIXME - vpblendvb m3, m3, m4, m6 + vpblendd m3, m3, m4, 00010001b %else %if %1 - vpbroadcastd xm6, [pb_27_17_17_27] + movq xm6, [pb_27_17_17_27] %endif movu m3, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] @@ -1796,9 +1798,7 @@ pmulhrsw xm4, xm14 %endif packsswb xm4, xm4 - pcmpeqw xm6, xm6 - psrldq xm6, 14 - vpblendvb m3, m3, m4, m6 + vpblendd m3, m3, m4, 00000001b %endif pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 @@ -1915,7 +1915,7 @@ mov hd, hm mov grain_lutq, grain_lutmp %if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] %endif %%loop_y_v_overlap: ; src @@ -1966,16 +1966,13 @@ ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -1988,7 +1985,6 @@ ; grain = grain_lut[offy+y][offx+x] %if %3 == 0 %if %2 - mova m6, [pb_8x_27_17_8x_17_27] movu xm3, [grain_lutq+offxyq] movu xm4, [grain_lutq+top_offxyq] vinserti128 m3, [grain_lutq+offxyq+82], 1 @@ -1999,13 +1995,8 @@ %endif punpckhbw m9, m4, m3 punpcklbw m4, m3 -%if %2 - pmaddubsw m9, m6, m9 - pmaddubsw m4, m6, m4 -%else - pmaddubsw m9, m1, m9 - pmaddubsw m4, m1, m4 -%endif + pmaddubsw m9, m10, m9 + pmaddubsw m4, m10, m4 %if %1 pmulhrsw m9, [pw_1024] pmulhrsw m4, [pw_1024] @@ -2015,19 +2006,15 @@ %endif packsswb m3, m4, m9 %else -%if %1 - vpbroadcastd m6, [pb_23_22] -%endif movq xm3, [grain_lutq+offxyq] movq xm4, [grain_lutq+top_offxyq] vinserti128 m3, [grain_lutq+offxyq+8], 1 vinserti128 m4, [grain_lutq+top_offxyq+8], 1 punpcklbw m4, m3 + pmaddubsw m4, m10, m4 %if %1 - pmaddubsw m4, m6, m4 pmulhrsw m4, [pw_1024] %else - pmaddubsw m4, m15, m4 pmulhrsw m4, m14 %endif packsswb m4, m4 @@ -2084,7 +2071,7 @@ %endif add grain_lutq, 82<<%2 %if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] btc hd, 16 jnc %%loop_y_v_overlap %endif @@ -2139,7 +2126,7 @@ mov hd, hm mov grain_lutq, grain_lutmp %if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] %endif %%loop_y_hv_overlap: ; src @@ -2190,16 +2177,13 @@ ; scaling[src] pcmpeqw m9, m9 pcmpeqw m3, m3 - vpgatherdd m8, [scalingq+m4], m9 - vpgatherdd m4, [scalingq+m5], m3 + vpgatherdd m8, [scalingq-3+m4], m9 + vpgatherdd m4, [scalingq-3+m5], m3 pcmpeqw m9, m9 pcmpeqw m3, m3 - vpgatherdd m5, [scalingq+m6], m9 - vpgatherdd m6, [scalingq+m7], m3 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 + vpgatherdd m5, [scalingq-3+m6], m9 + vpgatherdd m6, [scalingq-3+m7], m3 + REPX {psrld x, 24}, m8, m4, m5, m6 packusdw m8, m4 packusdw m5, m6 @@ -2212,9 +2196,9 @@ ; grain = grain_lut[offy+y][offx+x] %if %1 %if %2 - vpbroadcastd m9, [pb_23_22] + vpbroadcastq m9, [pb_23_22] %else - vpbroadcastd xm9, [pb_27_17_17_27] + vpbroadcastq xm9, [pb_27_17_17_27] %endif %endif @@ -2252,7 +2236,7 @@ %else punpcklbw m7, m6 %endif - punpcklwd m4, m7 + punpcklqdq m4, m7 %if %1 pmaddubsw m4, m9, m4 pmulhrsw m4, [pw_1024] @@ -2261,18 +2245,17 @@ pmulhrsw m4, m14 %endif packsswb m4, m4 - pcmpeqw m9, m9 ; this is kind of ugly - psrldq m9, 15 - vpblendvb m3, m3, m4, m9 - psrldq m4, 1 + vpblendd m3, m4, 00010001b + psrldq m4, 4 %if %3 - shufpd m9, m9, m9, 1110b ; clear upper lane + vpblendd m6, m6, m4, 00000001b +%else + vpblendd m6, m6, m4, 00010001b %endif - vpblendvb m6, m6, m4, m9 %else punpcklbw xm4, xm3 punpcklbw xm7, xm6 - punpckldq xm4, xm7 + punpcklqdq xm4, xm7 %if %1 pmaddubsw xm4, xm9, xm4 pmulhrsw xm4, [pw_1024] @@ -2281,23 +2264,19 @@ pmulhrsw xm4, xm14 %endif packsswb xm4, xm4 - pcmpeqw xm9, xm9 ; this is kind of ugly - psrldq xm9, 14 - vpblendvb m3, m3, m4, m9 - psrldq xm4, 2 - vpblendvb m6, m6, m4, m9 + vpblendd m3, m3, m4, 00000001b + psrldq xm4, 4 + vpblendd m6, m6, m4, 00000001b %endif ; followed by v interpolation (top | cur -> cur) %if %3 vpermq m9, m3, q3120 punpcklbw m6, m9 + pmaddubsw m6, m10, m6 %if %1 - vpbroadcastd m9, [pb_23_22] - pmaddubsw m6, m9, m6 pmulhrsw m6, [pw_1024] %else - pmaddubsw m6, m15, m6 pmulhrsw m6, m14 %endif packsswb m6, m6 @@ -2306,14 +2285,8 @@ %else punpckhbw m9, m6, m3 punpcklbw m6, m3 -%if %2 - mova m3, [pb_8x_27_17_8x_17_27] - pmaddubsw m9, m3, m9 - pmaddubsw m6, m3, m6 -%else - pmaddubsw m9, m1, m9 - pmaddubsw m6, m1, m6 -%endif + pmaddubsw m9, m10, m9 + pmaddubsw m6, m10, m6 %if %1 pmulhrsw m9, [pw_1024] pmulhrsw m6, [pw_1024] @@ -2373,7 +2346,7 @@ jg %%loop_y_h_overlap %else je %%end_y_hv_overlap - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] btc hd, 16 jnc %%loop_y_hv_overlap jmp %%loop_y_h_overlap diff -Nru dav1d-0.9.0/src/x86/film_grain_init_tmpl.c dav1d-0.9.1/src/x86/film_grain_init_tmpl.c --- dav1d-0.9.0/src/x86/film_grain_init_tmpl.c 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/film_grain_init_tmpl.c 2021-07-28 21:38:28.897852200 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -28,64 +28,48 @@ #include "src/cpu.h" #include "src/film_grain.h" -decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3); -decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3); - -decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2); -decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2); - -decl_generate_grain_y_fn(dav1d_generate_grain_y_16bpc_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_16bpc_avx2); -decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_16bpc_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_16bpc_avx2); +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3)); +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3)); + +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2)); +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2)); COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 - c->generate_grain_y = dav1d_generate_grain_y_ssse3; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3; - c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3; -#endif + c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); #if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 - c->generate_grain_y = dav1d_generate_grain_y_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2; - c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2; -#else - c->generate_grain_y = dav1d_generate_grain_y_16bpc_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = - dav1d_generate_grain_uv_420_16bpc_avx2; - c->fgy_32x32xn = dav1d_fgy_32x32xn_16bpc_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = - dav1d_fguv_32x32xn_i420_16bpc_avx2; -#endif + c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); #endif } diff -Nru dav1d-0.9.0/src/x86/film_grain_sse.asm dav1d-0.9.1/src/x86/film_grain_sse.asm --- dav1d-0.9.0/src/x86/film_grain_sse.asm 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/film_grain_sse.asm 2021-07-28 21:38:28.897852200 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019-2021, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; @@ -29,14 +29,18 @@ SECTION_RODATA pw_1024: times 8 dw 1024 +pb_27_17_17_27: db 27, 17, 17, 27 + times 6 db 0, 32 +pb_23_22_h: db 23, 22 + times 7 db 0, 32 pb_27_17: times 8 db 27, 17 pb_17_27: times 8 db 17, 27 +pb_23_22: times 8 db 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 -pb_23_22: times 2 db 23, 22 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 @@ -46,23 +50,21 @@ min: dw 0, 16 pw_1: dw 1 -%define pb_27_17_17_27 pb_17_27 - 2 - -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro -JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -88,6 +90,20 @@ SECTION .text +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if ARCH_X86_32 +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 @@ -98,7 +114,7 @@ %endmacro INIT_XMM ssse3 -cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data +cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] @@ -164,8 +180,8 @@ ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] - lea r2, [r2+base+generate_grain_y_ssse3_table] + movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] jmp r2 .ar1: @@ -507,7 +523,7 @@ %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 -cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv +cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv movifnidn r2, r2mp movifnidn r3, r3mp LEA r4, $$ @@ -606,8 +622,8 @@ ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] + movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] jmp r5 .ar0: @@ -1284,7 +1300,7 @@ ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize -cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register @@ -1295,43 +1311,41 @@ mov r4, r7m mov r5, r8m - mov [rsp+6*mmsize+ 3*gprsize], r0 - mov [rsp+6*mmsize+ 5*gprsize], r1 - mov [rsp+6*mmsize+ 7*gprsize], r2 - mov [rsp+6*mmsize+ 9*gprsize], r3 - mov [rsp+6*mmsize+10*gprsize], r4 - mov [rsp+6*mmsize+11*gprsize], r5 + mov [rsp+5*mmsize+ 4*gprsize], r0 + mov [rsp+5*mmsize+ 6*gprsize], r1 + mov [rsp+5*mmsize+ 8*gprsize], r2 + mov [rsp+5*mmsize+10*gprsize], r3 + mov [rsp+5*mmsize+11*gprsize], r4 + mov [rsp+5*mmsize+12*gprsize], r5 %else -cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize -%define r0m [rsp+6*mmsize+ 3*gprsize] -%define r1m [rsp+6*mmsize+ 4*gprsize] -%define r2m [rsp+6*mmsize+ 5*gprsize] -%define r3m [rsp+6*mmsize+ 6*gprsize] -%define r4m [rsp+6*mmsize+ 7*gprsize] -%define r5m [rsp+6*mmsize+ 8*gprsize] -%define r6m [rsp+6*mmsize+ 9*gprsize] -%define r7m [rsp+6*mmsize+10*gprsize] -%define r8m [rsp+6*mmsize+11*gprsize] +%define r0m [rsp+5*mmsize+ 4*gprsize] +%define r1m [rsp+5*mmsize+ 5*gprsize] +%define r2m [rsp+5*mmsize+ 6*gprsize] +%define r3m [rsp+5*mmsize+ 7*gprsize] +%define r4m [rsp+5*mmsize+ 8*gprsize] +%define r5m [rsp+5*mmsize+ 9*gprsize] +%define r6m [rsp+5*mmsize+10*gprsize] +%define r7m [rsp+5*mmsize+11*gprsize] +%define r8m [rsp+5*mmsize+12*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, picptrq %else -cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut +cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r7, [pb_mask] %define base r7-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - pcmpeqw m2, m2 - psrldq m2, 14 movd m4, [base+max+r6*4] movd m5, [base+min+r6*2] punpcklwd m3, m3 @@ -1340,10 +1354,9 @@ pshufd m3, m3, q0000 pshufd m4, m4, q0000 pshufd m5, m5, q0000 - SCRATCH 2, 10, 0 - SCRATCH 3, 11, 1 - SCRATCH 4, 12, 2 - SCRATCH 5, 13, 3 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap @@ -1356,9 +1369,9 @@ test overlapd, overlapd jz .no_vertical_overlap mova m6, [base+pw_1024] - movd m7, [base+pb_27_17_17_27] - SCRATCH 6, 14, 4 - SCRATCH 7, 15, 5 + mova m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 test sbyd, sbyd jnz .vertical_overlap ; fall-through @@ -1445,16 +1458,13 @@ ; scaling[src] %if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 + REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] @@ -1504,7 +1514,7 @@ jz .loop_x_odd %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif @@ -1525,7 +1535,7 @@ DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 add offxyd, 16 ; left_offxyd - mov [rsp+6*mmsize+0*gprsize], offxyd + mov [rsp+5*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 @@ -1578,21 +1588,18 @@ ; scaling[src] %if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 + REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r5, [rsp+6*mmsize+0*gprsize] + mov r5, [rsp+5*mmsize+0*gprsize] movd m7, [grain_lutq+r5] %else movd m7, [grain_lutq+left_offxyq] @@ -1601,9 +1608,7 @@ pmaddubsw m6, m15, m7 pmulhrsw m6, m14 packsswb m6, m6 - pand m6, m10 - pandn m7, m10, m3 - por m6, m7 + shufps m6, m3, q3210 pcmpgtb m2, m6 punpcklbw m7, m6, m2 punpckhbw m6, m2 @@ -1649,7 +1654,7 @@ test dword r8m, 2 ; have_top_overlap jz .loop_x_odd %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif @@ -1754,7 +1759,7 @@ movzx top_offxyd, offxyw %if ARCH_X86_32 - mov [rsp+6*mmsize+1*gprsize], top_offxyd + mov [rsp+5*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif @@ -1764,7 +1769,7 @@ %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+8], r5 + mov [rsp+5*mmsize+12], r5 %else mova m8, [pb_27_17] %endif @@ -1779,21 +1784,18 @@ ; scaling[src] %if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 + REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r5, [rsp+6*mmsize+1*gprsize] + mov r5, [rsp+5*mmsize+1*gprsize] movu m7, [grain_lutq+r5] %else movu m7, [grain_lutq+top_offxyq] @@ -1801,7 +1803,7 @@ punpckhbw m6, m7, m3 punpcklbw m7, m3 %if ARCH_X86_32 - mov r5, [rsp+5*mmsize+8] + mov r5, [rsp+5*mmsize+12] pmaddubsw m3, [r5], m6 pmaddubsw m6, [r5], m7 %else @@ -1833,7 +1835,7 @@ mova [dstq+srcq], m0 %if ARCH_X86_32 - add dword [rsp+5*mmsize+8], mmsize + add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif @@ -1864,7 +1866,7 @@ jc .loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif @@ -1874,16 +1876,16 @@ %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+8], r5 + mov [rsp+5*mmsize+12], r5 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak - mov r5, [rsp+6*mmsize+1*gprsize] + mov r5, [rsp+5*mmsize+1*gprsize] mov r4, offxyd add r5, 16 add r4, 16 - mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy - mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy + mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak @@ -1937,7 +1939,7 @@ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut movzx r5, offxyw ; top_offxy - mov [rsp+6*mmsize+1*gprsize], r5 + mov [rsp+5*mmsize+1*gprsize], r5 %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy @@ -1952,10 +1954,10 @@ ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy - mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy + mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy movu m6, [grain_lutq+r5] - mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy movd m4, [grain_lutq+r0] movd m7, [grain_lutq+r5] %else @@ -1972,17 +1974,13 @@ pmulhrsw m4, m14 packsswb m2, m2 packsswb m4, m4 - pand m2, m10 - pand m4, m10 - pandn m7, m10, m3 - pandn m3, m10, m6 - por m7, m2 - por m3, m4 + shufps m2, m3, q3210 + shufps m4, m6, q3210 ; followed by v interpolation (top | cur -> cur) - punpckhbw m4, m3, m7 - punpcklbw m3, m7 + punpcklbw m3, m4, m2 + punpckhbw m4, m2 %if ARCH_X86_32 - mov r5, [rsp+5*mmsize+8] + mov r5, [rsp+5*mmsize+12] pmaddubsw m7, [r5], m4 pmaddubsw m4, [r5], m3 %else @@ -2004,16 +2002,13 @@ ; scaling[src] %if ARCH_X86_32 - vpgatherdw m5, m0, scalingq, r0, r5, m7 - vpgatherdw m6, m1, scalingq, r0, r5, m7 + vpgatherdw m5, m0, scalingq-1, r0, r5, m7 + vpgatherdw m6, m1, scalingq-1, r0, r5, m7 %else - vpgatherdw m5, m0, scalingq, r13, r14, m7 - vpgatherdw m6, m1, scalingq, r13, r14, m7 + vpgatherdw m5, m0, scalingq-1, r13, r14, m7 + vpgatherdw m6, m1, scalingq-1, r13, r14, m7 %endif - pcmpeqw m7, m7 - psrlw m7, 8 - pand m5, m7 - pand m6, m7 + REPX {psrlw x, 8}, m5, m6 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m3, m5 @@ -2033,7 +2028,7 @@ mova [dstq+srcq], m0 %if ARCH_X86_32 - add dword [rsp+5*mmsize+8], mmsize + add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif @@ -2063,7 +2058,7 @@ xor dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 + add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif @@ -2079,61 +2074,60 @@ ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 -cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m - mov [rsp+8*mmsize+3*gprsize], r0 - mov [rsp+8*mmsize+5*gprsize], r1 - mov [rsp+8*mmsize+7*gprsize], r2 - mov [rsp+8*mmsize+9*gprsize], r3 - mov [rsp+8*mmsize+10*gprsize], r4 + mov [rsp+7*mmsize+3*gprsize], r0 + mov [rsp+7*mmsize+5*gprsize], r1 + mov [rsp+7*mmsize+7*gprsize], r2 + mov [rsp+7*mmsize+9*gprsize], r3 + mov [rsp+7*mmsize+10*gprsize], r4 mov r0, r8m mov r1, r9m mov r2, r10m mov r4, r11m mov r3, r12m - mov [rsp+8*mmsize+11*gprsize], r0 - mov [rsp+8*mmsize+12*gprsize], r1 - mov [rsp+8*mmsize+13*gprsize], r2 - mov [rsp+8*mmsize+14*gprsize], r4 + mov [rsp+7*mmsize+11*gprsize], r0 + mov [rsp+7*mmsize+12*gprsize], r1 + mov [rsp+7*mmsize+13*gprsize], r2 + mov [rsp+7*mmsize+14*gprsize], r4 %else -cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize -%define r0m [rsp+8*mmsize+ 3*gprsize] -%define r1m [rsp+8*mmsize+ 4*gprsize] -%define r2m [rsp+8*mmsize+ 5*gprsize] -%define r3m [rsp+8*mmsize+ 6*gprsize] -%define r4m [rsp+8*mmsize+ 7*gprsize] -%define r5m [rsp+8*mmsize+ 8*gprsize] -%define r6m [rsp+8*mmsize+ 9*gprsize] -%define r7m [rsp+8*mmsize+10*gprsize] -%define r8m [rsp+8*mmsize+11*gprsize] -%define r9m [rsp+8*mmsize+12*gprsize] -%define r10m [rsp+8*mmsize+13*gprsize] -%define r11m [rsp+8*mmsize+14*gprsize] -%define r12m [rsp+8*mmsize+15*gprsize] +%define r0m [rsp+7*mmsize+ 3*gprsize] +%define r1m [rsp+7*mmsize+ 4*gprsize] +%define r2m [rsp+7*mmsize+ 5*gprsize] +%define r3m [rsp+7*mmsize+ 6*gprsize] +%define r4m [rsp+7*mmsize+ 7*gprsize] +%define r5m [rsp+7*mmsize+ 8*gprsize] +%define r6m [rsp+7*mmsize+ 9*gprsize] +%define r7m [rsp+7*mmsize+10*gprsize] +%define r8m [rsp+7*mmsize+11*gprsize] +%define r9m [rsp+7*mmsize+12*gprsize] +%define r10m [rsp+7*mmsize+13*gprsize] +%define r11m [rsp+7*mmsize+14*gprsize] +%define r12m [rsp+7*mmsize+15*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, r5 %else -cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] - pcmpeqw m2, m2 movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] lea tmpd, [r6d*2] @@ -2145,17 +2139,15 @@ movd m5, [base+min+r6*2] cmovne r6d, tmpd movd m4, [base+max+r6*2] - psrldq m2, 14+%2 punpcklwd m3, m3 punpcklwd m5, m5 punpcklwd m4, m4 pshufd m3, m3, q0000 pshufd m5, m5, q0000 pshufd m4, m4, q0000 - SCRATCH 2, 10, 0 - SCRATCH 3, 11, 1 - SCRATCH 4, 12, 2 - SCRATCH 5, 13, 3 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl @@ -2177,8 +2169,8 @@ punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 - SCRATCH 6, 14, 4 - SCRATCH 7, 15, 5 + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 %endif mov sbyd, r8m @@ -2187,22 +2179,21 @@ jz %%no_vertical_overlap %if ARCH_X86_32 %if %2 - movd m1, [base+pb_23_22] + mova m1, [base+pb_23_22_h] %else - movd m1, [base+pb_27_17_17_27] + mova m1, [base+pb_27_17_17_27] %endif mova m0, [base+pw_1024] %else %if %2 - movd m1, [pb_23_22] + mova m1, [pb_23_22_h] %else - movd m1, [pb_27_17_17_27] + mova m1, [pb_27_17_17_27] %endif mova m0, [pw_1024] %endif - pshufd m1, m1, q0000 - SCRATCH 0, 8, 6 - SCRATCH 1, 9, 7 + SCRATCH 0, 8, 5 + SCRATCH 1, 9, 6 test sbyd, sbyd jnz %%vertical_overlap ; fall-through @@ -2347,16 +2338,13 @@ ; scaling[luma_src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 %endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 + REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 @@ -2426,7 +2414,7 @@ %if %2 == 0 ; adjust top_offxy %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif @@ -2450,9 +2438,9 @@ %if ARCH_X86_32 %if %2 lea r6, [offxyd+16] - mov [rsp+8*mmsize+0*gprsize], r6 + mov [rsp+7*mmsize+0*gprsize], r6 %else - mov [rsp+8*mmsize+0*gprsize], offxyd + mov [rsp+7*mmsize+0*gprsize], offxyd %endif DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut @@ -2558,36 +2546,31 @@ ; scaling[luma_src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 %endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 + REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq+ 0] + movu m4, [grain_lutq+offxyq+ 0] %if ARCH_X86_32 - mov r0, [rsp+8*mmsize+0*gprsize] - movd m4, [grain_lutq+r0+ 0] + mov r0, [rsp+7*mmsize+0*gprsize] + movd m2, [grain_lutq+r0+ 0] %else - movd m4, [grain_lutq+left_offxyq+ 0] + movd m2, [grain_lutq+left_offxyq+ 0] %endif - punpcklbw m2, m4, m3 - pmaddubsw m4, m9, m2 - pmulhrsw m4, m8 - packsswb m4, m4 - pand m4, m10 - pandn m2, m10, m3 - por m3, m4, m2 + punpcklbw m2, m4 + pmaddubsw m3, m9, m2 + pmulhrsw m3, m8 + packsswb m3, m3 + shufps m3, m4, q3210 pxor m4, m4 pcmpgtb m4, m3 punpcklbw m2, m3, m4 @@ -2652,7 +2635,7 @@ xor dword r8m, 4 ; adjust top_offxyd %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif @@ -2780,7 +2763,7 @@ movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd + mov [rsp+7*mmsize+1*gprsize], top_offxyd DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %endif @@ -2790,9 +2773,11 @@ mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m - mova m1, [base+pb_27_17] +%endif +%if %3 + mova m1, [PIC_ptr(pb_23_22)] %else - mova m1, [pb_27_17] + mova m1, [PIC_ptr(pb_27_17)] %endif %%loop_y_v_overlap: %if ARCH_X86_32 @@ -2848,34 +2833,26 @@ ; scaling[luma_src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 %endif - pcmpeqw m4, m4 - psrlw m4, 8 - pand m7, m4 - pand m5, m4 + REPX {psrlw x, 8}, m7, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 - mov r0, [rsp+8*mmsize+1*gprsize] + mov r0, [rsp+7*mmsize+1*gprsize] movu m4, [grain_lutq+r0] %else movu m4, [grain_lutq+top_offxyq] %endif punpckhbw m6, m4, m3 punpcklbw m4, m3 -%if %3 - pmaddubsw m2, m9, m6 - pmaddubsw m3, m9, m4 -%else pmaddubsw m2, m1, m6 pmaddubsw m3, m1, m4 -%endif pmulhrsw m2, m8 pmulhrsw m3, m8 packsswb m3, m2 @@ -2928,10 +2905,8 @@ btc hd, 16 %if ARCH_X86_32 mov r5, r5m - mova m1, [base+pb_17_27] -%else - mova m1, [pb_17_27] %endif + mova m1, [PIC_ptr(pb_17_27)] jnc %%loop_y_v_overlap %endif jmp %%loop_y @@ -2963,7 +2938,7 @@ ; h+v overlap %else %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif @@ -2976,15 +2951,15 @@ %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused - mov r6, [rsp+8*mmsize+1*gprsize] + mov r6, [rsp+7*mmsize+1*gprsize] %if %2 lea r0, [r3d+16] add r6, 16 - mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy + mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy %else - mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy %endif - mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy + mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused @@ -3048,18 +3023,55 @@ movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd + mov [rsp+7*mmsize+1*gprsize], top_offxyd %endif mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m - mova m3, [base+pb_27_17] +%endif +%if %3 + mova m3, [PIC_ptr(pb_23_22)] %else - mova m3, [pb_27_17] + mova m3, [PIC_ptr(pb_27_17)] %endif %%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy +%else + movd m1, [grain_lutq+topleft_offxyq] +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 + pmaddubsw m0, m9, m1 + pmaddubsw m1, m9, m4 + REPX {pmulhrsw x, m8}, m0, m1 + packsswb m0, m1 + shufps m4, m0, m2, q3232 + shufps m0, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m2, m0, m4 + punpckhbw m0, m4 + pmaddubsw m4, m3, m0 + pmaddubsw m1, m3, m2 + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + ; src %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut @@ -3116,69 +3128,20 @@ ; scaling[src] %if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 %else - movd m1, [grain_lutq+topleft_offxyq] %if %3 - vpgatherdw m7, m4, scalingq, r2, r12 - vpgatherdw m5, m6, scalingq, r2, r12 + vpgatherdw m7, m4, scalingq-1, r2, r12 + vpgatherdw m5, m6, scalingq-1, r2, r12 %else - vpgatherdw m7, m4, scalingq, r2, r13 - vpgatherdw m5, m6, scalingq, r2, r13 + vpgatherdw m7, m4, scalingq-1, r2, r13 + vpgatherdw m5, m6, scalingq-1, r2, r13 %endif %endif - pcmpeqw m2, m2 - psrlw m2, 8 - pand m7, m2 - pand m5, m2 + REPX {psrlw x, 8}, m7, m5 - ; grain = grain_lut[offy+y][offx+x] -%if ARCH_X86_32 - mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy - mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy - movd m1, [grain_lutq+r0] - mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy -%endif - movu m2, [grain_lutq+offxyq] -%if ARCH_X86_32 - movu m6, [grain_lutq+r5] - movd m4, [grain_lutq+r0] -%else - movu m6, [grain_lutq+top_offxyq] - movd m4, [grain_lutq+left_offxyq] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw m1, m6 - punpcklbw m4, m2 -%if %2 - punpcklwd m4, m1 -%else - punpckldq m4, m1 -%endif - pmaddubsw m1, m9, m4 - pmulhrsw m1, m8 - packsswb m1, m1 - pandn m4, m10, m2 - pandn m2, m10, m6 - psrldq m6, m1, 2-%2 - pand m1, m10 - pand m6, m10 - por m4, m1 - por m2, m6 - ; followed by v interpolation (top | cur -> cur) - punpckhbw m1, m2, m4 - punpcklbw m2, m4 -%if %3 - pmaddubsw m4, m9, m1 - pmaddubsw m1, m9, m2 -%else - pmaddubsw m4, m3, m1 - pmaddubsw m1, m3, m2 -%endif - pmulhrsw m4, m8 - pmulhrsw m1, m8 - packsswb m1, m4 + ; unpack grain pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 @@ -3229,10 +3192,8 @@ jle %%end_y_hv_overlap %if ARCH_X86_32 mov r5, r5m - mova m3, [base+pb_17_27] -%else - mova m3, [pb_17_27] %endif + mova m3, [PIC_ptr(pb_17_27)] btc hd, 16 jnc %%loop_y_hv_overlap %if ARCH_X86_64 @@ -3268,7 +3229,7 @@ jmp %%loop_x_hv_overlap %else %if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 + add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif diff -Nru dav1d-0.9.0/src/x86/ipred16_avx2.asm dav1d-0.9.1/src/x86/ipred16_avx2.asm --- dav1d-0.9.0/src/x86/ipred16_avx2.asm 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/ipred16_avx2.asm 2021-07-28 21:38:28.897852200 +0000 @@ -26,35 +26,39 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if ARCH_X86_64 - SECTION_RODATA 32 -%macro SMOOTH_WEIGHT_TABLE 1-* +%macro SMOOTH_WEIGHTS 1-* +const smooth_weights_1d_16bpc ; sm_weights[] << 7 + %rep %0 + dw %1*128 + %rotate 1 + %endrep +const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] %rep %0 dw %1, 256-%1 %rotate 1 %endrep %endmacro -; sm_weights[], but modified to precalculate x and 256-x -smooth_weights: SMOOTH_WEIGHT_TABLE \ - 0, 0, 255, 128, 255, 149, 85, 64, \ - 255, 197, 146, 105, 73, 50, 37, 32, \ - 255, 225, 196, 170, 145, 123, 102, 84, \ - 68, 54, 43, 33, 26, 20, 17, 16, \ - 255, 240, 225, 210, 196, 182, 169, 157, \ - 145, 133, 122, 111, 101, 92, 83, 74, \ - 66, 59, 52, 45, 39, 34, 29, 25, \ - 21, 17, 14, 12, 10, 9, 8, 8, \ - 255, 248, 240, 233, 225, 218, 210, 203, \ - 196, 189, 182, 176, 169, 163, 156, 150, \ - 144, 138, 133, 127, 121, 116, 111, 106, \ - 101, 96, 91, 86, 82, 77, 73, 69, \ - 65, 61, 57, 54, 50, 47, 44, 41, \ - 38, 35, 32, 29, 27, 25, 22, 20, \ - 18, 16, 15, 13, 12, 10, 9, 8, \ - 7, 6, 6, 5, 5, 4, 4, 4 +SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +%if ARCH_X86_64 ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 @@ -92,8 +96,6 @@ pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 -pd_128: dd 128 -pd_256: dd 256 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -132,8 +134,15 @@ SECTION .text -INIT_YMM avx2 +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro +INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm add tlq, 2 @@ -561,7 +570,6 @@ .w8: vbroadcasti128 m2, [tlq+2] movsldup m6, [base+ipred_hv_shuf] - lea r3, [strideq*3] psubw m4, m2, m3 pabsw m5, m4 .w8_loop: @@ -647,268 +655,244 @@ jg .w64_loop RET -%macro SMOOTH 4 ; src[1-2], mul[1-2] - pmaddwd m0, m%3, m%1 - pmaddwd m1, m%4, m%2 - paddd m0, m2 - paddd m1, m2 - psrld m0, 8 - psrld m1, 8 - packssdw m0, m1 -%endmacro - cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_16bpc_avx2_table - lea r6, [ipred_smooth_v_16bpc_avx2_table] - tzcnt wd, wm - mov hd, hm - movsxd wq, [r6+wq*4] - vpbroadcastd m2, [base+pd_128] - lea weightsq, [base+smooth_weights+hq*8] - neg hq - vpbroadcastw m5, [tlq+hq*2] ; bottom - add wq, r6 - jmp wq + lea r6, [ipred_smooth_v_16bpc_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m5, [tlq+hq*2] ; bottom + add wq, r6 + jmp wq .w4: - vpbroadcastq m3, [tlq+2] - punpcklwd m3, m5 ; top, bottom - movshdup m5, [base+ipred_hv_shuf] - lea r3, [strideq*3] - punpcklqdq m4, m5, m5 - punpckhqdq m5, m5 + vpbroadcastq m4, [tlq+2] ; top + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 ; top - bottom .w4_loop: - vbroadcasti128 m1, [weightsq+hq*4] - pshufb m0, m1, m4 - pshufb m1, m5 - SMOOTH 3, 3, 0, 1 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - add hq, 4 + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m3 + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 + movhps [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movq [dstq+r6 ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 jl .w4_loop .ret: RET -ALIGN function_align .w8: - vbroadcasti128 m4, [tlq+2] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 - movshdup m5, [base+ipred_hv_shuf] + vbroadcasti128 m4, [tlq+2] + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 .w8_loop: - vpbroadcastq m1, [weightsq+hq*4] - pshufb m1, m5 - SMOOTH 3, 4, 1, 1 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - add hq, 2 + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m3 + pshufb m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + vextracti128 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], xm0 + vextracti128 [dstq+strideq*2], m1, 1 + mova [dstq+r6 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 jl .w8_loop RET -ALIGN function_align .w16: - movu m4, [tlq+2] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 + movu m4, [tlq+2] + lea r6, [strideq*3] + psubw m4, m5 .w16_loop: - vpbroadcastd m1, [weightsq+hq*4] - vpbroadcastd m5, [weightsq+hq*4+4] - SMOOTH 3, 4, 1, 1 - mova [dstq+strideq*0], m0 - SMOOTH 3, 4, 5, 5 - mova [dstq+strideq*1], m0 - lea dstq, [dstq+strideq*2] - add hq, 2 + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r6 ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 jl .w16_loop RET -ALIGN function_align .w32: - WIN64_SPILL_XMM 8 - movu m4, [tlq+2] - movu m7, [tlq+34] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 - punpcklwd m6, m7, m5 - punpckhwd m7, m5 + WIN64_SPILL_XMM 7 + movu m4, [tlq+ 2] + movu m6, [tlq+34] + psubw m4, m5 + psubw m6, m5 .w32_loop: - vpbroadcastd m5, [weightsq+hq*4] - SMOOTH 3, 4, 5, 5 - mova [dstq+32*0], m0 - SMOOTH 6, 7, 5, 5 - mova [dstq+32*1], m0 - add dstq, strideq - inc hq + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 jl .w32_loop RET -ALIGN function_align .w64: - WIN64_SPILL_XMM 12 - movu m4, [tlq+ 2] - movu m7, [tlq+34] - movu m9, [tlq+66] - movu m11, [tlq+98] - punpcklwd m3, m4, m5 - punpckhwd m4, m5 - punpcklwd m6, m7, m5 - punpckhwd m7, m5 - punpcklwd m8, m9, m5 - punpckhwd m9, m5 - punpcklwd m10, m11, m5 - punpckhwd m11, m5 + WIN64_SPILL_XMM 8 + movu m3, [tlq+ 2] + movu m4, [tlq+34] + movu m6, [tlq+66] + movu m7, [tlq+98] + REPX {psubw x, m5}, m3, m4, m6, m7 .w64_loop: - vpbroadcastd m5, [weightsq+hq*4] - SMOOTH 3, 4, 5, 5 - mova [dstq+32*0], m0 - SMOOTH 6, 7, 5, 5 - mova [dstq+32*1], m0 - SMOOTH 8, 9, 5, 5 - mova [dstq+32*2], m0 - SMOOTH 10, 11, 5, 5 - mova [dstq+32*3], m0 - add dstq, strideq - inc hq + vpbroadcastw m2, [weightsq+hq*2] + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + inc hq jl .w64_loop RET -cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h +cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 %define base r6-ipred_smooth_h_16bpc_avx2_table - lea r6, [ipred_smooth_h_16bpc_avx2_table] - mov wd, wm - mov hd, hm - vpbroadcastw m3, [tlq+wq*2] ; right - tzcnt wd, wd - movsxd wq, [r6+wq*4] - vpbroadcastd m2, [base+pd_128] - add wq, r6 - jmp wq + lea r6, [ipred_smooth_h_16bpc_avx2_table] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m5, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [r6+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + add wq, r6 + jmp wq .w4: - vbroadcasti128 m4, [base+smooth_weights+4*4] - movsldup m5, [base+ipred_hv_shuf] - sub tlq, 8 - sub tlq, hq - sub tlq, hq - lea r3, [strideq*3] + vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] + movsldup m3, [base+ipred_hv_shuf] .w4_loop: - vpbroadcastq m1, [tlq+hq*2] - pshufb m1, m5 - punpcklwd m0, m1, m3 ; left, right - punpckhwd m1, m3 - SMOOTH 0, 1, 4, 4 - vextracti128 xm1, m0, 1 + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m3 + psubw m0, m5 ; left - right + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 jg .w4_loop RET -ALIGN function_align .w8: - WIN64_SPILL_XMM 7 - vbroadcasti128 m4, [base+smooth_weights+8*4+16*0] - vbroadcasti128 m5, [base+smooth_weights+8*4+16*1] - movsldup m6, [base+ipred_hv_shuf] - sub tlq, 4 - sub tlq, hq - sub tlq, hq + vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] + movsldup m3, [base+ipred_hv_shuf] .w8_loop: - vpbroadcastd m1, [tlq+hq*2] - pshufb m1, m6 - punpcklwd m0, m1, m3 - punpckhwd m1, m3 - SMOOTH 0, 1, 4, 5 + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m3 + pshufb m1, m3 + psubw m0, m5 + psubw m1, m5 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hq, 2 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 jg .w8_loop RET -ALIGN function_align .w16: - WIN64_SPILL_XMM 6 - mova xm4, [base+smooth_weights+16*4+16*0] - mova xm5, [base+smooth_weights+16*4+16*1] - vinserti128 m4, [base+smooth_weights+16*4+16*2], 1 - vinserti128 m5, [base+smooth_weights+16*4+16*3], 1 - sub tlq, 2 - sub tlq, hq - sub tlq, hq + movu m4, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: - vpbroadcastw m1, [tlq+hq*2] - punpcklwd m0, m1, m3 - punpckhwd m1, m3 - SMOOTH 0, 1, 4, 5 - mova [dstq], m0 - add dstq, strideq - dec hq + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 jg .w16_loop RET -ALIGN function_align .w32: - WIN64_SPILL_XMM 10 - mova xm6, [base+smooth_weights+32*4+16*0] - mova xm7, [base+smooth_weights+32*4+16*1] - vinserti128 m6, [base+smooth_weights+32*4+16*2], 1 - vinserti128 m7, [base+smooth_weights+32*4+16*3], 1 - mova xm8, [base+smooth_weights+32*4+16*4] - mova xm9, [base+smooth_weights+32*4+16*5] - vinserti128 m8, [base+smooth_weights+32*4+16*6], 1 - vinserti128 m9, [base+smooth_weights+32*4+16*7], 1 - sub tlq, 2 - sub tlq, hq - sub tlq, hq + WIN64_SPILL_XMM 7 + movu m4, [base+smooth_weights_1d_16bpc+32*2] + movu m6, [base+smooth_weights_1d_16bpc+32*3] .w32_loop: - vpbroadcastw m5, [tlq+hq*2] - punpcklwd m4, m5, m3 - punpckhwd m5, m3 - SMOOTH 4, 5, 6, 7 - mova [dstq+32*0], m0 - SMOOTH 4, 5, 8, 9 - mova [dstq+32*1], m0 - add dstq, strideq - dec hq + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m5 + psubw m3, m5 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 jg .w32_loop RET -ALIGN function_align .w64: -%assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 14 - mova xm6, [base+smooth_weights+64*4+16* 0] - mova xm7, [base+smooth_weights+64*4+16* 1] - vinserti128 m6, [base+smooth_weights+64*4+16* 2], 1 - vinserti128 m7, [base+smooth_weights+64*4+16* 3], 1 - mova xm8, [base+smooth_weights+64*4+16* 4] - mova xm9, [base+smooth_weights+64*4+16* 5] - vinserti128 m8, [base+smooth_weights+64*4+16* 6], 1 - vinserti128 m9, [base+smooth_weights+64*4+16* 7], 1 - mova xm10, [base+smooth_weights+64*4+16* 8] - mova xm11, [base+smooth_weights+64*4+16* 9] - vinserti128 m10, [base+smooth_weights+64*4+16*10], 1 - vinserti128 m11, [base+smooth_weights+64*4+16*11], 1 - mova xm12, [base+smooth_weights+64*4+16*12] - mova xm13, [base+smooth_weights+64*4+16*13] - vinserti128 m12, [base+smooth_weights+64*4+16*14], 1 - vinserti128 m13, [base+smooth_weights+64*4+16*15], 1 - sub tlq, 2 - sub tlq, hq - sub tlq, hq + WIN64_SPILL_XMM 8 + movu m3, [base+smooth_weights_1d_16bpc+32*4] + movu m4, [base+smooth_weights_1d_16bpc+32*5] + movu m6, [base+smooth_weights_1d_16bpc+32*6] + movu m7, [base+smooth_weights_1d_16bpc+32*7] .w64_loop: - vpbroadcastw m5, [tlq+hq*2] - punpcklwd m4, m5, m3 - punpckhwd m5, m3 - SMOOTH 4, 5, 6, 7 - mova [dstq+32*0], m0 - SMOOTH 4, 5, 8, 9 - mova [dstq+32*1], m0 - SMOOTH 4, 5, 10, 11 - mova [dstq+32*2], m0 - SMOOTH 4, 5, 12, 13 - mova [dstq+32*3], m0 - add dstq, strideq - dec hq + vpbroadcastw m2, [tlq+hq-2] + psubw m2, m5 + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + sub hq, 1*2 jg .w64_loop RET @@ -917,11 +901,10 @@ pmaddwd m1, m%2, m%4 paddd m0, m%5 paddd m1, m%6 - paddd m0, m5 - paddd m1, m5 - psrld m0, 9 - psrld m1, 9 + psrld m0, 8 + psrld m1, 8 packssdw m0, m1 + pavgw m0, m5 %endmacro cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights @@ -934,9 +917,9 @@ sub tlq, hq sub tlq, hq movsxd wq, [r6+wq*4] - vpbroadcastd m5, [base+pd_256] + pxor m5, m5 add wq, r6 - lea v_weightsq, [base+smooth_weights+hq*4] + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] jmp wq .w4: WIN64_SPILL_XMM 11 @@ -944,15 +927,14 @@ vpbroadcastq m6, [tlq+hq*2+2] movsldup m7, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] - vbroadcasti128 m10, [base+smooth_weights+4*4] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] punpcklwd m6, m0 ; top, bottom punpcklqdq m8, m9, m9 punpckhqdq m9, m9 lea r3, [strideq*3] - sub tlq, 8 .w4_loop: + vpbroadcastq m3, [tlq+hq*2-8] vbroadcasti128 m1, [v_weightsq] - vpbroadcastq m3, [tlq+hq*2] pshufb m3, m7 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 @@ -960,7 +942,7 @@ pmaddwd m3, m10 pshufb m0, m1, m8 pshufb m1, m9 - SMOOTH_2D_END 6, 6, 0, 1, 2, 3 + SMOOTH_2D_END 0, 1, 6, 6, 2, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 @@ -971,7 +953,6 @@ sub hd, 4 jg .w4_loop RET -ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 @@ -979,21 +960,20 @@ vbroadcasti128 m7, [tlq+hq*2+2] movsldup m8, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] - vbroadcasti128 m10, [base+smooth_weights+8*4+16*0] - vbroadcasti128 m11, [base+smooth_weights+8*4+16*1] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] + vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 - sub tlq, 4 .w8_loop: + vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastq m1, [v_weightsq] - vpbroadcastd m3, [tlq+hq*2] pshufb m3, m8 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m11 pshufb m1, m9 - SMOOTH_2D_END 6, 7, 1, 1, 2, 3 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] @@ -1001,88 +981,79 @@ sub hd, 2 jg .w8_loop RET -ALIGN function_align .w16: %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 14 + WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] - mova xm8, [base+smooth_weights+16*4+16*0] - mova xm9, [base+smooth_weights+16*4+16*1] - vinserti128 m8, [base+smooth_weights+16*4+16*2], 1 - vinserti128 m9, [base+smooth_weights+16*4+16*3], 1 + mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] + mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] + vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 + vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 - sub tlq, 2 .w16_loop: - vpbroadcastd m10, [v_weightsq+0] - vpbroadcastd m11, [v_weightsq+4] - vpbroadcastw m3, [tlq+hq*2-0] - vpbroadcastw m13, [tlq+hq*2-2] - punpcklwd m2, m3, m4 ; left, right - punpckhwd m3, m4 - punpcklwd m12, m13, m4 - punpckhwd m13, m4 - pmaddwd m2, m8 + vpbroadcastd m3, [tlq+hq*2-4] + vpbroadcastd m1, [v_weightsq+0] + punpcklwd m3, m4 ; left, right + pshufd m2, m3, q1111 + pmaddwd m10, m8, m2 + pmaddwd m2, m9 + pshufd m3, m3, q0000 + SMOOTH_2D_END 1, 1, 6, 7, 10, 2 + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m2, m8, m3 pmaddwd m3, m9 - pmaddwd m12, m8 - pmaddwd m13, m9 - SMOOTH_2D_END 6, 7, 10, 10, 2, 3 mova [dstq+strideq*0], m0 - SMOOTH_2D_END 6, 7, 11, 11, 12, 13 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hq, 2 jg .w16_loop RET -ALIGN function_align .w32: %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] movu m9, [tlq+hq*2+34] - mova xm10, [base+smooth_weights+32*4+16*0] - mova xm11, [base+smooth_weights+32*4+16*1] - vinserti128 m10, [base+smooth_weights+32*4+16*2], 1 - vinserti128 m11, [base+smooth_weights+32*4+16*3], 1 - mova xm12, [base+smooth_weights+32*4+16*4] - mova xm13, [base+smooth_weights+32*4+16*5] - vinserti128 m12, [base+smooth_weights+32*4+16*6], 1 - vinserti128 m13, [base+smooth_weights+32*4+16*7], 1 + mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 - sub tlq, 2 .w32_loop: - vpbroadcastw m3, [tlq+hq*2] - punpcklwd m2, m3, m4 - punpckhwd m3, m4 - pmaddwd m14, m2, m10 - pmaddwd m15, m3, m11 - pmaddwd m2, m12 + vpbroadcastw m3, [tlq+hq*2-2] + vpbroadcastd m14, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m1, m10, m3 + pmaddwd m2, m11, m3 + pmaddwd m0, m6, m14 + paddd m0, m1 + pmaddwd m1, m7, m14 + paddd m1, m2 + pmaddwd m2, m12, m3 pmaddwd m3, m13 - vpbroadcastd m1, [v_weightsq] - pmaddwd m0, m6, m1 - paddd m0, m14 - paddd m0, m5 - psrld m0, 9 - pmaddwd m14, m7, m1 - paddd m14, m15 - paddd m14, m5 - psrld m14, 9 - packssdw m0, m14 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m5 mova [dstq+32*0], m0 - SMOOTH_2D_END 8, 9, 1, 1, 2, 3 + SMOOTH_2D_END 14, 14, 8, 9, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec hd jg .w32_loop RET -ALIGN function_align .w64: %assign stack_offset stack_offset - stack_size_padded PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base @@ -1096,37 +1067,35 @@ vpbroadcastw m0, [tl_baseq] ; bottom movu m7, [tlq+xq*2+ 2] movu m9, [tlq+xq*2+34] - mova xm10, [base+smooth_weights+64*4+16*0] - mova xm11, [base+smooth_weights+64*4+16*1] - vinserti128 m10, [base+smooth_weights+64*4+16*2], 1 - vinserti128 m11, [base+smooth_weights+64*4+16*3], 1 - mova xm12, [base+smooth_weights+64*4+16*4] - mova xm13, [base+smooth_weights+64*4+16*5] - vinserti128 m12, [base+smooth_weights+64*4+16*6], 1 - vinserti128 m13, [base+smooth_weights+64*4+16*7], 1 + mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 lea tlq, [tl_baseq-2] .w64_loop_y: - vpbroadcastd m1, [v_weightsq] vpbroadcastw m3, [tlq+yq*2] - punpcklwd m2, m3, m4 - punpckhwd m3, m4 - pmaddwd m14, m2, m10 - pmaddwd m15, m3, m11 - pmaddwd m2, m12 + vpbroadcastd m1, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m14, m10, m3 + pmaddwd m15, m11, m3 + pmaddwd m2, m12, m3 pmaddwd m3, m13 pmaddwd m0, m6, m1 paddd m0, m14 - paddd m0, m5 - psrld m0, 9 pmaddwd m14, m7, m1 paddd m14, m15 - paddd m14, m5 - psrld m14, 9 + psrld m0, 8 + psrld m14, 8 packssdw m0, m14 + pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 mova [dstq+32*1], m0 diff -Nru dav1d-0.9.0/src/x86/ipred16_sse.asm dav1d-0.9.1/src/x86/ipred16_sse.asm --- dav1d-0.9.0/src/x86/ipred16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/ipred16_sse.asm 2021-07-28 21:38:28.897852200 +0000 @@ -0,0 +1,1931 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + +pb_0_1: times 4 db 0, 1 +pb_2_3: times 4 db 2, 3 +pw_1: times 4 dw 1 +pw_2: times 4 dw 2 +pw_4: times 4 dw 4 +pw_512: times 4 dw 512 +pw_2048: times 4 dw 2048 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) +%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) +%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) + +JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ + s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 +JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 +JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +INIT_XMM ssse3 +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_16bpc_ssse3_table + movd m4, wm + tzcnt wd, wm + add tlq, 2 + movifnidn hd, hm + pxor m3, m3 + pavgw m4, m3 + movd m5, wd + movu m0, [tlq] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_16bpc_ssse3_table + mov hd, hm + movd m4, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + pxor m3, m3 + sub tlq, hq + pavgw m4, m3 + movd m5, r6d + movu m0, [tlq] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m2, [tlq+112] + movu m1, [tlq+ 96] + paddw m0, m2 + movu m2, [tlq+ 80] + paddw m1, m2 + movu m2, [tlq+ 64] + paddw m0, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+ 48] + movu m2, [tlq+ 32] + paddw m1, m2 + paddw m0, m1 +.h16: + movu m1, [tlq+ 16] + paddw m0, m1 +.h8: + movhlps m1, m0 + paddw m0, m1 +.h4: + punpcklwd m0, m3 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + lea stride3q, [strideq*3] + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_16bpc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw m4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m1, m0 + punpckhwd m0, m3 + punpcklwd m1, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 + jmp .w4_end +.w4_mul: + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + psrld m0, 2 + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 +.s4: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 32 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + test hd, 8|32 + cmovz r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16c: + mova m1, m0 +.s16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*2+16*0], m0 + mova [dstq+strideq*2+16*1], m1 + mova [dstq+stride3q +16*0], m0 + mova [dstq+stride3q +16*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m0, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 8 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32c: + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s32: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*0+16*2], m2 + mova [dstq+strideq*0+16*3], m3 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s32 + RET +.h64: + mova m0, [tlq-128] + mova m1, [tlq-112] + paddw m0, [tlq- 96] + paddw m1, [tlq- 80] + paddw m0, [tlq- 64] + paddw m1, [tlq- 48] + paddw m0, [tlq- 32] + paddw m1, [tlq- 16] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + movu m2, [tlq+ 18] + paddw m1, m2 + movu m2, [tlq+ 34] + paddw m0, m2 + movu m2, [tlq+ 50] + paddw m1, m2 + movu m2, [tlq+ 66] + paddw m0, m2 + movu m2, [tlq+ 82] + paddw m1, m2 + movu m2, [tlq+ 98] + paddw m0, m2 + movu m2, [tlq+114] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 64 + je .w64_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w64_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + LEA r5, ipred_dc_128_16bpc_ssse3_table + tzcnt wd, wm + shr r6d, 11 + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_16bpc_ssse3_table + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+ 18] + movu m2, [tlq+ 34] + movu m3, [tlq+ 50] + cmp wd, 64 + je .w64 + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w64: + WIN64_SPILL_XMM 8 + movu m4, [tlq+ 66] + movu m5, [tlq+ 82] + movu m6, [tlq+ 98] + movu m7, [tlq+114] +.w64_loop: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + mova [dstq+16*4], m4 + mova [dstq+16*5], m5 + mova [dstq+16*6], m6 + mova [dstq+16*7], m7 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 +%define base r5-ipred_h_16bpc_ssse3_table + tzcnt wd, wm + LEA r5, ipred_h_16bpc_ssse3_table + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m2, [base+pb_0_1] + movddup m3, [base+pb_2_3] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + sub tlq, 8 + movq m3, [tlq] + pshuflw m0, m3, q3333 + pshuflw m1, m3, q2222 + pshuflw m2, m3, q1111 + pshuflw m3, m3, q0000 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + sub tlq, 8 + movq m3, [tlq] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16 + RET +.w32: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*0+16*3], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m1 + mova [dstq+strideq*1+16*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + sub tlq, 2 + movd m0, [tlq] + pshufb m0, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .w64 + RET + +cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left +%define base r5-ipred_paeth_16bpc_ssse3_table + movifnidn hd, hm + pshuflw m4, [tlq], q0000 + mov leftq, tlq + add hd, hd + punpcklqdq m4, m4 ; topleft + sub leftq, hq + and wd, ~7 + jnz .w8 + movddup m5, [tlq+2] ; top + psubw m6, m5, m4 + pabsw m7, m6 +.w4_loop: + movd m1, [leftq+hq-4] + punpcklwd m1, m1 + punpckldq m1, m1 ; left +%macro PAETH 0 + paddw m0, m6, m1 + psubw m2, m4, m0 ; tldiff + psubw m0, m5 ; tdiff + pabsw m2, m2 + pabsw m0, m0 + pminsw m2, m0 + pcmpeqw m0, m2 + pand m3, m5, m0 + pandn m0, m4 + por m0, m3 + pcmpgtw m3, m7, m2 + pand m0, m3 + pandn m3, m1 + por m0, m3 +%endmacro + PAETH + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %define r7d hm + %assign regs_used 7 +%elif WIN64 + movaps r4m, m8 + PUSH r7 + %assign regs_used 8 +%endif +%if ARCH_X86_64 + movddup m8, [pb_0_1] +%endif + lea tlq, [tlq+wq*2+2] + neg wq + mov r7d, hd +.w8_loop0: + movu m5, [tlq+wq*2] + mov r6, dstq + add dstq, 16 + psubw m6, m5, m4 + pabsw m7, m6 +.w8_loop: + movd m1, [leftq+hq-2] +%if ARCH_X86_64 + pshufb m1, m8 +%else + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 +%endif + PAETH + mova [r6], m0 + add r6, strideq + sub hd, 1*2 + jg .w8_loop + mov hd, r7d + add wq, 8 + jl .w8_loop0 +%if WIN64 + movaps m8, r4m +%endif + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 4 +%endif + +cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov hd, hm + lea weightsq, [weightsq+hq*4] + neg hq + movd m5, [tlq+hq*2] ; bottom + pshuflw m5, m5, q0000 + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [tlq+2] ; top + lea r3, [strideq*3] + psubw m4, m5 ; top - bottom +.w4_loop: + movq m1, [weightsq+hq*2] + punpcklwd m1, m1 + pshufd m0, m1, q1100 + punpckhdq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + mov hm, hq + %define hq hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0, hq + movu m4, [tlq+2] + add tlq, 16 + mov r6, dstq + add dstq, 16 + psubw m4, m5 +.w8_loop: + movq m3, [weightsq+t0*2] + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + add t0, 4 + jl .w8_loop + sub wd, 8 + jg .w8_loop0 + RET + +cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov wd, wm + movifnidn hd, hm + movd m5, [tlq+wq*2] ; right + sub tlq, 8 + add hd, hd + pshuflw m5, m5, q0000 + sub tlq, hq + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [weightsq+4*2] + lea r3, [strideq*3] +.w4_loop: + movq m1, [tlq+hq] ; left + punpcklwd m1, m1 + psubw m1, m5 ; left - right + pshufd m0, m1, q3322 + punpckldq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movhps [dstq+strideq*2], m1 + movq [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w4_loop + RET +.w8: + lea weightsq, [weightsq+wq*4] + neg wq +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + %define hd hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0d, hd + mova m4, [weightsq+wq*2] + mov r6, dstq + add dstq, 16 +.w8_loop: + movq m3, [tlq+t0*(1+ARCH_X86_32)] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + sub t0d, 4*(1+ARCH_X86_64) + jg .w8_loop + add wq, 8 + jl .w8_loop0 + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 10 +%else +DECLARE_REG_TMP 3 +%endif + +cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ + h_weights, v_weights, top + LEA h_weightsq, smooth_weights_2d_16bpc + mov wd, wm + mov hd, hm + movd m7, [tlq+wq*2] ; right + lea v_weightsq, [h_weightsq+hq*8] + neg hq + movd m6, [tlq+hq*2] ; bottom + pshuflw m7, m7, q0000 + pshuflw m6, m6, q0000 + cmp wd, 4 + jne .w8 + movq m4, [tlq+2] ; top + mova m5, [h_weightsq+4*4] + punpcklwd m4, m6 ; top, bottom + pxor m6, m6 +.w4_loop: + movq m1, [v_weightsq+hq*4] + sub tlq, 4 + movd m3, [tlq] ; left + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pmaddwd m0, m4 + punpcklwd m3, m7 ; left, right + pmaddwd m1, m4 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + lea h_weightsq, [h_weightsq+wq*4] + mov t0, tlq + mov r1m, tlq + mov r2m, hq + %define m8 [h_weightsq+16*0] + %define m9 [h_weightsq+16*1] +%else +%if WIN64 + movaps r4m, m8 + movaps r6m, m9 + PUSH r7 + PUSH r8 +%endif + PUSH r9 + PUSH r10 + %assign regs_used 11 + lea h_weightsq, [h_weightsq+wq*8] + lea topq, [tlq+wq*2] + neg wq + mov r8, tlq + mov r9, hq +%endif + punpcklqdq m6, m6 +.w8_loop0: +%if ARCH_X86_32 + movu m5, [t0+2] + add t0, 16 + mov r0m, t0 +%else + movu m5, [topq+wq*2+2] + mova m8, [h_weightsq+wq*4+16*0] + mova m9, [h_weightsq+wq*4+16*1] +%endif + mov t0, dstq + add dstq, 16 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 +.w8_loop: + movd m1, [v_weightsq+hq*4] + sub tlq, 2 + movd m3, [tlq] ; left + pshufd m1, m1, q0000 + pmaddwd m0, m4, m1 + pshuflw m3, m3, q0000 + pmaddwd m1, m5 + punpcklwd m3, m7 ; left, right + pmaddwd m2, m8, m3 + pmaddwd m3, m9 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pxor m1, m1 + pavgw m0, m1 + mova [t0], m0 + add t0, strideq + inc hq + jl .w8_loop +%if ARCH_X86_32 + mov t0, r0m + mov tlq, r1m + add h_weightsq, 16*2 + mov hq, r2m + sub dword wm, 8 + jg .w8_loop0 +%else + mov tlq, r8 + mov hq, r9 + add wq, 8 + jl .w8_loop0 +%endif +%if WIN64 + movaps m8, r4m + movaps m9, r6m +%endif + RET + +%if ARCH_X86_64 +cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter +%else +cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%endif +%define base r6-$$ + movifnidn hd, hm + movd m6, r8m ; bitdepth_max +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + LEA r6, $$ + shl filterd, 6 + movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 + mova m1, [base+filter_intra_taps+filterq+16*0] + mova m2, [base+filter_intra_taps+filterq+16*1] + mova m3, [base+filter_intra_taps+filterq+16*2] + mova m4, [base+filter_intra_taps+filterq+16*3] + pxor m5, m5 +%if ARCH_X86_64 + punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper + punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid + punpcklbw m10, m5, m2 ; having to perform sign-extension. + punpckhbw m11, m5, m2 + punpcklbw m12, m5, m3 + punpckhbw m13, m5, m3 + punpcklbw m14, m5, m4 + punpckhbw m15, m5, m4 +%else + punpcklbw m7, m5, m1 + mova m8, m7 + punpckhbw m7, m5, m1 + mova m9, m7 + punpcklbw m7, m5, m2 + mova m10, m7 + punpckhbw m7, m5, m2 + mova m11, m7 + punpcklbw m7, m5, m3 + mova m12, m7 + punpckhbw m7, m5, m3 + mova m13, m7 + punpcklbw m7, m5, m4 + mova m14, m7 + punpckhbw m7, m5, m4 + mova m15, m7 +%endif + mova m7, [base+filter_shuf] + add hd, hd + mov r5, dstq + pshuflw m6, m6, q0000 + mov r6, tlq + punpcklqdq m6, m6 + sub tlq, hq +.left_loop: + pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ + pshufd m1, m0, q0000 + pmaddwd m2, m8, m1 + pmaddwd m1, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + paddd m2, m3 + paddd m1, m4 + pshufd m4, m0, q2222 + pmaddwd m3, m12, m4 + pmaddwd m4, m13 + paddd m2, m3 + paddd m1, m4 + pshufd m3, m0, q3333 + pmaddwd m0, m14, m3 + pmaddwd m3, m15 + paddd m0, m2 + paddd m1, m3 + psrad m0, 11 ; x >> 3 + psrad m1, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 ; (x + 8) >> 4 + pminsw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movlps m0, [tlq+hq-10] + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .left_loop + sub wd, 4 + jz .end + sub tld, r6d ; -h*2 + sub r6, r5 ; tl-dst +.right_loop0: + add r5, 8 + mov hd, tld + movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ + mov dstq, r5 +.right_loop: + pshufd m2, m0, q0000 + pmaddwd m1, m8, m2 + pmaddwd m2, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + pinsrw m0, [dstq+strideq*0-2], 5 + paddd m1, m3 + paddd m2, m4 + pshufd m0, m0, q2222 + movddup m4, [dstq+strideq*1-8] + pmaddwd m3, m12, m0 + pmaddwd m0, m13 + paddd m1, m3 + paddd m0, m2 + pshuflw m2, m4, q3333 + punpcklwd m2, m5 + pmaddwd m3, m14, m2 + pmaddwd m2, m15 + paddd m1, m3 + paddd m0, m2 + psrad m1, 11 + psrad m0, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 + pminsw m0, m6 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + palignr m0, m4, 14 + lea dstq, [dstq+strideq*2] + add hd, 2*2 + jl .right_loop + sub wd, 4 + jg .right_loop0 +.end: + RET + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac + LEA t0, ipred_cfl_left_16bpc_ssse3_table + movd m4, wd + tzcnt wd, wd + movifnidn hd, hm + add tlq, 2 + movsxd r6, [t0+wq*4] + movd m5, wd + jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + LEA t0, ipred_cfl_left_16bpc_ssse3_table + tzcnt wd, wm + lea r6d, [hq*2] + movd m4, hd + sub tlq, r6 + tzcnt r6d, hd + movd m5, r6d + movsxd r6, [t0+r6*4] +.start: + movd m7, r7m + movu m0, [tlq] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table + movsxd wq, [t0+wq*4] + pxor m6, m6 + pshuflw m7, m7, q0000 + pcmpeqw m3, m3 + add wq, t0 + movifnidn acq, acmp + pavgw m4, m6 + punpcklqdq m7, m7 + jmp r6 +.h32: + movu m1, [tlq+48] + movu m2, [tlq+32] + paddw m0, m1 + paddw m0, m2 +.h16: + movu m1, [tlq+16] + paddw m0, m1 +.h8: + pshufd m1, m0, q1032 + paddw m0, m1 +.h4: + pmaddwd m0, m3 + psubd m4, m0 + pshuflw m0, m4, q1032 + paddd m0, m4 + psrld m0, m5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +%macro IPRED_CFL 2 ; dst, src + pabsw m%1, m%2 + pmulhrsw m%1, m2 + psignw m%2, m1 + psignw m%1, m%2 + paddw m%1, m0 + pmaxsw m%1, m6 + pminsw m%1, m7 +%endmacro + +cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_16bpc_ssse3_table + tzcnt wd, wd + movd m7, r7m + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw m4, 1 + pxor m6, m6 + pshuflw m7, m7, q0000 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + pcmpeqw m3, m3 + punpcklqdq m7, m7 + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + cmp hd, 4 + jg .w4_mul + psrld m0, 3 + jmp .w4_end +.w4_mul: + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 16 + cmove r6d, r2d + movd m1, r6d + psrld m0, 2 + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + lea r6, [strideq*3] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + movq [dstq+strideq*0], m3 + movhps [dstq+strideq*1], m3 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4_loop + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+strideq*0], m3 + mova [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s8_loop + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + add dstq, strideq + dec hd + jg .s16_loop + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m1, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 8 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + mova m4, [acq+16*2] + mova m5, [acq+16*3] + add acq, 16*4 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, strideq + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac + tzcnt wd, wm + LEA t0, ipred_cfl_splat_16bpc_ssse3_table + mov r6d, r7m + movifnidn hd, hm + shr r6d, 11 + movd m7, r7m + movsxd wq, [t0+wq*4] + movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] + pshuflw m7, m7, q0000 + pxor m6, m6 + add wq, t0 + movifnidn acq, acmp + punpcklqdq m7, m7 + jmp wq + +cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + paddw m5, m5 +%else + movddup m5, [pw_2] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + pmaddwd m2, m5, [ypxq+strideq*2] + pmaddwd m3, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m0, m1 + paddd m2, m3 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .dc + punpckhqdq m0, m0 + pslld m2, 2 +.w4_hpad: + mova [acq+16*0], m0 + paddd m4, m2 + mova [acq+16*1], m0 + add acq, 16*2 + sub hpadd, 4 + jg .w4_hpad + jmp .dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*1+16*0] + pmaddwd m1, m5, [ypxq+strideq*0+16*1] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m2 + paddd m1, m3 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz .dc + pslld m2, 2 + mova m1, m0 + jmp .hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m1 + pshufd m1, m0, q3333 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m6, m5, [ypxq+strideq*1+16*0] + paddd m0, m6 + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+strideq*0+16*1] + pmaddwd m6, m5, [ypxq+strideq*1+16*1] + paddd m3, m6 + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+strideq*0+16*2] + pmaddwd m6, m5, [ypxq+strideq*1+16*2] + paddd m1, m6 + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+strideq*0+16*3] + pmaddwd m6, m5, [ypxq+strideq*1+16*3] + paddd m2, m6 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + paddd m6, m0, m3 + packssdw m0, m3 + paddd m6, m1 + mova [acq+16*0], m0 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz .dc + paddd m2, m2 +.hpad: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m0 + mova [acq+16*3], m1 + add acq, 16*4 + sub hpadd, 4 + jg .hpad +.dc: + sub r5, acq ; -w*h*2 + pshufd m2, m4, q1032 + tzcnt r1d, r5d + paddd m2, m4 + sub r1d, 2 + pshufd m4, m2, q2301 + movd m0, r1d + paddd m2, m4 + psrld m2, m0 + pxor m0, m0 + pavgw m2, m0 + packssdw m2, m2 +.dc_loop: + mova m0, [acq+r5+16*0] + mova m1, [acq+r5+16*1] + psubw m0, m2 + psubw m1, m2 + mova [acq+r5+16*0], m0 + mova [acq+r5+16*1], m1 + add r5, 16*2 + jl .dc_loop + RET + +cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + psllw m5, 2 +%else + movddup m5, [pw_4] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m3, m5, [ypxq+strideq*1] + pmaddwd m1, m5, [ypxq+strideq*2] + pmaddwd m2, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m4, m0 + packssdw m0, m3 + paddd m3, m1 + packssdw m1, m2 + paddd m4, m2 + paddd m4, m3 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + pslld m2, 3 + mova [acq+16*0], m1 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m1 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*0+16*1] + pmaddwd m1, m5, [ypxq+strideq*1+16*0] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq+16*0], m0 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + pshufd m2, m0, q3333 + pshufd m3, m1, q3333 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+16*0] + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+16*1] + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+16*2] + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+16*3] +.w16_wpad_end: + add ypxq, strideq + paddd m6, m0, m3 + packssdw m0, m3 + mova [acq+16*0], m0 + paddd m6, m1 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad + +cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h +%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table + LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table + tzcnt wd, wm + movifnidn hpadd, hpadm + pxor m4, m4 + movsxd wq, [r6+wq*4] + movddup m5, [base+pw_1] + add wq, r6 + mov hd, hm + shl hpadd, 2 + sub hd, hpadd + jmp wq +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + movq m0, [ypxq+strideq*0] + movhps m0, [ypxq+strideq*1] + movq m1, [ypxq+strideq*2] + movhps m1, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + mova [acq+16*0], m1 + pslld m2, 2 + mova [acq+16*1], m1 + punpckhqdq m2, m2 + mova [acq+16*2], m1 + paddd m4, m2 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: + mov r5, acq +.w8_loop: + mova m0, [ypxq+strideq*0] + mova m1, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w16_wpad2: + pshufhw m3, m2, q3333 + pshufhw m1, m0, q3333 + punpckhqdq m3, m3 + punpckhqdq m1, m1 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + mov r5, acq +.w16_loop: + mova m2, [ypxq+strideq*0+16*0] + mova m0, [ypxq+strideq*1+16*0] + psllw m2, 3 + psllw m0, 3 + test wpadd, wpadd + jnz .w16_wpad2 + mova m3, [ypxq+strideq*0+16*1] + mova m1, [ypxq+strideq*1+16*1] + psllw m3, 3 + psllw m1, 3 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + mova [acq+16*0], m2 + pmaddwd m2, m5 + mova [acq+16*1], m3 + pmaddwd m3, m5 + paddd m4, m2 + pmaddwd m2, m5, m0 + mova [acq+16*2], m0 + paddd m4, m3 + pmaddwd m3, m5, m1 + mova [acq+16*3], m1 + add acq, 16*4 + paddd m2, m3 + paddd m4, m2 + sub hd, 2 + jg .w16_loop + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w32_wpad6: + pshufhw m1, m0, q3333 + punpckhqdq m1, m1 + mova m2, m1 + mova m3, m1 + jmp .w32_wpad_end +.w32_wpad4: + pshufhw m2, m1, q3333 + punpckhqdq m2, m2 + mova m3, m2 + jmp .w32_wpad_end +.w32_wpad2: + pshufhw m3, m2, q3333 + punpckhqdq m3, m3 + jmp .w32_wpad_end +.w32: + movifnidn wpadd, wpadm + mov r5, acq + WIN64_SPILL_XMM 8 +.w32_loop: + mova m0, [ypxq+16*0] + psllw m0, 3 + cmp wpadd, 4 + jg .w32_wpad6 + mova m1, [ypxq+16*1] + psllw m1, 3 + je .w32_wpad4 + mova m2, [ypxq+16*2] + psllw m2, 3 + jnp .w32_wpad2 + mova m3, [ypxq+16*3] + psllw m3, 3 +.w32_wpad_end: + add ypxq, strideq + pmaddwd m6, m5, m0 + mova [acq+16*0], m0 + pmaddwd m7, m5, m1 + mova [acq+16*1], m1 + paddd m6, m7 + pmaddwd m7, m5, m2 + mova [acq+16*2], m2 + paddd m6, m7 + pmaddwd m7, m5, m3 + mova [acq+16*3], m3 + add acq, 16*4 + paddd m6, m7 + paddd m4, m6 + dec hd + jg .w32_loop +%if WIN64 + mova m5, m6 + WIN64_RESTORE_XMM + SWAP 5, 6 +%endif + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w32_hpad_loop: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m6 + mova [acq+16*2], m2 + mova [acq+16*3], m3 + add acq, 16*4 + dec hpadd + jg .w32_hpad_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + +cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h +%define base r2-pal_pred_16bpc_ssse3_table +%if ARCH_X86_32 + %define hd r2d +%endif + mova m3, [palq] + LEA r2, pal_pred_16bpc_ssse3_table + tzcnt wd, wm + pshufb m3, [base+pal_pred_shuf] + movsxd wq, [r2+wq*4] + pshufd m4, m3, q1032 + add wq, r2 + movifnidn hd, hm + jmp wq +.w4: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4 + RET +.w8: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + add idxq, 16*2 + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m0 + add dstq, strideq + dec hd + jg .w32 + RET +.w64: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova m1, [idxq+16*2] + mova [dstq+16*2], m2 + pshufb m2, m3, m1 + mova [dstq+16*3], m0 + pshufb m0, m4, m1 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + mova m0, [idxq+16*3] + add idxq, 16*4 + mova [dstq+16*4], m1 + pshufb m1, m3, m0 + mova [dstq+16*5], m2 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + add dstq, strideq + dec hd + jg .w64 + RET diff -Nru dav1d-0.9.0/src/x86/ipred_avx2.asm dav1d-0.9.1/src/x86/ipred_avx2.asm --- dav1d-0.9.0/src/x86/ipred_avx2.asm 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/ipred_avx2.asm 2021-07-28 21:38:28.901852100 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -141,7 +141,7 @@ %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) - %xdefine %%base mangle(private_prefix %+ _%1_%2) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) @@ -178,7 +178,7 @@ SECTION .text INIT_YMM avx2 -cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h lea r5, [ipred_dc_left_avx2_table] tzcnt wd, wm inc tlq @@ -196,7 +196,7 @@ add wq, r5 jmp r6 -cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq @@ -235,7 +235,7 @@ mova m1, m0 jmp wq -cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd @@ -446,7 +446,7 @@ jg .s64 RET -cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm @@ -457,7 +457,7 @@ lea stride3q, [strideq*3] jmp wq -cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movu m0, [tlq+ 1] @@ -486,7 +486,7 @@ %endmacro INIT_XMM avx2 -cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3 +cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 lea r5, [ipred_h_avx2_table] tzcnt wd, wm movifnidn hd, hm @@ -543,7 +543,7 @@ vpblendvb m0, m5, m0, m1 %endmacro -cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h +cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h %define base r5-ipred_paeth_avx2_table lea r5, [ipred_paeth_avx2_table] tzcnt wd, wm @@ -677,7 +677,7 @@ packuswb m0, m1 %endmacro -cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights +cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_avx2_table lea r6, [ipred_smooth_v_avx2_table] tzcnt wd, wm @@ -835,7 +835,7 @@ ALLOC_STACK %1, %3 %endmacro -cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h +cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h %define base r6-ipred_smooth_h_avx2_table lea r6, [ipred_smooth_h_avx2_table] mov wd, wm @@ -1045,7 +1045,7 @@ packuswb m0, m1 %endmacro -cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights +cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_avx2_table lea r6, [ipred_smooth_avx2_table] mov wd, wm @@ -1315,7 +1315,7 @@ sub r3, hq ret -cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase +cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z1_avx2_table] tzcnt wd, wm @@ -2144,7 +2144,7 @@ .w64_end: RET -cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy +cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_avx2_table] tzcnt wd, wm @@ -3000,7 +3000,7 @@ movu [rsp+97], m0 jmp .w32_filter_above -cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase +cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_avx2_table] tzcnt hd, hm @@ -4211,7 +4211,7 @@ ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i -cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter +cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter %define base r6-ipred_filter_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm @@ -4435,7 +4435,7 @@ paddw m%1, m0 %endmacro -cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_left_avx2_table] tzcnt wd, wm inc tlq @@ -4454,7 +4454,7 @@ movifnidn acq, acmp jmp r6 -cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq @@ -4488,7 +4488,7 @@ vpbroadcastw m0, xm0 jmp wq -cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd @@ -4692,7 +4692,7 @@ jg .s32_loop RET -cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm @@ -4702,7 +4702,7 @@ movifnidn acq, acmp jmp wq -cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak +cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm @@ -4883,7 +4883,7 @@ jg .sub_loop RET -cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak +cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm @@ -5076,7 +5076,7 @@ jg .sub_loop RET -cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak +cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm @@ -5306,7 +5306,7 @@ jg .sub_loop RET -cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m4, [palq] lea r2, [pal_pred_avx2_table] tzcnt wd, wm diff -Nru dav1d-0.9.0/src/x86/ipred_init_tmpl.c dav1d-0.9.1/src/x86/ipred_init_tmpl.c --- dav1d-0.9.0/src/x86/ipred_init_tmpl.c 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/ipred_init_tmpl.c 2021-07-28 21:38:28.901852100 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -28,19 +28,11 @@ #include "src/cpu.h" #include "src/ipred.h" -#if BITDEPTH == 8 -#define decl_fn(type, name) \ - decl_##type##_fn(dav1d_##name##_ssse3); \ - decl_##type##_fn(dav1d_##name##_avx2) -#define init_fn(type0, type1, name, suffix) \ - c->type0[type1] = dav1d_##name##_##suffix -#else #define decl_fn(type, name) \ - decl_##type##_fn(dav1d_##name##_16bpc_ssse3); \ - decl_##type##_fn(dav1d_##name##_16bpc_avx2) + decl_##type##_fn(BF(dav1d_##name, ssse3)); \ + decl_##type##_fn(BF(dav1d_##name, avx2)) #define init_fn(type0, type1, name, suffix) \ - c->type0[type1] = dav1d_##name##_16bpc_##suffix -#endif + c->type0[type1] = BF(dav1d_##name, suffix) #define init_angular_ipred_fn(type, name, suffix) \ init_fn(intra_pred, type, name, suffix) @@ -80,7 +72,6 @@ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); @@ -102,8 +93,7 @@ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3); init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3); - c->pal_pred = dav1d_pal_pred_ssse3; -#endif + c->pal_pred = BF(dav1d_pal_pred, ssse3); #if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; @@ -132,10 +122,7 @@ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2); #if BITDEPTH == 8 init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); - - c->pal_pred = dav1d_pal_pred_avx2; -#else - c->pal_pred = dav1d_pal_pred_16bpc_avx2; #endif + c->pal_pred = BF(dav1d_pal_pred, avx2); #endif } diff -Nru dav1d-0.9.0/src/x86/ipred_sse.asm dav1d-0.9.1/src/x86/ipred_sse.asm --- dav1d-0.9.0/src/x86/ipred_sse.asm 2021-05-16 16:47:22.546950800 +0000 +++ dav1d-0.9.1/src/x86/ipred_sse.asm 2021-07-28 21:38:28.901852100 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -74,7 +74,7 @@ %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) - %xdefine %%base mangle(private_prefix %+ _%1_%2) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) @@ -156,7 +156,7 @@ %endmacro INIT_XMM ssse3 -cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3 +cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 LEA r5, ipred_h_ssse3_table tzcnt wd, wm movifnidn hd, hm @@ -179,7 +179,7 @@ ;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movu m0, [tlq+ 1] @@ -196,7 +196,7 @@ ;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd @@ -438,7 +438,7 @@ ;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_ssse3_table mov hd, hm ; zero upper half tzcnt r6d, hd @@ -488,7 +488,7 @@ ;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movifnidn hd, hm @@ -505,7 +505,7 @@ ;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_ssse3_table tzcnt wd, wm inc tlq @@ -540,7 +540,7 @@ packuswb m6, m0 %endmacro -cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_ssse3_table LEA r6, ipred_smooth_v_ssse3_table tzcnt wd, wm @@ -701,7 +701,7 @@ ;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- -cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h +cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h %define base r6-ipred_smooth_h_ssse3_table LEA r6, ipred_smooth_h_ssse3_table mov wd, wm @@ -958,7 +958,7 @@ mova m5, [rsp+16*%12] ; recovery %endmacro -cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights +cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_ssse3_table mov wd, wm mov hd, hm @@ -1194,7 +1194,7 @@ ;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, ; const uint8_t *idx, const int w, const int h); ;--------------------------------------------------------------------------------------- -cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h mova m4, [palq] LEA r2, pal_pred_ssse3_table tzcnt wd, wm @@ -1295,7 +1295,7 @@ DECLARE_REG_TMP 5 %endif -cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn wd, wm movifnidn hd, hm tzcnt r6d, hd @@ -1535,7 +1535,7 @@ ;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq @@ -1576,7 +1576,7 @@ ;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha LEA t0, ipred_cfl_left_ssse3_table tzcnt wd, wm inc tlq @@ -1600,7 +1600,7 @@ ;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha tzcnt wd, wm movifnidn hd, hm LEA r6, ipred_cfl_splat_ssse3_table @@ -1615,11 +1615,11 @@ %endmacro %if ARCH_X86_64 -cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak DECLARE_REG_TMP 7 movddup m2, [pb_2] %else -cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h DECLARE_REG_TMP 4 %define ac_bakq acmp mov t0d, 0x02020202 @@ -1855,10 +1855,10 @@ RET %if ARCH_X86_64 -cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else -cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 @@ -2128,10 +2128,10 @@ RET %if ARCH_X86_64 -cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak +cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else -cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h +cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h %define ac_bakq [rsp+16*4] mov t0d, 0x04040404 movd m2, t0d @@ -2769,7 +2769,7 @@ BLEND m1, m0, m5 %endmacro -cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h +cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h %define base r5-ipred_paeth_ssse3_table tzcnt wd, wm movifnidn hd, hm @@ -2937,7 +2937,7 @@ packuswb m%1, m%1 %endmacro -cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter +cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter %define base r6-$$ LEA r6, $$ tzcnt wd, wm diff -Nru dav1d-0.9.0/src/x86/itx16_avx2.asm dav1d-0.9.1/src/x86/itx16_avx2.asm --- dav1d-0.9.0/src/x86/itx16_avx2.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/itx16_avx2.asm 2021-07-28 21:38:28.901852100 +0000 @@ -105,32 +105,32 @@ cextern pw_2896x8 cextern pd_2048 -cextern idct_4x8_internal_avx2.main -cextern idct_4x16_internal_avx2.main -cextern idct_8x8_internal_avx2.main -cextern idct_8x16_internal_avx2.main -cextern idct_16x4_internal_avx2.main -cextern idct_16x8_internal_avx2.main -cextern idct_16x16_internal_avx2.main -cextern inv_txfm_add_dct_dct_8x32_avx2.main -cextern inv_txfm_add_dct_dct_8x32_avx2.main_fast -cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf -cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf_fast -cextern inv_txfm_add_dct_dct_16x64_avx2.main_part1 -cextern inv_txfm_add_dct_dct_16x64_avx2.main_part2_internal - -cextern iadst_4x4_internal_avx2.main -cextern iadst_4x8_internal_avx2.main_pass2 -cextern iadst_4x16_internal_avx2.main2 -cextern iadst_8x4_internal_avx2.main -cextern iadst_8x8_internal_avx2.main_pass2 -cextern iadst_8x16_internal_avx2.main -cextern iadst_8x16_internal_avx2.main_pass2_end -cextern iadst_16x4_internal_avx2.main -cextern iadst_16x8_internal_avx2.main -cextern iadst_16x8_internal_avx2.main_pass2_end -cextern iadst_16x16_internal_avx2.main -cextern iadst_16x16_internal_avx2.main_pass2_end +cextern idct_4x8_internal_8bpc_avx2.main +cextern idct_4x16_internal_8bpc_avx2.main +cextern idct_8x8_internal_8bpc_avx2.main +cextern idct_8x16_internal_8bpc_avx2.main +cextern idct_16x4_internal_8bpc_avx2.main +cextern idct_16x8_internal_8bpc_avx2.main +cextern idct_16x16_internal_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal + +cextern iadst_4x4_internal_8bpc_avx2.main +cextern iadst_4x8_internal_8bpc_avx2.main_pass2 +cextern iadst_4x16_internal_8bpc_avx2.main2 +cextern iadst_8x4_internal_8bpc_avx2.main +cextern iadst_8x8_internal_8bpc_avx2.main_pass2 +cextern iadst_8x16_internal_8bpc_avx2.main +cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x4_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x16_internal_8bpc_avx2.main +cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end SECTION .text @@ -384,7 +384,7 @@ .pass2: lea rax, [deint_shuf+128] vextracti128 xm1, m0, 1 - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main .end: vpbroadcastd xm4, [pw_2048] movq xm2, [dstq+strideq*0] @@ -457,7 +457,7 @@ .pass2: lea rax, [deint_shuf+128] vextracti128 xm1, m0, 1 - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main vpbroadcastd xm4, [pw_2048] movq xm3, [dstq+strideq*1] movhps xm3, [dstq+strideq*0] @@ -607,7 +607,7 @@ punpckldq m0, m2 ; 0 1 vextracti128 xm2, m0, 1 ; 4 5 vextracti128 xm3, m1, 1 ; 6 7 - call m(idct_4x8_internal).main + call m(idct_4x8_internal_8bpc).main vpbroadcastd xm4, [pw_2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 lea r3, [strideq*3] @@ -697,7 +697,7 @@ vextracti128 xm3, m5, 1 ; 6 7 pshufd xm4, xm4, q1032 ; 1 0 pshufd xm5, xm5, q1032 ; 3 2 - jmp m(iadst_4x8_internal).main_pass2 + jmp m(iadst_4x8_internal_8bpc).main_pass2 ALIGN function_align .main: vbroadcasti128 m0, [cq+16*0] @@ -934,7 +934,7 @@ vextracti128 xm3, m1, 1 ; 6 7 vextracti128 xm6, m4, 1 ; c d vextracti128 xm7, m5, 1 ; e f - call m(idct_4x16_internal).main + call m(idct_4x16_internal_8bpc).main vpbroadcastd m9, [pw_2048] vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 @@ -1054,7 +1054,7 @@ vinserti128 m0, xm3, 1 ; 0 3 2 1 vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? vinserti128 m2, xm4, 1 ; b 8 9 a - call m(iadst_4x16_internal).main2 + call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m5, [pw_2896x8] paddsw m1, m2, m4 psubsw m2, m4 @@ -1434,7 +1434,7 @@ vinserti128 m0, xm2, 1 pshufb m0, m4 pshufb m1, m4 - jmp m(iadst_8x4_internal).main + jmp m(iadst_8x4_internal_8bpc).main ALIGN function_align .main: vpbroadcastd m1, [pd_2896] @@ -1636,7 +1636,7 @@ jmp tx2q .pass2: call .transpose_8x8_packed - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main vpbroadcastd m12, [pw_2048] vpermq m0, m0, q3120 vpermq m1, m1, q2031 @@ -1754,7 +1754,7 @@ call m(idct_8x8_internal_16bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 + call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m5, [pw_2048] vpbroadcastd xm12, [pw_4096] psubw m12, m5 @@ -1814,7 +1814,7 @@ call m(idct_8x8_internal_16bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 + call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m12, [pw_2048] vpbroadcastd xm5, [pw_4096] psubw m12, m5 @@ -1971,7 +1971,7 @@ jmp tx2q .pass2: call .transpose - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main vpbroadcastd m12, [pw_2048] REPX {vpermq x, x, q3120}, m0, m2, m4, m6 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 @@ -2167,8 +2167,8 @@ jmp tx2q .pass2: call m(idct_8x16_internal_16bpc).transpose - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [pw_2048] vpbroadcastd xm12, [pw_4096] REPX {vpermq x, x, q2031}, m0, m1, m2, m3 @@ -2232,8 +2232,8 @@ jmp tx2q .pass2: call m(idct_8x16_internal_16bpc).transpose - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m12, [pw_2048] vpbroadcastd xm13, [pw_4096] mova m11, m0 @@ -2458,7 +2458,7 @@ .pass2: call .transpose_4x16_packed lea rax, [deint_shuf+128] - call m(idct_16x4_internal).main + call m(idct_16x4_internal_8bpc).main .end: vpbroadcastd m4, [pw_2048] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 @@ -2517,7 +2517,7 @@ .pass2: call m(idct_16x4_internal_16bpc).transpose_4x16_packed lea rax, [deint_shuf+128] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main jmp m(idct_16x4_internal_16bpc).end ALIGN function_align .main: @@ -2596,7 +2596,7 @@ .pass2: call m(idct_16x4_internal_16bpc).transpose_4x16_packed lea rax, [deint_shuf+128] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [pw_2048] pmulhrsw m5, m3, m4 pmulhrsw m6, m2, m4 @@ -2712,7 +2712,7 @@ jmp tx2q .pass2: call .transpose - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [pw_2048] .end: pmulhrsw m0, m10 @@ -2827,8 +2827,8 @@ jmp tx2q .pass2: call m(idct_16x8_internal_16bpc).transpose - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 @@ -3039,8 +3039,8 @@ jmp m(iadst_16x8_internal_16bpc).pass1_end .pass2: call m(idct_16x8_internal_16bpc).transpose - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 @@ -3216,7 +3216,7 @@ call .transpose lea rax, [pw_5+128] mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] .end: call .write_16x16 @@ -3450,8 +3450,8 @@ call m(idct_16x16_internal_16bpc).transpose lea rax, [pw_5+128] mova [rsp], m15 - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*0], m8 mova [rsp+32*2], m12 mova [rsp+32*3], m13 @@ -3582,8 +3582,8 @@ call m(idct_16x16_internal_16bpc).transpose lea rax, [pw_5+128] mova [rsp], m15 - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*3], m3 mova [rsp+32*2], m2 mova [rsp+32*0], m0 @@ -3740,7 +3740,7 @@ vpbroadcastd m10, [pw_2048] lea rax, [deint_shuf+128] REPX {mova x, m4}, m5, m6, m7 - call m(inv_txfm_add_dct_dct_8x32).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end .eob107: mova [rsp+32*3], m3 @@ -3778,7 +3778,7 @@ lea rax, [deint_shuf+128] mova m11, [rsp+32*3] ; out13 out15 vpbroadcastd m10, [pw_2048] - call m(inv_txfm_add_dct_dct_8x32).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main .end: ; [rsp+0*32] = m12 vpbroadcastd m12, [pw_2048] mov cq, r4 @@ -4294,7 +4294,7 @@ RET ALIGN function_align .pass2: - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main REPX {pmulhrsw x, m11}, m0, m1, m2, m3 call m(idct_16x8_internal_16bpc).write_16x4_start pmulhrsw m0, m11, m4 @@ -4404,7 +4404,7 @@ mova m3, [r4+32*3] .fast: lea rax, [pw_5+128] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp .idct16 @@ -4456,7 +4456,7 @@ mova m6, [r4-32*2] mova m7, [r4-32*1] lea rax, [pw_5 + 128] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea r3, [rsp+32*8] mova m8, [r3+32*0] mova m9, [r3+32*1] @@ -4477,7 +4477,7 @@ mova m6, [r3-32*2] mova m7, [r3-32*1] mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq @@ -4711,7 +4711,7 @@ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] lea rax, [pw_5+128] mov r7, dstq - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] @@ -4750,7 +4750,7 @@ call .transpose_16x16 lea rax, [pw_5+128] mov r7, dstq - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] @@ -4764,7 +4764,7 @@ call .transpose_16x16 .end: lea dstq, [r7+32] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .write_16x16 RET ALIGN function_align @@ -5124,7 +5124,7 @@ mova m13, [r3+32*51] ; 27 mova m14, [r3+32*53] ; 29 mova m15, [r3+32*55] ; 31 - jmp m(inv_txfm_add_dct_dct_16x32).main_oddhalf + jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf ALIGN function_align .pass2_evenhalf: mova m0, [r3+32* 0] ; 0 @@ -5144,7 +5144,7 @@ mova m14, [r3+32*52] ; 28 mova m15, [r3+32*54] ; 30 mova [rsp+gprsize], m15 - jmp m(idct_16x16_internal).main + jmp m(idct_16x16_internal_8bpc).main cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob %undef cmp @@ -5300,7 +5300,7 @@ pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*38] mova [r4-32*4], m0 @@ -5330,7 +5330,7 @@ mova m7, [rsp+32*32] ; in30 lea r5, [r4+32*16] add r4, 32*8 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [rsp+32* 3] ; in1 mova m1, [rsp+32*33] ; in31 mova m2, [rsp+32*19] ; in17 @@ -5342,7 +5342,7 @@ lea rax, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [rsp+32* 7] ; in5 mova m1, [rsp+32*29] ; in27 mova m2, [rsp+32*23] ; in21 @@ -5354,7 +5354,7 @@ add rax, 8 add r4, 32*8 sub r5, 32*8 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 @@ -5449,7 +5449,7 @@ lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [pw_m2896_2896] - call m(inv_txfm_add_dct_dct_16x64).main_part2_internal + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal vpbroadcastd m14, [pw_2048] IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 @@ -5648,7 +5648,7 @@ pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*70] mova [r4-32*4], m0 @@ -5678,7 +5678,7 @@ mova m7, [r10+32*56] ; in30 lea r5, [r4+32*16] add r4, 32*8 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10+32* 3] ; in1 mova m1, [r10+32*57] ; in31 mova m2, [r10+32*35] ; in17 @@ -5690,7 +5690,7 @@ lea rax, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10+32* 7] ; in5 mova m1, [r10+32*53] ; in27 mova m2, [r10+32*39] ; in21 @@ -5702,7 +5702,7 @@ add rax, 8 add r4, 32*8 sub r5, 32*8 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 add r10, 32*8 sub r4, 32*98 ; rsp+32*16 @@ -5877,7 +5877,7 @@ mova m15, [r7+32*3] sub r7, 32*24 mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16 add r5, 32 @@ -6109,7 +6109,7 @@ mova m13, [r7-32* 1] mova m14, [r7+32* 1] mova m15, [r7+32* 3] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf mova m0, [r7-32*100] mova m1, [r7-32*98] mova m2, [r7-32*96] @@ -6128,7 +6128,7 @@ mova m15, [r7+32* 2] add r7, 32*8 mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] @@ -6248,7 +6248,7 @@ pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [r4-32*4], m0 mova [r4-32*3], m1 @@ -6277,7 +6277,7 @@ mova m7, [r10+32* 2] ; in30 lea r5, [r4+32*16] add r4, 32*8 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10-32*99] ; in1 mova m1, [r10+32* 3] ; in31 mova m2, [r10-32*35] ; in17 @@ -6289,7 +6289,7 @@ lea rax, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10-32*95] ; in5 mova m1, [r10-32* 1] ; in27 mova m2, [r10-32*31] ; in21 @@ -6301,7 +6301,7 @@ add rax, 8 add r4, 32*8 sub r5, 32*8 - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 add r10, 32*8 sub dstq, r8 diff -Nru dav1d-0.9.0/src/x86/itx16_sse.asm dav1d-0.9.1/src/x86/itx16_sse.asm --- dav1d-0.9.0/src/x86/itx16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/itx16_sse.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,2345 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2017-2021, The rav1e contributors +; Copyright © 2020, Nathan Egge +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA +%macro COEF 1 +pd_%1: times 4 dd %1 +%endmacro + +COEF 201 +COEF 401 +COEF 601 +COEF 799 +COEF 995 +COEF 1189 +COEF 1380 +COEF 1567 +COEF 1751 +COEF 1931 +COEF 2106 +COEF 2276 +COEF 2440 +COEF 2598 +COEF 2751 +COEF 2896 +COEF 3035 +COEF 3166 +COEF 3290 +COEF 3406 +COEF 3513 +COEF 3612 +COEF 3703 +COEF 3784 +COEF 3857 +COEF 3920 +COEF 3973 +COEF 4017 +COEF 4052 +COEF 4076 +COEF 4091 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +pd_1321: times 4 dd 1321 +pd_2482: times 4 dd 2482 +pd_m3344: times 4 dd -3344 +pd_2048: times 4 dd 2048 +pw_4x2048_4xm2048: times 4 dw 2048 + times 4 dw -2048 +pw_4xm2048_4x2048: times 4 dw -2048 + times 4 dw 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pd_3803: times 4 dd 3803 +pw_4096: times 8 dw 4096 +pd_5793: times 4 dd 5793 +pd_6144: times 4 dd 6144 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_1697x16: times 8 dw 1697*16 +pw_16384: times 8 dw 16384 +pixel_10bpc_max: times 8 dw 0x03ff + +pw_1567_3784: times 4 dw 1567, 3784 +pw_m3784_1567: times 4 dw -3784, 1567 + +clip_min: times 4 dd -0x20000 +clip_max: times 4 dd 0x1ffff + +cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 +cextern iadst_4x4_internal_8bpc_ssse3.main +cextern idct_4x8_internal_8bpc_ssse3.main +cextern iadst_4x8_internal_8bpc_ssse3.main +cextern idct_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x4_internal_8bpc_ssse3.main +cextern iadst_8x4_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 +cextern iadst_8x8_internal_8bpc_ssse3.main +cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end + +tbl_4x16_2d: db 0, 13, 29, 45 +tbl_4x16_h: db 0, 16, 32, 48 +tbl_4x16_v: db 0, 4, 8, 12 + +tbl_8x16_2d: db 0, 14, 30, 46 +tbl_8x16_v: db 0, 4, 8, 12 +tbl_8x16_h: db 0, 32, 64, 96 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) +%define m(x) m_suffix(x, SUFFIX) + +; This refers to the first function in itx_sse i.e. the start of the text section +; which is needed as a base pointer for constants. +%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r6-$$+x ; PIC +%endif + +%macro IWHT4_1D 0 + ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 + paddd m0, m1 ; in0 += in1 + psubd m4, m2, m3 ; tmp0 = in2 - in3 + psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 + psrad m5, 1 + psubd m2, m5, m1 ; in2 = tmp1 - in1 + psubd m5, m3 ; in1 = tmp1 - in3 + psubd m0, m5 ; in0 -= in1 + paddd m4, m2 ; in3 = tmp0 + in2 + ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 + ; m4 = out3, m5 = out1 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + REPX {psrad x, 2}, m0, m1, m2, m3 + IWHT4_1D + punpckldq m1, m0, m5 + punpckhdq m3, m0, m5 + punpckldq m5, m2, m4 + punpckhdq m2, m4 + punpcklqdq m0, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m4, m3, m2 + punpckhqdq m3, m2 + mova m2, m4 + IWHT4_1D + packssdw m0, m4 ; low: out3, high: out0 + packssdw m2, m5 ; low: out2, high: out1 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + lea r2, [dstq+strideq*2] + movq m1, [dstq+strideq*0] + movhps m1, [r2 +strideq*1] + movq m3, [r2 +strideq*0] + movhps m3, [dstq+strideq*1] + movd m5, bdmaxm + pshuflw m5, m5, q0000 ; broadcast + punpcklqdq m5, m5 ; broadcast + paddsw m0, m1 + paddsw m2, m3 + pmaxsw m0, m4 + pmaxsw m2, m4 + pminsw m0, m5 + pminsw m2, m5 + movhps [r2 +strideq*1], m0 ; write out0 + movhps [dstq+strideq*1], m2 ; write out1 + movq [r2 +strideq*0], m2 ; write out2 + movq [dstq+strideq*0], m0 ; write out3 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +; %1 dst/src[1] +; %2 dst/src[2] +; %3 tmp[1] +; %4 tmp[2] +; %5 tmp[3] +; %6 rnd +; %7 coef[1] +; %8 coef[2] +; %9 flags +%ifnidn %7,%8 ; optimize when coef1 == coef2 +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else + mova m%3, [o(pd_%8)] + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else + mova m%5, [o(pd_%7)] + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 ; invert dst2 + paddd m%4, m%2 + psubd m%2, m%6, m%4 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%4, m%6 +%else + paddd m%1, m%6 +%endif +%endif +%ifnidn %7,%8 + paddd m%2, m%4 +%else + mova m%3, m%2 + paddd m%2, m%1 +%endif +%endif +%if %9 & 2 ; invert dst1 + psubd m%3, m%1 + paddd m%1, m%3, m%6 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%1, m%6 +%endif +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) +%if ARCH_X86_32 + LEA r6, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif + call %%p1 + RET +%%end: +%else + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + movd m1, [o(pw_2896x8)] + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + movd m0, r5d + packssdw m0, m0 + pmulhrsw m0, m1 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova m1, m0 + TAIL_CALL m(iadst_4x4_internal_16bpc).end +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ; butterfly rotation + ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 + ; Hadamard rotation + psubd m%5, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%3, m%4 + psubd m%3, m%4 + ; %1 (src1) = out0 + ; %2 (src2) = out1 + ; %3 (src3) = out3 + ; $5 (tmp1) = out2 +%endmacro + +INIT_XMM sse4 + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [o(pd_2048)] + call .pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + ; transpose + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass1_main: + IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 + ret +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_m3784_1567)] + punpckhwd m2, m1, m0 + psubw m3, m0, m1 + paddw m0, m1 + punpcklqdq m0, m3 + pmaddwd m4, m2 + pmaddwd m2, [o(pw_1567_3784)] + pmulhrsw m0, [o(pw_2896x8)] ; t0 t1 + paddd m4, m5 + paddd m2, m5 + psrad m4, 12 + psrad m2, 12 + packssdw m2, m4 ; t3 t2 + psubsw m1, m0, m2 ; tmp3 tmp2 + paddsw m0, m2 ; tmp0 tmp1 + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*1] + movhps m3, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movhps [r5 +strideq*0], m1 + movq [r5 +strideq*1], m1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main +.end: + mova m4, [o(pw_2048)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET +ALIGN function_align +.main: + mova m1, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [cq+16*0] + lea r3, [cq+16*1] +.main2: + mova m0, [o(pd_1321)] ; SINPI_1_9 + mova m2, [o(pd_2482)] ; SINPI_2_9 + mova m6, [o(pd_3803)] ; SINPI_4_9 + pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] + pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] + pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] + pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] + psubd m1, m3 ; T[2] - T[3] + pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] + pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] + paddd m0, m6 ; s[0] += s[3] + paddd m0, m3 ; s[0] += s[5] + mova m3, [o(pd_m3344)] ; -SINPI_3_9 + psubd m2, m4 ; s[1] -= s[4] + psubd m2, m7 ; s[1] -= s[6] + psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] + pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 + pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 + paddd m4, m0, m2 ; x[3] = s[0] + s[1] + psubd m2, m3 ; x[1] = s[1] + s[3] + psubd m0, m3 ; x[0] = s[0] + s[3] + paddd m4, m3 ; x[3] -= s[3] + paddd m2, m5 ; x[1] + 2048 + REPX {psrad x, 12}, m0, m2, m1, m4 + ret + + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main + mova m4, [o(pw_2048)] + movq m3, [dstq+strideq*1] + movhps m3, [dstq+strideq*0] + lea r5, [dstq+strideq*2] + movq m2, [r5 +strideq*1] + movhps m2, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movhps [dstq+strideq*0], m1 + movq [dstq+strideq*1], m1 + movhps [r5 +strideq*0], m0 + movq [r5 +strideq*1], m0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m3, [o(pd_5793)] + pmulld m0, m3, [cq+16*0] + pmulld m1, m3, [cq+16*1] + pmulld m2, m3, [cq+16*2] + pmulld m3, [cq+16*3] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + ; transpose + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_1697x8)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + pmulhrsw m3, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m3 + paddsw m1, m4 + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m4, [o(pixel_10bpc_max)] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pxor m5, m5 + mova [cq+16*0], m5 + mova [cq+16*1], m5 + mova [cq+16*2], m5 + mova [cq+16*3], m5 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m5 + pmaxsw m1, m5 + pminsw m0, m4 + pminsw m1, m4 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET + +%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2 + INV_TXFM_FN %1, %2, %3, 4x8 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 2 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 2048 + sar r5d, 12 +.end: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + pxor m4, m4 + mova m3, [o(pixel_10bpc_max)] + lea r2, [strideq*3] +.loop: + movq m1, [dstq+strideq*0] + movq m2, [dstq+strideq*2] + movhps m1, [dstq+strideq*1] + movhps m2, [dstq+r2] + paddw m1, m0 + paddw m2, m0 + REPX {pminsw x, m3}, m1, m2 + REPX {pmaxsw x, m4}, m1, m2 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movhps [dstq+r2 ], m2 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity, 9 +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + mova m3, [o(pd_2896)] + pmulld m0, m3, [cq+32*0+r5] + pmulld m1, m3, [cq+32*1+r5] + pmulld m2, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + call m(idct_4x4_internal_16bpc).pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova [cq+32*1+16], m4 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*0+16] + mova m6, [cq+32*1+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_4x8_internal_8bpc, _ssse3).main + ; m0-3 is now out0/1,3/2,4/5,7/6 + mova m4, [o(pw_2048)] + shufps m1, m1, q1032 + shufps m3, m3, q1032 +.end: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + lea r2, [strideq*3] + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r2] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r2] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r2 ], m3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity, 9 + +cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .pass1_main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*2+16] + mova m6, [cq+32*3+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass1_main: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 + lea r3, [cq+32*1+16] +.loop_pass1: + mova m0, [o(pd_2048)] + mova m3, [o(pd_2896)] + pmulld m5, m3, [cq+32*0+r5] + pmulld m2, m3, [cq+32*1+r5] + pmulld m1, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m0}, m5, m2, m1, m3 + REPX {psrad x, 12}, m5, m2, m1, m3 + mova [r3], m2 + call m(iadst_4x4_internal_16bpc).main2 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*2+16], m0 + mova [cq+32*3+16], m1 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + ret +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, [o(pw_4x2048_4xm2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity, 9 + +cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).pass1_main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + mova m6, [cq+32*2+16] + mova m2, [cq+32*3+16] + punpcklwd m4, m2, m6 + punpckhwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m4, [o(pw_4xm2048_4x2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity, 3 + +cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] + mova m4, [o(pd_2896)] + mova m6, [o(pd_5793)] + ; clear m7 in case we skip the bottom square + pxor m7, m7 +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 16 + setge r5b +%else + mov r5d, 1 + cmp eobd, 16 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + pmulld m0, m4, [cq+32*0+r5] + pmulld m1, m4, [cq+32*1+r5] + pmulld m2, m4, [cq+32*2+r5] + pmulld m3, m4, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova m7, m2 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m4, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m4 + punpcklwd m0, m4 + mova m2, [cq+32*0+16] + punpckhwd m4, m2, m7 + punpcklwd m2, m7 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + mova m4, [o(pw_4096)] + jmp m(idct_4x8_internal_16bpc).end + +%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2 + INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 4 + add r5d, 6144 + sar r5d, 13 + jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity, v +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_2048)] +.loop_pass1: + mova m0, [cq+64*0+r5] + mova m1, [cq+64*1+r5] + mova m2, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(idct_4x4_internal_16bpc).pass1_main + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m1, m4, m2 + REPX {psrad x, 1}, m0, m1, m4, m2 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz .end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m2, [cq+64*0+16] + mova m3, [cq+64*1+16] + mova m4, [cq+64*0+32] + mova m5, [cq+64*1+32] + mova m6, [cq+64*0+48] + mova m7, [cq+64*1+48] + ; m0-7 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_16x4_internal_8bpc, _ssse3).main + ; m0-6 is out0-13 [with odd registers having inversed output] + ; [coeffq+16*7] has out15/14 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [cq+16*7] + REPX {shufps x, x, q1032}, m1, m3, m5, m7 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova [cq+16*2], m6 + mova [cq+16*3], m7 +.end: + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova m7, [o(pixel_10bpc_max)] + mov r5d, 2 + lea r3, [strideq*3] +.loop: + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r3] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r3] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r3 ], m3 + dec r5d + jz .end2 + lea dstq, [dstq+strideq*8] + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 + jmp .loop +.end2: + RET + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity, v + +cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r6+r5] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 + ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out4/11,5/10,6/9,7/8 + ; m0/3/6/1 = out0/15,3/12,1/14,2/13 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movhps [cq+0*8], m4 + movhps [cq+1*8], m2 + movhps [cq+2*8], m5 + movhps [cq+3*8], m7 + movhps [cq+4*8], m3 + movhps [cq+5*8], m1 + movhps [cq+6*8], m6 + movhps [cq+7*8], m0 + punpcklqdq m0, m6 + punpcklqdq m1, m3 + punpcklqdq m3, m2, m4 + punpcklqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity, v + +cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out3 out2 + packssdw m1, m4 ; out1 out0 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 + ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out11/4,10/5,9/6,8/7 + ; m0/3/6/1 = out15/0,12/3,14/1,13/2 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movq [cq+0*8], m4 + movq [cq+1*8], m2 + movq [cq+2*8], m5 + movq [cq+3*8], m7 + movq [cq+4*8], m3 + movq [cq+5*8], m1 + movq [cq+6*8], m6 + movq [cq+7*8], m0 + punpckhqdq m0, m6 + punpckhqdq m1, m3 + punpckhqdq m3, m2, m4 + punpckhqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN identity, dct, h +INV_TXFM_4X16_FN identity, adst, h +INV_TXFM_4X16_FN identity, flipadst, h +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_6144)] + mova m4, [o(pd_5793)] +.loop_pass1: + pmulld m0, m4, [cq+64*0+r5] + pmulld m1, m4, [cq+64*1+r5] + pmulld m2, m4, [cq+64*2+r5] + pmulld m3, m4, [cq+64*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: + mova [cq+16*4], m0 + mova [cq+16*5], m1 + mova [cq+16*6], m2 + mova [cq+16*7], m7 + mova m0, [o(pw_1697x16)] + mova m7, [o(pw_2048)] + pmulhrsw m1, m0, m4 + pmulhrsw m2, m0, m5 + REPX {paddsw x, x}, m4, m5 + paddsw m4, m1 + paddsw m5, m2 + REPX {pmulhrsw x, m7}, m4, m5 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova m4, [cq+16*7] + pmulhrsw m1, m0, m6 + pmulhrsw m2, m0, m4 + REPX {paddsw x, x}, m6, m4 + paddsw m6, m1 + paddsw m4, m2 + REPX {pmulhrsw x, m7}, m6, m4 + mova [cq+16*2], m6 + mova [cq+16*3], m4 + mova m4, [cq+16*4] + mova m1, [cq+16*5] + mova m2, [cq+16*6] + pmulhrsw m5, m0, m2 + pmulhrsw m6, m0, m3 + REPX {paddsw x, x}, m2, m3 + paddsw m2, m5 + paddsw m3, m6 + pmulhrsw m6, m0, m1 + pmulhrsw m0, m4 + REPX {paddsw x, x}, m1, m4 + paddsw m1, m6 + paddsw m0, m4 + REPX {pmulhrsw x, m7}, m2, m3, m1, m0 + jmp m(idct_4x16_internal_16bpc).end + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 8x4, 14 +%else + INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif + call .main_pass1 + call .round +.pack_transpose: + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +.transpose: + ; transpose + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m7, m5, m4 + punpcklwd m5, m4 + + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + punpcklwd m2, m3, m7 + punpckhwd m3, m7 + ; m0-3 = packed & transposed output + jmp tx2q +.load: + mova m7, [o(pd_2896)] + pmulld m0, m7, [cq+0*16] + pmulld m1, m7, [cq+1*16] + pmulld m2, m7, [cq+2*16] + pmulld m3, m7, [cq+3*16] + pmulld m4, m7, [cq+4*16] + pmulld m5, m7, [cq+5*16] + pmulld m6, m7, [cq+6*16] + pmulld m7, [cq+7*16] +%if ARCH_X86_64 + mova m8, [o(pd_2048)] + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [cq+0*16], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [cq+0*16] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_min)] + mova m13, [o(clip_max)] + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m3, [o(pd_2896)] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 +%else + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m4 + mova [r3+3*16], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m6, [o(clip_min)] + REPX {pmaxsd x, m6 }, m1, m2, m7, m4 + mova m6, [o(clip_max)] + REPX {pminsd x, m6 }, m1, m2, m7, m4 + mova m6, [r3+3*16] + mova [r3+3*16], m2 + mova m2, [r3+1*16] + mova [r3+1*16], m4 + + ITX_MULSUB_2D 2, 6, 4, 3, 5, 0, 1567, 3784 ; t2 t3 + mova m3, [o(pd_2896)] + mova m5, [r3+0*16] + mova m4, [r3+2*16] + REPX {pmulld x, m3 }, m5, m4, m7, m1 + paddd m7, m0 + paddd m0, m5 + + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + + mova m1, [o(clip_min)] + REPX {pmaxsd x, m1 }, m0, m6, m5, m3 + mova m1, [o(clip_max)] + REPX {pminsd x, m1 }, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + mova [r3+0*16], m6 + mova m6, [r3+1*16] + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + mova m6, [r3+3*16] + psubd m4, m3, m6 ; out4 + paddd m3, m6 ; out3 + mova m6, [r3+0*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x4_internal_8bpc, _ssse3).main +.end: + lea r3, [strideq*3] +.end2: + ; output is in m0-3 + mova m4, [o(pw_2048)] +.end3: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + RET + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif + call .main_pass1 + call .round + jmp m(idct_8x4_internal_16bpc).pack_transpose +.main_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_min)] + mova m13, [o(clip_max)] + + ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 + ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + mova m8, [o(pd_2896)] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m12}, m5, m3, m2, m9 + REPX {pminsd x, m13}, m5, m3, m2, m9 + REPX {pmulld x, m8 }, m5, m3, m2, m9 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m2, m9 ; (t6 - t7) * 2896 + paddd m2, m9 ; (t6 + t7) * 2896 + ret +.round: + + ; m0=out0,m1=-out1,m6=out6,m7=-out7 + + pcmpeqd m8, m8 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m5, [o(pd_2048)] + + ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova m1, [r3+3*16] + mova [r3+3*16], m7 + ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a + mova m0, [r3+0*16] + mova m6, [r3+2*16] + psubd m7, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + mova [r3+0*16], m7 + mova m5, [r3+1*16] + mova m7, [r3+3*16] + psubd m4, m1, m5 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + mova m3, [o(clip_min)] + REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 + mova [r3+1*16], m7 + mova m7, [o(clip_max)] + pmaxsd m3, [r3+0*16] + REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 + pminsd m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a + mova m5, [r3+2*16] + mova m7, [r3+3*16] + psubd m2, m6, m3 ; t7 + paddd m6, m3 ; out6 + mova [r3+3*16], m6 + mova m0, [r3+0*16] + mova m6, [r3+1*16] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m6 ; t2 + paddd m0, m6 ; out0 + psubd m6, m1, m4 ; t6 + paddd m1, m4 ; -out1 + mova m4, [o(clip_min)] + REPX {pmaxsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(clip_max)] + REPX {pminsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(pd_2896)] + REPX {pmulld x, m4 }, m5, m3, m6, m2 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m6, m2 ; (t6 - t7) * 2896 + paddd m2, m6 ; (t6 + t7) * 2896 + ret +.round: + mova [r3+2*16], m0 + + pcmpeqd m0, m0 + mova m6, [o(pd_2048)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 + + mova m6, [r3+3*16] + mova m0, [r3+2*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x4_internal_16bpc).round + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x4_internal_16bpc).transpose +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + lea r3, [strideq*3] + add dstq, r3 + neg strideq + neg r3 + jmp m(idct_8x4_internal_16bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load + REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(idct_8x4_internal_16bpc).pack_transpose +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x4_internal_16bpc).end + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 8x8, 14, 0-3*16 +%else + INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 2 +.end: + add r5d, 6144 + sar r5d, 13 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] +.loop: + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity, 6 +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + mov [rsp+4*16+1*gprsize], r1 +%else + DECLARE_REG_TMP 6 +%endif + lea t0, [o(.pass1_main)] + +.pass1_full: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*32+r5] + mova m1, [cq+1*32+r5] + mova m2, [cq+2*32+r5] + mova m3, [cq+3*32+r5] + mova m4, [cq+4*32+r5] + mova m5, [cq+5*32+r5] + mova m6, [cq+6*32+r5] + mova m7, [cq+7*32+r5] + call t0 + + test r5d, r5d + jz .end_pass1 + + mova [cq+0*32+16], m0 + mova [cq+1*32+16], m1 + mova [cq+2*32+16], m2 + mova [cq+3*32+16], m3 + + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + mov r1, [rsp+4*16+1*gprsize] +%endif + jmp tx2q +.pass1_main: + call m(idct_8x4_internal_16bpc).main_pass1 + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 +.pack_and_transpose: + packssdw m2, m3 + packssdw m6, m7 + packssdw m0, m1 + packssdw m4, m5 +.transpose: + punpcklwd m7, m2, m6 + punpckhwd m2, m6 + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + + punpckhwd m4, m5, m2 + punpcklwd m5, m2 + punpckhwd m2, m0, m7 + punpcklwd m0, m7 + + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+0*16] +.end: + lea r3, [strideq*3] +%if ARCH_X86_64 +%define mzero m8 +%define mlim m11 +%else + mova [rsp+0*16+gprsize], m6 + mova [rsp+1*16+gprsize], m7 +%define mzero m6 +%define mlim m7 +%endif + pxor mzero, mzero + REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova mlim, [o(pixel_10bpc_max)] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, mlim }, m0, m1, m2, m3 + REPX {pmaxsw x, mzero}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] +%if ARCH_X86_32 + SWAP 2, 6 + SWAP 3, 7 + mova m6, [rsp+0*16+gprsize] + mova m7, [rsp+1*16+gprsize] +%define mzero m2 +%define mlim m3 +%endif + paddw m4, [dstq+strideq*0] + paddw m5, [dstq+strideq*1] + paddw m6, [dstq+strideq*2] + paddw m7, [dstq+r3] + REPX {pminsw x, mlim }, m4, m5, m6, m7 + REPX {pmaxsw x, mzero}, m4, m5, m6, m7 + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m5 + mova [dstq+strideq*2], m6 + mova [dstq+r3 ], m7 +%undef mzero +%undef mlim + RET + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity, 6 + +cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call .round + jmp m(idct_8x8_internal_16bpc).pack_and_transpose +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + mova m11, [o(pd_6144)] + REPX {psubd x, m8 }, m0, m6 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m0, m1, m6, m7 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 +%else + mova [r3+2*16], m0 + + pcmpeqd m0, m0 ; -1 + mova m6, [o(pd_6144)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m1, m7 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 + + mova m0, [r3+2*16] + psrld m6, 12 ; +1 + paddd m0, m6 + paddd m6, [r3+3*16] + REPX {psrad x, 1 }, m0, m6 +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova m7, [o(pw_m2048)] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + jmp m(idct_8x8_internal_16bpc).end + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity, 6 + +cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x8_internal_16bpc).round + ; invert registers + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x8_internal_16bpc).transpose + +.pass2: + lea dstq, [dstq+strideq*8] + sub dstq, strideq + neg strideq + jmp m(iadst_8x8_internal_16bpc).pass2 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+0*32] + mova m1, [cq+1*32] + mova m2, [cq+2*32] + mova m3, [cq+3*32] + mova m4, [cq+4*32] + mova m5, [cq+5*32] + mova m6, [cq+6*32] + mova m7, [cq+7*32] + packssdw m0, [cq+0*32+16] + packssdw m1, [cq+1*32+16] + packssdw m2, [cq+2*32+16] + packssdw m3, [cq+3*32+16] + packssdw m4, [cq+4*32+16] + packssdw m5, [cq+5*32+16] + packssdw m6, [cq+6*32+16] + packssdw m7, [cq+7*32+16] + mova [rsp+gprsize+16*1], m6 + jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_4096)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize+0*16], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+0*16] +%endif + jmp m(idct_8x8_internal_16bpc).end + +%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 14, 0-16*16 +%else + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + mov r3d, 4 +%if stack_size_padded > 0 + ; adjust to caller's stack allocation + add rsp, (12+ARCH_X86_64)*16 +%endif + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, v +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%endif + +cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] +.pass1_full: +%undef cmp + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] + ; setup stack pointer + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m7, [o(pd_2896)] + pmulld m0, m7, [cq+0*64+r5] + pmulld m1, m7, [cq+1*64+r5] + pmulld m2, m7, [cq+2*64+r5] + pmulld m3, m7, [cq+3*64+r5] + pmulld m4, m7, [cq+4*64+r5] + pmulld m5, m7, [cq+5*64+r5] + pmulld m6, m7, [cq+6*64+r5] + pmulld m7, [cq+7*64+r5] +%if ARCH_X86_64 + mova m8, [o(pd_2048)] + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize+0*16], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [rsp+gprsize+0*16] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + call t0 + + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + sub r5d, 16 + jge .loop_pass1 +%if WIN64 + POP r7 +%elif ARCH_X86_32 + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + + ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 + ; some are still pre-loaded from the final loop iteration in pass=1 + + mova m1, m2 + mova m2, [cq+ 1*16] + mova m3, [cq+ 9*16] + mova m4, [cq+ 2*16] + mova m5, [cq+10*16] + mova m6, [cq+ 3*16] + mova m7, [cq+11*16] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+ 4*16] + mova m1, [cq+12*16] + mova m2, [cq+ 5*16] + mova m3, [cq+13*16] + mova m4, [cq+ 6*16] + mova m5, [cq+14*16] + mova m6, [cq+ 7*16] + mova m7, [cq+15*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 +%define mzero m8 +%define mlim m9 +%define mula m10 +%define mulb m11 +%else +%define mzero m4 +%define mlim m5 +%define mula m6 +%define mulb m7 +%endif + mova m7, [rsp+gprsize+0*16] +%if ARCH_X86_32 + mova [rsp+gprsize+11*16], m4 + mova [rsp+gprsize+12*16], m5 + mova [rsp+gprsize+13*16], m6 + mova [rsp+gprsize+14*16], m7 +%endif + + mova mula, [o(pw_2048)] + mova mulb, mula +.end: + lea r3, [strideq*3] + lea r5, [dstq+strideq*8] + pxor mzero, mzero + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + mova mlim, [o(pixel_10bpc_max)] + call .write_8x4 + lea r5, [dstq+r3*4] +%if ARCH_X86_64 + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 +%else + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] +%endif + call .write_8x4 + mov r5, dstq + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + call .write_8x4 + lea r5, [dstq+strideq*4] + mova m0, [rsp+gprsize+ 7*16] + mova m1, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m3, [rsp+gprsize+10*16] + call .write_8x4 + RET +.write_8x4: + REPX {pmulhrsw x, mula}, m0, m2 + REPX {pmulhrsw x, mulb}, m1, m3 + paddw m0, [r5+strideq*0] + paddw m1, [r5+strideq*1] + paddw m2, [r5+strideq*2] + paddw m3, [r5+r3] + REPX {pminsw x, mlim }, m0, m1, m2, m3 + REPX {pmaxsw x, mzero}, m0, m1, m2, m3 + mova [r5+strideq*0], m0 + mova [r5+strideq*1], m1 + mova [r5+strideq*2], m2 + mova [r5+r3 ], m3 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, v + +cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m4, [cq+ 9*16] + mova m5, [cq+13*16] + mova [rsp+gprsize+7*16], m0 + mova [rsp+gprsize+8*16], m1 + mova [rsp+gprsize+5*16], m4 + mova [rsp+gprsize+6*16], m5 + mova m0, m2 + mova m1, m3 + mova m2, [cq+ 1*16] + mova m3, [cq+ 5*16] + mova m4, [cq+ 2*16] + mova m5, [cq+ 6*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + mova [rsp+gprsize+ 3*16], m4 + mova [rsp+gprsize+ 4*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m4, [cq+10*16] + mova m5, [cq+14*16] + mova m6, [cq+ 3*16] + mova m7, [cq+ 7*16] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + mova m7, [rsp+gprsize+0*16] +%if ARCH_X86_32 + mova [rsp+gprsize+11*16], m4 + mova [rsp+gprsize+12*16], m5 + mova [rsp+gprsize+13*16], m6 + mova [rsp+gprsize+14*16], m7 +%endif + mova mula, [o(pw_2048)] + mova mulb, [o(pw_m2048)] + jmp m(idct_8x16_internal_16bpc).end + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, v + +cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_8x16_internal_16bpc).pass2 + +INV_TXFM_8X16_FN identity, dct, h +INV_TXFM_8X16_FN identity, adst, h +INV_TXFM_8X16_FN identity, flipadst, h +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_1697x16)] +%endif + call .main + mova [rsp+ 3*16+gprsize], m0 + mova [rsp+ 4*16+gprsize], m1 + mova [rsp+ 5*16+gprsize], m2 + mova [rsp+ 6*16+gprsize], m3 + mova m0, [cq+ 1*16] + mova m1, [cq+ 5*16] + mova m2, [cq+ 9*16] + mova m3, [cq+13*16] + call .main + mova [rsp+ 7*16+gprsize], m0 + mova [rsp+ 8*16+gprsize], m1 + mova [rsp+ 9*16+gprsize], m2 + mova [rsp+10*16+gprsize], m3 +%if ARCH_X86_32 + mova m0, [cq+ 3*16] + mova m1, [cq+ 7*16] + mova m2, [cq+11*16] + mova m3, [cq+15*16] + call .main + mova [rsp+11*16+gprsize], m0 + mova [rsp+12*16+gprsize], m1 + mova [rsp+13*16+gprsize], m2 + mova [rsp+14*16+gprsize], m3 +%endif + mova m0, [cq+ 2*16] + mova m1, [cq+ 6*16] + mova m2, [cq+10*16] + mova m3, [cq+14*16] + call .main +%if ARCH_X86_64 + mova m4, [cq+ 3*16] + mova m5, [cq+ 7*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + pmulhrsw m9, m8, m4 + pmulhrsw m10, m8, m5 + pmulhrsw m11, m8, m6 + pmulhrsw m8, m7 + REPX {paddsw x, x}, m4, m5, m6, m7 + paddsw m4, m9 + paddsw m5, m10 + paddsw m6, m11 + paddsw m7, m8 +%endif + mova mula, [o(pw_2048)] + mova mulb, mula + jmp m(idct_8x16_internal_16bpc).end +.main: + ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) +%if ARCH_X86_32 + mova m7, [o(pw_1697x16)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 +%else + pmulhrsw m4, m8, m0 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 +%endif + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + ret +%undef mula +%undef mulb +%undef mlim +%undef mzero diff -Nru dav1d-0.9.0/src/x86/itx_avx2.asm dav1d-0.9.1/src/x86/itx_avx2.asm --- dav1d-0.9.0/src/x86/itx_avx2.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/itx_avx2.asm 2021-07-28 21:38:28.905852000 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -344,7 +344,7 @@ %endmacro INIT_XMM avx2 -cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c mova m0, [cq+16*0] mova m1, [cq+16*1] pxor m2, m2 @@ -362,12 +362,12 @@ ITX4_END 3, 0, 2, 1, 0 %macro INV_TXFM_FN 3 ; type1, type2, size -cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2 - %define %%p1 m(i%1_%3_internal) +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) lea rax, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. - lea tx2q, [m(i%2_%3_internal).pass2] + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 @@ -388,7 +388,7 @@ mov [cq], eobd ; 0 pmulhrsw m0, m1 mova m1, m0 - jmp m(iadst_4x4_internal).end2 + jmp m(iadst_4x4_internal_8bpc).end2 %endif %endmacro @@ -438,7 +438,7 @@ INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity -cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] IDCT4_1D_PACKED @@ -460,7 +460,7 @@ INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity -cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main @@ -487,17 +487,17 @@ INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity -cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 mova [cq+16*0], m2 @@ -510,7 +510,7 @@ INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity -cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 +cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] vpbroadcastd m3, [o(pw_1697x8)] @@ -529,7 +529,7 @@ pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 - jmp m(iadst_4x4_internal).end + jmp m(iadst_4x4_internal_8bpc).end %macro WRITE_4X8 2 ; coefs[1-2] movd xm4, [dstq+strideq*0] @@ -568,7 +568,7 @@ pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 - jmp m(iadst_4x8_internal).end3 + jmp m(iadst_4x8_internal_8bpc).end3 %endif %endmacro @@ -687,7 +687,7 @@ INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity -cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] @@ -708,7 +708,7 @@ vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pshufd m1, m1, q1032 - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main WRAP_XMM IDCT8_1D_PACKED @@ -719,13 +719,13 @@ INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity -cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 @@ -770,13 +770,13 @@ INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity -cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main punpcklwd m3, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m3 @@ -787,7 +787,7 @@ vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 - call m(iadst_4x8_internal).main_pass2 + call m(iadst_4x8_internal_8bpc).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 @@ -795,14 +795,14 @@ psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 - jmp m(iadst_4x8_internal).end + jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity -cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m2, [cq+32*0], q3120 vpermq m0, [cq+32*1], q3120 vpbroadcastd m3, [o(pw_2896x8)] @@ -820,7 +820,7 @@ jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16 @@ -837,7 +837,7 @@ mova m1, m0 mova m2, m0 mova m3, m0 - jmp m(iadst_4x16_internal).end3 + jmp m(iadst_4x16_internal_8bpc).end3 %endif %endmacro @@ -915,12 +915,12 @@ INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity -cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] - call m(idct_16x4_internal).main + call m(idct_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 @@ -945,7 +945,7 @@ vinserti128 m3, xm7, 1 pshufd m1, m1, q1032 pshufd m3, m3, q1032 - jmp m(iadst_4x16_internal).end2 + jmp m(iadst_4x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main WRAP_XMM IDCT16_1D_PACKED @@ -956,12 +956,12 @@ INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity -cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 @@ -1085,12 +1085,12 @@ INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity -cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m1, m0 punpckhwd m1, m0 @@ -1103,7 +1103,7 @@ punpckldq m0, m4 jmp tx2q .pass2: - call m(iadst_4x16_internal).main + call m(iadst_4x16_internal_8bpc).main vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 @@ -1120,14 +1120,14 @@ vpermq m2, m2, q2031 vpermq m3, m4, q1302 psubw m5, m7, m6 - jmp m(iadst_4x16_internal).end + jmp m(iadst_4x16_internal_8bpc).end INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity -cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m3, [cq+32*0] mova m2, [cq+32*1] mova m4, [cq+32*2] @@ -1171,7 +1171,7 @@ paddsw m1, m6 paddsw m2, m7 paddsw m3, m8 - jmp m(iadst_4x16_internal).end2 + jmp m(iadst_4x16_internal_8bpc).end2 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] @@ -1209,7 +1209,7 @@ pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 - jmp m(iadst_8x4_internal).end3 + jmp m(iadst_8x4_internal_8bpc).end3 %endif %endmacro @@ -1218,13 +1218,13 @@ INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity -cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] pmulhrsw xm0, xm3, [cq+16*0] pmulhrsw xm1, xm3, [cq+16*1] pmulhrsw xm2, xm3, [cq+16*2] pmulhrsw xm3, [cq+16*3] - call m(idct_4x8_internal).main + call m(idct_4x8_internal_8bpc).main vbroadcasti128 m4, [o(deint_shuf)] vinserti128 m3, m1, xm3, 1 vinserti128 m1, m0, xm2, 1 @@ -1237,14 +1237,14 @@ IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 - jmp m(iadst_8x4_internal).end2 + jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity -cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] @@ -1252,7 +1252,7 @@ pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main_pass1 + call m(iadst_4x8_internal_8bpc).main_pass1 vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 punpckhwd m2, m0, m1 @@ -1289,7 +1289,7 @@ INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity -cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] @@ -1297,7 +1297,7 @@ pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main_pass1 + call m(iadst_4x8_internal_8bpc).main_pass1 vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 punpckhwd m1, m3, m2 @@ -1308,18 +1308,18 @@ punpcklwd m0, m3 jmp tx2q .pass2: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 - jmp m(iadst_8x4_internal).end2 + jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity -cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm0, [cq+16*1] vinserti128 m2, [cq+16*2], 1 @@ -1340,7 +1340,7 @@ pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8 @@ -1372,7 +1372,7 @@ INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity -cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m2, [cq+32*2], q3120 ; 4 5 @@ -1398,7 +1398,7 @@ vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 - jmp m(iadst_8x8_internal).end2 + jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED @@ -1409,7 +1409,7 @@ INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity -cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 @@ -1476,12 +1476,12 @@ INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity -cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 - call m(iadst_8x8_internal).main_pass1 + call m(iadst_8x8_internal_8bpc).main_pass1 vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 @@ -1505,7 +1505,7 @@ .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 + call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 @@ -1515,14 +1515,14 @@ vpermq m2, m1, q2031 pmulhrsw m1, m0, m4 pmulhrsw m0, m5, m4 - jmp m(iadst_8x8_internal).end3 + jmp m(iadst_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity -cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 +cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*1] vinserti128 m3, [cq+16*4], 1 @@ -1542,7 +1542,7 @@ jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16 @@ -1558,7 +1558,7 @@ pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8).end2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 %endif %endmacro @@ -1580,9 +1580,9 @@ INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity -cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [o(pw_16384)] .pass1_end: vperm2i128 m9, m3, m7, 0x31 @@ -1642,14 +1642,14 @@ INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity -cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end vpbroadcastd m10, [o(pw_16384)] pslld m9, m10, 17 psubw m10, m9 ; 16384, -16384 - jmp m(idct_8x16_internal).pass1_end + jmp m(idct_8x16_internal_8bpc).pass1_end ALIGN function_align .pass2: call .main @@ -1659,7 +1659,7 @@ psubw m8, m9 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 - jmp m(idct_8x16_internal).end2 + jmp m(idct_8x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main REPX {pshufd x, x, q1032}, m7, m1, m5, m3 @@ -1783,10 +1783,10 @@ INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity -cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end vpbroadcastd m9, [o(pw_16384)] pslld m10, m9, 17 psubw m10, m9 ; -16384, 16384 @@ -1802,10 +1802,10 @@ punpckhwd m4, m0 punpcklwd m0, m3, m1 punpckhwd m3, m1 - jmp m(idct_8x16_internal).pass1_end2 + jmp m(idct_8x16_internal_8bpc).pass1_end2 .pass2: - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [o(pw_2048)] vpbroadcastd xm9, [o(pw_4096)] psubw m8, m9 @@ -1825,7 +1825,7 @@ pmulhrsw m5, m6, m8 pmulhrsw m6, m7, m8 pmulhrsw m7, m9, m8 - jmp m(idct_8x16_internal).end3 + jmp m(idct_8x16_internal_8bpc).end3 INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst @@ -1842,7 +1842,7 @@ paddsw m%1, m%2 %endmacro -cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*2] add cq, 16*8 @@ -1883,7 +1883,7 @@ vpbroadcastd m8, [o(pw_1697x16)] REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 - jmp m(idct_8x16_internal).end + jmp m(idct_8x16_internal_8bpc).end %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] pmovzxbw m%3, [dstq+%5] @@ -1941,7 +1941,7 @@ INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity -cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] @@ -1950,7 +1950,7 @@ mova xm5, [cq+16*5] mova xm6, [cq+16*6] mova xm7, [cq+16*7] - call m(idct_4x16_internal).main + call m(idct_4x16_internal_8bpc).main vinserti128 m6, m2, xm6, 1 vinserti128 m2, m0, xm4, 1 vinserti128 m0, m1, xm5, 1 @@ -1961,10 +1961,10 @@ punpckhwd m4, m0, m1 punpcklwd m0, m1 mova m1, m6 - jmp m(iadst_16x4_internal).pass1_end + jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: call .main - jmp m(iadst_16x4_internal).end + jmp m(iadst_16x4_internal_8bpc).end ALIGN function_align cglobal_label .main vpbroadcastd m6, [o(pd_2048)] @@ -1976,13 +1976,13 @@ INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity -cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 - call m(iadst_4x16_internal).main2 - call m(iadst_4x16_internal).main_pass1_end + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end punpcklwd m4, m3, m1 punpcklwd m5, m2, m0 punpckhwd m0, m1 @@ -2080,13 +2080,13 @@ INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity -cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 - call m(iadst_4x16_internal).main2 - call m(iadst_4x16_internal).main_pass1_end + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end punpckhwd m4, m3, m2 punpckhwd m5, m1, m0 punpcklwd m0, m2 @@ -2097,10 +2097,10 @@ vinserti128 m0, m4, xm5, 1 vperm2i128 m4, m4, m5, 0x31 psubw m1, m7, m6 - jmp m(iadst_16x4_internal).pass1_end + jmp m(iadst_16x4_internal_8bpc).pass1_end ALIGN function_align .pass2: - call m(iadst_16x4_internal).main + call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [o(pw_2048)] REPX {pmulhrsw x, m4}, m3, m2, m1, m0 pxor m4, m4 @@ -2118,7 +2118,7 @@ INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity -cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 +cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm4, [cq+16*1] vinserti128 m2, [cq+16*4], 1 @@ -2161,7 +2161,7 @@ paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(iadst_16x4_internal).end + jmp m(iadst_16x4_internal_8bpc).end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8 @@ -2172,7 +2172,7 @@ mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2195,9 +2195,9 @@ INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity -cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 3120 - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main vpbroadcastd m10, [o(pw_16384)] punpckhwd m8, m0, m2 punpcklwd m0, m2 @@ -2265,10 +2265,10 @@ INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity -cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 - call m(iadst_8x16_internal).main2 - call m(iadst_8x16_internal).main_pass1_end + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end psubw m11, m9, m10 punpcklwd m8, m0, m2 punpckhwd m0, m2 @@ -2279,7 +2279,7 @@ punpckhwd m6, m5, m7 punpcklwd m5, m7 REPX {pmulhrsw x, m11}, m8, m1, m4, m6 - jmp m(idct_16x8_internal).pass1_end + jmp m(idct_16x8_internal_8bpc).pass1_end ALIGN function_align .pass2: call .main @@ -2287,7 +2287,7 @@ pxor m8, m8 psubw m8, m9 REPX {pmulhrsw x, m9}, m0, m2, m4, m6 - jmp m(idct_16x8_internal).end2 + jmp m(idct_16x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main vpbroadcastd m10, [o(pd_2048)] @@ -2358,10 +2358,10 @@ INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity -cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 - call m(iadst_8x16_internal).main2 - call m(iadst_8x16_internal).main_pass1_end + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end psubw m9, m10 punpcklwd m8, m6, m4 punpckhwd m6, m4 @@ -2399,8 +2399,8 @@ vperm2i128 m7, m8, 0x31 jmp tx2q .pass2: - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end pxor m8, m8 psubw m8, m9 pmulhrsw m10, m7, m8 @@ -2414,14 +2414,14 @@ lea r3, [strideq*3] WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 WRITE_16X2 1, 2, 0, 1, strideq*2, r3 - jmp m(idct_16x8_internal).end3 + jmp m(idct_16x8_internal_8bpc).end3 INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity -cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 +cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 mova xm7, [cq+16*0] mova xm2, [cq+16*1] add cq, 16*8 @@ -2471,7 +2471,7 @@ jmp tx2q .pass2: vpbroadcastd m8, [o(pw_4096)] - jmp m(idct_16x8_internal).end + jmp m(idct_16x8_internal_8bpc).end %define o_base pw_5 + 128 @@ -2483,7 +2483,7 @@ movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2513,7 +2513,7 @@ INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity -cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main .pass1_end: @@ -2658,7 +2658,7 @@ INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst -cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main call .main_pass1_end @@ -2671,7 +2671,7 @@ mova [rsp+16*1], xm8 pxor m8, m8 psubw m1, m8, m1 - jmp m(idct_16x16_internal).pass1_end2 + jmp m(idct_16x16_internal_8bpc).pass1_end2 ALIGN function_align .pass2: call .main @@ -2680,7 +2680,7 @@ mova [rsp+32*0], m6 pxor m6, m6 psubw m1, m6, m1 - jmp m(idct_16x16_internal).end2 + jmp m(idct_16x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main vpbroadcastd m15, [o(pd_2048)] @@ -2833,10 +2833,10 @@ INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst -cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass1_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass1_end pmulhrsw m6, m1 pmulhrsw m2, m1, m8 mova [rsp+32*2], m6 @@ -2869,10 +2869,10 @@ vperm2i128 m13, m1, m0, 0x31 vinserti128 m1, m8, [rsp+32*2], 1 vperm2i128 m8, m8, [rsp+32*2], 0x31 - jmp m(idct_16x16_internal).pass1_end3 + jmp m(idct_16x16_internal_8bpc).pass1_end3 .pass2: - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end pmulhrsw m0, m1 pmulhrsw m8, m1 mova [rsp+32*0], m0 @@ -2900,7 +2900,7 @@ lea dstq, [dstq+strideq*4] WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 - jmp m(idct_16x16_internal).end3 + jmp m(idct_16x16_internal_8bpc).end3 %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 pmulhrsw m%2, m%3, m%1 @@ -2911,7 +2911,7 @@ INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity -cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 +cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 vpbroadcastd m7, [o(pw_1697x16)] mova xm0, [cq+16* 0] vinserti128 m0, [cq+16*16], 1 @@ -2953,7 +2953,7 @@ pmulhrsw m7, m0 psraw m7, 1 pavgw m7, m0 - jmp m(idct_16x16_internal).pass1_end3 + jmp m(idct_16x16_internal_8bpc).pass1_end3 ALIGN function_align .pass2: vpbroadcastd m15, [o(pw_1697x16)] @@ -2967,7 +2967,7 @@ pmulhrsw m15, m1 paddsw m1, m1 paddsw m15, m1 - jmp m(idct_16x16_internal).end + jmp m(idct_16x16_internal_8bpc).end %define o_base deint_shuf + 128 @@ -3028,7 +3028,7 @@ pmulhrsw m%2, m%3 %endmacro -cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jz .dconly @@ -3037,7 +3037,7 @@ cmp eobd, 106 jle .fast LOAD_8ROWS cq+32*1, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vperm2i128 m11, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 @@ -3077,7 +3077,7 @@ mova [rsp+32*2], m2 .fast: LOAD_8ROWS cq+32*0, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vperm2i128 m8, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 @@ -3135,7 +3135,7 @@ pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_8x8).end2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] @@ -3175,7 +3175,7 @@ RET ALIGN function_align cglobal_label .main_fast ; bottom half is zero - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] @@ -3190,7 +3190,7 @@ jmp .main2 ALIGN function_align cglobal_label .main - call m(idct_8x16_internal).main + call m(idct_8x16_internal_8bpc).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] @@ -3291,7 +3291,7 @@ shufpd m%1, m%2, 0x0c %endmacro -cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -3341,7 +3341,7 @@ jg .full pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(inv_txfm_add_dct_dct_8x32).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .pass2 .full: LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 @@ -3356,7 +3356,7 @@ LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 - call m(inv_txfm_add_dct_dct_8x32).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main .pass2: vpbroadcastd m12, [o(pw_8192)] REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 @@ -3397,7 +3397,7 @@ vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m10, 0x31 vinserti128 m3, xm10, 1 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 lea r2, [strideq*3] @@ -3442,7 +3442,7 @@ vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m12, 0x31 vinserti128 m3, xm12, 1 - call m(idct_16x8_internal).main2 + call m(idct_16x8_internal_8bpc).main2 vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 add r0, 16 @@ -3455,7 +3455,7 @@ WRITE_16X2 6, 7, 0, 1, strideq*2, r2 RET -cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob vpbroadcastd m9, [pw_5] lea r4, [strideq*3] sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) @@ -3525,7 +3525,7 @@ punpcklqdq m6, m8 ret -cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob add cq, 16*8 vpbroadcastd m9, [pw_4096] lea r4, [strideq*3] @@ -3550,7 +3550,7 @@ vinserti128 m7, [cq+16*7], 1 pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 @@ -3627,7 +3627,7 @@ vextracti128 [r2+%7], m%3, 1 %endmacro -cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jz .dconly @@ -3635,7 +3635,7 @@ base, tmp3 %undef cmp LOAD_16ROWS cq, 64, 1 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] lea tmp3q, [tmp1q+32*16] @@ -3682,7 +3682,7 @@ mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 16 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .full: mova [tmp1q-32*4], m1 mova [tmp1q-32*3], m3 @@ -3693,7 +3693,7 @@ mova [tmp1q+32*2], m13 mova [tmp1q+32*3], m15 LOAD_16ROWS cq+32, 64, 1 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main lea r2, [tmp3q+32*8] mova m1, [rsp+32*1] mova [rsp+32*0], m6 @@ -3746,7 +3746,7 @@ .idct16: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq @@ -3991,7 +3991,7 @@ vinserti128 m%1, xm%4, 1 %endmacro -cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4001,7 +4001,7 @@ mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 16 - jmp m(inv_txfm_add_dct_dct_32x8).dconly + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 vpbroadcastd m15, [o(pw_2896x8)] @@ -4023,7 +4023,7 @@ pmulhrsw m15, [cq+32*31] lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_16ROWS cq+32*0, 32*2, 1, 0 pxor m15, m15 mov r3d, 8 @@ -4035,14 +4035,14 @@ add cq, 32*4 dec r3d jg .zero_loop - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main call .pass1_end lea r2, [strideq*3] mov r3, dstq .pass2: vpbroadcastd m7, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main mova [rsp+32*2], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m2, m3, m0 @@ -4090,7 +4090,7 @@ IDCT32_PASS1_END 1, 9, 6, 7 ret -cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob %undef cmp lea rax, [o_base] vpbroadcastd m9, [o(pw_2896x8)] @@ -4124,7 +4124,7 @@ vinserti128 m7, [cq+64*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 @@ -4159,7 +4159,7 @@ jg .zero_loop RET -cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob %undef cmp lea rax, [o_base] vpbroadcastd m9, [o(pw_2896x8)] @@ -4192,7 +4192,7 @@ vinserti128 m7, [cq+32*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 @@ -4223,7 +4223,7 @@ jge .zero_loop RET -cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4232,7 +4232,7 @@ movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_32x8).dconly + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 @@ -4248,7 +4248,7 @@ test tmp4d, tmp4d jl .fast LOAD_8ROWS_H cq+64*17, 64*2 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2 pxor m0, m0 REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ @@ -4256,7 +4256,7 @@ mova [rsp], m15 jmp .idct16 .fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 @@ -4264,10 +4264,10 @@ LOAD_8ROWS cq+64*0, 64*2 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_32x16).pass1_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round lea tmp3q, [tmp1q+32*32] mova m15, [rsp] mova [tmp3q-32*4], m0 @@ -4296,7 +4296,7 @@ pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 @@ -4313,7 +4313,7 @@ pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 @@ -4341,21 +4341,21 @@ test tmp4d, tmp4d jl .fast2 LOAD_8ROWS_H tmp3q-32*4, 32 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf sub tmp3q, 32*8 LOAD_8ROWS_H tmp3q-32*4, 32 sub tmp3q, 32*16 jmp .pass2_loop_end .fast2: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast sub tmp3q, 32*24 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 .pass2_loop_end: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_16x32).pass2_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end lea tmp3q, [tmp1q-32*32] cmp tmp2q, tmp3q jb .ret @@ -4367,7 +4367,7 @@ .ret: RET -cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob %undef cmp vpbroadcastd m9, [pw_8192] sub eobd, 136 ; if (eob < 136) @@ -4393,7 +4393,7 @@ mova xm7, [cq+64* 7] vinserti128 m6, [cq+64*14], 1 vinserti128 m7, [cq+64*15], 1 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 @@ -4487,7 +4487,7 @@ %endif %endmacro -cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4496,7 +4496,7 @@ movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_16x4).dconly + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 %undef cmp @@ -4506,12 +4506,12 @@ mov r7d, eobd .pass1_loop: LOAD_16ROWS cq, 64 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round mova m15, [rsp+32*0] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m2 @@ -4559,7 +4559,7 @@ .fast: mova [rsp], m8 lea tmp1q, [rsp+32*7] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -4601,7 +4601,7 @@ .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast add r2, 32*24 vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 @@ -4629,7 +4629,7 @@ vinserti128 m6, [r3+32*0+16], 1 .fast3: add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -4654,8 +4654,8 @@ vinserti128 m5, [r3+32*1+ 0], 1 vinserti128 m6, [r3+32*2+16], 1 .fast4: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 RET ALIGN function_align %define o_base idct64_mul - 8 @@ -4833,7 +4833,7 @@ jne .main_part2_pass2_loop ret -cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -4876,7 +4876,7 @@ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 lea tmp1q, [rsp+32*7] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -4900,7 +4900,7 @@ REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -4915,7 +4915,7 @@ pxor m8, m8 REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -4929,8 +4929,8 @@ mova m7, [cq+32* 3] pxor m8, m8 REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*36 lea r2, [strideq*3] mov tmp2d, 4 @@ -4971,8 +4971,8 @@ mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - call m(idct_16x16_internal).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main mova [rsp+32*0], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 @@ -4997,7 +4997,7 @@ jg .pass2_loop RET -cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -5007,7 +5007,7 @@ mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 64 - jmp m(inv_txfm_add_dct_dct_32x8).dconly + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*7] @@ -5021,7 +5021,7 @@ test r10b, r10b jnz .fast LOAD_8ROWS_H cq+64*17, 64*2, 2 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2, 1 mova [rsp], m15 pxor m15, m15 @@ -5029,7 +5029,7 @@ 24, 25, 26, 27, 28, 29, 30, 31 jmp .idct16 .fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 @@ -5037,10 +5037,10 @@ LOAD_8ROWS cq+64*0, 64*2, 1 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_32x16).pass1_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end vpbroadcastd m7, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round lea r3, [tmp1q+32*48] mova m15, [rsp] mova [r3-32*4], m0 @@ -5069,7 +5069,7 @@ pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 @@ -5086,7 +5086,7 @@ pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 @@ -5119,7 +5119,7 @@ .fast2: mova [rsp], m8 lea tmp1q, [rsp+32*39] - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5153,7 +5153,7 @@ .fast3: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -5171,7 +5171,7 @@ mova m6, [r8+32*0] .fast4: add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -5188,8 +5188,8 @@ mova m5, [r8-32*3] mova m6, [r8+32*2] .fast5: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 add r10d, 0x80000000 jc .ret lea r2, [rsp+32*7] @@ -5200,7 +5200,7 @@ .ret: RET -cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -5210,7 +5210,7 @@ mov [cq], eobd pmulhrsw xm0, xm1 mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_64x16).dconly + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 @@ -5222,7 +5222,7 @@ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5246,7 +5246,7 @@ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -5262,7 +5262,7 @@ pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] add rax, 8 add tmp1q, 32*8 @@ -5277,11 +5277,11 @@ pmulhrsw m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave add cq, 32 add tmp4d, 0x80000000 jnc .pass1_loop @@ -5296,21 +5296,21 @@ test tmp4d, 0x40000000 jnz .fast LOAD_8ROWS_H tmp2q-32*4, 32 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea tmp3q, [tmp2q-32*8] LOAD_8ROWS_H tmp3q-32*4, 32 mova [rsp], m15 jmp .idct16 .fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: lea tmp3q, [tmp1q-32*8] LOAD_8ROWS tmp3q-32*4, 32 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_16x32).pass2_end + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end add tmp1q, 32*16 sub dstq, r3 lea r2, [r2+r3+16] @@ -5340,7 +5340,7 @@ vinserti128 m6, [tmp2q+32*2], 1 vinserti128 m7, [tmp2q+32*3], 1 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova xm8, [tmp1q-32*4+16] mova xm9, [tmp1q-32*3+16] vinserti128 m8, [tmp2q-32*4+16], 1 @@ -5368,7 +5368,7 @@ pmulhrsw m0, m8, m10 pmulhrsw m1, m9, m10 REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 @@ -5382,7 +5382,7 @@ jg .loop ret -cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob lea rax, [o_base] test eobd, eobd jnz .normal @@ -5391,7 +5391,7 @@ movd xm2, [o(pw_8192)] mov [cq], eobd mov r2d, 64 - jmp m(inv_txfm_add_dct_dct_64x16).dconly + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*71] @@ -5402,7 +5402,7 @@ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5426,7 +5426,7 @@ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 @@ -5441,7 +5441,7 @@ pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -5455,11 +5455,11 @@ mova m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave add cq, 32 add r10d, 0x80000000 jnc .pass1_loop @@ -5482,7 +5482,7 @@ mova m6, [r3+32*0] mova m7, [r3+32*2] .fast: - call m(idct_16x16_internal).main + call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 @@ -5516,7 +5516,7 @@ .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add r2, 32*8 add r3, 32*8 @@ -5536,7 +5536,7 @@ mova m6, [r3+32*0] ; 25 .fast3: add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 @@ -5553,8 +5553,8 @@ mova m5, [r3-32*3] ; 19 mova m6, [r3+32*2] ; 29 .fast4: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 sub tmp1q, 32*28 sub dstq, r8 lea dstq, [dstq+strideq*4+16] diff -Nru dav1d-0.9.0/src/x86/itx_init_tmpl.c dav1d-0.9.1/src/x86/itx_init_tmpl.c --- dav1d-0.9.0/src/x86/itx_init_tmpl.c 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/itx_init_tmpl.c 2021-07-28 21:38:28.905852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -29,84 +29,65 @@ #include "src/itx.h" #define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) #define decl_itx12_fns(w, h, opt) \ decl_itx2_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) #define decl_itx16_fns(w, h, opt) \ decl_itx12_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) #define decl_itx17_fns(w, h, opt) \ decl_itx16_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) -#define avx2_fns(avx2) \ -decl_itx17_fns( 4, 4, avx2); \ -decl_itx16_fns( 4, 8, avx2); \ -decl_itx16_fns( 4, 16, avx2); \ -decl_itx16_fns( 8, 4, avx2); \ -decl_itx16_fns( 8, 8, avx2); \ -decl_itx16_fns( 8, 16, avx2); \ -decl_itx2_fns ( 8, 32, avx2); \ -decl_itx16_fns(16, 4, avx2); \ -decl_itx16_fns(16, 8, avx2); \ -decl_itx12_fns(16, 16, avx2); \ -decl_itx2_fns (16, 32, avx2); \ -decl_itx2_fns (32, 8, avx2); \ -decl_itx2_fns (32, 16, avx2); \ -decl_itx2_fns (32, 32, avx2); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_##avx2); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_##avx2); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_##avx2); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_##avx2); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_##avx2) - -avx2_fns(avx2); -avx2_fns(16bpc_avx2); - -decl_itx17_fns( 4, 4, ssse3); -decl_itx16_fns( 4, 8, ssse3); -decl_itx16_fns( 8, 4, ssse3); -decl_itx16_fns( 8, 8, ssse3); -decl_itx16_fns( 4, 16, ssse3); -decl_itx16_fns(16, 4, ssse3); -decl_itx16_fns( 8, 16, ssse3); -decl_itx16_fns(16, 8, ssse3); -decl_itx12_fns(16, 16, ssse3); -decl_itx2_fns ( 8, 32, ssse3); -decl_itx2_fns (32, 8, ssse3); -decl_itx2_fns (16, 32, ssse3); -decl_itx2_fns (32, 16, ssse3); -decl_itx2_fns (32, 32, ssse3); - -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3); +#define decl_itx_fns(ext) \ +decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 4, 8, ext); \ +decl_itx16_fns( 4, 16, ext); \ +decl_itx16_fns( 8, 4, ext); \ +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns( 8, 16, ext); \ +decl_itx2_fns ( 8, 32, ext); \ +decl_itx16_fns(16, 4, ext); \ +decl_itx16_fns(16, 8, ext); \ +decl_itx12_fns(16, 16, ext); \ +decl_itx2_fns (16, 32, ext); \ +decl_itx2_fns (32, 8, ext); \ +decl_itx2_fns (32, 16, ext); \ +decl_itx2_fns (32, 32, ext); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) + +decl_itx_fns(avx2); +decl_itx_fns(sse4); +decl_itx_fns(ssse3); +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_sse2); COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, const int bpc) { #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - dav1d_inv_txfm_add_##type##_##w##x##h##_##ext + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) #define assign_itx1_fn(pfx, w, h, ext) \ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) @@ -141,6 +122,12 @@ const unsigned flags = dav1d_get_cpu_flags(); + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + +#if BITDEPTH == 16 + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); +#endif + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH == 8 @@ -165,38 +152,46 @@ assign_itx1_fn ( , 64, 64, ssse3); #endif + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + +#if BITDEPTH == 16 + if (bpc <= 10) { + assign_itx16_fn(, 4, 4, sse4); + assign_itx16_fn(R, 4, 8, sse4); + assign_itx16_fn(R, 4, 16, sse4); + assign_itx16_fn(R, 8, 4, sse4); + assign_itx16_fn(, 8, 8, sse4); + assign_itx16_fn(R, 8, 16, sse4); + } +#endif + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if ARCH_X86_64 && BITDEPTH == 16 - assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, 16bpc_avx2); +#if ARCH_X86_64 + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); #endif if (bpc > 10) return; #if ARCH_X86_64 -#if BITDEPTH == 8 -#define SUFFIX avx2 -#else -#define SUFFIX 16bpc_avx2 -#endif - assign_itx17_fn( , 4, 4, SUFFIX); - assign_itx16_fn(R, 4, 8, SUFFIX); - assign_itx16_fn(R, 4, 16, SUFFIX); - assign_itx16_fn(R, 8, 4, SUFFIX); - assign_itx16_fn( , 8, 8, SUFFIX); - assign_itx16_fn(R, 8, 16, SUFFIX); - assign_itx2_fn (R, 8, 32, SUFFIX); - assign_itx16_fn(R, 16, 4, SUFFIX); - assign_itx16_fn(R, 16, 8, SUFFIX); - assign_itx12_fn( , 16, 16, SUFFIX); - assign_itx2_fn (R, 16, 32, SUFFIX); - assign_itx1_fn (R, 16, 64, SUFFIX); - assign_itx2_fn (R, 32, 8, SUFFIX); - assign_itx2_fn (R, 32, 16, SUFFIX); - assign_itx2_fn ( , 32, 32, SUFFIX); - assign_itx1_fn (R, 32, 64, SUFFIX); - assign_itx1_fn (R, 64, 16, SUFFIX); - assign_itx1_fn (R, 64, 32, SUFFIX); - assign_itx1_fn ( , 64, 64, SUFFIX); + assign_itx17_fn( , 4, 4, avx2); + assign_itx16_fn(R, 4, 8, avx2); + assign_itx16_fn(R, 4, 16, avx2); + assign_itx16_fn(R, 8, 4, avx2); + assign_itx16_fn( , 8, 8, avx2); + assign_itx16_fn(R, 8, 16, avx2); + assign_itx2_fn (R, 8, 32, avx2); + assign_itx16_fn(R, 16, 4, avx2); + assign_itx16_fn(R, 16, 8, avx2); + assign_itx12_fn( , 16, 16, avx2); + assign_itx2_fn (R, 16, 32, avx2); + assign_itx1_fn (R, 16, 64, avx2); + assign_itx2_fn (R, 32, 8, avx2); + assign_itx2_fn (R, 32, 16, avx2); + assign_itx2_fn ( , 32, 32, avx2); + assign_itx1_fn (R, 32, 64, avx2); + assign_itx1_fn (R, 64, 16, avx2); + assign_itx1_fn (R, 64, 32, avx2); + assign_itx1_fn ( , 64, 64, avx2); #endif } diff -Nru dav1d-0.9.0/src/x86/itx_sse.asm dav1d-0.9.1/src/x86/itx_sse.asm --- dav1d-0.9.0/src/x86/itx_sse.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/itx_sse.asm 2021-07-28 21:38:28.905852000 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -240,8 +240,8 @@ %endmacro %macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack -cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 - %define %%p1 m(i%1_%3_internal) +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) %if ARCH_X86_32 LEA r5, $$ %endif @@ -250,12 +250,12 @@ test eobd, eobd jz %%end %endif - lea tx2q, [o(m(i%2_%3_internal).pass2)] + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] call %%p1 RET %%end: %else - lea tx2q, [o(m(i%2_%3_internal).pass2)] + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 @@ -277,18 +277,19 @@ mov [coeffq], eobd ;0 pmulhrsw m0, m1 mova m1, m0 - TAIL_CALL m(iadst_4x4_internal).end2 + TAIL_CALL m(iadst_4x4_internal_8bpc).end2 %endif %endmacro INIT_XMM ssse3 +; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity -cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 mova m1, [coeffq+16*1] ;high: in3 ;low in2 @@ -315,7 +316,7 @@ INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity -cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call .main @@ -337,7 +338,7 @@ ITX4_END 0, 1, 2, 3 ALIGN function_align -.main: +cglobal_label .main punpcklwd m2, m0, m1 ;unpacked in0 in2 punpckhwd m0, m1 ;unpacked in1 in3 mova m3, m0 @@ -367,10 +368,10 @@ INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity -cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 @@ -378,7 +379,7 @@ jmp tx2q .pass2: - call m(iadst_4x4_internal).main + call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 @@ -393,7 +394,7 @@ INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity -cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] mova m3, [o(pw_1697x8)] @@ -413,7 +414,7 @@ pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 - jmp m(iadst_4x4_internal).end + jmp m(iadst_4x4_internal_8bpc).end %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ;low: in1 high: in3 @@ -429,7 +430,7 @@ paddw m2, m1 ;low: out3 %endmacro -cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] pxor m2, m2 @@ -561,7 +562,7 @@ mova m1, m0 mova m2, m0 mova m3, m0 - TAIL_CALL m(iadst_4x8_internal).end3 + TAIL_CALL m(iadst_4x8_internal_8bpc).end3 %endif %endmacro @@ -570,7 +571,7 @@ INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity -cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -578,18 +579,18 @@ pmulhrsw m3, [coeffq+16*3] .pass1: - call m(idct_8x4_internal).main - jmp m(iadst_4x8_internal).pass1_end + call m(idct_8x4_internal_8bpc).main + jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: call .main shufps m1, m1, q1032 shufps m3, m3, q1032 mova m4, [o(pw_2048)] - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -599,7 +600,7 @@ INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity -cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -607,7 +608,7 @@ pmulhrsw m3, [coeffq+16*3] .pass1: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main .pass1_end: INV_4X8 @@ -640,7 +641,7 @@ RET ALIGN function_align -.main: +cglobal_label .main mova m6, [o(pd_2048)] punpckhwd m4, m3, m0 ;unpacked in7 in0 punpckhwd m5, m2, m1 ;unpacked in5 in2 @@ -690,7 +691,7 @@ INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity -cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -698,7 +699,7 @@ pmulhrsw m3, [coeffq+16*3] .pass1: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main punpcklwd m4, m3, m2 punpckhwd m3, m2 @@ -713,7 +714,7 @@ .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal_8bpc).main mova m4, m0 mova m5, m1 @@ -724,14 +725,14 @@ mova m5, [o(pw_2048)] pxor m4, m4 psubw m4, m5 - jmp m(iadst_4x8_internal).end + jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity -cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -748,11 +749,11 @@ paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(iadst_4x8_internal).pass1_end + jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: mova m4, [o(pw_4096)] - jmp m(iadst_4x8_internal).end2 + jmp m(iadst_4x8_internal_8bpc).end2 %macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] @@ -797,7 +798,7 @@ mova m1, m0 mova m2, m0 mova m3, m0 - TAIL_CALL m(iadst_8x4_internal).end2 + TAIL_CALL m(iadst_8x4_internal_8bpc).end2 %endif %endmacro @@ -806,14 +807,14 @@ INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity -cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] - call m(idct_4x8_internal).main + call m(idct_4x8_internal_8bpc).main mova m4, [o(deint_shuf1)] mova m5, [o(deint_shuf2)] @@ -833,10 +834,10 @@ .pass2: call .main - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end ALIGN function_align -.main: +cglobal_label .main mova m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret @@ -846,7 +847,7 @@ INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity -cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -855,7 +856,7 @@ shufps m0, m0, q1032 shufps m1, m1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal_8bpc).main punpckhwd m4, m0, m1 punpcklwd m0, m1 @@ -895,7 +896,7 @@ RET ALIGN function_align -.main: +cglobal_label .main punpckhwd m6, m0, m2 ;unpacked in0 in2 punpcklwd m0, m2 ;unpacked in0 in2 punpckhwd m7, m1, m3 ;unpacked in1 in3 @@ -964,7 +965,7 @@ INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity -cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -973,7 +974,7 @@ shufps m0, m0, q1032 shufps m1, m1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal_8bpc).main punpckhwd m5, m3, m2 punpcklwd m3, m2 @@ -994,21 +995,21 @@ jmp tx2q .pass2: - call m(iadst_8x4_internal).main + call m(iadst_8x4_internal_8bpc).main mova m4, m0 mova m5, m1 mova m0, m3 mova m1, m2 mova m2, m5 mova m3, m4 - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity -cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] @@ -1043,7 +1044,7 @@ paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(iadst_8x4_internal).end + jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 @@ -1060,7 +1061,7 @@ pmulhrsw m0, m2 .end: mov r3d, 2 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] .loop: WRITE_8X4 0, 0, 0, 0, 1, 2, 3 lea dstq, [dstq+strideq*2] @@ -1110,7 +1111,7 @@ INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity -cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: @@ -1127,7 +1128,7 @@ REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize+16*0] -.pass1_end3: +cglobal_label .pass1_end3 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 @@ -1161,7 +1162,7 @@ jmp tx2q .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main @@ -1189,7 +1190,7 @@ ret ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m1 @@ -1222,7 +1223,7 @@ INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity -cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: @@ -1238,11 +1239,11 @@ pxor m6, m6 psubw m6, m7 mova m7, m6 - jmp m(idct_8x8_internal).pass1_end2 + jmp m(idct_8x8_internal_8bpc).pass1_end2 ALIGN function_align .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main @@ -1255,10 +1256,10 @@ pxor m6, m6 psubw m6, m7 mova m7, m6 - jmp m(idct_8x8_internal).end2 + jmp m(idct_8x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m4 @@ -1343,7 +1344,7 @@ mova m6, [rsp+gprsize*2+16*2] ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end paddsw m7, m4, m3 ;t2 + t3 psubsw m4, m3 ;t2 - t3 paddsw m3, m5, m2 ;t6 + t7 @@ -1361,12 +1362,12 @@ INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity -cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: - call m(iadst_8x8_internal).main - call m(iadst_8x8_internal).main_pass1_end + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass1_end .pass1_end: mova m7, [o(pw_m16384)] @@ -1388,15 +1389,15 @@ pmulhrsw m0, [rsp+gprsize+16*0] REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: - call m(iadst_8x8_internal).main - call m(iadst_8x8_internal).main_pass2_end + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass2_end .end: mova m7, [o(pw_2048)] @@ -1415,21 +1416,21 @@ pmulhrsw m0, [rsp+gprsize+16*0] mova m3, m5 mova [rsp+gprsize+16*0], m7 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity -cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .end: pmulhrsw m7, [o(pw_4096)] @@ -1438,7 +1439,7 @@ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+gprsize+16*2], m5 mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 %macro INV_TXFM_4X16_FN 2 ; type1, type2 @@ -1469,8 +1470,8 @@ INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity -cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(idct_4x8_internal).pass1)] +cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] .pass1: mova m0, [coeffq+16*1] @@ -1478,7 +1479,7 @@ mova m2, [coeffq+16*5] mova m3, [coeffq+16*7] push tx2q - lea tx2q, [o(m(idct_4x16_internal).pass1_2)] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] jmp r3 .pass1_2: @@ -1490,7 +1491,7 @@ mova m1, [coeffq+16*2] mova m2, [coeffq+16*4] mova m3, [coeffq+16*6] - lea tx2q, [o(m(idct_4x16_internal).pass1_end)] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: @@ -1507,7 +1508,7 @@ jmp tx2q .pass2: - call m(idct_16x4_internal).main + call m(idct_16x4_internal_8bpc).main .end: mova m7, [o(pw_2048)] @@ -1538,13 +1539,13 @@ INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity -cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iadst_4x8_internal).pass1)] - jmp m(idct_4x16_internal).pass1 +cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 .pass2: - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass2_end + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 punpckhqdq m4, m5 ;low: out8 high: out10 @@ -1606,13 +1607,13 @@ INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity -cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iflipadst_4x8_internal).pass1)] - jmp m(idct_4x16_internal).pass1 +cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 .pass2: - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass2_end + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end punpckhqdq m6, m5, m4 ;low: out5 high: out7 punpcklqdq m4, m5 ;low: -out8 high: -out10 @@ -1628,7 +1629,7 @@ punpckhqdq m2, m3 ;low: out1 high: out3 mova m7, [o(pw_m2048)] - jmp m(iadst_4x16_internal).end1 + jmp m(iadst_4x16_internal_8bpc).end1 INV_TXFM_4X16_FN identity, dct @@ -1646,7 +1647,7 @@ paddsw m%1, m%2 %endmacro -cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*1] mova m6, [o(pw_1697x8)] mova m1, [coeffq+16*3] @@ -1672,7 +1673,7 @@ pcmpeqw m3, m7 pandn m2, m4 pandn m3, m5 - jmp m(iadst_4x8_internal).pass1_end + jmp m(iadst_4x8_internal_8bpc).pass1_end .pass1_2: mova [coeffq+16*1], m0 mova [coeffq+16*3], m1 @@ -1704,7 +1705,7 @@ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*4], m4 - jmp m(iadst_4x16_internal).end2 + jmp m(iadst_4x16_internal_8bpc).end2 %macro INV_TXFM_16X4_FN 2 ; type1, type2 @@ -1715,7 +1716,7 @@ movd m2, [o(pw_16384)] mov [coeffq], eobd mov r2d, 2 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] .dconly: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative @@ -1806,7 +1807,7 @@ INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity -cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main @@ -1850,7 +1851,7 @@ jmp tx2q .pass2: - lea tx2q, [o(m(idct_8x4_internal).pass2)] + lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] .pass2_end: mova [coeffq+16*4], m4 @@ -1868,7 +1869,7 @@ jmp tx2q ALIGN function_align -.main: +cglobal_label .main punpckhqdq m7, m0, m1 ;low:in1 high:in3 punpcklqdq m0, m1 punpcklqdq m1, m2, m3 @@ -1911,7 +1912,7 @@ INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity -cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main call .main_pass1_end @@ -1939,14 +1940,14 @@ REPX {pmulhrsw x, m2}, m7, m3, m6 pmulhrsw m2, [coeffq+16*7] mova [coeffq+16*6], m7 - jmp m(idct_16x4_internal).pass1_end3 + jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iadst_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end + lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end ALIGN function_align -.main: +cglobal_label .main mova [coeffq+16*6], m0 pshufd m0, m1, q1032 pshufd m2, m2, q1032 @@ -2069,7 +2070,7 @@ mova m3, [coeffq+16*5] ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] punpckhqdq m6, m2, m1 ;low:t11 high:t15a punpcklqdq m2, m1 ;low:t10 high:t14a @@ -2095,10 +2096,10 @@ INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity -cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass1_end + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass1_end punpcklwd m6, m7, m0 ;packed out11, out15 punpckhwd m0, m7 ;packed -out0, -out4 @@ -2114,11 +2115,11 @@ punpckhwd m1, m2 ;packed -out2, -out6 mova m7, [o(pw_m16384)] - jmp m(iadst_16x4_internal).pass1_end + jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iflipadst_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end + lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end INV_TXFM_16X4_FN identity, dct @@ -2126,7 +2127,7 @@ INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity -cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, [coeffq+16*6] mova m0, [coeffq+16*5] mova m2, [coeffq+16*7] @@ -2176,11 +2177,11 @@ punpcklwd m4, m6 ;packed out8, out12 punpckhwd m6, m5, m7 ;packed out11, out15 punpcklwd m5, m7 ;packed out10, out14 - jmp m(idct_16x4_internal).pass1_end3 + jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iidentity_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end + lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end %macro SAVE_8ROWS 2 ;src, stride @@ -2209,8 +2210,8 @@ pmulhrsw m0, m1 pmulhrsw m0, m2 mov r3d, 4 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET %endif @@ -2221,13 +2222,13 @@ INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity -cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(idct_8x8_internal).pass1)] +cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] .pass1: LOAD_8ROWS coeffq+16*1, 32, 1 mov [rsp+gprsize+16*11], tx2q - lea tx2q, [o(m(idct_8x16_internal).pass1_end)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: @@ -2237,7 +2238,7 @@ jmp r3 .pass2: - lea tx2q, [o(m(idct_8x16_internal).end)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] .pass2_pre: mova [coeffq+16*2 ], m1 @@ -2253,7 +2254,7 @@ mova m7, [coeffq+16*13] .pass2_main: - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*2 ] @@ -2264,18 +2265,18 @@ mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*11] mova m7, [coeffq+16*15] - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, dstq lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -2287,12 +2288,12 @@ INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity -cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iadst_8x8_internal).pass1)] - jmp m(idct_8x16_internal).pass1 +cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 .pass2: - lea tx2q, [o(m(iadst_8x16_internal).end)] + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] .pass2_pre: mova [rsp+gprsize+16*7], m0 @@ -2318,19 +2319,19 @@ mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end mov r3, dstq lea dstq, [dstq+strideq*8] - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN flipadst, dct @@ -2338,12 +2339,12 @@ INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity -cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iflipadst_8x8_internal).pass1)] - jmp m(idct_8x16_internal).pass1 +cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 .pass2: - lea tx2q, [o(m(iflipadst_8x16_internal).end)] + lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] lea r3, [dstq+strideq*8] .pass2_pre: @@ -2370,16 +2371,16 @@ mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end - jmp m(iflipadst_8x8_internal).end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + jmp m(iflipadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iflipadst_8x8_internal).end + jmp m(iflipadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN identity, dct @@ -2387,22 +2388,22 @@ INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity -cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 mov r3, tx2q - lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)] + lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)] mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32, 1 mov tx2q, r3 mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iidentity_8x16_internal).end1)] + lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)] .end: mova [rsp+gprsize+16*0], m7 @@ -2420,11 +2421,11 @@ mova [rsp+gprsize+16*0], m5 mova [rsp+gprsize+16*1], m6 mova [rsp+gprsize+16*2], m7 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end @@ -2438,8 +2439,8 @@ mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 4 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif @@ -2450,38 +2451,38 @@ INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity -cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 32, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*1, 32, 1 call .main mov r3, tx2q - lea tx2q, [o(m(idct_16x8_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 - jmp m(idct_8x8_internal).pass1_end + jmp m(idct_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(idct_16x8_internal).end)] + lea tx2q, [o(m(idct_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(idct_8x8_internal).pass2_main + jmp m(idct_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(idct_8x8_internal).pass2_main + jmp m(idct_8x8_internal_8bpc).pass2_main ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 mova [rsp+gprsize*2+32*5], m5 @@ -2567,7 +2568,7 @@ INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity -cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] @@ -2597,29 +2598,29 @@ call .main call .main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] - jmp m(iadst_8x8_internal).pass1_end + lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)] + jmp m(iadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 - jmp m(iadst_8x8_internal).pass1_end + jmp m(iadst_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iadst_16x8_internal).end)] + lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iadst_8x8_internal).pass2_main + jmp m(iadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iadst_8x8_internal).pass2_main + jmp m(iadst_8x8_internal_8bpc).pass2_main ALIGN function_align -.main: +cglobal_label .main mova [rsp+gprsize*2+16*0], m1 mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 @@ -2812,7 +2813,7 @@ mova m6, [rsp+gprsize*2+16*15] ;out14 ret ALIGN function_align -.main_pass2_end: +cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] mova m1, [rsp+gprsize*2+16* 9] mova m2, [rsp+gprsize*2+16*14] @@ -2847,7 +2848,7 @@ INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity -cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] @@ -2874,34 +2875,34 @@ pmulhrsw m6, m7, [coeffq+16*12] pmulhrsw m7, [coeffq+16*13] - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] - jmp m(iflipadst_8x8_internal).pass1_end + lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 - jmp m(iflipadst_8x8_internal).pass1_end + jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iflipadst_16x8_internal).end)] + lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iflipadst_8x8_internal).pass2_main + jmp m(iflipadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iflipadst_8x8_internal).pass2_main + jmp m(iflipadst_8x8_internal_8bpc).pass2_main INV_TXFM_16X8_FN identity, dct @@ -2909,14 +2910,14 @@ INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity -cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*16 mova m4, [coeffq-16*7] mova m5, [coeffq-16*5] mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mov r3, tx2q - lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] + lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)] .pass1: mova m0, [o(pw_2896x8)] @@ -2955,7 +2956,7 @@ pmulhrsw m6, m3 mova m3, [rsp+gprsize+16*0] paddsw m0, m6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: mova [coeffq+16*1], m4 @@ -2974,15 +2975,15 @@ jmp .pass1 .pass2: - lea tx2q, [o(m(iidentity_16x8_internal).end)] + lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iidentity_8x8_internal).end + jmp m(iidentity_8x8_internal_8bpc).end .end: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(iidentity_8x8_internal).end + jmp m(iidentity_8x8_internal_8bpc).end %macro INV_TXFM_16X16_FN 2 ; type1, type2 @@ -2993,8 +2994,8 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif @@ -3005,35 +3006,35 @@ INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity -cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*3, 64 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, tx2q - lea tx2q, [o(m(idct_16x16_internal).pass1_end)] + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)] mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)] mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)] mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 @@ -3041,19 +3042,19 @@ mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(idct_16x16_internal).end)] - jmp m(idct_8x16_internal).pass2_pre + lea tx2q, [o(m(idct_16x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal).end1)] + lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)] mov dstq, r3 lea r3, [dstq+8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -3070,8 +3071,8 @@ mova m5, [coeffq+16*5 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*13] - lea tx2q, [o(m(idct_8x16_internal).end)] - jmp m(idct_8x16_internal).pass2_main + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_main %macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 @@ -3132,33 +3133,33 @@ INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst -cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)] mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)] mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end - lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)] mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 @@ -3166,19 +3167,19 @@ mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 + jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(iadst_16x16_internal).end)] - jmp m(iadst_8x16_internal).pass2_pre + lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal).end1)] + lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)] mov dstq, r3 lea r3, [dstq+8] - jmp m(iadst_8x8_internal).end + jmp m(iadst_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -3199,45 +3200,45 @@ mova [rsp+gprsize+16*8], m5 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 - lea tx2q, [o(m(iadst_8x16_internal).end)] - jmp m(iadst_8x16_internal).pass2_main + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_main INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst -cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)] mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)] mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*17, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)] mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 @@ -3245,19 +3246,19 @@ mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(iflipadst_16x16_internal).end)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)] lea r3, [dstq+8] - jmp m(iflipadst_8x16_internal).pass2_pre + jmp m(iflipadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).end1)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] - jmp m(iflipadst_8x8_internal).end + jmp m(iflipadst_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -3278,16 +3279,16 @@ mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).end2)] + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)] mov dstq, r3 - jmp m(iflipadst_8x16_internal).pass2_main + jmp m(iflipadst_8x16_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] - jmp m(iflipadst_8x8_internal).end + jmp m(iflipadst_8x8_internal_8bpc).end %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 @@ -3299,10 +3300,10 @@ INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity -cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*17 mov r3, tx2q - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)] .pass1: mova m6, [o(pw_1697x16)] @@ -3318,18 +3319,18 @@ IDTX16B 5, 7, 6 mova m7, [coeffq+32*7] IDTX16B 7, 6, 6 - jmp m(idct_8x8_internal).pass1_end3 + jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq, 32 sub coeffq, 16 - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)] jmp .pass1 .pass1_end1: SAVE_8ROWS coeffq, 32 sub coeffq, 15*16 - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)] jmp .pass1 .pass1_end2: @@ -3340,7 +3341,7 @@ .pass2: lea r3, [dstq+8] - lea tx2q, [o(m(iidentity_16x16_internal).end1)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)] .end: mova [rsp+gprsize+16*0], m7 @@ -3359,11 +3360,11 @@ REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 pmulhrsw m4, m5 mova [rsp+gprsize+16*0], m6 - jmp m(idct_8x8_internal).end3 + jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(iidentity_16x16_internal).end2)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)] lea dstq, [dstq+strideq*2] jmp .end @@ -3373,24 +3374,24 @@ add coeffq, 32*8 LOAD_8ROWS coeffq, 32 - lea tx2q, [o(m(iidentity_16x16_internal).end3)] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)] mov dstq, r3 jmp .end .end3: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end -cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_8x32_internal) + call m(idct_8x32_internal_8bpc) RET .dconly: @@ -3405,24 +3406,24 @@ pshuflw m0, m0, q0000 punpcklwd m0, m0 mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET -cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp cmp eobd, 106 jle .fast LOAD_8ROWS coeffq+16*3, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1: mova [rsp+gprsize+16*9 ], m0 ;in24 @@ -3434,10 +3435,10 @@ mova [rsp+gprsize+16*27], m5 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 LOAD_8ROWS coeffq+16*2, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_1: mova [rsp+gprsize+16*7 ], m0 ;in16 @@ -3451,10 +3452,10 @@ .fast: LOAD_8ROWS coeffq+16*1, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: mova [rsp+gprsize+16*5 ], m0 ;in8 @@ -3466,10 +3467,10 @@ mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: mova [rsp+gprsize+16*11], m2 ;in2 @@ -3487,7 +3488,7 @@ pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 mova m0, [rsp+gprsize+16*11] mova m1, [rsp+gprsize+16*12] @@ -3495,7 +3496,7 @@ mova m3, [rsp+gprsize+16*14] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -3507,20 +3508,20 @@ mova m5, [rsp+gprsize+16*8 ] ;in20 mova m6, [rsp+gprsize+16*9 ] ;in24 mova m7, [rsp+gprsize+16*10] ;in28 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call .main .pass2: - lea r3, [o(m(idct_8x32_internal).end6)] + lea r3, [o(m(idct_8x32_internal_8bpc).end6)] .end: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_8x32_internal).end2)] + lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)] .end1: pxor m7, m7 @@ -3532,29 +3533,29 @@ jmp tx2q .end2: - lea tx2q, [o(m(idct_8x32_internal).end3)] - jmp m(idct_8x8_internal).end + lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)] + jmp m(idct_8x8_internal_8bpc).end .end3: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal).end4)] - jmp m(idct_8x8_internal).end + lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)] + jmp m(idct_8x8_internal_8bpc).end .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal).end5)] - jmp m(idct_8x8_internal).end + lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)] + jmp m(idct_8x8_internal_8bpc).end .end5: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] mov tx2q, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end6: ret @@ -3870,13 +3871,13 @@ ret -cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x8_internal) + call m(idct_32x8_internal_8bpc) RET .dconly: @@ -3885,7 +3886,7 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] .body: pmulhrsw m0, m2 @@ -3920,14 +3921,14 @@ RET -cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -3943,7 +3944,7 @@ cmp eobd, 106 jg .full - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: @@ -3956,66 +3957,66 @@ mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .pass2: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x8_internal).end)] - jmp m(idct_8x32_internal).end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end)] + jmp m(idct_8x32_internal_8bpc).end1 .end: mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end1: lea r3, [dstq+8] - lea tx2q, [o(m(idct_32x8_internal).end2)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end3: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal).end4)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end5)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end5: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal).end6)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end6: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end7)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .end7: mov dstq, r3 - lea tx2q, [o(m(idct_32x8_internal).end8)] - jmp m(idct_8x8_internal).pass2_main + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + jmp m(idct_8x8_internal_8bpc).pass2_main .end8: ret -cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 @@ -4024,19 +4025,19 @@ %if ARCH_X86_32 LEA r5, $$ %endif - lea tx2q, [o(m(idct_32x8_internal).end8)] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] .loop: LOAD_8ROWS coeffq+16*0, 64 paddsw m6, [o(pw_5)] mova [rsp+16*1], m6 mova m6, [o(pw_5)] REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - call m(idct_8x8_internal).pass1_end3 + call m(idct_8x8_internal_8bpc).pass1_end3 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 @@ -4045,7 +4046,7 @@ jg .loop RET -cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 @@ -4061,15 +4062,15 @@ mova [rsp+16*1], m6 mova m6, [o(pw_4096)] REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - lea tx2q, [o(m(idct_32x8_internal).end8)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + call m(idct_8x8_internal_8bpc).pass1_end3 mov [rsp+16*3], dstq mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_8x8_internal).end4)] - call m(idct_8x8_internal).end3 + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + call m(idct_8x8_internal_8bpc).end3 add coeffq, 16*8 mov dstq, [rsp+16*3] @@ -4080,13 +4081,13 @@ RET -cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_16x32_internal) + call m(idct_16x32_internal_8bpc) RET .dconly: @@ -4096,29 +4097,29 @@ mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET -cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp LOAD_8ROWS coeffq+16*1, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*5, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: mova [coeffq+16*1 ], m0 ;in8 @@ -4130,19 +4131,19 @@ mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: mova [rsp+gprsize+16*11], m2 ;in2 @@ -4160,7 +4161,7 @@ mova m3, [coeffq+16*5 ] ;in12 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [rsp+gprsize+16*11] ;in2 mova m1, [rsp+gprsize+16*12] ;in6 @@ -4168,11 +4169,11 @@ mova m3, [rsp+gprsize+16*14] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: @@ -4180,19 +4181,19 @@ mova [coeffq+16*4 ], m4 ;in4 LOAD_8ROWS coeffq+16*2, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*6, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: mova [coeffq+16*2 ], m0 ;in16 @@ -4205,19 +4206,19 @@ mova [rsp+gprsize+16*32], m7 ;in23 LOAD_8ROWS coeffq+16*3, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*7, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: mova [rsp+gprsize+16*17], m2 ;in26 @@ -4235,21 +4236,21 @@ mova m3, [coeffq+16*5 ] ;in12 mova m4, [coeffq+16*2 ] ;in16 mova m5, [coeffq+16*6 ] ;in20 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .pass2: mov [rsp+gprsize*1+16*35], eobd lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 - lea r3, [o(m(idct_16x32_internal).end)] - jmp m(idct_8x32_internal).end + lea r3, [o(m(idct_16x32_internal_8bpc).end)] + jmp m(idct_8x32_internal_8bpc).end .end: mov dstq, [rsp+gprsize*2+16*35] @@ -4284,7 +4285,7 @@ pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 @@ -4293,11 +4294,11 @@ mova m3, [coeffq+16*25] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp .end1 .full1: @@ -4305,7 +4306,7 @@ mova m5, [coeffq+16*18] ;in20 mova m6, [coeffq+16*3 ] ;in24 mova m7, [coeffq+16*19] ;in26 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 @@ -4316,7 +4317,7 @@ mova m5, [coeffq+16*26] ;in22 mova m6, [coeffq+16*11] ;in26 mova m7, [coeffq+16*27] ;in30 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4338,46 +4339,46 @@ mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .end1: - jmp m(idct_8x32_internal).pass2 + jmp m(idct_8x32_internal_8bpc).pass2 -cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x16_internal) - call m(idct_8x16_internal).pass2 + call m(idct_32x16_internal_8bpc) + call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*11, 16 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*19, 16 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*27, 16 mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 RET .dconly: @@ -4387,22 +4388,22 @@ mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body -cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp add coeffq, 16 - lea r3, [o(m(idct_32x16_internal).pass1_end1)] + lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)] .pass1: LOAD_8ROWS coeffq+16*0, 128, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4425,46 +4426,46 @@ mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main .pass1_end: mova [rsp+gprsize+16*0 ], m7 mov tx2q, r3 - jmp m(idct_8x8_internal).pass1_end + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+16*32, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*48, 32 sub coeffq, 16 - lea r3, [o(m(idct_32x16_internal).end)] + lea r3, [o(m(idct_32x16_internal_8bpc).end)] jmp .pass1 .end: ret -cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, eobd @@ -4488,8 +4489,8 @@ mova [rsp+16*1], m6 pxor m6, m6 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*0], m2 mova [rsp+16*1], m3 mova [rsp+16*2], m4 @@ -4513,7 +4514,7 @@ pmulhrsw m2, m4 pmulhrsw m3, m4 pmulhrsw m4, m7 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] add coeffq, 16 dec r3d @@ -4529,7 +4530,7 @@ RET -cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 12 ;0100b @@ -4550,8 +4551,8 @@ LOAD_8ROWS coeffq, 32, 1 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*1], m6 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*1], m5 mova [rsp+16*2], m6 mova m6, [o(pw_1697x16)] @@ -4566,7 +4567,7 @@ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m7 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 @@ -4589,14 +4590,14 @@ RET -cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x32_internal) + call m(idct_32x32_internal_8bpc) RET .dconly: @@ -4605,11 +4606,11 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body -cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -4641,10 +4642,10 @@ .full: LOAD_8ROWS coeffq+64*0, 64*4 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4658,7 +4659,7 @@ mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: @@ -4668,7 +4669,7 @@ mova m3, [coeffq+256*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+128*1] @@ -4677,41 +4678,41 @@ mova m3, [coeffq+128*7] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 @@ -4724,7 +4725,7 @@ .pass2: mov coeffq, [rsp+gprsize*2+16*35] mov r3d, 4 - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] .pass2_loop: mov [rsp+gprsize*3+16*35], r3d @@ -4761,7 +4762,7 @@ mova m5, [coeffq+16*18] mova m6, [coeffq+16*3 ] mova m7, [coeffq+16*19] - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] @@ -4772,7 +4773,7 @@ mova m5, [coeffq+16*26] mova m6, [coeffq+16*11] mova m7, [coeffq+16*27] - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -4793,7 +4794,7 @@ mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main jmp tx2q .fast1: @@ -4803,7 +4804,7 @@ mova m3, [coeffq+16*17] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] @@ -4812,19 +4813,19 @@ mova m3, [coeffq+16*25] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast jmp tx2q .pass2_end: - lea r3, [o(m(idct_32x32_internal).pass2_end1)] - jmp m(idct_8x32_internal).end + lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end .pass2_end1: - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] @@ -4834,7 +4835,7 @@ ret -cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -4855,15 +4856,15 @@ .loop: LOAD_8ROWS coeffq, 64 mova [rsp+16*1], m6 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 pmulhrsw m7, [o(pw_8192)] mova [rsp+16*0], m7 mova m7, [o(pw_8192)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+16*1], m6 mova [rsp+16*2], m5 - call m(idct_8x8_internal).end3 + call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 @@ -4891,14 +4892,14 @@ RET -cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_16x64_internal) + call m(idct_16x64_internal_8bpc) RET .dconly: @@ -4907,14 +4908,14 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET -cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -4931,21 +4932,21 @@ .pass1_loop: LOAD_8ROWS coeffq+64*0, 64*2 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 @@ -4958,7 +4959,7 @@ mov r3d, 2 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] .pass2_loop: mov [rsp+gprsize*3+16*67], r3d @@ -4993,7 +4994,7 @@ mova m3, [coeffq+16*3] REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 @@ -5003,7 +5004,7 @@ mova m3, [coeffq+16*19] REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -5024,7 +5025,7 @@ mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*6 ] ;in17 @@ -5049,7 +5050,7 @@ .fast: REPX {mova x, m4}, m2, m3, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 @@ -5057,7 +5058,7 @@ mova m1, [coeffq+16*17] REPX {mova x, m4}, m2, m3, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -5070,7 +5071,7 @@ mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 - call m(idct_8x32_internal).main_veryfast + call m(idct_8x32_internal_8bpc).main_veryfast SAVE_8ROWS rsp+gprsize+16*3, 16 call .main_fast @@ -5079,14 +5080,14 @@ LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, r4 - jmp m(idct_8x32_internal).end2 + jmp m(idct_8x32_internal_8bpc).end2 .end1: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] add rsp, 16*32 - lea r3, [o(m(idct_16x64_internal).end2)] - jmp m(idct_8x32_internal).end + lea r3, [o(m(idct_16x64_internal_8bpc).end2)] + jmp m(idct_8x32_internal_8bpc).end .end2: add coeffq, 16*32 @@ -5096,7 +5097,7 @@ mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] dec r3d jg .pass2_loop @@ -5751,14 +5752,14 @@ ret -cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_64x16_internal) + call m(idct_64x16_internal_8bpc) RET .dconly: @@ -5767,7 +5768,7 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)] + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)] .body: pmulhrsw m0, m2 @@ -5839,7 +5840,7 @@ mova m7, [%1+%2*3] %endmacro -cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 2 mov [rsp+gprsize*2+16*67], dstq lea dstq, [rsp+gprsize+16*68] @@ -5848,14 +5849,14 @@ LOAD_4ROWS coeffq+32*0, 32*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+32*4, 32*8 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -5869,7 +5870,7 @@ mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+32*1, 32*2 @@ -5892,69 +5893,69 @@ mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 - call m(idct_16x64_internal).main + call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+32*24, 32 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+32*24, 32 @@ -5974,23 +5975,23 @@ LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal).end)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end)] lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal).end1)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)] mov dstq, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 @@ -6011,23 +6012,23 @@ LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal).end2)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)] lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal).end3)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)] mov dstq, r3 - jmp m(idct_8x8_internal).end + jmp m(idct_8x8_internal_8bpc).end .end3: @@ -6041,14 +6042,14 @@ ret -cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_32x64_internal) + call m(idct_32x64_internal_8bpc) RET .dconly: @@ -6058,14 +6059,14 @@ mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body .end: RET -cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -6097,10 +6098,10 @@ .full: LOAD_8ROWS coeffq+64*0, 64*4, 1 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4, 1 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -6114,50 +6115,50 @@ mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main + call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: LOAD_4ROWS coeffq, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+128*1, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 @@ -6171,18 +6172,18 @@ mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - jmp m(idct_16x64_internal).pass2_loop + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop -cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_64x32_internal) + call m(idct_64x32_internal_8bpc) RET .dconly: @@ -6192,13 +6193,13 @@ pmulhrsw m0, m1 mov [coeffq], eobd mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] - jmp m(inv_txfm_add_dct_dct_64x16).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body .end: RET -cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r4d, 2 @@ -6220,14 +6221,14 @@ LOAD_4ROWS coeffq+64*0, 64*8, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8, 1 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -6241,7 +6242,7 @@ mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2, 1 @@ -6264,61 +6265,61 @@ mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 - call m(idct_16x64_internal).main + call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: SAVE_8ROWS dstq+64*24, 64 @@ -6334,39 +6335,39 @@ mov eobd, [rsp+gprsize*1+16*67] lea dstq, [dstq+32] mov [rsp+gprsize*1+16*35], eobd - lea tx2q, [o(m(idct_64x32_internal).pass2_end)] + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] mov r3d, 4 - jmp m(idct_32x32_internal).pass2_loop + jmp m(idct_32x32_internal_8bpc).pass2_loop .pass2_end: mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x32_internal).pass2_end1)] - jmp m(idct_8x32_internal).end2 + lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: - lea tx2q, [o(m(idct_64x32_internal).pass2_end)] + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] dec r3d - jg m(idct_32x32_internal).pass2_loop + jg m(idct_32x32_internal_8bpc).pass2_loop .pass2_end2: mov dstq, [rsp+gprsize*3+16*67] mov coeffq, [rsp+gprsize*2+16*67] - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] mov r3d, 4 - jmp m(idct_32x32_internal).pass2_loop + jmp m(idct_32x32_internal_8bpc).pass2_loop -cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly - call m(idct_64x64_internal) + call m(idct_64x64_internal_8bpc) RET .dconly: @@ -6375,10 +6376,10 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] - jmp m(inv_txfm_add_dct_dct_64x16).body + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body -cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 %undef cmp mov r5d, 4 @@ -6401,14 +6402,14 @@ LOAD_4ROWS coeffq+64*0, 64*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main + call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8 REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main + call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 @@ -6422,7 +6423,7 @@ mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 - call m(idct_8x32_internal).main_fast + call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 @@ -6445,69 +6446,69 @@ mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 - call m(idct_16x64_internal).main + call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end1 + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+64*24, 64 @@ -6524,16 +6525,16 @@ mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal).pass2_end)] - jmp m(idct_16x64_internal).pass2_loop + lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + jmp m(idct_16x64_internal_8bpc).pass2_loop .pass2_end: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] add rsp, 16*32 mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x64_internal).pass2_end1)] - jmp m(idct_8x32_internal).end2 + lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: add coeffq, 16*32 @@ -6543,10 +6544,10 @@ mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal).pass2_end)] + lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] dec r3d - jg m(idct_16x64_internal).pass2_loop + jg m(idct_16x64_internal_8bpc).pass2_loop .pass2_end2: mov coeffq, [rsp+gprsize*4+16*67] @@ -6555,5 +6556,5 @@ sub dstq, 72 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - jmp m(idct_16x64_internal).pass2_loop + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop diff -Nru dav1d-0.9.0/src/x86/loopfilter16_avx2.asm dav1d-0.9.1/src/x86/loopfilter16_avx2.asm --- dav1d-0.9.0/src/x86/loopfilter16_avx2.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/loopfilter16_avx2.asm 2021-07-28 21:38:28.905852000 +0000 @@ -623,9 +623,7 @@ paddw m8, m5 ; p6*7+p3+p1+q0 paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m2 - por m10, m9 + vpblendvb m10, m2, m10, m1 %ifidn %2, v mova [tmpq+strideq*2], m10 ; p5 %else @@ -638,9 +636,7 @@ paddw m8, m6 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m7 - por m10, m9 + vpblendvb m10, m7, m10, m1 %ifidn %2, v mova [tmpq+stride3q], m10 ; p4 %else @@ -653,9 +649,7 @@ psubw m8, m2 paddw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m11 - por m10, m9 + vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*4], m10 ; p3 lea tmpq, [dstq+strideq*4] @@ -669,9 +663,7 @@ paddw m8, m15 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m13 - por m10, m9 + vpblendvb m10, m13, m10, m1 mova [rsp+1*32], m10 ; don't clobber p2/m13 ; sub p6/p3, add p0/q4 @@ -684,9 +676,7 @@ %endif psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m3 - por m10, m9 + vpblendvb m10, m3, m10, m1 mova [rsp+2*32], m10 ; don't clobber p1/m3 ; sub p6/p2, add q0/q5 @@ -699,9 +689,7 @@ %endif psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m4 - por m10, m9 + vpblendvb m10, m4, m10, m1 mova [rsp+3*32], m10 ; don't clobber p0/m4 ; sub p6/p1, add q1/q6 @@ -715,9 +703,7 @@ paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m5 - por m10, m9 + vpblendvb m10, m5, m10, m1 mova [rsp+4*32], m10 ; don't clobber q0/m5 ; sub p5/p0, add q2/q6 @@ -726,9 +712,7 @@ paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m6 - por m2, m10, m9 ; don't clobber q1/m6 + vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 paddw m8, m15 @@ -736,9 +720,7 @@ paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m14 - por m7, m10, m9 ; don't clobber q2/m14 + vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v @@ -750,9 +732,7 @@ paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 - pandn m9, m1, m15 - por m10, m9 + vpblendvb m10, m15, m10, m1 %ifidn %2, v mova [tmpq+mstrideq], m10 ; q3 %else @@ -769,13 +749,12 @@ paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 - pand m10, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*0] + mova m9, [tmpq+strideq*0] %else - pandn m9, m1, [rsp+10*32] + mova m9, [rsp+10*32] %endif - por m10, m9 + vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*0], m10 ; q4 %else @@ -790,11 +769,11 @@ psrlw m10, m8, 4 pand m10, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*1] + mova m9, [tmpq+strideq*1] %else - pandn m9, m1, [rsp+11*32] + mova m9, [rsp+11*32] %endif - por m10, m9 + vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else @@ -859,14 +838,12 @@ paddw m2, m0 pmulhrsw m2, [pw_4096] - REPX {pand x, m9}, m7, m8, m10, m11, m1, m2 - REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14 - por m13, m7 - por m3, m8 - por m4, m10 - por m5, m11 - por m6, m1 - por m14, m2 + vpblendvb m13, m13, m7, m9 + vpblendvb m3, m3, m8, m9 + vpblendvb m4, m4, m10, m9 + vpblendvb m5, m5, m11, m9 + vpblendvb m6, m6, m1, m9 + vpblendvb m14, m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 @@ -984,12 +961,10 @@ paddw m8, m14 pmulhrsw m8, [pw_4096] - REPX {pand x, m9}, m2, m10, m11, m8 - REPX {pandn x, m9, x}, m3, m4, m5, m6 - por m3, m2 - por m4, m10 - por m5, m11 - por m6, m8 + vpblendvb m3, m3, m2, m9 + vpblendvb m4, m4, m10, m9 + vpblendvb m5, m5, m11, m9 + vpblendvb m6, m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 diff -Nru dav1d-0.9.0/src/x86/loopfilter16_sse.asm dav1d-0.9.1/src/x86/loopfilter16_sse.asm --- dav1d-0.9.0/src/x86/loopfilter16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/loopfilter16_sse.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,1801 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%if ARCH_X86_64 +%define PIC_sym(a) a +%else +%define PIC_base $$ +%define PIC_sym(a) pic_regq+a-PIC_base +%endif + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_3: times 8 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 8 dw 4 +pw_16: times 8 dw 16 +pw_8: times 8 dw 8 +pw_4096: times 8 dw 4096 + +pb_mask: dd 1, 1, 2, 2 + +SECTION .text + +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 +%define extra_stack 2 +%else +%define extra_stack 0 +%endif +%endif + +%macro RELOC_ARGS 2 ; h/v, off +ASSERT ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + mov r5d, [rstk + stack_offset + 4*4 + 4] +%define lstridem [esp+%2+0*gprsize] + mov lstridem, r5d + mov r5d, [rstk + stack_offset + 4*5 + 4] +%define lutm [esp+%2+1*gprsize] + mov lutm, r5d + mov r5d, [rstk + stack_offset + 4*6 + 4] +%ifidn %1, v +%define wm [esp+%2+2*gprsize] + mov wm, r5d + mov r5d, [rstk + stack_offset + 4*3 + 4] +%define lm [esp+%2+3*gprsize] + mov lm, r5d +%else ; %1 == h +%define hm [esp+%2+2*gprsize] + mov hm, r5d +%endif ; %1==v + mov r5d, r7m +%define bdmulm [esp+%2+4*gprsize] + mov bdmulm, r5d +%else +%define lstridem r4m +%define lutm r5m +%ifidn %1, v +%define wm r6m +%define lm r3m +%else +%define hm r6m +%endif +%define bdmulm r7m +%endif ; STACK_ALIGNMENT +%endmacro + +%macro UNRELOC_ARGS 0 +%if ARCH_X86_32 +%undef lm +%undef lstridem +%undef wm +%undef hm +%undef lutm +%endif +%endmacro + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro SPLATD 2 + movd %1, %2 + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 + movd %1, %2 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; m%1 a b c d e f g h a i q y 6 E M U +; m%2 i j k l m n o p b j r z 7 F N V +; m%3 q r s t u v w x c k s 0 8 G O W +; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; m%6 E F G H I J K L f n v 3 B J R Z +; m%7 M N O P Q R S T g o w 4 C K S + +; m%8 U V W X Y Z + = h p x 5 D L T = +%if ARCH_X86_64 +%macro TRANSPOSE8X8W 9 + ; m%1 a b c d e f g h a i q y b j r z + ; m%2 i j k l m n o p c k s 0 d l t 1 + ; m%3 q r s t u v w x -> e m u 2 f n v 3 + ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; m%6 E F G H I J K L 8 G O W 9 H P X + ; m%7 M N O P Q R S T -> A I Q Y B J R Z + ; m%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; m%1 a i q y b j r z a i q y 6 E M U + ; m%2 c k s 0 d l t 1 b j r z 7 F N V + ; m%3 e m u 2 f n v 3 c k s 0 8 G O W + ; m%4 g o w 4 h p x 5 d l t 1 9 H P X + ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; m%6 8 G O W 9 H P X f n v 3 B J R Z + ; m%7 A I Q Y B J R Z g o w 4 C K S + + ; m%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro +%else ; x86-32 +; input: 1-7 in registers, 8 in first memory [read-only] +; second memory is scratch, and may overlap with first or third memory +; output: 1-5,7-8 in registers, 6 in third memory [write-only] +%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] + TRANSPOSE4X4W %1, %2, %3, %4, %8 +%ifnidn %9, "" + mov%12 m%8, %9 +%else + mova m%8, %10 +%endif + mova %10, m%4 + TRANSPOSE4X4W %5, %6, %7, %8, %4 + punpckhqdq m%4, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + mova m%7, %10 +%ifnidn %11, "" + mov%13 %11, m%6 +%else + mova %10, m%6 +%endif + punpckhqdq m%6, m%7, m%8 + punpcklqdq m%7, m%8 + + ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 + SWAP %2, %4, %5, %3 + SWAP %6, %8 +%endmacro +%endif ; x86-32/64 + +; transpose and write m8-11, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp + ; transpose 8x4 + punpcklwd %5, %1, %2 + punpckhwd %1, %2 + punpcklwd %2, %3, %4 + punpckhwd %3, %4 + punpckldq %4, %5, %2 + punpckhdq %5, %2 + punpckldq %2, %1, %3 + punpckhdq %1, %3 + + ; write out + movq [dstq+strideq*0-4], %4 + movhps [dstq+strideq*1-4], %4 + movq [dstq+strideq*2-4], %5 + movhps [dstq+stride3q -4], %5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], %2 + movhps [dstq+strideq*1-4], %2 + movq [dstq+strideq*2-4], %1 + movhps [dstq+stride3q -4], %1 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 +%if ARCH_X86_64 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 + mova P1, [dstq+mstrideq*2] ; p1 + mova P0, [dstq+mstrideq*1] ; p0 + mova Q0, [dstq+strideq*0] ; q0 + mova Q1, [dstq+strideq*1] ; q1 +%else ; x86-32 +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%endif ; x86-32/64 +%else ; %1 != 4 + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if ARCH_X86_64 + ; we load p3 later +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq+strideq*0] + mova Q1, [dstq+strideq*1] + mova Q2, [dstq+strideq*2] +%if %1 != 6 +%define P3 [tmpq+strideq*0] +%define Q3 m15 + mova Q3, [dstq+stride3q] +%endif ; %1 != 6 +%else ; x86-32 +%define P2 [tmpq+strideq*1] +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%define Q2 [dstq+strideq*2] +%if %1 != 6 +%define P3 [dstq+mstrideq*4] +%define Q3 [dstq+stride3q] +%endif ; %1 != 6 +%endif ; x86-32/64 +%endif ; %1 ==/!= 4 +%else ; %2 != v + ; load lines +%if %1 == 4 + movq m0, [dstq+strideq*0-4] + movq m2, [dstq+strideq*1-4] + movq m4, [dstq+strideq*2-4] + movq m5, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq m3, [tmpq+strideq*0-4] + movq m6, [tmpq+strideq*1-4] + movq m1, [tmpq+strideq*2-4] + movq m7, [tmpq+stride3q -4] + + ; transpose 4x8 + ; m0: A-D0 + ; m2: A-D1 + ; m4: A-D2 + ; m5: A-D3 + ; m3: A-D4 + ; m6: A-D5 + ; m1: A-D6 + ; m7: A-D7 + punpcklwd m0, m2 + punpcklwd m4, m5 + punpcklwd m3, m6 + punpcklwd m1, m7 + ; m0: A0-1,B0-1,C0-1,D0-1 + ; m4: A2-3,B2-3,C2-3,D2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m1: A6-7,B6-7,C6-7,D6-7 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + ; m0: A0-3,B0-3 + ; m2: C0-3,D0-3 + ; m3: A4-7,B4-7 + ; m4: C4-7,D4-7 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + punpckhqdq m3, m2, m4 + punpcklqdq m2, m4 + ; m0: A0-7 + ; m1: B0-7 + ; m2: C0-7 + ; m3: D0-7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%else +%define P1 [esp+3*mmsize] +%define P0 [esp+4*mmsize] +%define Q0 [esp+5*mmsize] +%define Q1 [esp+6*mmsize] + mova P1, m0 + mova P0, m1 + mova Q0, m2 + mova Q1, m3 +%endif +%elif %1 == 6 || %1 == 8 + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] +%if ARCH_X86_64 + movu m7, [tmpq+stride3q -8] +%endif + + ; transpose 8x16 + ; m0: A-H0,A-H8 + ; m1: A-H1,A-H9 + ; m2: A-H2,A-H10 + ; m3: A-H3,A-H11 + ; m4: A-H4,A-H12 + ; m5: A-H5,A-H13 + ; m6: A-H6,A-H14 + ; m7: A-H7,A-H15 +%if ARCH_X86_64 + punpcklwd m8, m0, m1 +%else + punpcklwd m7, m0, m1 +%endif + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 +%if ARCH_X86_64 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 +%else + mova [rsp+3*16], m4 + movu m4, [tmpq+stride3q -8] + punpcklwd m5, m6, m4 + punpckhwd m6, m4 +%endif + ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] + ; m0: E0-1,F0-1,G0-1,H0-1 + ; m1: A2-3,B2-3,C2-3,D2-3 + ; m2: E2-3,F2-3,G2-3,H2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] + ; m5: A6-7,B6-7,C6-7,D6-7 + ; m6: E6-7,F6-7,G6-7,H6-7 +%if ARCH_X86_64 + punpckldq m7, m8, m1 + punpckhdq m8, m1 +%else + punpckldq m4, m7, m1 + punpckhdq m7, m1 +%endif + punpckldq m1, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 +%if ARCH_X86_64 + punpckldq m5, m4, m6 + punpckhdq m4, m6 +%else + mova [rsp+4*16], m3 + mova m3, [rsp+3*16] + punpckldq m5, m3, m6 + punpckhdq m3, m6 +%endif + ; m7: A0-3,B0-3 [m4 on x86-32] + ; m8: C0-3,D0-3 [m7 on x86-32] + ; m1: E0-3,F0-3 + ; m0: G0-3,H0-3 + ; m2: A4-7,B4-7 + ; m3: C4-7,D4-7 [r4 on x86-32] + ; m5: E4-7,F4-7 + ; m4: G4-7,H4-7 [m3 on x86-32] +%if ARCH_X86_64 +%if %1 != 6 + punpcklqdq m6, m7, m2 +%endif + punpckhqdq m7, m2 + punpcklqdq m2, m8, m3 + punpckhqdq m8, m3 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 +%if %1 != 6 + punpckhqdq m5, m0, m4 +%endif + punpcklqdq m0, m4 +%if %1 == 8 + mova [rsp+1*16], m6 +%define P3 [rsp+1*16] +%endif + ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 + SWAP 7, 13 + SWAP 8, 2, 9 + SWAP 3, 10 + SWAP 1, 11 + SWAP 0, 14 + SWAP 5, 15 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%if %1 == 8 +%define Q3 m15 +%endif +%else ; x86-32 +%if %1 == 8 +%define P3 [rsp+ 6*16] + punpcklqdq m6, m4, m2 + mova P3, m6 +%endif + mova m6, [rsp+4*16] + punpckhqdq m4, m2 + punpcklqdq m2, m7, m6 + punpckhqdq m7, m6 + punpcklqdq m6, m1, m5 + punpckhqdq m1, m5 +%if %1 == 8 +%define Q3 [rsp+24*16] + punpckhqdq m5, m0, m3 + mova Q3, m5 +%endif + punpcklqdq m0, m3 +%if %1 == 8 +%define P2 [rsp+18*16] +%define P1 [rsp+19*16] +%define P0 [rsp+20*16] +%define Q0 [rsp+21*16] +%define Q1 [rsp+22*16] +%define Q2 [rsp+23*16] +%else +%define P2 [rsp+3*16] +%define P1 [rsp+4*16] +%define P0 [rsp+5*16] +%define Q0 [rsp+6*16] +%define Q1 [rsp+7*16] +%define Q2 [rsp+8*16] +%endif + mova P2, m4 + mova P1, m2 + mova P0, m7 + mova Q0, m6 + mova Q1, m1 + mova Q2, m0 +%endif ; x86-32/64 +%else ; %1 == 16 + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova m0, [dstq+strideq*0-16] + mova m1, [dstq+strideq*1-16] + mova m2, [dstq+strideq*2-16] + mova m3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0-16] + mova m5, [tmpq+strideq*1-16] + mova m6, [tmpq+strideq*2-16] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q -16] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + SWAP 5, 13 + SWAP 6, 8 + SWAP 7, 9 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%else ; x86-32 +%define P2 [esp+18*16] +%define P1 [esp+19*16] +%define P0 [esp+20*16] + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q -16], P2, "", a, a + mova P1, m6 + mova P0, m7 +%endif ; x86-32/64 + mova [rsp+ 7*16], m0 + mova [rsp+ 8*16], m1 + mova [rsp+ 9*16], m2 + mova [rsp+10*16], m3 +%define P3 [rsp+6*16] + mova P3, m4 + + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + mova m2, [dstq+strideq*2] + mova m3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0] + mova m5, [tmpq+strideq*1] + mova m6, [tmpq+strideq*2] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q ] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 + SWAP 0, 10 + SWAP 1, 11 + SWAP 2, 14 + SWAP 3, 15 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%define Q3 m15 +%else ; x86-32 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q ], [rsp+12*16], "", a, a +%define Q0 [esp+21*16] +%define Q1 [esp+22*16] +%define Q2 [esp+23*16] +%define Q3 [esp+24*16] + mova Q0, m0 + mova Q1, m1 + mova Q2, m2 + mova Q3, m3 +%endif ; x86-32/64 + + mova [rsp+11*16], m4 +%if ARCH_X86_64 + mova [rsp+12*16], m5 +%endif + mova [rsp+13*16], m6 + mova [rsp+14*16], m7 +%endif ; %1 == 4/6/8/16 +%endif ; %2 ==/!= v + + ; load L/E/I/H +%if ARCH_X86_32 +%define l_strideq r5 + mov l_strideq, dword lstridem +%ifidn %2, v +%define lq r3 + mov lq, dword lm +%endif +%endif +%ifidn %2, v +%if cpuflag(sse4) + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else ; ssse3 + movq m1, [lq] + movq m0, [lq+l_strideq] + pxor m2, m2 + REPX {punpcklbw x, m2}, m1, m0 +%endif ; ssse3/sse4 +%else ; %2 != v + movq m0, [lq] ; l0, l1 + movq m1, [lq+l_strideq] ; l2, l3 + punpckldq m0, m1 ; l0, l2, l1, l3 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2 + punpckhbw m0, m2 ; l1, l3 +%endif ; %2==/!=v +%if ARCH_X86_32 +%ifidn %2, v +%undef lq + mov mstrideq, mstridem +%endif +%endif + pcmpeqw m5, m2, m0 + pand m1, m5 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] + pcmpeqw m5, m2, m0 ; !L + psrlw m5, 1 +%if ARCH_X86_64 + psrlw m2, m0, [lutq+128] + SPLATW m1, [lutq+136] +%else ; x86-32 + mov r5, lutm + psrlw m2, m0, [r5+128] + SPLATW m1, [r5+136] +%endif ; x86-32/64 + pminsw m2, m1 + pmaxsw m2, [PIC_sym(pw_1)] ; I + psrlw m1, m0, 4 ; H + paddw m0, [PIC_sym(pw_2)] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [bdmulq]}, m0, m1, m2 +%if ARCH_X86_32 +%undef l_strideq + lea stride3q, [strideq*3] +%endif + + psubw m3, P1, P0 ; p1-p0 + psubw m4, Q0, Q1 ; q0-q1 + REPX {pabsw x, x}, m3, m4 + pmaxsw m3, m5 + pmaxsw m3, m4 + pcmpgtw m7, m3, m1 ; hev +%if %1 != 4 + psubw m4, P2, P0 ; p2-p0 + pabsw m4, m4 + pmaxsw m4, m3 +%if %1 != 6 + mova m6, P3 ; p3 + psubw m5, m6, P0 ; p3-p0 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + psubw m5, Q0, Q2 ; q0-q2 + pabsw m5, m5 + pmaxsw m4, m5 +%if %1 != 6 + psubw m5, Q0, Q3 ; q0-q3 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + pcmpgtw m4, [bdmulq] ; !flat8in + + psubw m5, P2, P1 ; p2-p1 + pabsw m5, m5 +%if %1 != 6 + psubw m6, P2 ; p3-p2 + pabsw m6, m6 + pmaxsw m5, m6 + psubw m6, Q2, Q3 ; q2-q3 + pabsw m6, m6 + pmaxsw m5, m6 +%endif ; %1 != 6 + psubw m6, Q2, Q1 ; q2-q1 + pabsw m6, m6 + pmaxsw m5, m6 + +%if %1 == 16 + SPLATD m6, [maskq+8] + SPLATD m1, [maskq+4] + por m6, m1 + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 +%else ; %1 != 16 + SPLATD m6, [maskq+4] + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 ; only apply fm-wide to wd>4 blocks +%endif ; %1==/!=16 + pmaxsw m3, m5 +%endif ; %1 != 4 + pcmpgtw m3, m2 + + psubw m5, P1, Q1 ; p1-q1 + psubw m6, P0, Q0 ; p0-q0 + REPX {pabsw x, x}, m5, m6 + paddw m6, m6 + psrlw m5, 1 + paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m3, m5 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m1, [rsp+ 9*16] + mova m2, [rsp+10*16] +%endif ; %2==/!=v + REPX {psubw x, P0}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxsw m1, m0 + pmaxsw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m5, [tmpq+strideq*2] +%else ; %2 != v + mova m0, [rsp+11*16] + mova m2, [rsp+12*16] + mova m5, [rsp+13*16] +%endif ; %2==/!=v + REPX {psubw x, Q0}, m0, m2, m5 + REPX {pabsw x, x}, m0, m2, m5 + pmaxsw m0, m2 + pmaxsw m1, m5 + pmaxsw m1, m0 + pcmpgtw m1, [bdmulq] ; !flat8out + por m1, m4 ; !flat8in | !flat8out + SPLATD m2, [maskq+8] + pand m5, m2, m12 + pcmpeqd m5, m12 + pandn m1, m5 ; flat16 + pandn m5, m3, m1 ; flat16 & fm + SWAP 1, 5 + + SPLATD m5, [maskq+4] + por m5, m2 + pand m2, m5, m12 + pcmpeqd m2, m12 + pandn m4, m2 ; flat8in + pandn m2, m3, m4 + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m2, m5 + pand m2, m12 + pcmpeqd m2, m12 + pandn m3, m2 + pandn m0, m4, m3 ; fm & !flat8 & !flat16 + SWAP 0, 3 + pandn m0, m1, m4 ; flat8 & !flat16 + SWAP 0, 4 +%elif %1 != 4 + SPLATD m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m4, m2 + pandn m2, m3, m4 ; flat8 & fm + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 + pandn m0, m4, m3 ; fm & !flat8 + SWAP 0, 3 +%else ; %1 == 4 + SPLATD m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 ; fm +%endif ; %1==/!=4 + + ; short filter +%if ARCH_X86_64 + SPLATW m0, r7m +%else + SPLATW m0, bdmulm +%endif + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m5, Q0, P0 ; q0-p0 + paddw m6, m5, m5 + paddw m6, m5 ; 3*(q0-p0) + psubw m5, P1, Q1 ; iclip_diff(p1-q1) + pminsw m5, m0 + pmaxsw m5, m2 + pand m5, m7 ; f=iclip_diff(p1-q1)&hev + paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m5, m0 + pmaxsw m5, m2 + pand m3, m5 ; f&=fm + paddw m5, m3, [PIC_sym(pw_3)] + paddw m3, [PIC_sym(pw_4)] + REPX {pminsw x, m0}, m5, m3 + psraw m5, 3 ; f2 + psraw m3, 3 ; f1 + psubw m0, m2 ; 1023 or 4095 + pxor m2, m2 +%if ARCH_X86_64 + paddw P0, m5 + psubw Q0, m3 +%else + paddw m5, P0 + psubw m6, Q0, m3 + REPX {pminsw x, m0}, m5, m6 + REPX {pmaxsw x, m2}, m5, m6 +%endif + + paddw m3, [PIC_sym(pw_1)] + psraw m3, 1 ; f=(f1+1)>>1 + pandn m7, m3 ; f&=!hev + SWAP 7, 3 +%if ARCH_X86_64 + paddw P1, m3 + psubw Q1, m3 + REPX {pminsw x, m0}, P1, P0, Q0, Q1 + REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 +%else + psubw m7, Q1, m3 + paddw m3, P1 + REPX {pminsw x, m0}, m7, m3 + REPX {pmaxsw x, m2}, m7, m3 +%if %1 > 4 + mova P1, m3 + mova P0, m5 + mova Q0, m6 + mova Q1, m7 +%endif +%endif + +%if %1 == 16 + +; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2-3,5-7 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m6, [tmpq+strideq*4] ; p3 + lea tmpq, [dstq+mstrideq*4] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m7, [rsp+10*16] + mova m6, [rsp+ 6*16] +%endif ; %2==/!=v + + mova [rsp+ 0*16], m4 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m3, m0, 3 ; p6*8 + paddw m3, [PIC_sym(pw_8)] + paddw m5, m2, m7 ; p5+p4 + psubw m3, m0 + paddw m5, m5 ; (p5+p4)*2 + paddw m3, m6 ; p6*7+p3 + paddw m5, P2 ; (p5+p4)*2+p2 + paddw m3, P1 ; p6*7+p3+p1 + paddw m5, P0 ; (p5+p4)*2+p2+p0 + paddw m3, Q0 ; p6*7+p3+p1+q0 + paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m2 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*2], m5 ; p5 +%else ; %2 != v + mova [rsp+9*16], m5 +%endif ; %2==/!=v + + ; sub p6*2, add p3/q1 + paddw m3, m6 + paddw m5, m0, m0 + paddw m3, Q1 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m7 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*1], m5 ; p4 +%else ; %2 != v + mova [rsp+10*16], m5 +%endif ; %2==/!=v + + ; sub p6/p5, add p2/q2 + psubw m3, m0 + paddw m5, P2, Q2 + psubw m3, m2 + paddw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m6 + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; p3 +%else ; %2 != v + mova [rsp+6*16], m5 +%endif ; %2==/!=v + +%define WRITE_IN_PLACE 0 +%ifidn %2, v +%if ARCH_X86_64 +%define WRITE_IN_PLACE 1 +%endif +%endif + + ; sub p6/p4, add p1/q3 + paddw m3, P1 + paddw m5, m0, m7 + paddw m3, Q3 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P2 + por m5, m4 +%if WRITE_IN_PLACE + mova [tmpq+strideq*1], m5 +%else + mova [rsp+1*16], m5 ; don't clobber p2/m13 +%endif + + ; sub p6/p3, add p0/q4 + paddw m3, P0 + paddw m5, m0, m6 +%ifidn %2, v + paddw m3, [dstq+strideq*4] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P1 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*2], m5 +%else + mova [rsp+2*16], m5 ; don't clobber p1/m3 +%endif + + ; sub p6/p2, add q0/q5 + paddw m3, Q0 + paddw m5, m0, P2 +%ifidn %2, v +%if ARCH_X86_32 + lea r4, P2 +%endif + lea tmpq, [dstq+strideq*4] + paddw m3, [tmpq+strideq*1] +%else ; %2 != v + paddw m3, [rsp+12*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*1], m5 +%else + mova [rsp+3*16], m5 ; don't clobber p0/m4 +%endif + + ; sub p6/p1, add q1/q6 + paddw m3, Q1 + paddw m5, m0, P1 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else ; %2 != v + mova m0, [rsp+13*16] ; q6 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq], m5 +%else + mova [rsp+4*16], m5 ; don't clobber q0/m5 +%endif + + ; sub p5/p0, add q2/q6 + paddw m3, Q2 + paddw m5, m2, P0 + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q1 + por m2, m5, m4 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m3, Q3 + paddw m7, Q0 + paddw m3, m0 + psubw m3, m7 + psrlw m7, m3, 4 + pand m7, m1 + pandn m4, m1, Q2 + por m7, m4 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*0] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + paddw m6, Q1 + paddw m3, m0 + psubw m3, m6 + psrlw m6, m3, 4 + pand m6, m1 + pandn m4, m1, Q3 + por m6, m4 +%if WRITE_IN_PLACE + mova [tmpq+mstrideq], m6 ; q3 +%else ; %2 != v + mova [rsp+5*16], m6 +%endif ; %2==/!=v + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*1] +%if ARCH_X86_64 + paddw m5, P2, Q2 +%else + ; because tmpq is clobbered, so we use a backup pointer for P2 instead + paddw m5, [r4], Q2 + mov pic_regq, pic_regm +%endif +%else ; %2 != v + paddw m3, [rsp+12*16] + paddw m5, P2, Q2 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*0] +%else ; %2 != v + pandn m4, m1, [rsp+11*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; q4 +%else ; %2 != v + mova [rsp+11*16], m5 +%endif ; %2==/!=v + + ; sub p1/q3, add q6*2 + psubw m3, P1 + paddw m0, m0 + psubw m3, Q3 + paddw m3, m0 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*1] +%else ; %2 != v + pandn m4, m1, [rsp+12*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*1], m5 ; q5 +%else ; %2 != v + mova [rsp+12*16], m5 +%endif ; %2==/!=v + + mova m4, [rsp+0*16] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%if ARCH_X86_64 + SWAP 2, 11 + SWAP 7, 14 + SWAP 6, 15 +%else ; x86-32 + mova Q1, m2 + mova Q2, m7 +%endif ; x86-32/64 +%if WRITE_IN_PLACE + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq] +%elif ARCH_X86_64 + mova P2, [rsp+1*16] + mova P1, [rsp+2*16] + mova P0, [rsp+3*16] + mova Q0, [rsp+4*16] +%else ; !WRITE_IN_PLACE & x86-32 + mova m0, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m3, [rsp+4*16] + mova m7, [rsp+5*16] + mova P2, m0 + mova P1, m1 + mova P0, m2 + mova Q0, m3 + mova Q3, m7 +%endif ; WRITE_IN_PLACE / x86-32/64 +%undef WRITE_IN_PLACE +%endif ; %1 == 16 + +%if %1 >= 8 + + ; flat8 filter + mova m0, P3 ; p3 + paddw m1, m0, P2 ; p3+p2 + paddw m2, P1, P0 ; p1+p0 + paddw m3, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m3, Q0 ; 2*(p3+p2)+q0 + paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [PIC_sym(pw_4096)] + psubw m7, P2 + pand m7, m4 + + paddw m3, P1, Q1 ; p1+q1 + psubw m2, m1 ; 2*p3+p2+p1+p0+q0 + paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 + pmulhrsw m3, m2, [PIC_sym(pw_4096)] + psubw m3, P1 + pand m3, m4 + + paddw m5, m0, P1 ; p3+p1 + paddw m6, P0, Q2 ; p0+q2 + psubw m2, m5 ; p3+p2+p1+p0+q0+q1 + paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 + pmulhrsw m5, m2, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m6, m0, P0 ; p3+p0 + paddw m1, Q0, Q3 ; q0+q3 + psubw m2, m6 ; p2+p1+p0+q0+q1+q2 + paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 + pmulhrsw m6, m2, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 + paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 + paddw m1, P2, Q0 ; p2+q0 + psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 + pmulhrsw m1, m2, [PIC_sym(pw_4096)] + psubw m1, Q1 + pand m1, m4 + + psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 + psubw m2, Q1 ; p0+q0+q1+q2+2*q3 + paddw m0, Q3, Q2 ; q3+q2 + paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 + pmulhrsw m2, [PIC_sym(pw_4096)] + psubw m2, Q2 + pand m2, m4 + + paddw m7, P2 + paddw m3, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m1, Q1 + paddw m2, Q2 + +%ifidn %2, v + mova [tmpq+strideq*1], m7 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m1 ; q1 + mova [dstq+strideq*2], m2 ; q2 +%else ; %2 != v + mova m0, P3 + +%if %1 == 8 + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + SWAP 4, 15 + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 +%else + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ + Q3, [tmpq+strideq*1-8], a, u +%endif + + ; write 8x8 + movu [dstq+strideq*0-8], m0 + movu [dstq+strideq*1-8], m7 + movu [dstq+strideq*2-8], m3 + movu [dstq+stride3q -8], m5 + movu [tmpq+strideq*0-8], m6 +%if ARCH_X86_64 + movu [tmpq+strideq*1-8], m1 +%endif + movu [tmpq+strideq*2-8], m2 + movu [tmpq+stride3q -8], m4 + lea dstq, [dstq+strideq*8] +%else ; %1 != 8 +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 +%else + mova [rsp+1*16], m6 + mova [rsp+2*16], m1 + mova [rsp+3*16], m2 +%endif + + mova m1, [rsp+ 7*16] + mova m2, [rsp+ 8*16] + mova m4, [rsp+ 9*16] + mova m6, [rsp+10*16] + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 +%else + mova [rsp+7*16], m5 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ + [rsp+7*16], [tmpq+strideq*1-16], a, a +%endif + + mova [dstq+strideq*0-16], m1 + mova [dstq+strideq*1-16], m2 + mova [dstq+strideq*2-16], m4 + mova [dstq+stride3q -16], m6 + mova [tmpq+strideq*0-16], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1-16], m7 +%endif + mova [tmpq+strideq*2-16], m3 + mova [tmpq+stride3q -16], m5 + +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 4, 15 +%else + mova m6, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m4, Q3 +%endif + mova m0, [rsp+11*16] + mova m3, [rsp+12*16] + mova m5, [rsp+13*16] +%if ARCH_X86_64 + mova m7, [rsp+14*16] + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 +%else + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ + [rsp+14*16], [tmpq+strideq*1], a, a +%endif + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + mova [tmpq+strideq*0], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1], m3 +%endif + mova [tmpq+strideq*2], m5 + mova [tmpq+stride3q ], m7 + lea dstq, [dstq+strideq*8] +%endif ; %1==/!=8 +%endif ; %2==/!=v +%elif %1 == 6 + ; flat6 filter + paddw m3, P1, P0 ; p1+p0 + paddw m3, P2 ; p2+p1+p0 + paddw m6, P2, Q0 ; p2+q0 + paddw m3, m3 ; 2*(p2+p1+p0) + paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m3, [PIC_sym(pw_4096)] + psubw m2, P1 + pand m2, m4 + + paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) + paddw m6, P2, P2 ; 2*p2 + paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 + psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 + pmulhrsw m5, m3, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) + paddw m6, P2, P1 ; p2+p1 + paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 + psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 + pmulhrsw m6, m3, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + psubw m3, P1 ; 2*(p0+q0+q1)+q2 +%if ARCH_X86_64 + paddw Q2, Q2 ; q2*2 +%else + mova m0, Q2 + paddw m0, m0 +%endif + psubw m3, P0 ; p0+2*(q0+q1)+q2 +%if ARCH_X86_64 + paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 +%else + paddw m3, m0 +%endif + pmulhrsw m3, [PIC_sym(pw_4096)] + psubw m3, Q1 + pand m3, m4 + + paddw m2, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m3, Q1 + +%ifidn %2, v + mova [dstq+mstrideq*2], m2 ; p1 + mova [dstq+mstrideq*1], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m3 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 +%endif ; %2==/!=v +%else ; %1 == 4 +%if ARCH_X86_64 +%ifidn %2, v + mova [dstq+mstrideq*2], P1 ; p1 + mova [dstq+mstrideq*1], P0 ; p0 + mova [dstq+strideq*0], Q0 ; q0 + mova [dstq+strideq*1], Q1 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 +%endif ; %2==/!=v +%else ; x86-32 +%ifidn %2, v + mova [dstq+mstrideq*2], m3 + mova [dstq+mstrideq*1], m5 + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m7 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 +%endif ; %2==/!=v +%endif ; x86-32/64 +%endif ; %1 +%undef P3 +%undef P2 +%undef P1 +%undef P0 +%undef Q0 +%undef Q1 +%undef Q2 +%undef Q3 +%endmacro + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 code +%if ARCH_X86_64 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r1-4 - p2-q0 post-filter16 +; r5 - p3 +; r6 - q3 post-filter16 +; r7 - GPRs [mask_bitsm, mstridem] +; r8 - m12/pb_mask +; r9 - bdmulq +cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 10*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base +%define pic_regm dword [esp+7*16+2*gprsize] + mov pic_regm, pic_regq + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+9*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mstridem dword [esp+7*16+1*gprsize] + mov mstridem, mstrideq +%define mask_bitsm dword [esp+7*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+8*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 +; r1-4 - p2-q0 post-filter16 backup +; r5 - q3 post-filter16 backup +; r6 - p3 +; r7-10 - p7-4 +; r11-14 - q4-7 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r15 - GPRs [mask_bitsm] +; r16 - m12/pb_mask +; r17 - bdmulq +; r18-24 - p2-q3 +cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 25*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+17*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+15*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+16*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm, mstridem] +; r1 - m12/pb_mask +; r2 - bdmulq +cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 3*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] +%define mstridem dword [esp+1*gprsize] + mov mask_bitsm, 0x3 + mov mstridem, mstrideq + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm] +; r1 - m12/pb_mask +; r2 - bdmulq +; r3-8 - p2-q2 +cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 9*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET diff -Nru dav1d-0.9.0/src/x86/loopfilter_avx2.asm dav1d-0.9.1/src/x86/loopfilter_avx2.asm --- dav1d-0.9.0/src/x86/loopfilter_avx2.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_avx2.asm 2021-07-28 21:38:28.905852000 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -822,9 +822,7 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m7 - por m8, m9 + vpblendvb m8, m7, m8, m1 %ifidn %2, v mova [tmpq+stride3q], m8 ; p4 %else @@ -850,9 +848,7 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m12 - por m8, m9 + vpblendvb m8, m12, m8, m1 %ifidn %2, v mova [tmpq+strideq*4], m8 ; p3 %else @@ -878,9 +874,7 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m13 - por m8, m9 + vpblendvb m8, m13, m8, m1 mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E @@ -910,9 +904,7 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m3 - por m8, m9 + vpblendvb m8, m3, m8, m1 mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F @@ -940,9 +932,7 @@ pmulhrsw m0, m10, [pw_2048] pmulhrsw m8, m11, [pw_2048] packuswb m0, m8 - pand m0, m1 - pandn m8, m1, m4 - por m0, m8 + vpblendvb m0, m4, m0, m1 mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G @@ -966,9 +956,7 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m5 - por m8, m9 + vpblendvb m8, m5, m8, m1 mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H @@ -985,9 +973,7 @@ pmulhrsw m2, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m2, m9 - pand m2, m1 - pandn m9, m1, m6 - por m2, m9 ; don't clobber q1/m6 since we need it in K + vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 @@ -1003,9 +989,7 @@ pmulhrsw m7, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m7, m9 - pand m7, m1 - pandn m9, m1, m14 - por m7, m9 ; don't clobber q2/m14 since we need it in K + vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 @@ -1021,9 +1005,7 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m15 - por m8, m9 + vpblendvb m8, m15, m8, m1 %ifidn %2, v mova [tmpq+mstrideq], m8 ; q3 %else @@ -1044,13 +1026,12 @@ pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 - pand m8, m1 %ifidn %2, v - pandn m9, m1, [tmpq+strideq*0] + mova m9, [tmpq+strideq*0] %else - pandn m9, m1, [rsp+15*32] + mova m9, [rsp+15*32] %endif - por m8, m9 + vpblendvb m8, m9, m8, m1 %ifidn %2, v mova [tmpq+strideq*0], m8 ; q4 %else @@ -1070,13 +1051,12 @@ pmulhrsw m10, [pw_2048] pmulhrsw m11, [pw_2048] packuswb m10, m11 - pand m10, m1 %ifidn %2, v - pandn m11, m1, [tmpq+strideq*1] + mova m11, [tmpq+strideq*1] %else - pandn m11, m1, [rsp+16*32] + mova m11, [rsp+16*32] %endif - por m10, m11 + vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else @@ -1109,9 +1089,7 @@ psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m13 - por m10, m8, m11 ; p2 + vpblendvb m10, m13, m8, m9 ; p2 %ifidn %2, v mova [tmpq+strideq*1], m10 ; p2 %endif @@ -1129,9 +1107,7 @@ psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m3 - por m8, m11 ; p1 + vpblendvb m8, m3, m8, m9 ; p1 %ifidn %2, v mova [tmpq+strideq*2], m8 ; p1 %else @@ -1151,9 +1127,7 @@ psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m4 - por m8, m11 ; p0 + vpblendvb m8, m4, m8, m9 ; p0 %ifidn %2, v mova [tmpq+stride3q ], m8 ; p0 %else @@ -1175,9 +1149,7 @@ psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m5 - por m11, m8, m11 ; q0 + vpblendvb m11, m5, m8, m9 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 ; q0 %endif @@ -1195,9 +1167,7 @@ psrlw m8, m2, 3 psrlw m13, m7, 3 packuswb m8, m13 - pand m8, m9 - pandn m13, m9, m6 - por m13, m8, m13 ; q1 + vpblendvb m13, m6, m8, m9 ; q1 %ifidn %2, v mova [dstq+strideq*1], m13 ; q1 %endif @@ -1217,9 +1187,7 @@ psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 - pand m2, m9 - pandn m7, m9, m14 - por m2, m7 ; q2 + vpblendvb m2, m14, m2, m9 ; q2 %ifidn %2, v mova [dstq+strideq*2], m2 ; q2 %else @@ -1380,9 +1348,7 @@ pmulhrsw m2, m0, [pw_4096] pmulhrsw m12, m1, [pw_4096] packuswb m2, m12 - pand m2, m9 - pandn m12, m9, m3 - por m2, m12 + vpblendvb m2, m3, m2, m9 %ifidn %2, v mova [tmpq+strideq*2], m2 ; p1 %endif @@ -1400,9 +1366,7 @@ pmulhrsw m12, m0, [pw_4096] pmulhrsw m13, m1, [pw_4096] packuswb m12, m13 - pand m12, m9 - pandn m13, m9, m4 - por m12, m13 + vpblendvb m12, m4, m12, m9 %ifidn %2, v mova [tmpq+stride3q], m12 ; p0 %endif @@ -1418,9 +1382,7 @@ pmulhrsw m14, m0, [pw_4096] pmulhrsw m13, m1, [pw_4096] packuswb m14, m13 - pand m14, m9 - pandn m13, m9, m5 - por m14, m13 + vpblendvb m14, m5, m14, m9 %ifidn %2, v mova [dstq+strideq*0], m14 ; q0 %endif @@ -1436,9 +1398,7 @@ pmulhrsw m0, [pw_4096] pmulhrsw m1, [pw_4096] packuswb m0, m1 - pand m0, m9 - pandn m9, m6 - por m0, m9 + vpblendvb m0, m6, m0, m9 %ifidn %2, v mova [dstq+strideq*1], m0 ; q1 %else @@ -1457,7 +1417,7 @@ %endmacro INIT_YMM avx2 -cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \ +cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp shl l_strideq, 2 @@ -1495,7 +1455,7 @@ RET INIT_YMM avx2 -cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \ +cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp shl l_strideq, 2 @@ -1535,7 +1495,7 @@ RET INIT_YMM avx2 -cglobal lpf_v_sb_uv, 7, 10, 16, \ +cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp shl l_strideq, 2 @@ -1566,7 +1526,7 @@ RET INIT_YMM avx2 -cglobal lpf_h_sb_uv, 7, 10, 16, \ +cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp shl l_strideq, 2 diff -Nru dav1d-0.9.0/src/x86/loopfilter_init_tmpl.c dav1d-0.9.1/src/x86/loopfilter_init_tmpl.c --- dav1d-0.9.0/src/x86/loopfilter_init_tmpl.c 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_init_tmpl.c 2021-07-28 21:38:28.905852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -29,14 +29,13 @@ #include "src/loopfilter.h" #define decl_loopfilter_sb_fns(ext) \ -decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \ -decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \ -decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \ -decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext) +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext)) decl_loopfilter_sb_fns(ssse3); decl_loopfilter_sb_fns(avx2); -decl_loopfilter_sb_fns(16bpc_avx2); COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -44,25 +43,23 @@ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH == 8 - c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3; - c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3; - c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3; - c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3; + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); +#else + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); #endif if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; #if ARCH_X86_64 -#if BITDEPTH == 8 - c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2; - c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2; - c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2; - c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2; -#else - c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx2; - c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx2; - c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx2; - c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx2; -#endif + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2); #endif } diff -Nru dav1d-0.9.0/src/x86/loopfilter_sse.asm dav1d-0.9.1/src/x86/loopfilter_sse.asm --- dav1d-0.9.0/src/x86/loopfilter_sse.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_sse.asm 2021-07-28 21:38:28.909852300 +0000 @@ -1,4 +1,4 @@ -; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; @@ -1977,11 +1977,11 @@ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \ +cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else -cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ +cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC @@ -2075,11 +2075,11 @@ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \ +cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else -cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ +cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC @@ -2179,11 +2179,11 @@ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \ +cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else -cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ +cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC @@ -2261,11 +2261,11 @@ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \ +cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else -cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \ +cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC diff -Nru dav1d-0.9.0/src/x86/looprestoration16_avx2.asm dav1d-0.9.1/src/x86/looprestoration16_avx2.asm --- dav1d-0.9.0/src/x86/looprestoration16_avx2.asm 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/looprestoration16_avx2.asm 2021-07-28 21:38:28.909852300 +0000 @@ -56,7 +56,7 @@ pd_25: dd 25 pd_4096: dd 4096 pd_34816: dd 34816 -pd_m262128 dd -262128 +pd_m262128: dd -262128 pd_0xf00800a4: dd 0xf00800a4 pd_0xf00801c7: dd 0xf00801c7 diff -Nru dav1d-0.9.0/src/x86/looprestoration16_sse.asm dav1d-0.9.1/src/x86/looprestoration16_sse.asm --- dav1d-0.9.0/src/x86/looprestoration16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/looprestoration16_sse.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,3778 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +pb_m14_m13: times 8 db -14,-13 +pb_m10_m9: times 8 db -10, -9 +pb_m6_m5: times 8 db -6, -5 +pb_m2_m1: times 8 db -2, -1 +pb_2_3: times 8 db 2, 3 +pb_6_7: times 8 db 6, 7 +pw_25: times 8 dw 25 +pw_256: times 8 dw 256 +pw_1023: times 8 dw 1023 +pd_8: times 4 dd 8 +pd_4096: times 4 dd 4096 +pd_34816: times 4 dd 34816 +pd_m262128: times 4 dd -262128 +pd_0xffff: times 4 dd 0xffff +pd_0xf00800a4: times 4 dd 0xf00800a4 +pd_0xf00801c7: times 4 dd 0xf00801c7 + +wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 +wiener_round: dd 1049600, 1048832 + +cextern sgr_x_by_x + +SECTION .text + +%macro movif64 2 ; dst, src + %if ARCH_X86_64 + mov %1, %2 + %endif +%endmacro + +%macro movif32 2 ; dst, src + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +DECLARE_REG_TMP 4, 6 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 14*16 + %else + %assign extra_stack 12*16 + %endif +cglobal wiener_filter7_16bpc, 5, 7, 8, -384*12-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+16*12+ 0] + %define lpf_stridem dword [esp+calloff+16*12+ 4] + %define wm dword [esp+calloff+16*12+ 8] + %define hd dword [esp+calloff+16*12+12] + %define edgeb byte [esp+calloff+16*12+16] + %define edged dword [esp+calloff+16*12+16] + %else + %define hd dword r6m + %define edgeb byte r8m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t5m dword [esp+calloff+4*6] + %define t6m dword [esp+calloff+4*7] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define t5 t5m + %define t6 t6m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define r10 r5 + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+24] + mov lpf_stridem, lpf_strideq + mov wm, wd + mov r4, [rstk+stack_offset+28] + mov hd, r4 + mov r4, [rstk+stack_offset+36] + mov edged, r4 ; edge + %endif +%else +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers +cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, fltmp + mov edged, r8m + mov hd, r6m + mov t3d, r9m ; pixel_max + movq m13, [fltq] + movq m15, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+32] + mov t1, [rstk+stack_offset+40] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + LEA t0, wiener_shifts + mov fltq, r7m + movq m1, [fltq] + movq m3, [fltq+16] + mov t1, r9m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m6, [base+wiener_shufA] + mova m7, [base+wiener_shufB] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + pshufd m12, m13, q0000 ; x0 x1 + pshufd m13, m13, q1111 ; x2 x3 + pshufd m14, m15, q0000 ; y0 y1 + pshufd m15, m15, q1111 ; y2 y3 + mova m8, [wiener_shufC] + mova m9, [wiener_shufD] + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + shr t3d, 11 + %define base t4-wiener_shifts + movd m10, [base+wiener_round+t3*4] + movq m11, [base+wiener_shifts+t3*8] + pshufd m10, m10, q0000 + pshufd m0, m11, q0000 + pshufd m11, m11, q1111 + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] + %define base + %define wiener_lshuf7_mem [wiener_lshuf7] + %define pd_m262128_mem [pd_m262128] +%else + add wd, wd + mova m4, [base+wiener_shufC] + mova m5, [base+wiener_shufD] + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pshufd m2, m3, q0000 + pshufd m3, m3, q1111 + mova m8, m4 + mova m9, m5 + mova m14, m2 + mova m15, m3 + shr t1, 11 + add lpfq, wq + mova m3, [base+pd_m262128] + movd m4, [base+wiener_round+t1*4] + movq m5, [base+wiener_shifts+t1*8] + lea t1, [esp+extra_stack+wq+16] + add dstq, wq + neg wq + pshufd m4, m4, q0000 + pshufd m2, m5, q0000 + pshufd m5, m5, q1111 + mov wm, wq + pmullw m0, m2 + pmullw m1, m2 + mova m2, [base+wiener_lshuf7] + %define pd_m262128_mem [esp+calloff+16*10] + mova pd_m262128_mem, m3 + mova m10, m4 + mova m11, m5 + mova m12, m0 + mova m13, m1 + %define wiener_lshuf7_mem [esp+calloff+16*11] + mova wiener_lshuf7_mem, m2 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + add lpfq, lpf_stridem +%endif + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, lpfm + call .hv_bottom + add lpfq, lpf_stridem + call .hv_bottom +.v1: + call .v + RET +.no_top: + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v + movif32 wq, wm +.v2: + call .v + movif32 wq, wm + jmp .v1 +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m0, m0 + movd m1, wd + mova m2, [base+pb_0to15] + pshufb m1, m0 + mova m0, [base+pb_6_7] + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + mova m0, [base+pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + mova m0, [base+pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + movif32 t0, t0m + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r5 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, wiener_lshuf7_mem ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r5 + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-8] +.h_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, wiener_lshuf7_mem + jmp .hv_main +.hv_bottom: + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-8] +.hv_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t4m + movif32 t0, t2m + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 +%if ARCH_X86_64 + mova m2, [t4+wq] + paddw m2, [t2+wq] + mova m5, [t3+wq] +%else + mova m2, [t1+wq] + paddw m2, [t0+wq] + mov t1, t3m + mov t0, t5m + mova m5, [t1+wq] + mov t1, t1m +%endif + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 +%if ARCH_X86_64 + mova m4, [t5+wq] + paddw m4, [t1+wq] + psraw m0, 1 + paddw m3, m0, [t6+wq] +%else + mova m4, [t0+wq] + paddw m4, [t1+wq] + mov t0, t0m + mov t1, t6m + psraw m0, 1 + paddw m3, m0, [t1+wq] +%endif + mova [t0+wq], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov r5, t5m + mov t1, t4m + mov t6m, r5 + mov t5m, t1 + mov r5, t3m + mov t1, t2m + mov t4m, r5 + mov t3m, t1 + mov r5, t1m + mov t1, t0 + mov t2m, r5 + mov t0, t6m + mov wq, wm +%endif + add dstq, dst_strideq + ret +.v: + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+wq] + paddw m1, [t2+wq] + mova m2, [t3+wq] + mova m4, [t1+wq] + paddw m3, m4, [t6+wq] + paddw m4, [t5+wq] +%else + mov t0, t4m + mov t1, t2m + mova m1, [t0+wq] + paddw m1, [t1+wq] + mov t0, t3m + mov t1, t1m + mova m2, [t0+wq] + mova m4, [t1+wq] + mov t0, t6m + mov t1, t5m + paddw m3, m4, [t0+wq] + paddw m4, [t1+wq] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .v_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t5m + mov t1, t4m + mov r5, t3m + mov t6m, t0 + mov t5m, t1 + mov t4m, r5 + mov r5, t2m + mov t1, t1m + mov t0, t0m + mov t3m, r5 + mov t2m, t1 +%endif + add dstq, dst_strideq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign stack_size 12*16+384*8 + %else + %assign stack_size 11*16+384*8 + %endif +cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ + lpf, lpf_stride, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+4*6] + %define lpf_stridem dword [esp+calloff+4*7] + %define wm dword [esp+calloff+16*10+0] + %define hd dword [esp+calloff+16*10+4] + %define edgeb byte [esp+calloff+16*10+8] + %define edged dword [esp+calloff+16*10+8] + %else + %define hd dword r6m + %define edgeb byte r8m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+24] + mov lpf_stridem, lpf_strideq + mov wm, wd + mov r4, [rstk+stack_offset+28] + mov hd, r4 + mov r4, [rstk+stack_offset+36] + mov edged, r4 ; edge + %endif +%else +cglobal wiener_filter5_16bpc, 5, 14, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, fltmp + mov edged, r8m + mov hd, r6m + mov t3d, r9m ; pixel_max + movq m12, [fltq] + movq m14, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+32] + mov t1, [rstk+stack_offset+40] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + LEA t0, wiener_shifts + mov fltq, r7m + movq m1, [fltq] + movq m3, [fltq+16] + mov t1, r9m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m5, [base+wiener_shufE] + mova m6, [base+wiener_shufB] + mova m7, [base+wiener_shufD] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + punpcklwd m11, m12, m12 + pshufd m11, m11, q1111 ; x1 + pshufd m12, m12, q1111 ; x2 x3 + punpcklwd m13, m14, m14 + pshufd m13, m13, q1111 ; y1 + pshufd m14, m14, q1111 ; y2 y3 + shr t3d, 11 + mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + %define base t4-wiener_shifts + movd m9, [base+wiener_round+t3*4] + movq m10, [base+wiener_shifts+t3*8] + pshufd m9, m9, q0000 + pshufd m0, m10, q0000 + pshufd m10, m10, q1111 + mova m15, [wiener_lshuf5] + pmullw m11, m0 + pmullw m12, m0 + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] + %define base +%else + add wd, wd + punpcklwd m0, m1, m1 + pshufd m0, m0, q1111 ; x1 + pshufd m1, m1, q1111 ; x2 x3 + punpcklwd m2, m3, m3 + pshufd m2, m2, q1111 ; y1 + pshufd m3, m3, q1111 ; y2 y3 + mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) + mova m13, m2 + mova m14, m3 + mova m8, m4 + shr t1, 11 + add lpfq, wq + movd m2, [base+wiener_round+t1*4] + movq m3, [base+wiener_shifts+t1*8] + %if STACK_ALIGNMENT < 16 + lea t1, [esp+16*11+wq+16] + %else + lea t1, [esp+16*10+wq+16] + %endif + add dstq, wq + neg wq + pshufd m2, m2, q0000 + pshufd m4, m3, q0000 + pshufd m3, m3, q1111 + mov wm, wq + pmullw m0, m4 + pmullw m1, m4 + mova m4, [base+wiener_lshuf5] + mova m9, m2 + mova m10, m3 + mova m11, m0 + mova m12, m1 + mova m15, m4 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + add lpfq, lpf_stridem +%endif + mov t4, t1 + add t1, 384*2 + call .h_top + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, lpfm + call .hv_bottom + add lpfq, lpf_stridem + call .hv_bottom +.end: + RET +.no_top: + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t3m + mov r5, t2m + mov t1, t1m + mov t4m, t0 + mov t3m, r5 + mov t2m, t1 + mov wq, wm +%endif + add dstq, dst_strideq +.v1: + call .v + jmp .end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m1, m1 + movd m2, wd + mova m0, [base+pb_2_3] + pshufb m2, m1 + mova m1, [base+pb_m6_m5] + psubb m0, m2 + psubb m1, m2 + mova m2, [base+pb_0to15] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r5 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, m15 ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r5 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-4] +.h_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-4] +.hv_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t1m + movif32 t0, t3m + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 +%if ARCH_X86_64 + mova m2, [t3+wq] + paddw m2, [t1+wq] + paddd m1, m3 + mova m4, [t2+wq] +%else + mova m2, [t0+wq] + mov t0, t2m + paddw m2, [t1+wq] + mov t1, t4m + paddd m1, m3 + mova m4, [t0+wq] + mov t0, t0m +%endif + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 +%if ARCH_X86_64 + mova m4, [t4+wq] +%else + mova m4, [t1+wq] +%endif + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+wq], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 +%else + mov r5, t3m + mov t1, t2m + mov t4m, r5 + mov t3m, t1 + mov r5, t1m + mov t1, t0 + mov t2m, r5 + mov t0, t4m + mov wq, wm +%endif + add dstq, dst_strideq + ret +.v: + movif64 wq, r5 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m0, [t1+wq] + paddw m2, m0, [t3+wq] + mova m1, [t2+wq] + mova m4, [t4+wq] +%else + mov t0, t3m + mova m0, [t1+wq] + mov t1, t2m + paddw m2, m0, [t0+wq] + mov t0, t4m + mova m1, [t1+wq] + mova m4, [t0+wq] +%endif + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 +%if ARCH_X86_64 + jl .v_loop +%else + jge .v_end + mov t1, t1m + jmp .v_loop +.v_end: +%endif + ret + +%macro GATHERDD 3 ; dst, src, tmp + movd %3d, %2 + %if ARCH_X86_64 + movd %1, [r13+%3] + pextrw %3d, %2, 2 + pinsrw %1, [r13+%3+2], 3 + pextrw %3d, %2, 4 + pinsrw %1, [r13+%3+2], 5 + pextrw %3d, %2, 6 + pinsrw %1, [r13+%3+2], 7 + %else + movd %1, [base+sgr_x_by_x-0xf03+%3] + pextrw %3, %2, 2 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 + pextrw %3, %2, 4 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 + pextrw %3, %2, 6 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 + %endif +%endmacro + +%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore + %if ARCH_X86_64 + %define tmp r14 + %else + %define tmp %4 + %endif + GATHERDD %1, %2, tmp + GATHERDD %2, %3, tmp + movif32 %4, %5 + psrld %1, 24 + psrld %2, 24 + packssdw %1, %2 +%endmacro + +%macro MAXSD 3-4 0 ; dst, src, restore_tmp + pcmpgtd %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 + %if %4 == 1 + pxor %3, %3 + %endif +%endmacro + +%macro MULLD 3 ; dst, src, tmp + pmulhuw %3, %1, %2 + pmullw %1, %2 + pslld %3, 16 + paddd %1, %3 +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 0, 1, 2, 3, 4 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 5*16 + %else + %assign extra_stack 3*16 + %endif +cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*0+4*6] + %define dst_stridemp dword [esp+calloff+16*3+4*7] + %define leftm dword [esp+calloff+16*3+4*0] + %define lpfm dword [esp+calloff+16*3+4*1] + %define lpf_stridem dword [esp+calloff+16*3+4*2] + %define w0m dword [esp+calloff+16*3+4*3] + %define hd dword [esp+calloff+16*3+4*4] + %define edgeb byte [esp+calloff+16*3+4*5] + %define edged dword [esp+calloff+16*3+4*5] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t0m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define m8 [base+pd_8] + %define m9 [base+pw_25] + %define m10 [esp+calloff+16*2] + %define m11 [base+pd_0xf00800a4] + %define m12 [base+pw_256] + %define m13 [base+pd_34816] + %define m14 [base+pw_1023] + %define m15 [base+sgr_lshuf5] + %define r10 r5 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_5x5_16bpc, 5, 15, 16, -400*24-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + movu m10, [paramsq] + mova m12, [pw_256] + add lpfq, wq + mova m8, [pd_8] + lea t1, [rsp+wq+20] + mova m9, [pw_25] + add dstq, wq + lea t3, [rsp+wq*2+400*12+16] + mova m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + pshufhw m7, m10, q0000 + pshufb m10, m12 ; s0 + punpckhqdq m7, m7 ; w0 + neg wq + mova m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + mova m14, [pw_1023] + psllw m7, 4 + mova m15, [sgr_lshuf5] + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + movu m1, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*20+16] + mov t3m, t3 + pshufhw m7, m1, q0000 + mov t4m, t4 + pshufb m1, m12 ; s0 + punpckhqdq m7, m7 ; w0 + psllw m7, 4 + neg wq + mova m10, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + movif32 t2m, t1 + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t0m, t2 + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_stridemp + movif32 t4, t4m + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp +%if ARCH_X86_64 + test hb, hb +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + call .h + add lpfq, dst_stridemp + call .hv + movif32 dstq, dstm + call .n0 + call .n1 + sub hd, 2 + movif32 t0, t0m + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .h_top + add lpfq, lpf_stridem + call .hv_bottom +.end: + movif32 dstq, dstm + call .n0 + call .n1 +.end2: + RET +.height1: + movif32 t4, t4m + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + movif32 dstq, dstm + call .n0 + call .n1 +.odd_height_end: + call .v + movif32 dstq, dstm + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + lea t2, [t1+400*6] + movif32 t2m, t2 + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + movif32 t0m, t0 + jmp .main +.no_top_height1: + movif32 t3, t3m + movif32 t4, t4m + call .v + call .prep_n + jmp .odd_height_end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + mova m3, [base+pb_m14_m13] + mova m0, [base+pb_0to15] + pshufb m1, m6 + psubb m2, m12, m1 + psubb m3, m1 + movd m1, [lpfq-2] + pcmpgtb m2, m0 + pcmpgtb m3, m0 + pshufb m1, m12 + pand m4, m2 + pand m5, m3 + pandn m2, m1 + pandn m3, m1 + por m4, m2 + por m5, m3 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m15 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+wq+400*0] + paddd m1, [t1+wq+400*2] + paddd m2, [t1+wq+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+wq+400*0], m0 + mova [t1+wq+400*2], m1 + mova [t1+wq+400*4], m2 + add wq, 16 + jl .h_loop + ret +.top_fixup: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m15 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv_loop_start +%endif +.hv_loop: + movif32 lpfq, hvsrcm +.hv_loop_start: + movu m4, [lpfq+wq- 2] +.hv_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t3, hd + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] +%if ARCH_X86_64 + test hd, hd +%else + test t3, t3 +%endif + jz .hv_last_row +.hv_main2: + paddw m1, [t2+wq+400*0] ; hv sum + paddd m4, [t2+wq+400*2] ; hv sumsq + paddd m5, [t2+wq+400*4] + mova [t0+wq+400*0], m0 + mova [t0+wq+400*2], m2 + mova [t0+wq+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + MULLD m4, m9, m0 ; a * 25 + MULLD m5, m9, m0 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, t2, t2m + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m5 + MULLD m1, m4, m5 + psubw m5, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m5 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + movif32 t2m, t2 + movif32 t0m, t0 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+wq+400*0], m1 + paddw m1, m0 + mova [t1+wq+400*2], m4 + paddd m4, m2 + mova [t1+wq+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v_loop: + mova m0, [t1+wq+400*0] + mova m2, [t1+wq+400*2] + mova m3, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + MULLD m4, m9, m0 ; a * 25 + MULLD m5, m9, m0 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, t2, t2m + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m5 + MULLD m1, m4, m5 + psubw m5, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m5 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+wq*1+400*2+ 0] + paddd m4, m1, [t3+wq*2+400*4+ 0] + paddd m5, m2, [t3+wq*2+400*4+16] + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + mova m0, [dstq+wq] + mova m3, [t4+wq*1+400*2+ 0] + mova m4, [t3+wq*2+400*4+ 0] + mova m5, [t3+wq*2+400*4+16] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 7) + paddd m3, m5 + psrld m2, 8 + psrld m3, 8 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 4*16 + %else + %assign extra_stack 2*16 + %endif +cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*2+4*0] + %define dst_stridemp dword [esp+calloff+16*2+4*1] + %define leftm dword [esp+calloff+16*2+4*2] + %define lpfm dword [esp+calloff+16*2+4*3] + %define lpf_stridem dword [esp+calloff+16*2+4*4] + %define w0m dword [esp+calloff+16*2+4*5] + %define hd dword [esp+calloff+16*2+4*6] + %define edgeb byte [esp+calloff+16*2+4*7] + %define edged dword [esp+calloff+16*2+4*7] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %define m8 [base+pd_8] + %define m9 [esp+calloff+16*1] + %define m10 [base+pd_0xf00801c7] + %define m11 [base+pd_34816] + %define m12 [base+pw_256] + %define m13 [base+pw_1023] + %define m14 [base+sgr_lshuf3] + %define m15 m6 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_3x3_16bpc, 5, 15, 16, 400*42+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + movq m9, [paramsq+4] + mova m12, [pw_256] + add lpfq, wq + lea t1, [rsp+wq+12] + mova m8, [pd_8] + add dstq, wq + lea t3, [rsp+wq*2+400*12+8] + mova m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + mova m11, [pd_34816] + pshuflw m7, m9, q3333 + pshufb m9, m12 ; s1 + punpcklqdq m7, m7 ; w1 + neg wq + pxor m6, m6 + mova m13, [pw_1023] + psllw m7, 4 + mova m14, [sgr_lshuf3] + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + movq m1, [r1+4] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*32+16] + mov t3m, t3 + pshuflw m7, m1, q3333 + mov t4m, t4 + pshufb m1, m12 ; s1 + punpcklqdq m7, m7 ; w1 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv0 +%if ARCH_X86_64 + test hb, hb +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + mov lpfq, hvsrcm + add lpfq, lpf_stridem +%endif + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + mova m2, [base+pb_m2_m1] + mova m3, [base+pb_0to15] + movd m5, [lpfq-2] + pshufb m1, m6 + pshufb m5, m12 + psubb m2, m1 + pcmpgtb m2, m3 + pand m4, m2 + pandn m2, m5 + por m4, m2 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq+ 0] +.h_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq+ 0] +.hv0_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -18 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq+4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq+ 0] +.hv1_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -18 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m2 + mova [t2+wq+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2 +4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*0+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 4] + movu m1, [t3+wq*2+400*0+ 8] + movu m2, [t3+wq*2+400*0+24] + movu m3, [t4+wq*1+400*0+ 2] + movu m4, [t3+wq*2+400*0+ 4] + movu m5, [t3+wq*2+400*0+20] + paddw m0, [t4+wq*1+400*0+ 0] + paddd m1, [t3+wq*2+400*0+ 0] + paddd m2, [t3+wq*2+400*0+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[-1] 444 + pslld m4, 2 ; b[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a[-1] 343 + psubd m4, m1 ; b[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400*4], m3 + mova [t3+wq*2+400*8+ 0], m4 + mova [t3+wq*2+400*8+16], m5 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[ 0] 444 + pslld m4, 2 ; b[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400* 6], m3 + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + psubw m3, m0 ; a[ 0] 343 + psubd m4, m1 ; b[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m3, [t4+wq*1+400*0+4] + movu m1, [t4+wq*1+400*0+2] + paddw m3, [t4+wq*1+400*0+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*4] + paddw m3, [t4+wq*1+400*6] + mova [t4+wq*1+400*4], m2 + mova [t4+wq*1+400*6], m1 + movu m4, [t3+wq*2+400*0+8] + movu m1, [t3+wq*2+400*0+4] + paddd m4, [t3+wq*2+400*0+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400* 8+ 0] + paddd m4, [t3+wq*2+400*12+ 0] + mova [t3+wq*2+400* 8+ 0], m2 + mova [t3+wq*2+400*12+ 0], m1 + movu m5, [t3+wq*2+400*0+24] + movu m1, [t3+wq*2+400*0+20] + paddd m5, [t3+wq*2+400*0+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400* 8+16] + paddd m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400* 8+16], m2 + mova [t3+wq*2+400*12+16], m1 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*2+4] + movu m1, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*6] + paddw m3, [t4+wq*1+400*8] + mova [t4+wq*1+400*6], m1 + mova [t4+wq*1+400*8], m2 + movu m4, [t3+wq*2+400*4+8] + movu m1, [t3+wq*2+400*4+4] + paddd m4, [t3+wq*2+400*4+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400*12+ 0] + paddd m4, [t3+wq*2+400*16+ 0] + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*16+ 0], m2 + movu m5, [t3+wq*2+400*4+24] + movu m1, [t3+wq*2+400*4+20] + paddd m5, [t3+wq*2+400*4+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400*12+16] + paddd m5, [t3+wq*2+400*16+16] + mova [t3+wq*2+400*12+16], m1 + mova [t3+wq*2+400*16+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 10*16 + %else + %assign extra_stack 8*16 + %endif +cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*8+4*0] + %define dst_stridemp dword [esp+calloff+16*8+4*1] + %define leftm dword [esp+calloff+16*8+4*2] + %define lpfm dword [esp+calloff+16*8+4*3] + %define lpf_stridem dword [esp+calloff+16*8+4*4] + %define w0m dword [esp+calloff+16*8+4*5] + %define hd dword [esp+calloff+16*8+4*6] + %define edgeb byte [esp+calloff+16*8+4*7] + %define edged dword [esp+calloff+16*8+4*7] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %xdefine m8 m6 + %define m9 [base+pd_8] + %define m10 [base+pd_34816] + %define m11 [base+pd_0xf00801c7] + %define m12 [base+pw_256] + %define m13 [esp+calloff+16*4] + %define m14 [esp+calloff+16*5] + %define m15 [esp+calloff+16*6] + %define m6 [esp+calloff+16*7] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_mix_16bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \ + lpf, lpf_stride, w, edge, \ + params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + mova m15, [paramsq] + add lpfq, wq + mova m9, [pd_8] + lea t1, [rsp+wq+44] + mova m10, [pd_34816] + add dstq, wq + mova m12, [pw_256] + lea t3, [rsp+wq*2+400*24+40] + mova m11, [pd_0xf00801c7] + lea t4, [rsp+wq+400*52+40] + neg wq + pshuflw m13, m15, q0000 + pshuflw m14, m15, q2222 + pshufhw m15, m15, q1010 + punpcklqdq m13, m13 ; s0 + punpcklqdq m14, m14 ; s1 + punpckhqdq m15, m15 ; w0 w1 + pxor m6, m6 + psllw m15, 2 + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + mova m2, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+52] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*24+48] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*52+48] + mov t3m, t3 + mov t4m, t4 + neg wq + pshuflw m0, m2, q0000 + pshuflw m1, m2, q2222 + pshufhw m2, m2, q1010 + punpcklqdq m0, m0 ; s0 + punpcklqdq m1, m1 ; s1 + punpckhqdq m2, m2 ; w0 w1 + mov w1m, wd + pxor m3, m3 + psllw m2, 2 + mova m13, m0 + mova m14, m1 + sub wd, 4 + mova m15, m2 + mova m6, m3 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 +%if ARCH_X86_64 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup +%else + mov wq, w0m + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop +%endif + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv0 +%if ARCH_X86_64 + test hd, hd +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + mov lpfq, hvsrcm + add lpfq, lpf_stridem +%endif + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+wq+400* 0] + mova m1, [t1+wq+400* 2] + mova m2, [t1+wq+400* 4] + paddw m0, m0 + mova m3, [t1+wq+400* 6] + paddd m1, m1 + mova m4, [t1+wq+400* 8] + paddd m2, m2 + mova m5, [t1+wq+400*10] + mova [t2+wq+400* 0], m0 + mova [t2+wq+400* 2], m1 + mova [t2+wq+400* 4], m2 + mova [t2+wq+400* 6], m3 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.h: ; horizontal boxsum +%assign stack_offset stack_offset+4 +%assign calloff 4 +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m8, m1 ; sum5 + paddd m7, m2 ; sumsq5 + paddd m5, m3 + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq- 2] +.hv0_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -20 + jl .hv0_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + movif32 t3, t3m + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; h sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m7, m2 ; h sumsq5 + paddd m5, m3 + mova [t3+wq*2+400*8+ 8], m8 + mova [t3+wq*2+400*0+ 8], m7 + mova [t3+wq*2+400*0+24], m5 + paddw m8, [t1+wq+400* 0] + paddd m7, [t1+wq+400* 2] + paddd m5, [t1+wq+400* 4] + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + paddw m0, m1, [t1+wq+400* 6] + paddd m4, m2, [t1+wq+400* 8] + paddd m5, m3, [t1+wq+400*10] + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq- 2] +.hv1_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -20 + jl .hv1_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv1_have_right: + palignr m7, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m7, m3 + punpcklwd m0, m7, m3 + pmaddwd m0, m0 + punpckhwd m7, m3 + pmaddwd m7, m7 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m6 + pmaddwd m1, m1 + punpckhwd m3, m6 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + palignr m5, m4, 8 + punpckhwd m1, m4, m5 + paddw m8, m4, m5 + pmaddwd m1, m1 + punpcklwd m4, m5 + pmaddwd m4, m4 + paddd m7, m3 + paddw m5, m2, [t2+wq+400* 6] + mova [t2+wq+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 8], m0 + mova [t2+wq+400*10], m7 + paddd m4, m0 ; h sumsq5 + paddd m1, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m7, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m7 + psrlw m7, m5, 1 + pavgw m7, m6 ; (b3 + 2) >> 2 + punpcklwd m0, m7, m6 + pmaddwd m0, m0 + punpckhwd m7, m6 + pmaddwd m7, m7 +%if ARCH_X86_32 + mova [esp+20], m8 +%else + SWAP m8, m6 +%endif + MAXSD m2, m0, m8 + MAXSD m3, m7, m8 + pxor m8, m8 + psubd m2, m0 ; p3 + psubd m3, m7 + punpcklwd m0, m5, m8 ; b3 + punpckhwd m5, m8 + MULLD m2, m14, m8 ; p3 * s1 + MULLD m3, m14, m8 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m5, m11 + paddusw m2, m11 + paddusw m3, m11 + psrld m8, m2, 20 ; min(z3, 255) + movif32 t3, t3m + psrld m2, m3, 20 + GATHER_X_BY_X m7, m8, m2, r0, dstm + punpcklwd m2, m7, m7 + punpckhwd m8, m7, m7 + MULLD m0, m2, m3 + MULLD m5, m8, m3 + psubw m3, m12, m7 +%if ARCH_X86_32 + mova m8, [esp+20] +%endif + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m5, m10 + psrld m0, 12 + psrld m5, 12 + mova [t4+wq*1+400*4+4], m3 + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m5 +%if ARCH_X86_64 + SWAP m6, m8 + pxor m6, m6 +%endif + paddw m5, m8, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m1, [t2+wq+400*4] + paddw m5, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m8 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m1 + mova m4, [base+pw_25] + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 + MULLD m3, m4, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m1, m5, 1 + pavgw m1, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m1, m7 + pmaddwd m4, m4 + punpckhwd m1, m7 + pmaddwd m1, m1 + punpcklwd m0, m5, m7 ; b5 + punpckhwd m5, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + mova m4, [base+pd_0xf00800a4] + MAXSD m3, m1, m7 + psubd m3, m1 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m5, m4 + paddusw m2, m4 + paddusw m3, m4 + psrld m1, m2, 20 ; min(z5, 255) + psrld m2, m3, 20 + GATHER_X_BY_X m4, m1, m2, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + psubw m1, m12, m4 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m5, m10 + mova [t4+wq*1+400*0+ 4], m1 + psrld m0, 12 + psrld m5, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m5 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400* 6] + mova m4, [t1+wq+400* 8] + mova m5, [t1+wq+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+4], m5 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + mova [t3+wq*2+400*8+ 8], m3 + mova [t3+wq*2+400*0+ 8], m4 + mova [t3+wq*2+400*0+24], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+wq+400*0], m3 + mova [t1+wq+400*2], m4 + mova [t1+wq+400*4], m5 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m4, [t1+wq+400* 6] + mova m5, [t1+wq+400* 8] + mova m7, [t1+wq+400*10] + paddw m1, m4, [t2+wq+400* 6] + paddd m2, m5, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 6], m4 + mova [t2+wq+400* 8], m5 + mova [t2+wq+400*10], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*4+4], m5 + psrld m0, 12 + psrld m8, m1, 12 + mova m4, [t3+wq*2+400*8+ 8] + mova m5, [t3+wq*2+400*0+ 8] + mova m7, [t3+wq*2+400*0+24] + paddw m1, m4, [t2+wq+400*0] + paddd m2, m5, [t2+wq+400*2] + paddd m3, m7, [t2+wq+400*4] + paddw m1, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m4 + mova [t2+wq+400*2], m5 + mova [t2+wq+400*4], m7 + mova m4, [base+pw_25] + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m8 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 + MULLD m3, m4, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + mova m4, [base+pd_0xf00800a4] + MAXSD m3, m5, m7 + psubd m3, m5 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrld m5, m2, 20 ; min(z5, 255) + psrld m2, m3, 20 + GATHER_X_BY_X m4, m5, m2, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + psubw m5, m12, m4 + MULLD m0, m2, m7 + MULLD m1, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*0+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 2] + movu m1, [t3+wq*2+400*0+ 4] + movu m2, [t3+wq*2+400*0+20] + movu m7, [t4+wq*1+400*0+ 4] + movu m8, [t3+wq*2+400*0+ 8] + paddw m3, m0, [t4+wq*1+400*0+ 0] + paddd m4, m1, [t3+wq*2+400*0+ 0] + paddd m5, m2, [t3+wq*2+400*0+16] + paddw m3, m7 + paddd m4, m8 + movu m7, [t3+wq*2+400*0+24] + paddw m0, m3 + paddd m1, m4 + psllw m3, 2 + pslld m4, 2 + paddd m5, m7 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a5 565 + paddd m1, m4 ; b5 565 + paddd m2, m5 + mova [t4+wq*1+400* 6+ 0], m0 + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*12+16], m2 + movu m0, [t4+wq*1+400*2+ 4] + movu m3, [t4+wq*1+400*2+ 2] + paddw m0, [t4+wq*1+400*2+ 0] + movu m1, [t3+wq*2+400*4+ 8] + movu m4, [t3+wq*2+400*4+ 4] + paddd m1, [t3+wq*2+400*4+ 0] + movu m2, [t3+wq*2+400*4+24] + movu m5, [t3+wq*2+400*4+20] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[-1] 444 + pslld m4, 2 ; b3[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a3[-1] 343 + psubd m4, m1 ; b3[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8+ 0], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + movu m0, [t4+wq*1+400*4+ 4] + movu m3, [t4+wq*1+400*4+ 2] + paddw m0, [t4+wq*1+400*4+ 0] + movu m1, [t3+wq*2+400*8+ 8] + movu m4, [t3+wq*2+400*8+ 4] + paddd m1, [t3+wq*2+400*8+ 0] + movu m2, [t3+wq*2+400*8+24] + movu m5, [t3+wq*2+400*8+20] + paddd m2, [t3+wq*2+400*8+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[ 0] 444 + pslld m4, 2 ; b3[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400*10+ 0], m3 + mova [t3+wq*2+400*20+ 0], m4 + mova [t3+wq*2+400*20+16], m5 + psubw m3, m0 ; a3[ 0] 343 + psubd m4, m1 ; b3[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400*12+ 0], m3 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 4] + movu m2, [t4+wq*1+ 2] + paddw m0, [t4+wq*1+ 0] + paddw m0, m2 + paddw m2, m0 + psllw m0, 2 + paddw m0, m2 ; a5 + movu m4, [t3+wq*2+ 8] + movu m5, [t3+wq*2+24] + movu m1, [t3+wq*2+ 4] + movu m3, [t3+wq*2+20] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddd m4, m1 + paddd m5, m3 + paddd m1, m4 + paddd m3, m5 + pslld m4, 2 + pslld m5, 2 + paddd m4, m1 ; b5 + paddd m5, m3 + movu m2, [t4+wq*1+400* 6] + paddw m2, m0 + mova [t4+wq*1+400* 6], m0 + paddd m0, m4, [t3+wq*2+400*12+ 0] + paddd m1, m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + mova [rsp+16+ARCH_X86_32*4], m1 + movu m3, [t4+wq*1+400*2+4] + movu m5, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + movu m3, [t4+wq*1+400* 8] + paddw m3, [t4+wq*1+400*10] + paddw m3, m4 + mova [t4+wq*1+400* 8], m4 + mova [t4+wq*1+400*10], m5 + movu m1, [t3+wq*2+400*4+ 8] + movu m5, [t3+wq*2+400*4+ 4] + movu m7, [t3+wq*2+400*4+24] + movu m8, [t3+wq*2+400*4+20] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m7, [t3+wq*2+400*4+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 +%if ARCH_X86_32 + mova [esp+52], m8 + psubd m8, m7 +%else + psubd m6, m8, m7 + SWAP m8, m6 +%endif + paddd m1, m4, [t3+wq*2+400*16+ 0] + paddd m7, m8, [t3+wq*2+400*16+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m8 + mova [t3+wq*2+400*20+ 0], m5 +%if ARCH_X86_32 + mova m8, [esp+52] +%else + SWAP m8, m6 + pxor m6, m6 +%endif + mova [t3+wq*2+400*20+16], m8 + mova [rsp+32+ARCH_X86_32*4], m7 + movu m4, [dstq+wq] + punpcklwd m7, m2, m6 + punpckhwd m2, m6 + punpcklwd m8, m3, m6 + punpckhwd m3, m6 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + pmaddwd m7, m5 ; a5 * src + pmaddwd m8, m5 ; a3 * src + pmaddwd m2, m4 + pmaddwd m3, m4 + pslld m5, 13 + pslld m4, 13 + psubd m0, m5 + psubd m1, m5 + paddd m0, m7 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m8 ; a3 * src + b3 + (1 << 8) - (src << 13) + mova m7, [base+pd_0xffff] + psrld m0, 9 + pslld m1, 7 + pand m0, m7 + pandn m8, m7, m1 + por m0, m8 + psubd m1, m4, [rsp+16+ARCH_X86_32*4] + psubd m8, m4, [rsp+32+ARCH_X86_32*4] + psubd m2, m1 + psubd m3, m8 + mova m1, [base+pd_4096] + psrld m2, 9 + pslld m3, 7 + pand m2, m7 + pandn m7, m3 + por m2, m7 + pmaddwd m0, m15 + pmaddwd m2, m15 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + paddd m5, m1 + paddd m4, m1 + paddd m0, m5 + paddd m2, m4 + psrad m0, 8 + psrad m2, 8 + packssdw m0, m2 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +%if ARCH_X86_64 + SWAP m6, m7 +%endif +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*4+4] + movu m5, [t4+wq*1+400*4+2] + paddw m3, [t4+wq*1+400*4+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + paddw m3, m4, [t4+wq*1+400*12] + paddw m3, [t4+wq*1+400*10] + mova [t4+wq*1+400*10], m5 + mova [t4+wq*1+400*12], m4 + movu m1, [t3+wq*2+400*8+ 8] + movu m5, [t3+wq*2+400*8+ 4] + movu m7, [t3+wq*2+400*8+24] + movu m8, [t3+wq*2+400*8+20] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m7, [t3+wq*2+400*8+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 + psubd m0, m8, m7 + paddd m1, m4, [t3+wq*2+400*24+ 0] + paddd m7, m0, [t3+wq*2+400*24+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*20+ 0], m5 + mova [t3+wq*2+400*20+16], m8 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m0 + mova m5, [dstq+wq] + mova m8, [t4+wq*1+400* 6] + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m0, m8, m6 + punpckhwd m8, m6 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m0, m4 ; a5 * src + pmaddwd m2, m4 ; a3 * src + pmaddwd m8, m5 + pmaddwd m3, m5 + paddd m1, m2 ; a3 * src + b3 + (1 << 8) - (src << 13) + pslld m4, 12 + pslld m5, 12 + psubd m2, m4, [t3+wq*2+400*12+ 0] + psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + psubd m2, m5, [t3+wq*2+400*12+16] + psubd m8, m2 + paddd m4, m4 + paddd m5, m5 + paddd m7, m3 + mova m2, [base+pd_0xffff] + psubd m1, m4 + psubd m7, m5 + psrld m0, 8 + psrld m8, 8 + pslld m1, 7 + pslld m7, 7 + pand m0, m2 + pand m8, m2 + pandn m3, m2, m1 + pandn m2, m7 + por m0, m3 + por m8, m2 + mova m1, [base+pd_4096] + pmaddwd m0, m15 + pmaddwd m8, m15 +%if ARCH_X86_64 + pxor m6, m6 + SWAP m7, m6 +%else + pxor m7, m7 +%endif + paddd m4, m1 + paddd m5, m1 + paddd m0, m4 + paddd m8, m5 + psrad m0, 8 + psrad m8, 8 + packssdw m0, m8 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret diff -Nru dav1d-0.9.0/src/x86/looprestoration_init_tmpl.c dav1d-0.9.1/src/x86/looprestoration_init_tmpl.c --- dav1d-0.9.0/src/x86/looprestoration_init_tmpl.c 2021-05-16 16:47:22.550950800 +0000 +++ dav1d-0.9.1/src/x86/looprestoration_init_tmpl.c 2021-07-28 21:38:28.909852300 +0000 @@ -179,6 +179,7 @@ decl_wiener_filter_fns(sse2); decl_wiener_filter_fns(ssse3); decl_wiener_filter_fns(avx2); +decl_sgr_filter_fns(ssse3); decl_sgr_filter_fns(avx2); #if BITDEPTH == 8 @@ -197,12 +198,18 @@ #endif if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); +#if BITDEPTH == 8 c->sgr[0] = BF(sgr_filter_5x5, ssse3); c->sgr[1] = BF(sgr_filter_3x3, ssse3); c->sgr[2] = BF(sgr_filter_mix, ssse3); +#else + if (bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3); + c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3); + } #endif #if ARCH_X86_64 diff -Nru dav1d-0.9.0/src/x86/mc16_sse.asm dav1d-0.9.1/src/x86/mc16_sse.asm --- dav1d-0.9.0/src/x86/mc16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/mc16_sse.asm 2021-07-28 21:38:28.913852200 +0000 @@ -0,0 +1,4544 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +; dav1d_obmc_masks[] << 9 +obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 + dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 + dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 + dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 + dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 + dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 + dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + +pw_2: times 8 dw 2 +pw_16: times 4 dw 16 +prep_mul: times 4 dw 16 + times 8 dw 4 +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pw_2048: times 4 dw 2048 +bidir_mul: times 4 dw 2048 +pw_8192: times 8 dw 8192 +pw_27615: times 8 dw 27615 +pw_32766: times 8 dw 32766 +pw_m512: times 8 dw -512 +pd_512: times 4 dd 512 +pd_65538: times 2 dd 65538 + +put_bilin_h_rnd: times 4 dw 8 + times 4 dw 10 +bidir_rnd: times 4 dw -16400 + times 4 dw -16388 +put_8tap_h_rnd: dd 34, 34, 40, 40 +prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) +prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) + +warp8x8_shift: dd 11, 13 +warp8x8_rnd1: dd 1024, 1024, 4096, 4096 +warp8x8_rnd2: times 4 dw 4096 + times 4 dw 16384 +warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +INIT_XMM ssse3 +cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy +%define base t0-put_ssse3 + mov mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn wd, wm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + add wq, t0 + movifnidn hd, hm + jmp wq +.put_w2: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + add srcq, 16*8 + add dstq, 16*8 +.put_w128_loop: + movu m0, [srcq-16*8] + movu m1, [srcq-16*7] + movu m2, [srcq-16*6] + movu m3, [srcq-16*5] + mova [dstq-16*8], m0 + mova [dstq-16*7], m1 + mova [dstq-16*6], m2 + mova [dstq-16*5], m3 + movu m0, [srcq-16*4] + movu m1, [srcq-16*3] + movu m2, [srcq-16*2] + movu m3, [srcq-16*1] + mova [dstq-16*4], m0 + mova [dstq-16*3], m1 + mova [dstq-16*2], m2 + mova [dstq-16*1], m3 + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w128_loop + RET +.h: + movd m5, mxyd + mov mxyd, r7m ; my + mova m4, [base+pw_16] + pshufb m5, [base+pw_256] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + mov r6d, r8m ; bitdepth_max + shr r6d, 11 + movddup m3, [base+put_bilin_h_rnd+r6*8] + movifnidn hd, hm + sub wd, 8 + jg .h_w16 + je .h_w8 + jp .h_w4 +.h_w2: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw m0, m4, m1 + psrlq m1, 16 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movd [dstq+dsq*0], m0 + punpckhqdq m0, m0 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + movq m1, [srcq+ssq*0+2] + movhps m1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2+16*0], m0 + mova [dstq+r6*2+16*1], m1 + add r6, 16 + jl .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16_loop0 + RET +.v: + shl mxyd, 11 + movd m5, mxyd + pshufb m5, [base+pw_256] + movifnidn hd, hm + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movd m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq m0, [srcq+ssq*0] +.v_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movq m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + mov r7, srcq + lea r6d, [wq+hq-256] + mov r4, dstq +%else + mov r6, srcq +%endif +.v_w8_loop0: + movu m0, [srcq+ssq*0] +.v_w8_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +%if ARCH_X86_64 + add r7, 16 + add r4, 16 + movzx hd, r6b + mov srcq, r7 + mov dstq, r4 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .v_w8_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 8 + shl mxyd, 11 + mova m3, [base+pw_2] + movd m6, mxyd + mova m7, [base+pw_8192] + pshufb m6, [base+pw_256] + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + mova m7, [base+pw_2048] +.hv_12bpc: + movifnidn hd, hm + cmp wd, 4 + jg .hv_w8 + je .hv_w4 +.hv_w2: + movddup m0, [srcq+ssq*0] + pshufhw m1, m0, q0321 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w2_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m2, [srcq+ssq*0] + pmullw m1, m4, m2 + psrlq m2, 16 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 _ 2 _ + shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + movddup m0, [srcq+ssq*0] + movddup m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + movq m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + movhps m2, [srcq+ssq*0+2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + lea r6d, [wq+hq-256] + mov r4, srcq + mov r7, dstq +%else + mov r6, srcq +%endif +.hv_w8_loop0: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w8_loop: + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m2, m5 + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .hv_w8_loop0 +%if WIN64 + pop r7 +%endif + RET + +cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 +%define base r6-prep_ssse3 + movifnidn mxyd, r5m ; mx + LEA r6, prep_ssse3 + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [base+prep_ssse3_table+wq*2] + mov r5d, r7m ; bitdepth_max + mova m5, [base+pw_8192] + add wq, r6 + shr r5d, 11 + movddup m4, [base+prep_mul+r5*8] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + dec hd + jg .prep_w32 + RET +.prep_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + dec hd + jg .prep_w64 + RET +.prep_w128: + movu m0, [srcq+16* 0] + movu m1, [srcq+16* 1] + movu m2, [srcq+16* 2] + movu m3, [srcq+16* 3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16* 4] + movu m1, [srcq+16* 5] + movu m2, [srcq+16* 6] + movu m3, [srcq+16* 7] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + movu m0, [srcq+16* 8] + movu m1, [srcq+16* 9] + movu m2, [srcq+16*10] + movu m3, [srcq+16*11] + add tmpq, 16*16 + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*8], m0 + mova [tmpq-16*7], m1 + mova [tmpq-16*6], m2 + mova [tmpq-16*5], m3 + movu m0, [srcq+16*12] + movu m1, [srcq+16*13] + movu m2, [srcq+16*14] + movu m3, [srcq+16*15] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*4], m0 + mova [tmpq-16*3], m1 + mova [tmpq-16*2], m2 + mova [tmpq-16*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd m4, mxyd + mov mxyd, r6m ; my + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m3, 2 + psllw m4, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + sub wd, 8 + je .h_w8 + jg .h_w16 +.h_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*0+2] + movhps m1, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + add r6, 16 + jl .h_w16_loop + add srcq, strideq + dec hd + jg .h_w16_loop0 + RET +.v: + movd m4, mxyd + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 + psllw m4, 2 +.v_12bpc: + cmp wd, 8 + je .v_w8 + jg .v_w16 +.v_w4: + movq m0, [srcq+strideq*0] +.v_w4_loop: + movq m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklqdq m1, m0, m2 ; 0 1 + movq m0, [srcq+strideq*0] + punpcklqdq m2, m0 ; 1 2 + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu m0, [srcq+strideq*0] +.v_w8_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+16*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.v_w16_loop0: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+wq*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+wq*2], m1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .v_w16_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + shl mxyd, 11 + movd m6, mxyd + pshufb m6, [base+pw_256] + cmp wd, 8 + je .hv_w8 + jg .hv_w16 +.hv_w4: + movddup m0, [srcq+strideq*0] + movddup m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + movhps m1, [srcq+strideq*0] + movhps m2, [srcq+strideq*0+2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+16*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.hv_w16_loop0: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+wq*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .hv_w16_loop0 +%if WIN64 + pop r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2, 6 +%elif WIN64 +DECLARE_REG_TMP 4, 5, 8 +%else +DECLARE_REG_TMP 7, 8, 8 +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r1b +%define myd r1 +%define myq r1 +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%else +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%endif +%define base t2-put_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, put_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + movifnidn ssq, ssmp + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + movifnidn dstq, dstmp + movifnidn dsq, dsmp + add wq, t2 +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + cmp wd, 4 + jg .h_w8 + movzx mxd, mxb + lea srcq, [srcq-2] + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + je .h_w4 +.h_w2: + mova m2, [base+spel_h_shuf2] + pshufd m3, m3, q2121 +.h_w2_loop: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m2 + pshufb m1, m2 + pmaddwd m0, m3 + pmaddwd m1, m3 + phaddd m0, m1 + paddd m0, m4 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movd [dstq+dsq*0], m0 + pshuflw m0, m0, q3232 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + WIN64_SPILL_XMM 8 + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] + pshufd m2, m3, q1111 + pshufd m3, m3, q2222 +.h_w4_loop: + movu m1, [srcq] + add srcq, ssq + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movq [dstq], m0 + add dstq, dsq + dec hd + jg .h_w4_loop + RET +.h_w8: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 +%endif + shr mxd, 16 + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] +%if UNIX64 + mov wd, wd +%endif + lea srcq, [srcq+wq*2] + punpcklbw m3, m3 + lea dstq, [dstq+wq*2] + psraw m3, 8 + neg wq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6*2- 6] + movu m1, [srcq+r6*2+ 2] + pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 + pshufb m0, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m8 ; abcd0 + pmaddwd m0, m9 ; abcd1 + pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 + pshufb m1, m7 ; 6 7 7 8 8 9 9 a + paddd m2, m4 + paddd m0, m2 + pmaddwd m2, m10, m3 ; abcd2 + pmaddwd m3, m8 ; efgh0 + paddd m0, m2 + pmaddwd m2, m11, m1 ; abcd3 + pmaddwd m1, m9 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6*2+10] + paddd m3, m4 + paddd m1, m3 + pshufb m3, m2, m6 ; 8 9 9 a a b b c + pshufb m2, m7 ; a b b c c d d e + pmaddwd m3, m10 ; efgh2 + pmaddwd m2, m11 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + mova [dstq+r6*2], m0 + add r6, 8 + jl .h_w8_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if WIN64 + WIN64_SPILL_XMM 15 +%endif + movd m7, r8m + movifnidn dstq, dstmp + movifnidn dsq, dsmp + punpcklbw m3, m3 + pshufb m7, [base+pw_256] + psraw m3, 8 ; sign-extend +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + cmp wd, 2 + jne .v_w4 +.v_w2: + movd m1, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + movd m2, [srcq+ssq*2] + add srcq, r6 + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m6, [srcq+ssq*2] + add srcq, r6 + movd m0, [srcq+ssq*0] + punpckldq m1, m4 ; 0 1 + punpckldq m4, m2 ; 1 2 + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m6 ; 4 5 + punpckldq m6, m0 ; 5 6 + punpcklwd m1, m4 ; 01 12 + punpcklwd m2, m5 ; 23 34 + punpcklwd m3, m6 ; 45 56 + pxor m6, m6 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + punpckldq m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m6 + pavgw m5, m6 + pminsw m5, m7 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcmp, srcq +%endif + lea wd, [wq+hq-(1<<16)] +%else + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] +%endif +.v_w4_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, r6 + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, r6 + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_w4_loop_start +.v_w4_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_w4_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m3 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + pxor m2, m2 + pmaxsw m1, m2 + pavgw m1, m2 + pminsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*29] + mov dstq, [esp+4*30] + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + sub wd, 1<<16 +%else +.v_w4_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packssdw m12, m13 + pxor m13, m13 + pmaxsw m12, m13 + pavgw m12, m13 + pminsw m12, m7 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .v_w4_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if ARCH_X86_32 + movd m4, r8m + mova m6, [base+pd_512] + pshufb m4, [base+pw_256] +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + movd m15, r8m + pshufb m15, [base+pw_256] +%endif + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + je .hv_w4 + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov dstq, dstmp + mov dsq, dsmp + mova m5, [base+spel_h_shuf2] + ALLOC_STACK -16*8 +%else + mova m6, [base+pd_512] + mova m9, [base+spel_h_shuf2] +%endif + pshuflw m0, m0, q2121 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_w2_10bpc + psraw m7, 2 + psllw m3, 2 +.hv_w2_10bpc: + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 +%if ARCH_X86_32 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m9, m5 + mova m11, m0 + mova m12, m1 + mova m13, m2 + mova m14, m3 + mova m15, m4 +%else + pshufd m11, m3, q0000 + pshufd m12, m3, q1111 + pshufd m13, m3, q2222 + pshufd m14, m3, q3333 +%endif + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m1, [srcq+ssq*2] + add srcq, r6 + movu m4, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m2, m3, m1, m4 +%else + REPX {pshufb x, m9}, m2, m3, m1, m4 +%endif + REPX {pmaddwd x, m7}, m2, m3, m1, m4 + phaddd m2, m3 ; 0 1 + phaddd m1, m4 ; 2 3 + movu m3, [srcq+ssq*1] + movu m4, [srcq+ssq*2] + add srcq, r6 + movu m0, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m3, m4, m0 +%else + REPX {pshufb x, m9}, m3, m4, m0 +%endif + REPX {pmaddwd x, m7}, m3, m4, m0 + phaddd m3, m4 ; 4 5 + phaddd m0, m0 ; 6 6 + REPX {paddd x, m6}, m2, m1, m3, m0 + REPX {psrad x, 10}, m2, m1, m3, m0 + packssdw m2, m1 ; 0 1 2 3 + packssdw m3, m0 ; 4 5 6 _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + pshufd m5, m3, q0321 ; 5 6 _ _ + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + punpcklwd m3, m5 ; 45 56 +.hv_w2_loop: + movu m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m5, [srcq+ssq*0] + pshufb m4, m9 + pshufb m5, m9 + pmaddwd m4, m7 + pmaddwd m5, m7 + phaddd m4, m5 + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + paddd m4, m6 + psrad m4, 10 ; 7 8 + packssdw m0, m4 + pshufd m3, m0, q2103 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m5, m6 + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + pxor m4, m4 + pminsw m5, m15 + pmaxsw m5, m4 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w8: + shr mxd, 16 +.hv_w4: + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + mov dstq, dstmp + mov dsq, dsmp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + ALLOC_STACK -16*15 + mova m8, m0 + mova m9, m1 + mova m14, m6 +%else + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m3, 8 + test dword r8m, 0x800 + jz .hv_w4_10bpc + psraw m0, 2 + psllw m3, 2 +.hv_w4_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 +%if ARCH_X86_32 + %define tmp esp+16*8 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcmp, srcq +%endif + mova [tmp+16*5], m4 + lea wd, [wq+hq-(1<<16)] + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-104 ; red zone +%endif + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + mova [tmp+16*5], m15 +%endif + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] + pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 + pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 + pmaddwd m%3, m10 + pmaddwd m%1, m11 + paddd m%3, %5 + paddd m%1, m%3 + pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + pmaddwd m%3, m12 + pmaddwd m%2, m13 + paddd m%1, m%3 + paddd m%1, m%2 + psrad m%1, %4 +%endmacro +.hv_w4_loop0: +%if ARCH_X86_64 + mova m14, [pd_512] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + movu m6, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 4, 1, 0, 10 + PUT_8TAP_HV_H 5, 2, 0, 10 + PUT_8TAP_HV_H 6, 3, 0, 10 + movu m7, [srcq+ssq*0+0] + movu m2, [srcq+ssq*0+8] + movu m1, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + PUT_8TAP_HV_H 7, 2, 0, 10 + PUT_8TAP_HV_H 1, 3, 0, 10 + movu m2, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 2, 3, 0, 10 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 10 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_w4_loop_start +.hv_w4_loop: + mova m1, [tmp+16*6] + mova m2, m15 +.hv_w4_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*6], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 9 + psrad m2, 9 + packssdw m1, m2 + pxor m7, m7 + pmaxsw m1, m7 + pavgw m7, m1 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*61] + mov dstq, [esp+4*62] + add srcq, 8 + add dstq, 8 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + movzx hd, ww + sub wd, 1<<16 +%else +.hv_w4_loop: + mova m15, [tmp+16*1] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 9 + psrad m15, 9 + packssdw m14, m15 + pxor m7, m7 + pmaxsw m14, m7 + pavgw m7, m14 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .hv_w4_loop0 + RET +%undef tmp + +%if ARCH_X86_32 +DECLARE_REG_TMP 2, 1, 6, 4 +%elif WIN64 +DECLARE_REG_TMP 6, 4, 7, 4 +%else +DECLARE_REG_TMP 6, 7, 7, 8 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r2b +%define myd r2 +%define myq r2 +%else +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%endif +%define base t2-prep_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, prep_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + test mxd, 0xf00 + jnz .h + movifnidn hd, hm + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov myd, r7m ; bitdepth_max + movzx wd, word [base+prep_ssse3_table+wq*2] + mova m5, [base+pw_8192] + shr myd, 11 + add wq, t2 + movddup m4, [base+prep_mul+myq*8] + movifnidn ssq, ssmp + movifnidn tmpq, tmpmp + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movifnidn hd, r4m + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + jne .h_w8 + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + mova m3, [base+spel_h_shufA] + mova m4, [base+spel_h_shufB] + movifnidn tmpq, tmpmp + sub srcq, 2 + WIN64_SPILL_XMM 8 + punpcklbw m0, m0 + psraw m0, 8 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw m0, 2 +.h_w4_12bpc: + pshufd m6, m0, q1111 + pshufd m7, m0, q2222 +.h_w4_loop: + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + WIN64_SPILL_XMM 11 + shr mxd, 16 + movq m2, [base+subpel_filters+mxq*8] + mova m4, [base+spel_h_shufA] + mova m6, [base+spel_h_shufB] + movifnidn tmpq, r0mp + add wd, wd + punpcklbw m2, m2 + add srcq, wq + psraw m2, 8 + add tmpq, wq + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*3 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 + pshufd m10, m2, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6- 6] + movu m1, [srcq+r6+ 2] + pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 + pshufb m0, m6 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m7 ; abcd0 + pmaddwd m0, m8 ; abcd1 + pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 + pshufb m1, m6 ; 6 7 7 8 8 9 9 a + paddd m2, m5 + paddd m0, m2 + pmaddwd m2, m9, m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m2 + pmaddwd m2, m10, m1 ; abcd3 + pmaddwd m1, m8 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6+10] + paddd m3, m5 + paddd m1, m3 + pshufb m3, m2, m4 ; a b b c c d d e + pshufb m2, m6 ; 8 9 9 a a b b c + pmaddwd m3, m9 ; efgh2 + pmaddwd m2, m10 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + WIN64_SPILL_XMM 15 + movddup m7, [base+prep_8tap_1d_rnd] + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 +.v_12bpc: +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_64 + mov r7, tmpq +%elif STACK_ALIGNMENT < 16 + mov [esp+4*29], tmpq +%endif + lea wd, [wq+hq-(1<<8)] +.v_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m5, [srcq+ssq*0] + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_loop_start +.v_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m7 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m7 + paddd m2, m3 + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*29] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*29], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.v_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m7 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m7 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + movq [tmpq+r6*0], m12 + movhps [tmpq+r6*2], m12 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .v_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + movzx t3d, mxb + shr mxd, 16 + cmp wd, 4 + cmove mxd, t3d + movifnidn hd, r4m + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov ssq, r2mp + mov tmpq, r0mp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + mova m4, [base+prep_8tap_2d_rnd] + ALLOC_STACK -16*14 + mova m8, m0 + mova m9, m1 + mova m14, m4 +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m0, 4 + psraw m3, 8 + test dword r7m, 0x800 + jz .hv_10bpc + psraw m0, 2 +.hv_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_32 + %define tmp esp+16*8 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], tmpq +%endif + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-88 ; red zone +%endif + mov r7, tmpq + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + lea wd, [wq+hq-(1<<8)] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +.hv_loop0: +%if ARCH_X86_64 + mova m14, [prep_8tap_2d_rnd] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m6, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 4, 1, 0, 6 + PUT_8TAP_HV_H 5, 2, 0, 6 + PUT_8TAP_HV_H 6, 3, 0, 6 + movu m7, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m1, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 7, 2, 0, 6 + PUT_8TAP_HV_H 1, 3, 0, 6 + movu m2, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 2, 3, 0, 6 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 6 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_loop_start +.hv_loop: + mova m1, [tmp+16*5] + mova m2, m15 +.hv_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*5], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m14 + paddd m2, m14 + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 6 + psrad m2, 6 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*61] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*61], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.hv_loop: + mova m15, [tmp+16*1] + mova m7, [prep_8tap_2d_rnd] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + paddd m14, m7 + paddd m15, m7 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 6 + psrad m15, 6 + packssdw m14, m15 + movq [tmpq+r6*0], m14 + movhps [tmpq+r6*2], m14 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .hv_loop0 + RET +%undef tmp + +%if ARCH_X86_64 +; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that +; by allocating 16 bytes more stack space so that stack offsets match up. +%if WIN64 && STACK_ALIGNMENT == 16 +%assign stksz 16*14 +%else +%assign stksz 16*13 +%endif +cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +%assign stack_size_padded_8x8t stack_size_padded +%else +cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%define m8 [esp+16*13] +%define m9 [esp+16*14] +%define cntd dword [esp+4*63] +%define dstq tmpq +%define dsq 0 +%if STACK_ALIGNMENT < 16 +%define dstm [esp+4*65] +%define dsm [esp+4*66] +%else +%define dstm r0m +%define dsm r1m +%endif +%endif +%define base filterq-$$ + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8t_rnd] +%else + movddup m1, [base+warp8x8t_rnd] + mov r1, r1m + add r1, r1 + mova m8, m1 + mov r1m, r1 ; ds *= 2 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*4] +%else + add dstq, dsm + mov dstm, dstq +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*0], m1 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*2], m1 + dec cntd + jg .loop + RET + +%if ARCH_X86_64 +cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +ASSERT stack_size_padded == stack_size_padded_8x8t +%else +cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%endif + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8_rnd2+t0*8] + movd m9, r7m ; pixel_max + pshufb m9, [base+pw_256] +%else + movddup m1, [base+warp8x8_rnd2+t0*8] + movd m2, r7m ; pixel_max + pshufb m2, [base+pw_256] + mova m8, m1 + mova m9, m2 +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*2] +%else + add dstq, dsm + mov dstm, dstq +%endif + call .main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*0], m1 + call .main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*1], m1 + dec cntd + jg .loop + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov deltaq, r5m + mov mxd, r6m +%endif + movd m0, [base+warp8x8_shift+t0*4] + movddup m7, [base+warp8x8_rnd1+t0*8] + add filterq, mc_warp_filter-$$ +%if ARCH_X86_64 + movsx alphad, word [deltaq+2*0] + movsx betad, word [deltaq+2*1] + movsx gammad, word [deltaq+2*2] + movsx deltad, word [deltaq+2*3] + lea tmpq, [ssq*3] + add mxd, 512+(64<<10) + sub srcq, tmpq ; src -= ss*3 + imul tmpd, alphad, -7 + mov myd, r7m + add betad, tmpd ; beta -= alpha*7 + imul tmpd, gammad, -7 + add myd, 512+(64<<10) + mov cntd, 4 + add deltad, tmpd ; delta -= gamma*7 +%else +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset - gprsize +%endif + mov r3d, r5m ; abcd +%if STACK_ALIGNMENT < 16 + mov r0, r1m ; dst + mov r1, r2m ; ds + mov [esp+gprsize+4*65], r0 + mov [esp+gprsize+4*66], r1 +%endif + movsx alphad, word [r3+2*0] + movsx r2d, word [r3+2*1] + movsx gammad, word [r3+2*2] + movsx r3d, word [r3+2*3] + imul r5d, alphad, -7 + add r2d, r5d ; beta -= alpha*7 + imul r5d, gammad, -7 + mov [esp+gprsize+4*60], r2d + add r3d, r5d ; delta -= gamma*7 + mov [esp+gprsize+4*61], r3d + mov r3d, r4m ; ss + mov srcq, r3m + mov mxd, r6m + mov myd, r7m + mov dword [esp+gprsize+4*63], 4 ; cnt + mov [esp+gprsize+4*62], r3 + lea r3, [r3*3] + add mxd, 512+(64<<10) + add myd, 512+(64<<10) + sub srcq, r3 ; src -= ss*3 +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset + gprsize +%endif +%endif + mova [rsp+gprsize], m0 + pxor m6, m6 + call .h + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 01 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 1], m1 + mova [rsp+gprsize+16* 4], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 12 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 7], m1 + mova [rsp+gprsize+16*10], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 2], m1 + mova [rsp+gprsize+16* 5], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 34 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 8], m1 + mova [rsp+gprsize+16*11], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 45 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 3], m1 + mova [rsp+gprsize+16* 6], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 56 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 9], m1 + mova [rsp+gprsize+16*12], m5 + mova m5, m0 +.main2: + call .h +%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m4, [filterq+myq*8] ; a + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m2, [filterq+tmpq*8] ; b + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m3, [filterq+myq*8] ; c + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m1, [filterq+tmpq*8] ; d + lea tmpd, [myq+gammaq] + shr myd, 10 + punpcklwd m4, m2 + punpcklwd m3, m1 + punpckldq m2, m4, m3 + punpckhdq m4, m3 + punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + pmaddwd m1, [rsp+gprsize+16*%1] + punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + mova m2, [rsp+gprsize+16*%2] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%1], m2 + paddd m1, m3 + punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + mova m2, [rsp+gprsize+16*%3] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%2], m2 + paddd m1, m3 + punpcklwd m3, m5, m0 ; 67 + punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m2, m3 + mova [rsp+gprsize+16*%3], m3 + paddd m1, m2 + movq m4, [filterq+myq*8] ; e + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] ; f + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m2, [filterq+myq*8] ; g +%if ARCH_X86_64 + lea myd, [tmpq+deltaq] ; my += delta +%else + mov myd, [esp+gprsize+4*61] + add myd, tmpd +%endif + shr tmpd, 10 + punpcklwd m4, m3 + movq m3, [filterq+tmpq*8] ; h + punpcklwd m2, m3 + punpckldq m3, m4, m2 + punpckhdq m4, m2 + punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 + pmaddwd m2, [rsp+gprsize+16*%4] + punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 + mova m3, [rsp+gprsize+16*%5] + pmaddwd m6, m3 + mova [rsp+gprsize+16*%4], m3 + pxor m3, m3 + paddd m2, m6 + punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 + mova m6, [rsp+gprsize+16*%6] + pmaddwd m3, m6 + mova [rsp+gprsize+16*%5], m6 + punpckhwd m5, m0 + pxor m6, m6 + paddd m2, m3 + punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 + pmaddwd m3, m5 + mova [rsp+gprsize+16*%6], m5 + mova m5, m0 + paddd m2, m3 +%endmacro + WARP_V 1, 2, 3, 4, 5, 6 + ret +.main3: + call .h + WARP_V 7, 8, 9, 10, 11, 12 + ret +ALIGN function_align +.h: + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + punpcklbw m0, m6, m3 + movu m3, [srcq-6] + pmaddwd m0, m3 ; 0 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m2, m6, m3 + movu m3, [srcq-4] + pmaddwd m2, m3 ; 1 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m0, m2 ; 0 1 + punpcklbw m2, m6, m3 + movu m3, [srcq-2] + pmaddwd m2, m3 ; 2 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m1, m6, m3 + movu m3, [srcq+0] + pmaddwd m1, m3 ; 3 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m2, m1 ; 2 3 + punpcklbw m1, m6, m3 + movu m3, [srcq+2] + pmaddwd m1, m3 ; 4 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + phaddd m0, m2 ; 0 1 2 3 + punpcklbw m2, m6, m3 + movu m3, [srcq+4] + pmaddwd m2, m3 ; 5 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m1, m2 ; 4 5 + punpcklbw m2, m6, m3 + movu m3, [srcq+6] + pmaddwd m2, m3 ; 6 +%if ARCH_X86_64 + lea mxd, [tmpq+betaq] ; mx += beta +%else + mov mxd, [esp+gprsize*2+4*60] + add mxd, tmpd +%endif + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m4, m6, m3 + movu m3, [srcq+8] +%if ARCH_X86_64 + add srcq, ssq +%else + add srcq, [esp+gprsize*2+4*62] +%endif + pmaddwd m3, m4 ; 7 + phaddd m2, m3 ; 6 7 + phaddd m1, m2 ; 4 5 6 7 + paddd m0, m7 + paddd m1, m7 + psrad m0, [rsp+gprsize*2] + psrad m1, [rsp+gprsize*2] + packssdw m0, m1 + ret + +%macro BIDIR_FN 0 + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.ret: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jne .w8_loop + RET +.w16_loop: + call .main + add dstq, strideq +.w16: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + dec hd + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h +%define base r6-avg_ssse3_table + LEA r6, avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + movddup m2, [base+bidir_rnd+t0*8] + movddup m3, [base+bidir_mul+t0*8] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+16*0] + paddsw m0, [tmp2q+16*0] + mova m1, [tmp1q+16*1] + paddsw m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + pmulhw m0, m3 + pmulhw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h +%define base r6-w_avg_ssse3_table + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; weight + movd m6, r7m ; pixel_max + movddup m5, [base+pd_65538] + movsxd wq, [r6+wq*4] + pshufb m6, [base+pw_256] + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + paddw m5, m6 + mov r6d, t0d + shl t0d, 2 + test dword r7m, 0x800 + cmovnz r6d, t0d + movifnidn hd, hm + movd m4, r6d + pslld m5, 7 + pxor m7, m7 + pshufd m4, m4, q0000 + BIDIR_FN +ALIGN function_align +.main: + mova m2, [tmp1q+16*0] + mova m0, [tmp2q+16*0] + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + mova m2, [tmp1q+16*1] + mova m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaddwd m3, m4 + pmaddwd m0, m4 + paddd m3, m5 + paddd m0, m5 + psrad m3, 8 + psrad m0, 8 + packssdw m0, m3 + punpckhwd m3, m1, m2 + punpcklwd m1, m2 + pmaddwd m3, m4 + pmaddwd m1, m4 + paddd m3, m5 + paddd m1, m5 + psrad m3, 8 + psrad m1, 8 + packssdw m1, m3 + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 + ret + +%if ARCH_X86_64 +cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%else +cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask +%define hd dword r5m +%define m8 [base+pw_64] +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + movddup m6, [base+bidir_rnd+t0*8] + movddup m7, [base+bidir_mul+t0*8] +%if ARCH_X86_64 + mova m8, [base+pw_64] + movifnidn hd, hm +%endif + add wq, r6 + mov maskq, r6mp + BIDIR_FN +ALIGN function_align +.main: + movq m3, [maskq+8*0] + mova m0, [tmp1q+16*0] + mova m4, [tmp2q+16*0] + pxor m5, m5 + punpcklbw m3, m5 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + psubw m1, m8, m3 + punpckhwd m4, m3, m1 ; m, 64-m + punpcklwd m3, m1 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m0, m3 + movq m3, [maskq+8*1] + mova m1, [tmp1q+16*1] + mova m4, [tmp2q+16*1] + add maskq, 8*2 + add tmp1q, 16*2 + add tmp2q, 16*2 + psrad m2, 5 + psrad m0, 5 + packssdw m0, m2 + punpcklbw m3, m5 + punpckhwd m2, m1, m4 + punpcklwd m1, m4 + psubw m5, m8, m3 + punpckhwd m4, m3, m5 ; m, 64-m + punpcklwd m3, m5 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m1, m3 + psrad m2, 5 + psrad m1, 5 + packssdw m1, m2 + pmaxsw m0, m6 + pmaxsw m1, m6 + psubsw m0, m6 + psubsw m1, m6 + pmulhw m0, m7 + pmulhw m1, m7 + ret + +cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_420_ssse3_table + LEA t0, w_mask_420_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m0, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 + %define m8 [rsp+gprsize+16*0] + %define m9 [rsp+gprsize+16*1] + %define m10 [rsp+gprsize+16*2] + %define m11 [rsp+gprsize+16*3] +%endif + movd m7, [base+pw_2] + psubw m7, m0 + pshufb m7, [base+pw_256] + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w4: + movq [dstq+strideq*0], m0 + phaddw m2, m3 + movhps [dstq+strideq*1], m0 + phaddd m2, m2 + lea dstq, [dstq+strideq*2] + paddw m2, m7 + movq [dstq+strideq*0], m1 + psrlw m2, 2 + movhps [dstq+strideq*1], m1 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w8: + mova [dstq+strideq*0], m0 + paddw m2, m3 + phaddw m2, m2 + mova [dstq+strideq*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 8 +.w16: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movq [maskq], m2 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*0+16*2], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + paddw m2, [dstq+strideq*1+16*3] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*2 +.w64: + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*2], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*1+16*4], m3 + mova [dstq+strideq*0+16*3], m1 + call .main + mova [dstq+strideq*1+16*5], m2 + mova [dstq+strideq*0+16*4], m0 + mova [dstq+strideq*1+16*6], m3 + mova [dstq+strideq*0+16*5], m1 + call .main + mova [dstq+strideq*0+16*6], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*7], m2 + mova [dstq+strideq*0+16*7], m1 + call .main + paddw m2, [dstq+strideq*1+16*1] + paddw m3, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*3] + paddw m3, [dstq+strideq*1+16*4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16*5] + paddw m3, [dstq+strideq*1+16*6] + mova [dstq+strideq*1+16*4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*6], m2 + mova [dstq+strideq*1+16*5], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*6] + paddw m2, [dstq+strideq*1+16*7] + mova [dstq+strideq*1+16*6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*4 +.w128: + mova [dstq+strideq*1+16* 1], m2 + mova [dstq+strideq*0+16* 0], m0 + mova [dstq+strideq*1+16* 2], m3 + mova [dstq+strideq*0+16* 1], m1 + call .main + mova [dstq+strideq*1+16* 3], m2 + mova [dstq+strideq*0+16* 2], m0 + mova [dstq+strideq*1+16* 4], m3 + mova [dstq+strideq*0+16* 3], m1 + call .main + mova [dstq+strideq*1+16* 5], m2 + mova [dstq+strideq*0+16* 4], m0 + mova [dstq+strideq*1+16* 6], m3 + mova [dstq+strideq*0+16* 5], m1 + call .main + mova [dstq+strideq*1+16* 7], m2 + mova [dstq+strideq*0+16* 6], m0 + mova [dstq+strideq*1+16* 8], m3 + mova [dstq+strideq*0+16* 7], m1 + call .main + mova [dstq+strideq*1+16* 9], m2 + mova [dstq+strideq*0+16* 8], m0 + mova [dstq+strideq*1+16*10], m3 + mova [dstq+strideq*0+16* 9], m1 + call .main + mova [dstq+strideq*1+16*11], m2 + mova [dstq+strideq*0+16*10], m0 + mova [dstq+strideq*1+16*12], m3 + mova [dstq+strideq*0+16*11], m1 + call .main + mova [dstq+strideq*1+16*13], m2 + mova [dstq+strideq*0+16*12], m0 + mova [dstq+strideq*1+16*14], m3 + mova [dstq+strideq*0+16*13], m1 + call .main + mova [dstq+strideq*0+16*14], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*15], m2 + mova [dstq+strideq*0+16*15], m1 + call .main + paddw m2, [dstq+strideq*1+16* 1] + paddw m3, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 2], m2 + mova [dstq+strideq*1+16* 1], m1 + call .main + paddw m2, [dstq+strideq*1+16* 3] + paddw m3, [dstq+strideq*1+16* 4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16* 5] + paddw m3, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 6], m2 + mova [dstq+strideq*1+16* 5], m1 + call .main + paddw m2, [dstq+strideq*1+16* 7] + paddw m3, [dstq+strideq*1+16* 8] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + call .main + paddw m2, [dstq+strideq*1+16* 9] + paddw m3, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16* 8], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*10], m2 + mova [dstq+strideq*1+16* 9], m1 + call .main + paddw m2, [dstq+strideq*1+16*11] + paddw m3, [dstq+strideq*1+16*12] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16*10], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*11], m1 + packuswb m3, m2 + mova [maskq+16*2], m3 + call .main + paddw m2, [dstq+strideq*1+16*13] + paddw m3, [dstq+strideq*1+16*14] + mova [dstq+strideq*1+16*12], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*14], m2 + mova [dstq+strideq*1+16*13], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*14] + paddw m2, [dstq+strideq*1+16*15] + mova [dstq+strideq*1+16*14], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*15], m1 + packuswb m3, m2 + mova [maskq+16*3], m3 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2 ; dst/tmp_offset, mask + mova m%1, [tmp1q+16*%1] + mova m%2, [tmp2q+16*%1] + punpcklwd m4, m%2, m%1 + punpckhwd m5, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m6, m8, m%1 + psrlw m6, 10 ; 64-m + psubw m%2, m9, m6 ; m + punpcklwd m%1, m6, m%2 + punpckhwd m6, m%2 + pmaddwd m%1, m4 + pmaddwd m6, m5 + psrad m%1, 5 + psrad m6, 5 + packssdw m%1, m6 + pmaxsw m%1, m10 + psubsw m%1, m10 + pmulhw m%1, m11 +%endmacro + W_MASK 0, 2 + W_MASK 1, 3 + add tmp1q, 16*2 + add tmp2q, 16*2 + ret + +cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_422_ssse3_table + LEA t0, w_mask_422_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m7, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 +%endif + pxor m0, m0 + add wq, t0 + pshufb m7, m0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + phaddw m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + packuswb m2, m2 + pxor m3, m3 + psubb m2, m7 + pavgb m2, m3 + movq [maskq], m2 + add maskq, 8 + ret + +cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_444_ssse3_table + LEA t0, w_mask_444_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m7, [base+bidir_mul+r6*8] + ALLOC_STACK -16*3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + %define m11 m7 +%endif + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + packuswb m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + mova [maskq], m2 + add maskq, 16 + ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + mova m7, [base+pw_m512] + add wq, r6 + lea stride3q, [strideq*3] + pxor m6, m6 + jmp wq +.w4: + mova m5, [maskq] + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + movq m1, [dstq+strideq*2] + movhps m1, [dstq+stride3q ] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + mova m5, [maskq] + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m5, [maskq] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m5, [maskq+16*0] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova m5, [maskq+16*1] + mova m0, [dstq+16*2] + mova m1, [dstq+16*3] + psubw m2, m0, [tmpq+16*2] + psubw m3, m1, [tmpq+16*3] + add maskq, 32 + add tmpq, 64 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + add dstq, strideq + dec hd + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + movd m4, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+strideq*0] + movd m2, [tmpq+4*0] + movd m1, [dstq+strideq*1] + movd m3, [tmpq+4*1] + add tmpq, 4*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m4, [base+obmc_masks+8*2] +.w8_loop: + mova m0, [dstq+strideq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+strideq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] + movq m5, [base+obmc_masks+16*3] +.w16_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+8], m6 +%endif + mova m4, [base+obmc_masks+16*4] + mova m5, [base+obmc_masks+16*5] + mova m6, [base+obmc_masks+16*6] +.w32_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + mova m2, [dstq+16*2] + paddw m1, m3 + mova m3, [tmpq+16*2] + add tmpq, 16*4 + psubw m3, m2 + pmulhrsw m3, m6 + paddw m2, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps m6, [rsp+8] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+16*(%1+0)] + mova m2, [tmpq+16*(%2+0)] + mova m1, [dstq+16*(%1+1)] + mova m3, [tmpq+16*(%2+1)] +%if %3 + add tmpq, 16*%3 +%endif + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*(%1+0)], m0 + mova [dstq+16*(%1+1)], m1 +%endmacro + +cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_ssse3_table + LEA r6, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+blend_shuf] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + movd m3, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpckldq m0, m2 + punpcklwd m3, m3 + psubw m1, m0 + pmulhrsw m1, m3 + paddw m0, m1 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [base+blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + movddup m5, [base+blend_shuf+8] +%if WIN64 + movaps [rsp+ 8], m6 + movaps [rsp+24], m7 +%endif +.w8_loop: + movd m7, [maskq+hq*2] + mova m0, [dstq+dsq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+dsq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + pshufb m6, m7, m4 + psubw m2, m0 + pshufb m7, m5 + psubw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m3, m7 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop +%if WIN64 + movaps m6, [rsp+ 8] + movaps m7, [rsp+24] +%endif + RET +.w16: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w16 + RET +.w32: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 8 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 16 + BLEND_H_ROW 8, -8 + BLEND_H_ROW 10, -6 + BLEND_H_ROW 12, -4 + BLEND_H_ROW 14, -2 + add dstq, dsq + inc hq + jl .w128 + RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + lea reg_src, [reg_src+reg_tmp*2] +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, mmsize/2 + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3*2] + %else + mov r1, srcm + movu m0, [r1+r3*2] + %endif +%if %1 + movu [reg_tmp+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, mmsize/2 + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + lea reg_tmp, [reg_tmp+centerwq*2] +%else + lea reg_tmp, [dstq+centerwq*2] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq*2-2] + %else + mov r3, srcm + movd m0, [r3+centerwq*2-2] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3*2], m0 + add r3, mmsize/2 + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1*2] + lea r3, [dstq+r1*2] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1*2] +%else + mov r3, reg_blkm + mova m0, [r3+r1*2] +%endif + lea r3, [dstq+r1*2] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp diff -Nru dav1d-0.9.0/src/x86/mc_init_tmpl.c dav1d-0.9.1/src/x86/mc_init_tmpl.c --- dav1d-0.9.0/src/x86/mc_init_tmpl.c 2021-05-16 16:47:22.554951000 +0000 +++ dav1d-0.9.1/src/x86/mc_init_tmpl.c 2021-07-28 21:38:28.913852200 +0000 @@ -47,7 +47,7 @@ decl_##type##_fn(name##_16bpc_sse2); \ decl_##type##_fn(name##_16bpc_ssse3); \ decl_##type##_fn(name##_16bpc_avx2); \ - decl_##type##_fn(name##_avx512icl); + decl_##type##_fn(name##_16bpc_avx512icl); #define init_mc_fn(type, name, suffix) \ c->mc[type] = dav1d_put_##name##_16bpc_##suffix #define init_mct_fn(type, name, suffix) \ @@ -147,8 +147,6 @@ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 - init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); @@ -158,8 +156,8 @@ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); - init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); @@ -169,8 +167,9 @@ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); -#if ARCH_X86_64 +#if BITDEPTH == 8 && ARCH_X86_64 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); @@ -194,6 +193,7 @@ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); #endif +#if BITDEPTH == 8 c->avg = dav1d_avg_ssse3; c->w_avg = dav1d_w_avg_ssse3; c->mask = dav1d_mask_ssse3; @@ -207,6 +207,19 @@ c->emu_edge = dav1d_emu_edge_ssse3; c->resize = dav1d_resize_ssse3; +#else + c->avg = dav1d_avg_16bpc_ssse3; + c->w_avg = dav1d_w_avg_16bpc_ssse3; + c->mask = dav1d_mask_16bpc_ssse3; + c->w_mask[0] = dav1d_w_mask_444_16bpc_ssse3; + c->w_mask[1] = dav1d_w_mask_422_16bpc_ssse3; + c->w_mask[2] = dav1d_w_mask_420_16bpc_ssse3; + c->blend = dav1d_blend_16bpc_ssse3; + c->blend_v = dav1d_blend_v_16bpc_ssse3; + c->blend_h = dav1d_blend_h_16bpc_ssse3; + c->warp8x8 = dav1d_warp_affine_8x8_16bpc_ssse3; + c->warp8x8t = dav1d_warp_affine_8x8t_16bpc_ssse3; + c->emu_edge = dav1d_emu_edge_16bpc_ssse3; #endif if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) diff -Nru dav1d-0.9.0/tests/checkasm/checkasm.h dav1d-0.9.1/tests/checkasm/checkasm.h --- dav1d-0.9.0/tests/checkasm/checkasm.h 2021-05-16 16:47:22.558951000 +0000 +++ dav1d-0.9.1/tests/checkasm/checkasm.h 2021-07-28 21:38:28.917852400 +0000 @@ -282,9 +282,9 @@ #ifdef readtime #define bench_new(...)\ do {\ + func_type *tfunc = func_new;\ + checkasm_set_signal_handler_state(1);\ if (checkasm_bench_func()) {\ - checkasm_set_signal_handler_state(1);\ - func_type *tfunc = func_new;\ uint64_t tsum = 0;\ int tcount = 0;\ for (int ti = 0; ti < BENCH_RUNS; ti++) {\ @@ -299,9 +299,11 @@ tcount++;\ }\ }\ - checkasm_set_signal_handler_state(0);\ checkasm_update_bench(tcount, tsum);\ + } else {\ + tfunc(__VA_ARGS__);\ }\ + checkasm_set_signal_handler_state(0);\ } while (0) #else #define bench_new(...) do {} while (0) diff -Nru dav1d-0.9.0/tests/checkasm/filmgrain.c dav1d-0.9.1/tests/checkasm/filmgrain.c --- dav1d-0.9.0/tests/checkasm/filmgrain.c 2021-05-16 16:47:22.558951000 +0000 +++ dav1d-0.9.1/tests/checkasm/filmgrain.c 2021-07-28 21:38:28.917852400 +0000 @@ -188,12 +188,21 @@ for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; fg_data[0].overlap_flag++) { - for (int i = 0; i <= fg_data[0].overlap_flag; i++) { + for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { int w, h, row_num; if (fg_data[0].overlap_flag) { w = 35 + (rnd() % 93); - h = 3 + (rnd() % 29); - row_num = i ? 1 + (rnd() & 0x7ff) : 0; + if (i == 0) { + row_num = 0; + h = 1 + (rnd() % 31); + } else { + row_num = 1 + (rnd() & 0x7ff); + if (i == 1) { + h = 3 + (rnd() % 30); + } else { + h = 1 + (rnd() & 1); + } + } } else { w = 1 + (rnd() & 127); h = 1 + (rnd() & 31); @@ -220,6 +229,11 @@ } } fg_data[0].overlap_flag = 1; + for (int y = 0; y < 32; y++) { + // Make sure all pixels are in range + for (int x = 0; x < 128; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + } bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32, 1 HIGHBD_TAIL_SUFFIX); } @@ -311,12 +325,21 @@ for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; fg_data[0].overlap_flag++) { - for (int i = 0; i <= fg_data[0].overlap_flag; i++) { + for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { int w, h, row_num; if (fg_data[0].overlap_flag) { w = (36 >> ss_x) + (rnd() % (92 >> ss_x)); - h = (4 >> ss_y) + (rnd() % (28 >> ss_y)); - row_num = i ? 1 + (rnd() & 0x7ff) : 0; + if (i == 0) { + row_num = 0; + h = 1 + (rnd() & (31 >> ss_y)); + } else { + row_num = 1 + (rnd() & 0x7ff); + if (i == 1) { + h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30)); + } else { + h = ss_y ? 1 : 1 + (rnd() & 1); + } + } } else { w = 1 + (rnd() & (127 >> ss_x)); h = 1 + (rnd() & (31 >> ss_y)); @@ -350,6 +373,13 @@ } fg_data[0].overlap_flag = 1; + for (int y = 0; y < 32; y++) { + // Make sure all pixels are in range + for (int x = 0; x < 128; x++) { + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; + } + } bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16, 1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); } diff -Nru dav1d-0.9.0/tests/checkasm/loopfilter.c dav1d-0.9.1/tests/checkasm/loopfilter.c --- dav1d-0.9.0/tests/checkasm/loopfilter.c 2021-05-16 16:47:22.558951000 +0000 +++ dav1d-0.9.1/tests/checkasm/loopfilter.c 2021-07-28 21:38:28.917852400 +0000 @@ -33,13 +33,12 @@ #include "src/loopfilter.h" static void init_lpf_border(pixel *const dst, const ptrdiff_t stride, - int E, int I, int H, const int bitdepth_max) + int E, int I, const int bitdepth_max) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int F = 1 << bitdepth_min_8; E <<= bitdepth_min_8; I <<= bitdepth_min_8; - H <<= bitdepth_min_8; const int filter_type = rnd() % 4; const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2); @@ -171,7 +170,7 @@ L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx]; } init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1, - lut.e[L], lut.i[L], L >> 4, bitdepth_max); + lut.e[L], lut.i[L], bitdepth_max); } memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16); diff -Nru dav1d-0.9.0/tests/header_test.c dav1d-0.9.1/tests/header_test.c --- dav1d-0.9.0/tests/header_test.c 2021-05-16 16:47:22.558951000 +0000 +++ dav1d-0.9.1/tests/header_test.c 2021-07-28 21:38:28.917852400 +0000 @@ -27,7 +27,7 @@ #include DAV1D_TEST_HEADER -int main() +int main(void) { return 0; }