Projects
Essentials
kvazaar
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 20
View file
kvazaar.changes
Changed
@@ -1,4 +1,35 @@ ------------------------------------------------------------------- +Wed Apr 10 11:16:02 UTC 2024 - Luigi Baldoni <aloisio@gmx.com> + +- Update to version 2.3.1 + Features: + * --(no-)enable-logging to enable/disable logging of normal + encoder perfomance into stderr, errors are still outputted + to stderr. + * AVX2 optimisations for finding last non zero coefficient in + RDOQ + * Remove YASM to make compilation with visual studio easier + * Experimental support for CMake, in the future we would like + to get rid of automake and visual studio so if there are any + issues with the CMakeLists.txt please report them + Fixes: + * Fix a bug when requesting encoder_headers before any frame + has been pushed in + * Fix a problem with win+gcc (mingw, msys, cygwin) causing + some optimized functions to segfault + * Fix GCC detection for automake + External contributions: + * add config option to turn off logging output + * Don't export MD5 byteReverse symbol on big-endian + Performance: + * The RD performance should be exactly the same as in v2.2.0 + and configurations using RDOQ should be around 1-3% faster + with the AVX2 optimizations +- Drop kvazaar-fix_libm_underlinking.patch, + kvazaar-add_soversion.patch, kvazaar-fix_install_libdir.patch + and kvazaar-fix_install_mandir.patch (merged upstream) + +------------------------------------------------------------------- Wed Jan 17 18:39:21 UTC 2024 - Luigi Baldoni <aloisio@gmx.com> - Update to version 2.3.0
View file
kvazaar.spec
Changed
@@ -19,21 +19,13 @@ %define libname libkvazaar %define libmver 7 Name: kvazaar -Version: 2.3.0 +Version: 2.3.1 Release: 0.pm.0 Summary: HEVC encoder License: BSD-3-Clause Group: Productivity/Multimedia/Video/Editors and Convertors URL: http://ultravideo.cs.tut.fi/#encoder Source0: https://github.com/ultravideo/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz -# PATCH-FIX-OPENSUSE kvazaar-fix_libm_underlinking.patch -Patch1: kvazaar-fix_libm_underlinking.patch -# PATCH-FIX-UPSTREAM kvazaar-add_soversion.patch -Patch2: kvazaar-add_soversion.patch -# PATCH-FIX-OPENSUSE kvazaar-fix_install_libdir.patch -Patch3: kvazaar-fix_install_libdir.patch -# PATCH-FIX-OPENSUSE kvazaar-fix_install_mandir.patch -Patch4: kvazaar-fix_install_mandir.patch BuildRequires: cmake >= 3.12 BuildRequires: gcc >= 4.4 BuildRequires: pkgconfig @@ -82,6 +74,8 @@ %install %cmake_install +%check + %post -n %{libname}%{libmver} -p /sbin/ldconfig %postun -n %{libname}%{libmver} -p /sbin/ldconfig
View file
kvazaar-add_soversion.patch
Deleted
@@ -1,21 +0,0 @@ -From 621a2bba8f12c9fed07c266e590bc05dea2861b2 Mon Sep 17 00:00:00 2001 -From: Joose Sainio <joose.sainio@tuni.fi> -Date: Thu, 18 Jan 2024 09:14:35 +0200 -Subject: PATCH CMake versions .so file - ---- - CMakeLists.txt | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 278939d9..1f459c44 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -165,6 +165,7 @@ if(MSVC) - set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) - else() - set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src) -+ set_target_properties(kvazaar PROPERTIES SOVERSION "7" VERSION "7.3.0") - list(APPEND ALLOW_AVX2 "x86_64" "AMD64") - if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) - set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" )
View file
kvazaar-fix_install_libdir.patch
Deleted
@@ -1,37 +0,0 @@ -Index: kvazaar-2.3.0/CMakeLists.txt -=================================================================== ---- kvazaar-2.3.0.orig/CMakeLists.txt -+++ kvazaar-2.3.0/CMakeLists.txt -@@ -128,7 +128,7 @@ if(MSVC) - endif() - - if(BUILD_SHARED_LIBS) -- list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" "./" "../lib" ) -+ list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_FULL_LIBDIR}" "./" "../lib" ) - set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - add_library(kvazaar SHARED ${LIB_SOURCES}) - else() -@@ -233,9 +233,9 @@ source_group( "" FILES ${SOURCE_GROUP_TO - - # ToDo: make configurable - --install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/share/pkgconfig) -+install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/pkgconfig) - install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) --install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -+install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}) - if(BUILD_SHARED_LIBS) # Just add the lib to the bin directory for now - if(MSVC) - install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) -Index: kvazaar-2.3.0/src/kvazaar.pc.in -=================================================================== ---- kvazaar-2.3.0.orig/src/kvazaar.pc.in -+++ kvazaar-2.3.0/src/kvazaar.pc.in -@@ -1,6 +1,6 @@ - prefix=@CMAKE_INSTALL_PREFIX@ - exec_prefix=${prefix} --libdir=${prefix}/lib -+libdir=@CMAKE_INSTALL_FULL_LIBDIR@ - incdir=${prefix}/include - - Name: libkvazaar
View file
kvazaar-fix_install_mandir.patch
Deleted
@@ -1,13 +0,0 @@ -Index: kvazaar-2.3.0/CMakeLists.txt -=================================================================== ---- kvazaar-2.3.0.orig/CMakeLists.txt -+++ kvazaar-2.3.0/CMakeLists.txt -@@ -242,7 +242,7 @@ if(BUILD_SHARED_LIBS) # Just add the lib - endif() - endif() - install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include) --install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man) -+install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man/man1) - - IF(UNIX) - # DIST
View file
kvazaar-fix_libm_underlinking.patch
Deleted
@@ -1,12 +0,0 @@ -Index: kvazaar-2.3.0/CMakeLists.txt -=================================================================== ---- kvazaar-2.3.0.orig/CMakeLists.txt -+++ kvazaar-2.3.0/CMakeLists.txt -@@ -182,6 +182,7 @@ else() - set(EXTRA_LIBS ${EXTRA_LIBS} m) - endif (HAVE_LIB_M) - -+ target_link_libraries(kvazaar PUBLIC ${EXTRA_LIBS}) - target_link_libraries(kvazaar-bin PUBLIC ${EXTRA_LIBS}) - endif() -
View file
kvazaar-2.3.0.tar.gz/CMakeLists.txt -> kvazaar-2.3.1.tar.gz/CMakeLists.txt
Changed
@@ -3,15 +3,17 @@ project(kvazaar LANGUAGES C CXX HOMEPAGE_URL https://github.com/ultravideo/kvazaar -DESCRIPTION "An open-source VVC encoder licensed under 3-clause BSD" -VERSION 2.3.0 ) +DESCRIPTION "An open-source HEVC encoder licensed under 3-clause BSD" +VERSION 2.3.1 ) option(BUILD_SHARED_LIBS "Build using shared kvazaar library" ON) option(BUILD_TESTS "Build tests" ON) +option(USE_CRYPTO "Use crypto library" OFF) include(GNUInstallDirs) #Helps to define correct distro specific install directories +set(DEFERRED "@") set(KVAZAAR_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" CACHE PATH "kvazaar library install path") set(KVAZAAR_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH "kvazaar binary install path") @@ -97,8 +99,22 @@ add_definitions(-DCMAKE_BUILD) +# Set correct pkgconfig libdir variable +if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}") + set(KVAZAAR_PC_LIBDIR "${CMAKE_INSTALL_LIBDIR}") +else() + set(KVAZAAR_PC_LIBDIR "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") +endif() + +# Set correct pkgconfig include variable +if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}") + set(KVAZAAR_PC_INCDIR "${CMAKE_INSTALL_INCLUDEDIR}") +else() + set(KVAZAAR_PC_INCDIR "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") +endif() + # Apply dynamic info to the config files -configure_file("${PROJECT_SOURCE_DIR}/src/kvazaar.pc.in" "${PROJECT_SOURCE_DIR}/src/kvazaar.pc" @ONLY) +configure_file("${PROJECT_SOURCE_DIR}/src/kvazaarCMake.pc.in" "${PROJECT_SOURCE_DIR}/src/kvazaar.pc.temp" @ONLY) configure_file("${PROJECT_SOURCE_DIR}/src/version.h.in" "${PROJECT_SOURCE_DIR}/src/version.h" @ONLY) # Add all sources in src/ base @@ -127,8 +143,52 @@ add_definitions(-DWIN32_LEAN_AND_MEAN -D_WIN32 -DWIN32 -DWIN64) endif() + +if (USE_CRYPTO) + if(BUILD_TESTS) + message(WARNING "Crypto++ is not compatible with the tests, disabling tests") + set(BUILD_TESTS OFF) + endif() + if (BUILD_SHARED_LIBS) + message(WARNING "Crypto++ is not compatible with shared libraries, disabling shared libraries") + set(BUILD_SHARED_LIBS OFF) + endif () + include(FetchContent) + + add_definitions(-DKVZ_SEL_ENCRYPTION) + list(APPEND LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/crypto.cpp) + list(APPEND CLI_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/crypto.cpp) + + if (NOT CRYPTOPP_FOUND) + message(STATUS "Fetching and building Crypto++") + + # CMake for Crypto++ + FetchContent_Declare( + cryptopp-cmake + GIT_REPOSITORY https://github.com/abdes/cryptopp-cmake.git + GIT_TAG 43367a9cef6576b34179427a31a619802205406e + ) + + set(CRYPTOPP_INSTALL OFF CACHE BOOL "" FORCE) # we don't want to install Crypto++ + set(CRYPTOPP_BUILD_TESTING OFF CACHE BOOL "" FORCE) + set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) + + FetchContent_MakeAvailable(cryptopp-cmake) + + unset(BUILD_SHARED_LIBS) # unset so it does not affect other projects + + # copy lib binary so it is found later + file(GLOB CRYPTOPP_BIN "${cryptopp-cmake_BINARY_DIR}/cryptopp/cryptopp.*") + file(COPY ${CRYPTOPP_BIN} DESTINATION ${CMAKE_BINARY_DIR}/lib/) + file(GLOB CRYPTOPP_BIN "${cryptopp-cmake_BINARY_DIR}/cryptopp/libcryptopp.*") + file(COPY ${CRYPTOPP_BIN} DESTINATION ${CMAKE_BINARY_DIR}/lib/) + endif () + + +endif () + if(BUILD_SHARED_LIBS) - list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" "./" "../lib" ) + list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_LIBDIR}" "./" "../lib" ) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) add_library(kvazaar SHARED ${LIB_SOURCES}) else() @@ -165,11 +225,21 @@ set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) else() set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src) + set_target_properties(kvazaar PROPERTIES SOVERSION "7" VERSION "7.4.0") list(APPEND ALLOW_AVX2 "x86_64" "AMD64") if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" ) set_property( SOURCE ${LIB_SOURCES_STRATEGIES_SSE41} APPEND PROPERTY COMPILE_FLAGS "-msse4.1" ) endif() + + # CYGWIN, MSYS, and MINGW seem to be needing this but in some cases + # it might be that the toolset is not properly set, so also use this + # in cases where we are not sure that it is not needed + if((NOT MSVC AND NOT LINUX AND NOT APPLE) OR (CYGWIN OR MSYS OR MINGW)) + set(CMAKE_C_FLAGS "-Wa,-muse-unaligned-vector-move ${CMAKE_C_FLAGS}") + endif() + + set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) target_link_libraries(kvazaar PUBLIC Threads::Threads) @@ -181,10 +251,15 @@ if (HAVE_LIB_M) set(EXTRA_LIBS ${EXTRA_LIBS} m) endif (HAVE_LIB_M) + if (USE_CRYPTO) + set(EXTRA_LIBS ${EXTRA_LIBS} ${CMAKE_BINARY_DIR}/lib/libcryptopp.a) + endif () - target_link_libraries(kvazaar-bin PUBLIC ${EXTRA_LIBS}) + target_link_libraries(kvazaar PUBLIC ${EXTRA_LIBS}) + target_link_libraries(kvazaar-bin PUBLIC ${EXTRA_LIBS} ) endif() + # Source grouping # Some basic structuring of the files based on previous visual studio project files @@ -230,17 +305,16 @@ # INSTALL # ToDo: make configurable +install(CODE "configure_file(\"${PROJECT_SOURCE_DIR}/src/kvazaar.pc.temp\" \"${PROJECT_SOURCE_DIR}/src/kvazaar.pc\" @ONLY)") -install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/share/pkgconfig) -install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) -install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -if(BUILD_SHARED_LIBS) # Just add the lib to the bin directory for now - if(MSVC) - install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) - endif() -endif() -install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include) -install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man) +install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) +install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_BINDIR}) +install(TARGETS kvazaar + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) +install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1) IF(UNIX) # DIST @@ -316,7 +390,7 @@ enable_testing() if(MSVC OR MINGW OR MSYS) - if(BUILD_SHARED_LIBS) + if(BUILD_SHARED_LIBS) set(BUILD_TESTS OFF) message(INFO " Disable test building, fails in MSVC/MINGW/MSYS2 when building shared binaries") endif()
View file
kvazaar-2.3.0.tar.gz/Makefile.am -> kvazaar-2.3.1.tar.gz/Makefile.am
Changed
@@ -11,7 +11,8 @@ doc \ docs.doxy \ greatest \ - tools + tools \ + CMakeLists.txt # Run scripts to maintain autogenerated documentation # in the version control.
View file
kvazaar-2.3.0.tar.gz/README.md -> kvazaar-2.3.1.tar.gz/README.md
Changed
@@ -51,7 +51,7 @@ comment: # "BEGIN KVAZAAR HELP MESSAGE" ``` -Kvazaar v2.3.0 2024-01-17 +Kvazaar v2.3.1 2024-04-10 Kvazaar license: 3-clause BSD Usage: kvazaar -i <input> --input-res <width>x<height> -o <output>
View file
kvazaar-2.3.0.tar.gz/configure.ac -> kvazaar-2.3.1.tar.gz/configure.ac
Changed
@@ -23,7 +23,7 @@ # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html ver_major=7 -ver_minor=3 +ver_minor=4 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS
View file
kvazaar-2.3.0.tar.gz/doc/kvazaar.1 -> kvazaar-2.3.1.tar.gz/doc/kvazaar.1
Changed
@@ -1,4 +1,4 @@ -.TH KVAZAAR "1" "tammikuu 2024" "kvazaar v2.3.0" "User Commands" +.TH KVAZAAR "1" "April 2024" "kvazaar v2.3.1" "User Commands" .SH NAME kvazaar \- open source HEVC encoder .SH SYNOPSIS
View file
kvazaar-2.3.0.tar.gz/src/cfg.c -> kvazaar-2.3.1.tar.gz/src/cfg.c
Changed
@@ -1112,13 +1112,20 @@ } if (preset_valuespreset_line0 != NULL) { - fprintf(stderr, "Using preset %s: ", value); + if (cfg->enable_logging_output) { + fprintf(stderr, "Using preset %s: ", value); + } + // Loop all the name and value pairs and push to the config parser for (int preset_value = 1; preset_valuespreset_linepreset_value != NULL; preset_value += 2) { - fprintf(stderr, "--%s=%s ", preset_valuespreset_linepreset_value, preset_valuespreset_linepreset_value + 1); + if (cfg->enable_logging_output) { + fprintf(stderr, "--%s=%s ", preset_valuespreset_linepreset_value, preset_valuespreset_linepreset_value + 1); + } kvz_config_parse(cfg, preset_valuespreset_linepreset_value, preset_valuespreset_linepreset_value + 1); } - fprintf(stderr, "\n"); + if (cfg->enable_logging_output) { + fprintf(stderr, "\n"); + } } else { fprintf(stderr, "Input error: unknown preset \"%s\"\n", value); return 0; @@ -1727,7 +1734,7 @@ } if (cfg->implicit_rdpcm && !cfg->lossless) { - fprintf(stderr, "Input error: --implicit-rdpcm is not suppoted without --lossless\n"); + fprintf(stderr, "Input error: --implicit-rdpcm is not supported without --lossless\n"); error = 1; }
View file
kvazaar-2.3.0.tar.gz/src/global.h -> kvazaar-2.3.1.tar.gz/src/global.h
Changed
@@ -220,7 +220,7 @@ #define QUOTE_EXPAND(x) QUOTE(x) #ifndef KVZ_VERSION -#define KVZ_VERSION 2.3.0 +#define KVZ_VERSION 2.3.1 #endif #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION) @@ -252,6 +252,13 @@ #endif #ifdef _MSC_VER +#define NO_ASAN +#else +#define NO_ASAN __attribute__((no_sanitize("address"))) +#endif + + +#ifdef _MSC_VER // Buggy VS2010 throws intellisense warnings if void* is not casted. #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num)) #define MALLOC_SIMD_PADDED(type, num, padding) (type *)malloc(sizeof(type) * (num) + (padding))
View file
kvazaar-2.3.0.tar.gz/src/intra.c -> kvazaar-2.3.1.tar.gz/src/intra.c
Changed
@@ -588,7 +588,7 @@ kvz_intra_references refs; kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs); - kvz_pixel pred32 * 32; + ALIGNED(32) kvz_pixel predTR_MAX_WIDTH * TR_MAX_WIDTH; const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); kvz_intra_predict(&refs, log2width, intra_mode, color, pred, filter_boundary);
View file
kvazaar-2.3.0.tar.gz/src/kvazaar.pc.in -> kvazaar-2.3.1.tar.gz/src/kvazaar.pc.in
Changed
@@ -1,12 +1,11 @@ -prefix=@CMAKE_INSTALL_PREFIX@ +prefix=@prefix@ exec_prefix=${prefix} -libdir=${prefix}/lib +libdir=@libdir@ incdir=${prefix}/include Name: libkvazaar -Description: @CMAKE_PROJECT_DESCRIPTION@ -URL: @CMAKE_PROJECT_HOMEPAGE_URL@ -Version: @PROJECT_VERSION@ +Description: Open-source HEVC encoder +Version: @VERSION@ Libs: -L${libdir} -lkvazaar Libs.private: @LIBS@ -Cflags: -I${incdir} +Cflags: -I${incdir} \ No newline at end of file
View file
kvazaar-2.3.1.tar.gz/src/kvazaarCMake.pc.in
Added
@@ -0,0 +1,12 @@ +prefix=@DEFERRED@CMAKE_INSTALL_PREFIX@DEFERRED@ +exec_prefix=${prefix} +libdir=@KVAZAAR_PC_LIBDIR@ +incdir=@KVAZAAR_PC_INCDIR@ + +Name: libkvazaar +Description: @CMAKE_PROJECT_DESCRIPTION@ +URL: @CMAKE_PROJECT_HOMEPAGE_URL@ +Version: @PROJECT_VERSION@ +Libs: -L${libdir} -lkvazaar +Libs.private: @LIBS@ +Cflags: -I${incdir}
View file
kvazaar-2.3.0.tar.gz/src/strategies/avx2/intra-avx2.c -> kvazaar-2.3.1.tar.gz/src/strategies/avx2/intra-avx2.c
Changed
@@ -31,7 +31,6 @@ ****************************************************************************/ #include "strategies/avx2/intra-avx2.h" - #if COMPILE_INTEL_AVX2 && defined X86_64 #include "kvazaar.h" #if KVZ_BIT_DEPTH == 8 @@ -43,339 +42,771 @@ #include "strategies/missing-intel-intrinsics.h" - /** - * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register. - * \param ref_main Reference pixels - * \param delta_pos Fractional pixel precise position of sample displacement - * \param x Sample offset in direction x in ref_main array - */ -static INLINE __m128i filter_4x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){ - - int8_t delta_int = delta_pos >> 5; - int8_t delta_fract = delta_pos & (32-1); - __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_mainx + delta_int)); - __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_mainx + delta_int + 1)); - __m128i pairs = _mm_unpacklo_epi8(sample0, sample1); - __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); - sample0 = _mm_maddubs_epi16(pairs, weight); - sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16)); - sample0 = _mm_srli_epi16(sample0, 5); - sample0 = _mm_packus_epi16(sample0, sample0); - - return sample0; +static const int16_t delta_int_table = { +1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, +0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, +0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21, +0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17, +0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, +0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, +-1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, +-1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, +-1, -1, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -13, +-1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, +-1, -2, -2, -3, -4, -4, -5, -6, -6, -7, -8, -8, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -16, -16, -17, -18, -18, -19, -20, -20, -21, -21, +-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, +-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver = { +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 3 +11, 21, 22, 10, 1, 31, 12, 20, 23, 9, 2, 30, 13, 19, 24, 8, 3, 29, 14, 18, 25, 7, 4, 28, 15, 17, 26, 6, 5, 27, 16, 16, 27, 5, 6, 26, 17, 15, 28, 4, 7, 25, 18, 14, 29, 3, 8, 24, 19, 13, 30, 2, 9, 23, 20, 12, 31, 1, 10, 22, 21, 11, 32, 0, // Mode 4 +15, 17, 30, 2, 13, 19, 28, 4, 11, 21, 26, 6, 9, 23, 24, 8, 7, 25, 22, 10, 5, 27, 20, 12, 3, 29, 18, 14, 1, 31, 16, 16, 31, 1, 14, 18, 29, 3, 12, 20, 27, 5, 10, 22, 25, 7, 8, 24, 23, 9, 6, 26, 21, 11, 4, 28, 19, 13, 2, 30, 17, 15, 32, 0, // Mode 5 +19, 13, 6, 26, 25, 7, 12, 20, 31, 1, 18, 14, 5, 27, 24, 8, 11, 21, 30, 2, 17, 15, 4, 28, 23, 9, 10, 22, 29, 3, 16, 16, 3, 29, 22, 10, 9, 23, 28, 4, 15, 17, 2, 30, 21, 11, 8, 24, 27, 5, 14, 18, 1, 31, 20, 12, 7, 25, 26, 6, 13, 19, 32, 0, // Mode 6 +23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode 7 +27, 5, 22, 10, 17, 15, 12, 20, 7, 25, 2, 30, 29, 3, 24, 8, 19, 13, 14, 18, 9, 23, 4, 28, 31, 1, 26, 6, 21, 11, 16, 16, 11, 21, 6, 26, 1, 31, 28, 4, 23, 9, 18, 14, 13, 19, 8, 24, 3, 29, 30, 2, 25, 7, 20, 12, 15, 17, 10, 22, 5, 27, 32, 0, // Mode 8 +30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 9 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 10 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 11 + 5, 27, 10, 22, 15, 17, 20, 12, 25, 7, 30, 2, 3, 29, 8, 24, 13, 19, 18, 14, 23, 9, 28, 4, 1, 31, 6, 26, 11, 21, 16, 16, 21, 11, 26, 6, 31, 1, 4, 28, 9, 23, 14, 18, 19, 13, 24, 8, 29, 3, 2, 30, 7, 25, 12, 20, 17, 15, 22, 10, 27, 5, 32, 0, // Mode 12 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 25, 7, 2, 30, 11, 21, 20, 12, 29, 3, 6, 26, 15, 17, 24, 8, 1, 31, 10, 22, 19, 13, 28, 4, 5, 27, 14, 18, 23, 9, 32, 0, // Mode 13 +13, 19, 26, 6, 7, 25, 20, 12, 1, 31, 14, 18, 27, 5, 8, 24, 21, 11, 2, 30, 15, 17, 28, 4, 9, 23, 22, 10, 3, 29, 16, 16, 29, 3, 10, 22, 23, 9, 4, 28, 17, 15, 30, 2, 11, 21, 24, 8, 5, 27, 18, 14, 31, 1, 12, 20, 25, 7, 6, 26, 19, 13, 32, 0, // Mode 14 +17, 15, 2, 30, 19, 13, 4, 28, 21, 11, 6, 26, 23, 9, 8, 24, 25, 7, 10, 22, 27, 5, 12, 20, 29, 3, 14, 18, 31, 1, 16, 16, 1, 31, 18, 14, 3, 29, 20, 12, 5, 27, 22, 10, 7, 25, 24, 8, 9, 23, 26, 6, 11, 21, 28, 4, 13, 19, 30, 2, 15, 17, 32, 0, // Mode 15 +21, 11, 10, 22, 31, 1, 20, 12, 9, 23, 30, 2, 19, 13, 8, 24, 29, 3, 18, 14, 7, 25, 28, 4, 17, 15, 6, 26, 27, 5, 16, 16, 5, 27, 26, 6, 15, 17, 4, 28, 25, 7, 14, 18, 3, 29, 24, 8, 13, 19, 2, 30, 23, 9, 12, 20, 1, 31, 22, 10, 11, 21, 32, 0, // Mode 16 +26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 17 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor = { +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, // Mode 3 +11, 21, 22, 10, 1, 31, 12, 20, 23, 9, 2, 30, 13, 19, 24, 8, // Mode 4 +15, 17, 30, 2, 13, 19, 28, 4, 11, 21, 26, 6, 9, 23, 24, 8, // Mode 5 +19, 13, 6, 26, 25, 7, 12, 20, 31, 1, 18, 14, 5, 27, 24, 8, // Mode 6 +23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, // Mode 7 +27, 5, 22, 10, 17, 15, 12, 20, 7, 25, 2, 30, 29, 3, 24, 8, // Mode 8 +30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, // Mode 9 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 10 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, // Mode 11 + 5, 27, 10, 22, 15, 17, 20, 12, 25, 7, 30, 2, 3, 29, 8, 24, // Mode 12 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, // Mode 13 +13, 19, 26, 6, 7, 25, 20, 12, 1, 31, 14, 18, 27, 5, 8, 24, // Mode 14 +17, 15, 2, 30, 19, 13, 4, 28, 21, 11, 6, 26, 23, 9, 8, 24, // Mode 15 +21, 11, 10, 22, 31, 1, 20, 12, 9, 23, 30, 2, 19, 13, 8, 24, // Mode 16 +26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, // Mode 17 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_hor = { +0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 , // Mode 2 +0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, +0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07 , // Mode 3 +0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06 , // Mode 4 +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05 , // Mode 5 +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, +0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04 , // Mode 6 +0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03 , // Mode 7 +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02 , // Mode 8 +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 , // Mode 9 +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 , // Mode 10 +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 , // Mode 11 +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01 , // Mode 12 +0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, +0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01 , // Mode 13 +0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01 , // Mode 14 +0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01 , // Mode 15 +0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01 , // Mode 16 +0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01 , // Mode 17 +0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01 , // Mode 18 +0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 3 + 11, 21, 22, 10, 1, 31, 12, 20, 23, 9, 2, 30, 13, 19, 24, 8, 3, 29, 14, 18, 25, 7, 4, 28, 15, 17, 26, 6, 5, 27, 16, 16, // Mode 4 + 15, 17, 30, 2, 13, 19, 28, 4, 11, 21, 26, 6, 9, 23, 24, 8, 7, 25, 22, 10, 5, 27, 20, 12, 3, 29, 18, 14, 1, 31, 16, 16, // Mode 5 + 19, 13, 6, 26, 25, 7, 12, 20, 31, 1, 18, 14, 5, 27, 24, 8, 11, 21, 30, 2, 17, 15, 4, 28, 23, 9, 10, 22, 29, 3, 16, 16, // Mode 6 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, // Mode 7 + 27, 5, 22, 10, 17, 15, 12, 20, 7, 25, 2, 30, 29, 3, 24, 8, 19, 13, 14, 18, 9, 23, 4, 28, 31, 1, 26, 6, 21, 11, 16, 16, // Mode 8 + 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 9 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 10 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 11 + 5, 27, 10, 22, 15, 17, 20, 12, 25, 7, 30, 2, 3, 29, 8, 24, 13, 19, 18, 14, 23, 9, 28, 4, 1, 31, 6, 26, 11, 21, 16, 16, // Mode 12 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, // Mode 13 + 13, 19, 26, 6, 7, 25, 20, 12, 1, 31, 14, 18, 27, 5, 8, 24, 21, 11, 2, 30, 15, 17, 28, 4, 9, 23, 22, 10, 3, 29, 16, 16, // Mode 14 + 17, 15, 2, 30, 19, 13, 4, 28, 21, 11, 6, 26, 23, 9, 8, 24, 25, 7, 10, 22, 27, 5, 12, 20, 29, 3, 14, 18, 31, 1, 16, 16, // Mode 15 + 21, 11, 10, 22, 31, 1, 20, 12, 9, 23, 30, 2, 19, 13, 8, 24, 29, 3, 18, 14, 7, 25, 28, 4, 17, 15, 6, 26, 27, 5, 16, 16, // Mode 16 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + }; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_hor = { +0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 2 +0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, +0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, +0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, +0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 3 +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, +0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // Mode 4 +0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, +0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 5 +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, +0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 6 +0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, +0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, +0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 7 +0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, +0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, // Mode 8 +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 9 +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 10 +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 11 +0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 12 +0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, +0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, +0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, +0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 13 +0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, +0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, +0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 14 +0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, +0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, +0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 15 +0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, +0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 16 +0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, +0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, +0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 17 +0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, +0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, +0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 18 +0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, +0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, }; + + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w32_hor = { +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 3 +11, 21, 22, 10, 1, 31, 12, 20, 23, 9, 2, 30, 13, 19, 24, 8, 3, 29, 14, 18, 25, 7, 4, 28, 15, 17, 26, 6, 5, 27, 16, 16, 27, 5, 6, 26, 17, 15, 28, 4, 7, 25, 18, 14, 29, 3, 8, 24, 19, 13, 30, 2, 9, 23, 20, 12, 31, 1, 10, 22, 21, 11, 32, 0, // Mode 4 +15, 17, 30, 2, 13, 19, 28, 4, 11, 21, 26, 6, 9, 23, 24, 8, 7, 25, 22, 10, 5, 27, 20, 12, 3, 29, 18, 14, 1, 31, 16, 16, 31, 1, 14, 18, 29, 3, 12, 20, 27, 5, 10, 22, 25, 7, 8, 24, 23, 9, 6, 26, 21, 11, 4, 28, 19, 13, 2, 30, 17, 15, 32, 0, // Mode 5 +19, 13, 6, 26, 25, 7, 12, 20, 31, 1, 18, 14, 5, 27, 24, 8, 11, 21, 30, 2, 17, 15, 4, 28, 23, 9, 10, 22, 29, 3, 16, 16, 3, 29, 22, 10, 9, 23, 28, 4, 15, 17, 2, 30, 21, 11, 8, 24, 27, 5, 14, 18, 1, 31, 20, 12, 7, 25, 26, 6, 13, 19, 32, 0, // Mode 6 +23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode 7 +27, 5, 22, 10, 17, 15, 12, 20, 7, 25, 2, 30, 29, 3, 24, 8, 19, 13, 14, 18, 9, 23, 4, 28, 31, 1, 26, 6, 21, 11, 16, 16, 11, 21, 6, 26, 1, 31, 28, 4, 23, 9, 18, 14, 13, 19, 8, 24, 3, 29, 30, 2, 25, 7, 20, 12, 15, 17, 10, 22, 5, 27, 32, 0, // Mode 8 +30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 9 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 10 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 11 + 5, 27, 10, 22, 15, 17, 20, 12, 25, 7, 30, 2, 3, 29, 8, 24, 13, 19, 18, 14, 23, 9, 28, 4, 1, 31, 6, 26, 11, 21, 16, 16, 21, 11, 26, 6, 31, 1, 4, 28, 9, 23, 14, 18, 19, 13, 24, 8, 29, 3, 2, 30, 7, 25, 12, 20, 17, 15, 22, 10, 27, 5, 32, 0, // Mode 12 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 25, 7, 2, 30, 11, 21, 20, 12, 29, 3, 6, 26, 15, 17, 24, 8, 1, 31, 10, 22, 19, 13, 28, 4, 5, 27, 14, 18, 23, 9, 32, 0, // Mode 13 +13, 19, 26, 6, 7, 25, 20, 12, 1, 31, 14, 18, 27, 5, 8, 24, 21, 11, 2, 30, 15, 17, 28, 4, 9, 23, 22, 10, 3, 29, 16, 16, 29, 3, 10, 22, 23, 9, 4, 28, 17, 15, 30, 2, 11, 21, 24, 8, 5, 27, 18, 14, 31, 1, 12, 20, 25, 7, 6, 26, 19, 13, 32, 0, // Mode 14 +17, 15, 2, 30, 19, 13, 4, 28, 21, 11, 6, 26, 23, 9, 8, 24, 25, 7, 10, 22, 27, 5, 12, 20, 29, 3, 14, 18, 31, 1, 16, 16, 1, 31, 18, 14, 3, 29, 20, 12, 5, 27, 22, 10, 7, 25, 24, 8, 9, 23, 26, 6, 11, 21, 28, 4, 13, 19, 30, 2, 15, 17, 32, 0, // Mode 15 +21, 11, 10, 22, 31, 1, 20, 12, 9, 23, 30, 2, 19, 13, 8, 24, 29, 3, 18, 14, 7, 25, 28, 4, 17, 15, 6, 26, 27, 5, 16, 16, 5, 27, 26, 6, 15, 17, 4, 28, 25, 7, 14, 18, 3, 29, 24, 8, 13, 19, 2, 30, 23, 9, 12, 20, 1, 31, 22, 10, 11, 21, 32, 0, // Mode 16 +26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 17 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_hor = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, // Mode 2 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, // Mode 3 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, // Mode 4 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, // Mode 5 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, // Mode 6 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 7 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, // Mode 8 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 9 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 10 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 11 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 12 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 13 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 14 + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 15 + 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 16 + 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 17 + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 18 + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ver = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 2 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 3 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 4 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 5 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 6 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 7 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 8 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 9 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 10 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 11 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 12 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // Mode 13 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // Mode 14 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // Mode 15 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // Mode 16 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 17 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 18 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 3 + 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, + 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, // Mode 4 + 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, + 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, // Mode 5 + 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, + 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 6 + 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, + 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode 7 + 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, + 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, // Mode 8 + 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, + 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 9 + 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 10 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 11 + 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, // Mode 12 + 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, + 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 13 + 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, + 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 14 + 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, + 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, // Mode 15 + 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, + 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, // Mode 16 + 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, + 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 17 + 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 3 + 11, 21, 22, 10, 1, 31, 12, 20, 11, 21, 22, 10, 1, 31, 12, 20, // Mode 4 + 15, 17, 30, 2, 13, 19, 28, 4, 15, 17, 30, 2, 13, 19, 28, 4, // Mode 5 + 19, 13, 6, 26, 25, 7, 12, 20, 19, 13, 6, 26, 25, 7, 12, 20, // Mode 6 + 23, 9, 14, 18, 5, 27, 28, 4, 23, 9, 14, 18, 5, 27, 28, 4, // Mode 7 + 27, 5, 22, 10, 17, 15, 12, 20, 27, 5, 22, 10, 17, 15, 12, 20, // Mode 8 + 30, 2, 28, 4, 26, 6, 24, 8, 30, 2, 28, 4, 26, 6, 24, 8, // Mode 9 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 10 + 2, 30, 4, 28, 6, 26, 8, 24, 2, 30, 4, 28, 6, 26, 8, 24, // Mode 11 + 5, 27, 10, 22, 15, 17, 20, 12, 5, 27, 10, 22, 15, 17, 20, 12, // Mode 12 + 9, 23, 18, 14, 27, 5, 4, 28, 9, 23, 18, 14, 27, 5, 4, 28, // Mode 13 + 13, 19, 26, 6, 7, 25, 20, 12, 13, 19, 26, 6, 7, 25, 20, 12, // Mode 14 + 17, 15, 2, 30, 19, 13, 4, 28, 17, 15, 2, 30, 19, 13, 4, 28, // Mode 15 + 21, 11, 10, 22, 31, 1, 20, 12, 21, 11, 10, 22, 31, 1, 20, 12, // Mode 16 + 26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 +}; + +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_hor = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 2 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 3 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 4 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 5 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, // Mode 6 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, + 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, + 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 7 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 8 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 9 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 10 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 11 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 12 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 13 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 14 + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 15 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 16 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 17 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 18 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, +}; + + +// Linear interpolation filter for width 4 has a different call, since it uses premade tables for coefficients +static INLINE void angular_pred_avx2_linear_filter_w4_ver(kvz_pixel* dst, const kvz_pixel* const ref, const int16_t* delta_int, const int32_t pred_mode) +{ + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + + const int mode_idx = (pred_mode <= 18 ? (pred_mode - 2) : (34 - pred_mode)); + const int table_offset = mode_idx * 32; + + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset = MIN(dint0, dint3); + dint += 4; + // Load enough reff samples to cover four 4 width lines. Shuffles will put the samples in correct places. + const __m128i vsrc_raw = _mm_loadu_si128((const __m128i*) & refmin_offset); + + const __m128i vcoeff0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w4_vertable_offset); + const __m128i vcoeff1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w4_vertable_offset + 16); + + const __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w4_vertable_offset + 0); + const __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w4_vertable_offset + 16); + + __m128i vsrc0 = _mm_shuffle_epi8(vsrc_raw, vshuf0); + __m128i vsrc1 = _mm_shuffle_epi8(vsrc_raw, vshuf1); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); } - /** - * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst. - * \param dst Destination buffer - * \param ref_main Reference pixels - * \param sample_disp Sample displacement per row - * \param vertical_mode Mode direction, true if vertical - */ -static void filter_4x4_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){ - - __m128i row0 = filter_4x1_avx2(ref_main, 1 * sample_disp, 0); - __m128i row1 = filter_4x1_avx2(ref_main, 2 * sample_disp, 0); - __m128i row2 = filter_4x1_avx2(ref_main, 3 * sample_disp, 0); - __m128i row3 = filter_4x1_avx2(ref_main, 4 * sample_disp, 0); - - //Transpose if horizontal mode - if (!vertical_mode) { - __m128i temp = _mm_unpacklo_epi16(_mm_unpacklo_epi8(row0, row1), _mm_unpacklo_epi8(row2, row3)); - row0 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 0)); - row1 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 1)); - row2 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 2)); - row3 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 3)); + +static INLINE void angular_pred_avx2_linear_filter_w8_ver(kvz_pixel* dst, const kvz_pixel* const ref, const int16_t* delta_int, const int pred_mode) +{ + const int height = 8; + const __m128i v16s = _mm_set1_epi16(16); + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); + + const int mode_idx = (pred_mode <= 18 ? (pred_mode - 2) : (34 - pred_mode)); + const int coeff_table_offset = mode_idx * 64; + + // Height has to be at least 2, handle 2 lines at once + for (int y = 0; y < height; y += 2) { + const int16_t* coeff_tmp0 = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_vercoeff_table_offset + (y << 1) + 0; + const int16_t* coeff_tmp1 = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_vercoeff_table_offset + (y << 1) + 2; + + __m128i vsrc0 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 0); + __m128i vsrc1 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 1); + + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); + + const __m128i vcoeff0 = _mm_set1_epi16(*coeff_tmp0); + const __m128i vcoeff1 = _mm_set1_epi16(*coeff_tmp1); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static INLINE void angular_pred_avx2_linear_filter_w16_ver(kvz_pixel* dst, const kvz_pixel* const ref, const int16_t* delta_int, const int pred_mode) +{ + const int height = 16; + const __m128i v16s = _mm_set1_epi16(16); + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); + + const int mode_idx = (pred_mode <= 18 ? (pred_mode - 2) : (34 - pred_mode)); + const int coeff_table_offset = mode_idx * 64; + + // Handle 1 line at a time + for (int y = 0; y < height; ++y) { + const int16_t* coeff_tmp = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_vercoeff_table_offset + (y << 1); + __m128i vcoeff = _mm_set1_epi16(*coeff_tmp); + + __m128i vsrc0 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 0); + __m128i vsrc1 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 8); + + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; } +} - *(int32_t*)(dst + 0 * 4) = _mm_cvtsi128_si32(row0); - *(int32_t*)(dst + 1 * 4) = _mm_cvtsi128_si32(row1); - *(int32_t*)(dst + 2 * 4) = _mm_cvtsi128_si32(row2); - *(int32_t*)(dst + 3 * 4) = _mm_cvtsi128_si32(row3); +NO_ASAN +static INLINE void angular_pred_avx2_linear_filter_w32_ver(kvz_pixel* dst, const kvz_pixel* const ref, const int16_t* delta_int, const int pred_mode) +{ + const int height = 32; + const __m256i v16s = _mm256_set1_epi16(16); + const __m256i vshuf = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); + + const int mode_idx = (pred_mode <= 18 ? (pred_mode - 2) : (34 - pred_mode)); + const int coeff_table_offset = mode_idx * 64; + + // Handle 1 line at a time + for (int y = 0; y < height; ++y) { + const int16_t* coeff_tmp = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_vercoeff_table_offset + (y << 1); + __m256i vcoeff = _mm256_set1_epi16(*coeff_tmp); + + ALIGNED(32) __m128i vsrc4; + vsrc0 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 0 ); + vsrc1 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 16); // Flip these two middle sources. They will be later flipped back into place by packus + vsrc2 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 8 ); + vsrc3 = _mm_loadu_si128((const __m128i*) & refdelta_inty + 24); + + __m256i* vsrc256 = (__m256i*)vsrc; + vsrc2560 = _mm256_shuffle_epi8(vsrc2560, vshuf); + vsrc2561 = _mm256_shuffle_epi8(vsrc2561, vshuf); + + __m256i res0 = _mm256_maddubs_epi16(vsrc2560, vcoeff); + __m256i res1 = _mm256_maddubs_epi16(vsrc2561, vcoeff); + res0 = _mm256_add_epi16(res0, v16s); + res1 = _mm256_add_epi16(res1, v16s); + res0 = _mm256_srai_epi16(res0, 5); + res1 = _mm256_srai_epi16(res1, 5); + + _mm256_store_si256((__m256i*)dst, _mm256_packus_epi16(res0, res1)); + dst += 32; + } } - /** - * \brief Linear interpolation for 8 pixels. Returns 8 filtered pixels in lower 64-bits of the register. - * \param ref_main Reference pixels - * \param delta_pos Fractional pixel precise position of sample displacement - * \param x Sample offset in direction x in ref_main array - */ -static INLINE __m128i filter_8x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){ - int8_t delta_int = delta_pos >> 5; - int8_t delta_fract = delta_pos & (32-1); - __m128i sample0 = _mm_cvtsi64_si128(*(uint64_t*)&(ref_mainx + delta_int)); - __m128i sample1 = _mm_cvtsi64_si128(*(uint64_t*)&(ref_mainx + delta_int + 1)); - __m128i pairs_lo = _mm_unpacklo_epi8(sample0, sample1); +static INLINE void angular_pred_avx2_linear_filter_w4_hor(kvz_pixel* dst, const kvz_pixel* const ref, const int mode, const int16_t* delta_int) +{ + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); - __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); - __m128i v_temp_lo = _mm_maddubs_epi16(pairs_lo, weight); - v_temp_lo = _mm_add_epi16(v_temp_lo, _mm_set1_epi16(16)); - v_temp_lo = _mm_srli_epi16(v_temp_lo, 5); - sample0 = _mm_packus_epi16(v_temp_lo, v_temp_lo); + const int16_t weigth_offset = (mode - 2) * 16; + const int16_t shuf_offset = (mode - 2) * 32; - return sample0; + __m128i vcoeff = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w4_horweigth_offset); + __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w4_horshuf_offset + 0); + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w4_horshuf_offset + 16); + + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset = MIN(dint0, dint3); + + + // Prepare sources + __m128i vidx = _mm_set_epi64x((long long int)(min_offset + 2), (long long int)(min_offset + 0)); + __m128i vsrc_tmp = _mm_i64gather_epi64((const long long*)ref, vidx, 1); + __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp, vshuf0); + __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp, vshuf1); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } - /** - * \brief Linear interpolation for 8x8 block. Writes filtered 8x8 block to dst. - * \param dst Destination buffer - * \param ref_main Reference pixels - * \param sample_disp Sample displacement per row - * \param vertical_mode Mode direction, true if vertical - */ -static void filter_8x8_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){ - __m128i row0 = filter_8x1_avx2(ref_main, 1 * sample_disp, 0); - __m128i row1 = filter_8x1_avx2(ref_main, 2 * sample_disp, 0); - __m128i row2 = filter_8x1_avx2(ref_main, 3 * sample_disp, 0); - __m128i row3 = filter_8x1_avx2(ref_main, 4 * sample_disp, 0); - __m128i row4 = filter_8x1_avx2(ref_main, 5 * sample_disp, 0); - __m128i row5 = filter_8x1_avx2(ref_main, 6 * sample_disp, 0); - __m128i row6 = filter_8x1_avx2(ref_main, 7 * sample_disp, 0); - __m128i row7 = filter_8x1_avx2(ref_main, 8 * sample_disp, 0); - - //Transpose if horizontal mode - if (!vertical_mode) { - __m128i q0 = _mm_unpacklo_epi8(row0, row1); - __m128i q1 = _mm_unpacklo_epi8(row2, row3); - __m128i q2 = _mm_unpacklo_epi8(row4, row5); - __m128i q3 = _mm_unpacklo_epi8(row6, row7); - - __m128i h0 = _mm_unpacklo_epi16(q0, q1); - __m128i h1 = _mm_unpacklo_epi16(q2, q3); - __m128i h2 = _mm_unpackhi_epi16(q0, q1); - __m128i h3 = _mm_unpackhi_epi16(q2, q3); - - __m128i temp0 = _mm_unpacklo_epi32(h0, h1); - __m128i temp1 = _mm_unpackhi_epi32(h0, h1); - __m128i temp2 = _mm_unpacklo_epi32(h2, h3); - __m128i temp3 = _mm_unpackhi_epi32(h2, h3); - - row0 = _mm_cvtsi64_si128(_mm_extract_epi64(temp0, 0)); - row1 = _mm_cvtsi64_si128(_mm_extract_epi64(temp0, 1)); - row2 = _mm_cvtsi64_si128(_mm_extract_epi64(temp1, 0)); - row3 = _mm_cvtsi64_si128(_mm_extract_epi64(temp1, 1)); - row4 = _mm_cvtsi64_si128(_mm_extract_epi64(temp2, 0)); - row5 = _mm_cvtsi64_si128(_mm_extract_epi64(temp2, 1)); - row6 = _mm_cvtsi64_si128(_mm_extract_epi64(temp3, 0)); - row7 = _mm_cvtsi64_si128(_mm_extract_epi64(temp3, 1)); - } - - _mm_storel_epi64((__m128i*)(dst + 0 * 8), row0); - _mm_storel_epi64((__m128i*)(dst + 1 * 8), row1); - _mm_storel_epi64((__m128i*)(dst + 2 * 8), row2); - _mm_storel_epi64((__m128i*)(dst + 3 * 8), row3); - _mm_storel_epi64((__m128i*)(dst + 4 * 8), row4); - _mm_storel_epi64((__m128i*)(dst + 5 * 8), row5); - _mm_storel_epi64((__m128i*)(dst + 6 * 8), row6); - _mm_storel_epi64((__m128i*)(dst + 7 * 8), row7); -} - /** - * \brief Linear interpolation for two 16 pixels. Returns 8 filtered pixels in lower 64-bits of both lanes of the YMM register. - * \param ref_main Reference pixels - * \param delta_pos Fractional pixel precise position of sample displacement - * \param x Sample offset in direction x in ref_main array - */ -static INLINE __m256i filter_16x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){ - - int8_t delta_int = delta_pos >> 5; - int8_t delta_fract = delta_pos & (32-1); - __m256i sample0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)&(ref_mainx + delta_int))); - sample0 = _mm256_packus_epi16(sample0, sample0); - __m256i sample1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)&(ref_mainx + delta_int + 1))); - sample1 = _mm256_packus_epi16(sample1, sample1); - __m256i pairs_lo = _mm256_unpacklo_epi8(sample0, sample1); - - __m256i weight = _mm256_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); - __m256i v_temp_lo = _mm256_maddubs_epi16(pairs_lo, weight); - v_temp_lo = _mm256_add_epi16(v_temp_lo, _mm256_set1_epi16(16)); - v_temp_lo = _mm256_srli_epi16(v_temp_lo, 5); - sample0 = _mm256_packus_epi16(v_temp_lo, v_temp_lo); - - return sample0; +static INLINE void angular_pred_avx2_linear_filter_w8_hor(kvz_pixel* dst, const kvz_pixel* const ref, const int mode, const int16_t* delta_int) +{ + const int height = 8; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 16; + const int16_t shuf_offset = (mode - 2) * 32; + + __m128i vcoeff = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w8_horweigth_offset); + __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w8_horshuf_offset + 0); + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w8_horshuf_offset + 16); + + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset = MIN(dint0, dint7); + + // Height has to be at least 2, handle 2 lines at once + for (int y = 0; y < height; y += 2) { + // Prepare sources + __m128i vsrc_tmp = _mm_loadu_si128((__m128i*) & refmin_offset + y); + const __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp, vshuf0); + const __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp, vshuf1); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } } - /** - * \brief Linear interpolation for 16x16 block. Writes filtered 16x16 block to dst. - * \param dst Destination buffer - * \param ref_main Reference pixels - * \param sample_disp Sample displacement per row - * \param vertical_mode Mode direction, true if vertical - */ -static void filter_16x16_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){ - for (int y = 0; y < 16; y += 8) { - __m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, 0); - __m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, 0); - __m256i row2 = filter_16x1_avx2(ref_main, (y + 3) * sample_disp, 0); - __m256i row3 = filter_16x1_avx2(ref_main, (y + 4) * sample_disp, 0); - __m256i row4 = filter_16x1_avx2(ref_main, (y + 5) * sample_disp, 0); - __m256i row5 = filter_16x1_avx2(ref_main, (y + 6) * sample_disp, 0); - __m256i row6 = filter_16x1_avx2(ref_main, (y + 7) * sample_disp, 0); - __m256i row7 = filter_16x1_avx2(ref_main, (y + 8) * sample_disp, 0); - - if (!vertical_mode) { - __m256i q0 = _mm256_unpacklo_epi8(row0, row1); - __m256i q1 = _mm256_unpacklo_epi8(row2, row3); - __m256i q2 = _mm256_unpacklo_epi8(row4, row5); - __m256i q3 = _mm256_unpacklo_epi8(row6, row7); - - __m256i h0 = _mm256_unpacklo_epi16(q0, q1); - __m256i h1 = _mm256_unpacklo_epi16(q2, q3); - __m256i h2 = _mm256_unpackhi_epi16(q0, q1); - __m256i h3 = _mm256_unpackhi_epi16(q2, q3); - - __m256i temp0 = _mm256_unpacklo_epi32(h0, h1); - __m256i temp1 = _mm256_unpackhi_epi32(h0, h1); - __m256i temp2 = _mm256_unpacklo_epi32(h2, h3); - __m256i temp3 = _mm256_unpackhi_epi32(h2, h3); - - row0 = _mm256_unpacklo_epi64(temp0, temp0); - row1 = _mm256_unpackhi_epi64(temp0, temp0); - row2 = _mm256_unpacklo_epi64(temp1, temp1); - row3 = _mm256_unpackhi_epi64(temp1, temp1); - row4 = _mm256_unpacklo_epi64(temp2, temp2); - row5 = _mm256_unpackhi_epi64(temp2, temp2); - row6 = _mm256_unpacklo_epi64(temp3, temp3); - row7 = _mm256_unpackhi_epi64(temp3, temp3); - - //x and y must be flipped due to transpose - int rx = y; - int ry = 0; - - *(int64_t*)(dst + (ry + 0) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row0)); - *(int64_t*)(dst + (ry + 1) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row1)); - *(int64_t*)(dst + (ry + 2) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row2)); - *(int64_t*)(dst + (ry + 3) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row3)); - *(int64_t*)(dst + (ry + 4) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row4)); - *(int64_t*)(dst + (ry + 5) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row5)); - *(int64_t*)(dst + (ry + 6) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row6)); - *(int64_t*)(dst + (ry + 7) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row7)); - - *(int64_t*)(dst + (ry + 8) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row0, 1)); - *(int64_t*)(dst + (ry + 9) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row1, 1)); - *(int64_t*)(dst + (ry + 10) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row2, 1)); - *(int64_t*)(dst + (ry + 11) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row3, 1)); - *(int64_t*)(dst + (ry + 12) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row4, 1)); - *(int64_t*)(dst + (ry + 13) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row5, 1)); - *(int64_t*)(dst + (ry + 14) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row6, 1)); - *(int64_t*)(dst + (ry + 15) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row7, 1)); - } else { - //Set ry for the lower half of the block - int rx = 0; - int ry = y; - - row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0)); - row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1)); - row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0)); - row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1)); - row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0)); - row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1)); - row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0)); - row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1)); - - _mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0)); - _mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1)); - _mm_storeu_si128((__m128i*)(dst + (ry + 2) * 16 + rx), _mm256_castsi256_si128(row2)); - _mm_storeu_si128((__m128i*)(dst + (ry + 3) * 16 + rx), _mm256_castsi256_si128(row3)); - _mm_storeu_si128((__m128i*)(dst + (ry + 4) * 16 + rx), _mm256_castsi256_si128(row4)); - _mm_storeu_si128((__m128i*)(dst + (ry + 5) * 16 + rx), _mm256_castsi256_si128(row5)); - _mm_storeu_si128((__m128i*)(dst + (ry + 6) * 16 + rx), _mm256_castsi256_si128(row6)); - _mm_storeu_si128((__m128i*)(dst + (ry + 7) * 16 + rx), _mm256_castsi256_si128(row7)); - } +static INLINE void angular_pred_avx2_linear_filter_w16_hor(kvz_pixel* dst, const kvz_pixel* const ref, const int mode, const int16_t* delta_int) +{ + const int height = 16; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 32; + const int16_t shuf_offset = (mode - 2) * 64; + + __m128i vcoeff0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_horweigth_offset + 0); + __m128i vcoeff1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_horweigth_offset + 16); + __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_horshuf_offset + 0); + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_horshuf_offset + 16); + __m128i vshuf2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_horshuf_offset + 32); + __m128i vshuf3 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_horshuf_offset + 48); + + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset0 = MIN(dint0, dint7); + const int16_t min_offset1 = MIN(dint8, dint15); + + // Height has to be at least 2, there is no 16x1 block for chroma. + for (int y = 0; y < height; y += 2) { + // Prepare sources + __m128i vsrc_tmp0 = _mm_loadu_si128((__m128i*) & refmin_offset0 + y); + __m128i vsrc_tmp1 = _mm_loadu_si128((__m128i*) & refmin_offset1 + y); + const __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp0, vshuf0); + const __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp1, vshuf1); + const __m128i vsrc2 = _mm_shuffle_epi8(vsrc_tmp0, vshuf2); + const __m128i vsrc3 = _mm_shuffle_epi8(vsrc_tmp1, vshuf3); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff0); + __m128i res3 = _mm_maddubs_epi16(vsrc3, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res2 = _mm_add_epi16(res2, v16s); + res3 = _mm_add_epi16(res3, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + res2 = _mm_srai_epi16(res2, 5); + res3 = _mm_srai_epi16(res3, 5); + + _mm_store_si128((__m128i*) & dst0, _mm_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*) & dst16, _mm_packus_epi16(res2, res3)); + dst += 32; } } - /** - * \brief Linear interpolation for NxN blocks 16x16 and larger. Writes filtered NxN block to dst. - * \param dst Destination buffer - * \param ref_main Reference pixels - * \param sample_disp Sample displacement per row - * \param vertical_mode Mode direction, true if vertical - * \param width Block width - */ -static void filter_NxN_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode, int width){ - for (int y = 0; y < width; y += 8) { - for (int x = 0; x < width; x += 16) { - __m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, x); - __m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, x); - __m256i row2 = filter_16x1_avx2(ref_main, (y + 3) * sample_disp, x); - __m256i row3 = filter_16x1_avx2(ref_main, (y + 4) * sample_disp, x); - __m256i row4 = filter_16x1_avx2(ref_main, (y + 5) * sample_disp, x); - __m256i row5 = filter_16x1_avx2(ref_main, (y + 6) * sample_disp, x); - __m256i row6 = filter_16x1_avx2(ref_main, (y + 7) * sample_disp, x); - __m256i row7 = filter_16x1_avx2(ref_main, (y + 8) * sample_disp, x); - - //Transpose if horizontal mode - if (!vertical_mode) { - __m256i q0 = _mm256_unpacklo_epi8(row0, row1); - __m256i q1 = _mm256_unpacklo_epi8(row2, row3); - __m256i q2 = _mm256_unpacklo_epi8(row4, row5); - __m256i q3 = _mm256_unpacklo_epi8(row6, row7); - - __m256i h0 = _mm256_unpacklo_epi16(q0, q1); - __m256i h1 = _mm256_unpacklo_epi16(q2, q3); - __m256i h2 = _mm256_unpackhi_epi16(q0, q1); - __m256i h3 = _mm256_unpackhi_epi16(q2, q3); - - __m256i temp0 = _mm256_unpacklo_epi32(h0, h1); - __m256i temp1 = _mm256_unpackhi_epi32(h0, h1); - __m256i temp2 = _mm256_unpacklo_epi32(h2, h3); - __m256i temp3 = _mm256_unpackhi_epi32(h2, h3); - - row0 = _mm256_unpacklo_epi64(temp0, temp0); - row1 = _mm256_unpackhi_epi64(temp0, temp0); - row2 = _mm256_unpacklo_epi64(temp1, temp1); - row3 = _mm256_unpackhi_epi64(temp1, temp1); - row4 = _mm256_unpacklo_epi64(temp2, temp2); - row5 = _mm256_unpackhi_epi64(temp2, temp2); - row6 = _mm256_unpacklo_epi64(temp3, temp3); - row7 = _mm256_unpackhi_epi64(temp3, temp3); - - //x and y must be flipped due to transpose - int rx = y; - int ry = x; - - *(int64_t*)(dst + (ry + 0) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row0)); - *(int64_t*)(dst + (ry + 1) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row1)); - *(int64_t*)(dst + (ry + 2) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row2)); - *(int64_t*)(dst + (ry + 3) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row3)); - *(int64_t*)(dst + (ry + 4) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row4)); - *(int64_t*)(dst + (ry + 5) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row5)); - *(int64_t*)(dst + (ry + 6) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row6)); - *(int64_t*)(dst + (ry + 7) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row7)); - - *(int64_t*)(dst + (ry + 8) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row0, 1)); - *(int64_t*)(dst + (ry + 9) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row1, 1)); - *(int64_t*)(dst + (ry + 10) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row2, 1)); - *(int64_t*)(dst + (ry + 11) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row3, 1)); - *(int64_t*)(dst + (ry + 12) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row4, 1)); - *(int64_t*)(dst + (ry + 13) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row5, 1)); - *(int64_t*)(dst + (ry + 14) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row6, 1)); - *(int64_t*)(dst + (ry + 15) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row7, 1)); - } else { - - //Move all filtered pixels to the lower lane to reduce memory accesses - row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0)); - row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1)); - row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0)); - row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1)); - row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0)); - row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1)); - row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0)); - row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1)); - - _mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0)); - _mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1)); - _mm_storeu_si128((__m128i*)(dst + (y + 2) * width + x), _mm256_castsi256_si128(row2)); - _mm_storeu_si128((__m128i*)(dst + (y + 3) * width + x), _mm256_castsi256_si128(row3)); - _mm_storeu_si128((__m128i*)(dst + (y + 4) * width + x), _mm256_castsi256_si128(row4)); - _mm_storeu_si128((__m128i*)(dst + (y + 5) * width + x), _mm256_castsi256_si128(row5)); - _mm_storeu_si128((__m128i*)(dst + (y + 6) * width + x), _mm256_castsi256_si128(row6)); - _mm_storeu_si128((__m128i*)(dst + (y + 7) * width + x), _mm256_castsi256_si128(row7)); - } - } + +NO_ASAN +static INLINE void angular_pred_avx2_linear_filter_w32_hor(kvz_pixel* dst, const kvz_pixel* const ref, const int mode, const int16_t* delta_int) +{ + const int height = 32; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 64; + const int16_t shuf_offset = (mode - 2) * 64; + + __m128i vcoeff0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w32_horweigth_offset + 0); + __m128i vcoeff1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w32_horweigth_offset + 16); + __m128i vcoeff2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w32_horweigth_offset + 32); + __m128i vcoeff3 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w32_horweigth_offset + 48); + __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w32_horshuf_offset + 0); + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w32_horshuf_offset + 16); + __m128i vshuf2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w32_horshuf_offset + 32); + __m128i vshuf3 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w32_horshuf_offset + 48); + + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + // Due to width, two loads are needed, and therefore two offsets. Cannot use 256-bit loads due to alignment issues. + const int16_t min_offset0 = MIN(dint0, dint15); + const int16_t min_offset1 = MIN(dint16, dint31); + + // Height has to be at least 2. Due to width, handle 1 line at a time + for (int y = 0; y < height; ++y) { + // Prepare sources + __m128i vsrc_tmp0 = _mm_loadu_si128((__m128i*) & refmin_offset0 + y); + __m128i vsrc_tmp1 = _mm_loadu_si128((__m128i*) & refmin_offset1 + y); + __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp0, vshuf0); + __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp0, vshuf1); + __m128i vsrc2 = _mm_shuffle_epi8(vsrc_tmp1, vshuf2); + __m128i vsrc3 = _mm_shuffle_epi8(vsrc_tmp1, vshuf3); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff2); + __m128i res3 = _mm_maddubs_epi16(vsrc3, vcoeff3); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res2 = _mm_add_epi16(res2, v16s); + res3 = _mm_add_epi16(res3, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + res2 = _mm_srai_epi16(res2, 5); + res3 = _mm_srai_epi16(res3, 5); + + _mm_store_si128((__m128i*) & dst0, _mm_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*) & dst16, _mm_packus_epi16(res2, res3)); + dst += 32; } } @@ -451,20 +882,74 @@ } + if(sample_disp == 0) { + if (vertical_mode) { + for (int_fast32_t y = 0; y < width; ++y) { + switch (width) { + case 4: memcpy(&dsty * 4, &ref_main0, 4 * sizeof(uint8_t)); break; + case 8: memcpy(&dsty * 8, &ref_main0, 8 * sizeof(uint8_t)); break; + case 16: memcpy(&dsty * 16, &ref_main0, 16 * sizeof(uint8_t)); break; + case 32: memcpy(&dsty * 32, &ref_main0, 32 * sizeof(uint8_t)); break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; + } + } + } + else { + for (int y = 0; y < width; ++y) { + switch (width) { + case 4: memset(&dsty * 4, ref_mainy, 4 * sizeof(uint8_t)); break; + case 8: memset(&dsty * 8, ref_mainy, 8 * sizeof(uint8_t)); break; + case 16: memset(&dsty * 16, ref_mainy, 16 * sizeof(uint8_t)); break; + case 32: memset(&dsty * 32, ref_mainy, 32 * sizeof(uint8_t)); break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; + } + } + } + return; + } + + if ((abs(sample_disp) & 0x1f )== 0) { + int table_offset = vertical_mode ? (34 - intra_mode) * 32 : (intra_mode - 2) * 32; + const int16_t* delta_int = &delta_int_tabletable_offset; + for (int y = 0; y < width; ++y) { + uint8_t* dst_row = dst + y * width; + const uint8_t* ref_row = ref_main + delta_inty; + switch (width) { + case 4: memcpy(dst_row, ref_row, 4 * sizeof(uint8_t)); break; + case 8: memcpy(dst_row, ref_row, 8 * sizeof(uint8_t)); break; + case 16: memcpy(dst_row, ref_row, 16 * sizeof(uint8_t)); break; + case 32: memcpy(dst_row, ref_row, 32 * sizeof(uint8_t)); break; + } + } + return; + } + // The mode is not horizontal or vertical, we have to do interpolation. - switch (width) { - case 4: - filter_4x4_avx2(dst, ref_main, sample_disp, vertical_mode); - break; - case 8: - filter_8x8_avx2(dst, ref_main, sample_disp, vertical_mode); - break; - case 16: - filter_16x16_avx2(dst, ref_main, sample_disp, vertical_mode); + if (vertical_mode) { + const int16_t* delta_int = &delta_int_table(34 - intra_mode) * 32; + switch (width) { + case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, delta_int, intra_mode); break; + case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, delta_int, intra_mode); break; + case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, delta_int, intra_mode); break; + case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, delta_int, intra_mode); break; + default: break; + } + } + else { + const int16_t* delta_int = &delta_int_table(intra_mode - 2) * 32; + switch (width) { + case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, intra_mode, delta_int); break; + case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, intra_mode, delta_int); break; + case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, intra_mode, delta_int); break; + case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, intra_mode, delta_int); break; default: - filter_NxN_avx2(dst, ref_main, sample_disp, vertical_mode, width); break; + } } }
View file
kvazaar-2.3.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-2.3.1.tar.gz/src/strategies/avx2/quant-avx2.c
Changed
@@ -40,6 +40,8 @@ #include <immintrin.h> #include <stdlib.h> +#include "strategies/missing-intel-intrinsics.h" + #include "avx2_common_functions.h" #include "cu.h" #include "encoder.h"
View file
kvazaar-2.3.0.tar.gz/src/strategies/generic/nal-generic.c -> kvazaar-2.3.1.tar.gz/src/strategies/generic/nal-generic.c
Changed
@@ -157,6 +157,13 @@ assert(SEI_HASH_MAX_LENGTH >= 4); for (y = 0; y < height; ++y) { + if (y*stride % 8 != 0) { + for (x = 0; x < width; ++x) { + uint8_t mask = (uint8_t)((x & 0xff) ^ (y & 0xff) ^ (x >> 8) ^ (y >> 8)); + checksum += (data(y * stride) + x & 0xff) ^ mask; + } + continue; + } for (xp = 0; xp < width/8; ++xp) { const int x = xp * 8; const uint64_t mask = ckmap(xp&31)+32*(y&255) ^ ((uint64_t)((x >> 8) ^ (y >> 8)) * 0x101010101010101);
View file
kvazaar-2.3.0.tar.gz/src/strategies/missing-intel-intrinsics.h -> kvazaar-2.3.1.tar.gz/src/strategies/missing-intel-intrinsics.h
Changed
@@ -30,6 +30,16 @@ #ifndef _mm256_extract_epi32 #define _mm256_extract_epi32(a, index) (_mm_extract_epi32(_mm256_extracti128_si256((a), (index) >> 2), (index) & 3)) #endif + + #ifndef _mm256_storeu2_m128i + #define _mm256_storeu2_m128i(/* __m128i* */ hiaddr, /* __m128i* */ loaddr, /* __m256i */ a) \ + do { __m256i _a = (a); \ + _mm_storeu_si128((__m128i*)(loaddr), _mm256_castsi256_si128(_a)); \ + _mm_storeu_si128((__m128i*)(hiaddr), _mm256_extractf128_si256(_a, 0x1)); \ + } while (0) + #endif + #endif #endif +
View file
kvazaar-2.3.0.tar.gz/tests/test_external_symbols.sh -> kvazaar-2.3.1.tar.gz/tests/test_external_symbols.sh
Changed
@@ -4,7 +4,7 @@ set -eu${BASH+o pipefail} -if nm -go --defined-only ../src/.libs/libkvazaar.a | grep -v ' kvz_'; then +if nm -go --defined-only ../src/.libs/libkvazaar.a | grep -Ev ' (kvz_|__a-z0-9+(_|\.)get_pc_thunk\.)'; then printf '%s\n' 'Only symbols prefixed with "kvz_" should be exported from libkvazaar.' false fi
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.